Llama 模型在 Intel Gaudi 上的预训练#
在这个 Jupyter Notebook 中,我们将使用 Intel Gaudi 加速器预训练一个 huggyllama/llama-7b 模型。
我们将使用 PyTorch 进行模型训练,使用 Ray 进行分布式训练。
Intel Gaudi AI 处理器 (HPUs) 是由 Habana Labs 设计的 AI 硬件加速器。更多信息请参阅 Gaudi 架构 和 Gaudi 开发者文档。
此预训练示例的基本特点是
在 HPUs 上运行,支持三种执行模式:“lazy”、“eager”、“eager.compile”。(Keeping links and code snippets)
预训练 Llama 模型使用配置 huggyllama/llama-7b
基于
GaudiTrainer
的训练。基于 DeepSpeed 的预训练。
基于 Ray 的资源调度和管理。
准备环境#
此示例在具有 4 个 HPU 的单节点上运行。
我们建议使用预构建的容器来运行这些示例。要运行容器,您需要 Docker。有关安装说明,请参阅 安装 Docker Engine。
接下来,按照 Run Using Containers 安装 Habana 驱动程序和容器运行时。
获取 docker 镜像#
# more available docker image can be found here: https://vault.habana.ai/ui/native/gaudi-docker
docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
运行 docker 镜像#
docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
# maybe should mapping your workspace volumns
安装依赖#
# "optimum-habana>1.11.1" if exection mode "eager" or "eager.compile"
# "ray>=2.20.0"
pip install ray[train] notebook transformers datasets evaluate peft accelerate scikit-learn optimum-habana
# install deepspeed
pip install git+https://github.com/HabanaAI/[email protected]
# this notebook verfied with packages' version:
# transformers==4.45.2
# datasets==3.3.2
# evaluate==0.4.3
# peft==0.14.0
# accelerate==0.33.0
# scikit-learn==1.6.1
# optimum-habana==1.15.0
# deepspeed==0.16.1+hpu.synapse.v1.20.0
导入必要的库#
#!/usr/bin/env python
import os
from typing import Any, Dict
from torch.utils.data import DataLoader
import transformers
from itertools import chain
from datasets import load_dataset
from transformers import default_data_collator
from transformers.testing_utils import CaptureLogger
from optimum.habana import GaudiConfig, GaudiTrainer, GaudiTrainingArguments
from optimum.habana.utils import set_seed
构建数据集#
从 huggingface.co 下载并加载数据集。
def load_datasets(config):
dataset_name = config["name"]
dataset_config_name = config["config_name"]
# Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(
dataset_name,
dataset_config_name,
cache_dir=None,
token=None,
streaming=False,
)
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
dataset_name,
dataset_config_name,
split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=None,
token=None,
streaming=False,
)
raw_datasets["train"] = load_dataset(
dataset_name,
dataset_config_name,
split=f"train[{data_args.validation_split_percentage}%:]",
cache_dir=None,
token=None,
streaming=False,
)
return raw_datasets
加载 tokenizer#
从 huggingface.co 下载词汇表。
def load_tokenizer(config):
name = config["name"]
tokenizer_kwargs = {
"cache_dir": None,
"use_fast": True,
"revision": "main",
"token": None,
"trust_remote_code": False,
}
return transformers.AutoTokenizer.from_pretrained(name, **tokenizer_kwargs)
Tokenize 数据集#
将单词 tokenize 为 token id。
def tokenize_dataset(datasets, tokenizer):
column_names = list(datasets["train"].features)
text_column_name = "text" if "text" in column_names else column_names[0]
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
def tokenize_function(examples):
with CaptureLogger(tok_logger) as cl:
output = tokenizer(examples[text_column_name])
# clm input could be much much longer than block_size
if "Token indices sequence length is longer than the" in cl.out:
tok_logger.warning(
"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
" before being passed to the model."
)
return output
tokenized_datasets = datasets.map(
tokenize_function,
batched=True,
num_proc=None,
remove_columns=column_names,
load_from_cache_file=True,
desc="Running tokenizer on dataset",
)
return tokenized_datasets
分组数据集#
此预处理将连接数据集中的所有文本并生成 block_size 大小的块,这将大大加快模型预训练速度。
def group_dataset(config, datasets, tokenizer):
config_name = config["name"]
auto_config = transformers.AutoConfig.from_pretrained(config_name)
max_pos_embeddings = auto_config.max_position_embeddings
block_size = tokenizer.model_max_length
if block_size > max_pos_embeddings:
print(
f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
f"Using block_size={min(1024, max_pos_embeddings)} instead. You can change that default value by passing --block_size xxx."
)
if max_pos_embeddings > 0:
block_size = min(1024, max_pos_embeddings)
else:
block_size = 1024
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
# We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
total_length = (total_length // block_size) * block_size
# Split by chunks of max_len.
result = {
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy()
return result
lm_datasets = datasets.map(
group_texts,
batched=True,
num_proc=None,
load_from_cache_file=True,
desc=f"Grouping texts in chunks of {block_size}",
)
return lm_datasets
加载模型#
从 huggingface.co 下载并加载预配置的模型,详细模型配置在 config.json 中。
def load_model(config):
name = config["name"]
model_config = config.get("config", {})
auto_config = transformers.AutoConfig.from_pretrained(
pretrained_model_name_or_path=name, **model_config
)
model = transformers.AutoModelForCausalLM.from_config(auto_config, trust_remote_code=False)
return model
准备 trainer#
使用 model
, gaudi_config
, training_args
, tokenizer
实例化 Trainer。
未传递评估数据集,仅进行训练。
def get_trainer(training_args, datasets, tokenizer, model):
gaudi_config = GaudiConfig.from_pretrained(
training_args.gaudi_config_name, revision="main",
)
trainer = GaudiTrainer(
model=model,
gaudi_config=gaudi_config,
args=training_args,
train_dataset=datasets["train"],
eval_dataset=None,
tokenizer=tokenizer,
data_collator=default_data_collator,
)
return trainer
训练函数#
此函数将在训练期间由每个 worker 执行,步骤如下
准备 GaudiTrainingArguments 对象。
从 huggingface.co 加载数据集。
从 huggingface.co 加载预配置的 tokenizer。
使用加载的模型 tokenizer 对数据集进行 tokenize。
连接数据集中的所有文本并生成 block_size 大小的块。
使用 training_args、datasets、tokenizer 和 model 实例化
GaudiTrainer
对象。调用 trainer 的
train
方法。保存模型。
def pretrain_llama(config: Dict[str, Any]):
training_args = GaudiTrainingArguments(**config["training_args"])
set_seed(training_args.seed)
raw_datasets = load_datasets(config["datasets"])
tokenizer = load_tokenizer(config["tokenizer"])
tokenized_datasets = tokenize_dataset(raw_datasets, tokenizer)
tokenized_datasets = group_dataset(config["model"], tokenized_datasets, tokenizer)
model = load_model(config["model"])
trainer = get_trainer(training_args, tokenized_datasets, tokenizer, model)
result = trainer.train()
trainer.save_model()
print(result)
主训练函数#
main
函数使用 Ray 设置分布式训练环境并启动训练过程。要使用 HPU 进行训练,我们只需进行以下更改
设置训练的执行模式,支持的执行模式有
“lazy”:延迟执行图,由脚本操作按操作传递,类似于 Eager 模式。它提供了 Eager 模式体验,并在 Gaudi 上表现良好。与使用 torch.compile 的 Eager 模式不同,每次迭代都会分析图,导致 CPU 使用率更高。
“eager”:按照标准 PyTorch Eager 模式脚本中定义的逐操作执行。
“eager.compile”:扩展了
torch.compile
的 Eager 模式 - 类似于 Eager 模式,但通过将模型的全部或部分(例如函数)封装到图中进行了扩展。未封装的部分将以 Eager 模式执行。
在 ScalingConfig 中为每个 worker 要求一个 HPU
在 TorchConfig 中将后端设置为
hccl
def main(num_workers, execution_mode):
import ray
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer, TorchConfig
pretrain_config = {
"datasets": {
"name": "wikitext",
"config_name": "wikitext-2-raw-v1",
},
"tokenizer": {
"name": "huggyllama/llama-7b",
"config": {}
},
"model": {
"name": "huggyllama/llama-7b",
"config": {
"torch_dtype": "bfloat16",
},
},
"training_args": {
"per_device_train_batch_size": 1,
"do_train": True,
"save_strategy": "no",
"output_dir": "/tmp/ray/pretrain-llama-2",
"logging_steps": 1,
"gaudi_config_name": "Habana/llama",
"use_habana": True,
"throughput_warmup_steps": 3,
"use_lazy_mode": True,
"overwrite_output_dir": True,
"seed": 42,
"bf16": True,
"report_to":'tensorboard',
"deepspeed": {
"steps_per_print": 64,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"gradient_accumulation_steps": "auto",
"bf16": {
"enabled": True
},
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": 3,
"overlap_comm": False,
"reduce_scatter": False,
"contiguous_gradients": False,
"stage3_gather_16bit_weights_on_model_save": True
}
},
},
}
# if execution mode is eager with compile, must spcified with a compile backend
if execution_mode == "eager.compile":
pretrain_config["training_args"].update({"torch_compile_backend": "hpu_backend"})
scaling_config = ScalingConfig(num_workers=num_workers,
use_gpu=False,
resources_per_worker={"CPU": 1, "HPU": 1})
# Set backend to hccl in TorchConfig
torch_config = TorchConfig(backend="hccl")
ray.init()
# Initialize a Ray TorchTrainer
trainer = TorchTrainer(
train_loop_per_worker=pretrain_llama,
train_loop_config=pretrain_config,
torch_config=torch_config,
scaling_config=scaling_config
)
result = trainer.fit()
print(result)
开始训练#
最后,我们调用 main
函数开始预训练过程。
在调用 main
函数之前,您必须设置一些环境变量。
可见设备。环境变量
HABANA_VISIBLE_DEVICES
和HABANA_VISIBLE_MODULES
用于控制应用可见的 HPU 设备,您必须正确设置这两个环境变量。有关HABANA_VISIBLE_DEVICES
,HABANA_VISIBLE_MODULES
的更多详细用法,请访问 这里执行模式。不同的执行模式具有不同的运行时性能。默认执行模式是 lazy 模式。
# set some environment variables
os.environ["RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES"] = "0"
# if using RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES env var
# you must set HABANA_VISIBLE_MODULES, such as
# os.environ["HABANA_VISIBLE_MODULES"] = "0,1,2,3"
# execution_mode are ["lazy", "eager", "eager.compile"]
execution_mode = "lazy"
os.environ["PT_HPU_LAZY_MODE"] = "1" if execution_mode == "lazy" else "0"
main(num_workers=4, execution_mode=execution_mode)
可能的输出#
...
RayTrainWorker pid=36561) Setting up process group for: env:// [rank=0, world_size=4]
(TorchTrainer pid=36054) Started distributed worker processes:
(TorchTrainer pid=36054) - (node_id=409da2dba1dc3e5b8e58a2b766a4a19d90e7879c28c2fb13644148b8, ip=100.83.111.228, pid=36561) world_rank=0, local_rank=0, node_rank=0
(TorchTrainer pid=36054) - (node_id=409da2dba1dc3e5b8e58a2b766a4a19d90e7879c28c2fb13644148b8, ip=100.83.111.228, pid=36562) world_rank=1, local_rank=1, node_rank=0
(TorchTrainer pid=36054) - (node_id=409da2dba1dc3e5b8e58a2b766a4a19d90e7879c28c2fb13644148b8, ip=100.83.111.228, pid=36563) world_rank=2, local_rank=2, node_rank=0
(TorchTrainer pid=36054) - (node_id=409da2dba1dc3e5b8e58a2b766a4a19d90e7879c28c2fb13644148b8, ip=100.83.111.228, pid=36564) world_rank=3, local_rank=3, node_rank=0
...
(RayTrainWorker pid=36561) ============================= HABANA PT BRIDGE CONFIGURATION ===========================
(RayTrainWorker pid=36561) PT_HPU_LAZY_MODE = 1
(RayTrainWorker pid=36561) PT_HPU_RECIPE_CACHE_CONFIG = ,false,1024
(RayTrainWorker pid=36561) PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
(RayTrainWorker pid=36561) PT_HPU_LAZY_ACC_PAR_MODE = 0
(RayTrainWorker pid=36561) PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
(RayTrainWorker pid=36561) PT_HPU_EAGER_PIPELINE_ENABLE = 1
(RayTrainWorker pid=36561) PT_HPU_EAGER_COLLECTIVE_PIPELINE_ENABLE = 1
(RayTrainWorker pid=36561) PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
(RayTrainWorker pid=36561) ---------------------------: System Configuration :---------------------------
(RayTrainWorker pid=36561) Num CPU Cores : 160
(RayTrainWorker pid=36561) CPU RAM : 1056374420 KB
(RayTrainWorker pid=36561) ------------------------------------------------------------------------------
...
(RayTrainWorker pid=36561) {'loss': 4.1052, 'grad_norm': 2.225008249282837, 'learning_rate': 8.26086956521739e-06, 'epoch': 2.5, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.0472, 'grad_norm': 2.0701019763946533, 'learning_rate': 8.212560386473431e-06, 'epoch': 2.51, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.097, 'grad_norm': 2.119075059890747, 'learning_rate': 8.164251207729469e-06, 'epoch': 2.51, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 3.7035, 'grad_norm': 2.1802899837493896, 'learning_rate': 8.115942028985508e-06, 'epoch': 2.51, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.242, 'grad_norm': 1.9516953229904175, 'learning_rate': 8.067632850241547e-06, 'epoch': 2.52, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 3.9594, 'grad_norm': 2.0580222606658936, 'learning_rate': 8.019323671497584e-06, 'epoch': 2.52, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.3415, 'grad_norm': 2.192605495452881, 'learning_rate': 7.971014492753623e-06, 'epoch': 2.52, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 3.9739, 'grad_norm': 2.0198025703430176, 'learning_rate': 7.922705314009662e-06, 'epoch': 2.52, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.1624, 'grad_norm': 2.0957565307617188, 'learning_rate': 7.874396135265701e-06, 'epoch': 2.53, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 3.9744, 'grad_norm': 2.1159448623657227, 'learning_rate': 7.82608695652174e-06, 'epoch': 2.53, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.1127, 'grad_norm': 2.159834623336792, 'learning_rate': 7.777777777777777e-06, 'epoch': 2.53, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.0588, 'grad_norm': 2.106534004211426, 'learning_rate': 7.729468599033817e-06, 'epoch': 2.54, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 3.8734, 'grad_norm': 2.445814371109009, 'learning_rate': 7.681159420289856e-06, 'epoch': 2.54, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.0278, 'grad_norm': 2.0376927852630615, 'learning_rate': 7.632850241545895e-06, 'epoch': 2.54, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 3.9643, 'grad_norm': 2.1097891330718994, 'learning_rate': 7.584541062801932e-06, 'epoch': 2.54, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.1384, 'grad_norm': 2.157325267791748, 'learning_rate': 7.536231884057972e-06, 'epoch': 2.55, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 3.9982, 'grad_norm': 2.230065107345581, 'learning_rate': 7.48792270531401e-06, 'epoch': 2.55, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.0988, 'grad_norm': 2.355875015258789, 'learning_rate': 7.439613526570048e-06, 'epoch': 2.55, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.0514, 'grad_norm': 2.1178295612335205, 'learning_rate': 7.391304347826088e-06, 'epoch': 2.56, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 3.9858, 'grad_norm': 2.089723825454712, 'learning_rate': 7.342995169082126e-06, 'epoch': 2.56, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.1548, 'grad_norm': 2.308490753173828, 'learning_rate': 7.294685990338164e-06, 'epoch': 2.56, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.0356, 'grad_norm': 1.9994627237319946, 'learning_rate': 7.246376811594203e-06, 'epoch': 2.57, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 3.7696, 'grad_norm': 1.9719663858413696, 'learning_rate': 7.1980676328502416e-06, 'epoch': 2.57, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.0157, 'grad_norm': 2.1598856449127197, 'learning_rate': 7.1497584541062814e-06, 'epoch': 2.57, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.0113, 'grad_norm': 1.997869849205017, 'learning_rate': 7.10144927536232e-06, 'epoch': 2.57, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.1048, 'grad_norm': 2.099222183227539, 'learning_rate': 7.053140096618358e-06, 'epoch': 2.58, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.0048, 'grad_norm': 2.100231885910034, 'learning_rate': 7.004830917874397e-06, 'epoch': 2.58, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.0302, 'grad_norm': 2.18204402923584, 'learning_rate': 6.956521739130435e-06, 'epoch': 2.58, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 3.7227, 'grad_norm': 2.190962553024292, 'learning_rate': 6.908212560386473e-06, 'epoch': 2.59, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.1111, 'grad_norm': 2.349518060684204, 'learning_rate': 6.859903381642513e-06, 'epoch': 2.59, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.024, 'grad_norm': 2.5497331619262695, 'learning_rate': 6.811594202898551e-06, 'epoch': 2.59, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 3.8844, 'grad_norm': 2.3125178813934326, 'learning_rate': 6.7632850241545894e-06, 'epoch': 2.59, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=36561) {'loss': 4.2208, 'grad_norm': 2.1103923320770264, 'learning_rate': 6.7149758454106285e-06, 'epoch': 2.6, 'memory_allocated (GB)': 28.87, 'max_memory_allocated (GB)': 94.26, 'total_memory_available (GB)': 94.62}
...