使用 Intel Gaudi 微调 Llama-2 模型#

在本 Jupyter Notebook 中，我们将

使用 DDP 方法，通过 Intel Gaudi 加速器微调 Llama-2-7b 模型
使用 DeepSpeed 方法，通过 Intel Gaudi 加速器微调 Llama-2-70b 模型

我们将使用 PyTorch 进行模型训练，使用 Ray 进行分布式训练。我们将使用数据集 tatsu-lab/alpaca。

Intel Gaudi AI 处理器 (HPUs) 是 Habana Labs 设计的 AI 硬件加速器。更多信息，请参阅 Gaudi 架构和 Gaudi 开发者文档。

本微调示例的基本特性包括

运行在 HPUs 上，支持三种执行模式：“lazy”, “eager”, “eager.compile”。
LoRA 训练。
基于 DDP 或 DeepSpeed 的方法。
基于 GaudiTrainer 的训练。
Llama-2-7b/Llama-2-70b 模型。
基于 Ray 的资源调度和管理。

准备环境#

本示例在具有 4 个 HPUs 的单个节点上运行。

我们推荐使用预构建的容器来运行这些示例。运行容器需要 Docker。请参阅安装 Docker Engine 获取安装说明。

接下来，按照使用容器运行安装 Habana 驱动程序和容器运行时。

获取 Docker 镜像#

docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest

运行 Docker 镜像#

docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
# maybe should mapping your workspace volumns

安装依赖项#

# "optimum-habana>1.11.1" if exection mode "eager" or "eager.compile" 
# "ray>=2.20.0"
pip install ray[train] notebook transformers datasets evaluate peft accelerate scikit-learn optimum-habana

# install deepspeed
pip install git+https://github.com/HabanaAI/[email protected]

# this notebook verfied with packages' version:
# transformers==4.45.2
# datasets==3.3.2
# evaluate==0.4.3
# peft==0.14.0
# accelerate==0.33.0
# scikit-learn==1.6.1
# optimum-habana==1.15.0

# deepspeed==0.16.1+hpu.synapse.v1.20.0

导入必需的库#

import os
import copy
from typing import Dict

import torch

import datasets
import transformers
from transformers import DataCollatorForLanguageModeling

from tqdm import tqdm

import peft

from optimum.habana import GaudiTrainer, GaudiConfig, GaudiTrainingArguments
from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi

准备数据集函数#

使用指定格式预处理原始数据集的每一行。

def preprocess_dataset(raw_datasets):

    PROMPT_DICT = {
        "prompt_with_input": (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
        ),
        "prompt_without_input": (
            "Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Response:"
        ),
    }

    def create_prompts(examples):
        prompts = {}
        prompts["source"] = []
        prompts["target"] = []
        for example in examples:
            prompt_template = (
                PROMPT_DICT["prompt_with_input"] if example["input"] != "" else PROMPT_DICT["prompt_without_input"]
            )
            source = prompt_template.format_map(example)
            prompts["source"].append(source)
            prompts["target"].append(example["output"])
        return prompts

    # Preprocessing the datasets.
    for key in raw_datasets:
        prompts = create_prompts(raw_datasets[key])
        columns_to_be_removed = list(raw_datasets[key].features.keys())
        raw_datasets[key] = raw_datasets[key].add_column("prompt_sources", prompts["source"])
        raw_datasets[key] = raw_datasets[key].add_column("prompt_targets", prompts["target"])
        raw_datasets[key] = raw_datasets[key].remove_columns(columns_to_be_removed)

数据集到 Tokenizer 函数#

使用模型 tokenizer 对数据集中的每一行进行 tokenize。

在示例代码中，我们连接了数据集的行内容以加快训练速度。

所有数据集都作为“train”数据集处理，不从 raw_datasets 中采样评估数据集。

def preprocess_dataset_to_tokenizer(raw_datasets, tokenizer):
    max_seq_length = 512
    tokenizer.pad_token_id = 0
    tokenizer.eos_token_id = 1
    tokenizer.bos_token_id = 2

    def tokenize(prompt, add_eos_token=True):
        results = tokenizer(
            prompt,
            truncation=True,
            max_length=max_seq_length,
            padding=False,
            return_tensors=None,
        )
        for i in range(len(results["input_ids"])):
            if (
                results["input_ids"][i][-1] != tokenizer.eos_token_id
                and len(results["input_ids"][i]) < max_seq_length
                and add_eos_token
            ):
                results["input_ids"][i].append(tokenizer.eos_token_id)
                results["attention_mask"][i].append(1)

        results["labels"] = copy.deepcopy(results["input_ids"])
        results["input_id_len"] = [len(result) for result in results["input_ids"]]
        return results

    def preprocess_function(examples):
        keys = list(examples.data.keys())
        if len(keys) != 2:
            raise ValueError("Unsupported dataset format")

        st = [s + t for s, t in zip(examples[keys[0]], examples[keys[1]])]

        examples_tokenized = tokenize(st)
        input_ids = examples_tokenized["input_ids"]
        labels = examples_tokenized["labels"]
        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": examples_tokenized["attention_mask"],
        }

    tokenized_datasets = raw_datasets.map(
        preprocess_function,
        batched=True,
        load_from_cache_file=True,
    )

    def concatenate_data(dataset, max_seq_length):
        concatenated_dataset = {}
        for column in dataset.features:
            concatenated_data = [item for sample in dataset[column] for item in sample]
            reshaped_data = [
                concatenated_data[i * max_seq_length : (i + 1) * max_seq_length]
                for i in range(len(concatenated_data) // max_seq_length)
            ]
            concatenated_dataset[column] = reshaped_data
        return datasets.Dataset.from_dict(concatenated_dataset)

    tokenized_datasets_ = tokenized_datasets["train"].remove_columns(["prompt_sources", "prompt_targets"])
    tokenized_datasets["train"] = concatenate_data(tokenized_datasets_, max_seq_length)

    return tokenized_datasets

准备训练参数#

这里有些参数是硬编码的，你可以从 config 中传递参数

def prepare_training_args(config: Dict):
    # prepare execution mode config
    execution_mode = config["execution_mode"]
    use_lazy_mode = True if execution_mode == "lazy" else False
    torch_compile_backend = "hpu_backend" if execution_mode == "eager.compile" else None

    deepspeed = config["deepspeed"] if "deepspeed" in config else None

    return GaudiTrainingArguments(deepspeed=deepspeed,
                                  output_dir=config["output"],
                                  do_train=True,
                                  do_eval=False,
                                  per_device_train_batch_size=config["batch_size_per_worker"],
                                  bf16=True,
                                  learning_rate=config["lr"],
                                  save_strategy="no",
                                  torch_compile_backend=torch_compile_backend,
                                  evaluation_strategy="no",
                                  lr_scheduler_type="cosine",
                                  num_train_epochs=config["epochs"],
                                  use_lazy_mode=use_lazy_mode,
                                  use_habana=True,
                                  pipelining_fwd_bwd=True,
                                  save_only_model=True,
                                  gradient_checkpointing=True,
                                  warmup_ratio=0.03,
                                  throughput_warmup_steps=3,
                                  logging_steps=5)

准备模型#

从 huggingface 下载模型或从本地目录读取模型。
将模型转换为 lora 模型。
将模型移动到 HPU 设备。

如果您不想使用 LoRA 进行微调，只需删除 LoRA 转换步骤即可。

def prepare_model(config: Dict, device):
    # prepare from pretrained model
    deepspeed = config["deepspeed"] if "deepspeed" in config else None
    if deepspeed is not None:
        auto_config = transformers.AutoConfig.from_pretrained(config["model"], use_cache=False, revision="main", use_auth_token=None, trust_remote_code=None)
        model = transformers.AutoModelForCausalLM.from_pretrained(config["model"], config=auto_config, **config["model_config"])
        model.generation_config.attn_softmax_bf16 = True
        model.generation_config.use_flash_attention = True
    else:
        model = transformers.AutoModelForCausalLM.from_pretrained(config["model"], **config["model_config"])
    model.enable_input_require_grads()

    # convert to peft model for lora training
    peft_config = peft.LoraConfig(**config["lora_config"])
    model = peft.get_peft_model(model, peft_config)

    model.to(dtype=config["model_config"]["torch_dtype"], device=device)

    return model

训练函数#

此函数将在训练期间由每个 worker 执行，步骤如下：

准备训练参数，即 GaudiTrainingArguments 的一个实例。
加载数据集并预处理数据集，只加载前 4096 项作为训练数据集。
加载预训练模型作为 tokenizer，并将数据集处理为 tokenizer。
加载预训练模型。
准备 data collator 和 gaidu_config。
准备 GaudiTrainer 的实例。
调用 train() 训练模型。
保存模型结果。

与 GPU 的训练函数相比，移植到 HPU 无需更改。 Ray Train 在内部执行以下操作：

检测 HPU 并设置设备。
初始化 habana PyTorch 后端。
初始化 habana 分布式后端。

def train_func_per_worker(config: Dict):
    # adapt transformers to gaudi
    adapt_transformers_to_gaudi()

    # prepare training arguments
    training_args = prepare_training_args(config)

    # prepare datasets
    # here we use dataset "tatsu-lab/alpaca" from huggingface
    raw_datasets = datasets.DatasetDict({"train": datasets.load_dataset("tatsu-lab/alpaca", split='train[0:4096]')})
    preprocess_dataset(raw_datasets)

    # prepare tokenizer
    tokenizer = transformers.AutoTokenizer.from_pretrained(config["model"])
    tokenized_datasets = preprocess_dataset_to_tokenizer(raw_datasets, tokenizer)

    # prepare model
    model = prepare_model(config, training_args.device)

    # prepare data collator
    data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="pt", mlm=False)

    # prepare gaudi config
    gaudi_config = GaudiConfig()
    gaudi_config.use_fused_adam = True
    gaudi_config.use_fused_clip_norm = True

    # instance GaudiTrainer
    trainer = GaudiTrainer(
        model=model,
        gaudi_config=gaudi_config,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=None,
        preprocess_logits_for_metrics=None,
    )

    train_result = trainer.train()
    print(f"train_result = {train_result}")
    trainer.save_model()

    return train_result

主训练函数#

train_llama 函数使用 Ray 设置分布式训练环境并启动训练过程。要使用 HPU 进行训练，我们只需进行以下更改：

设置训练的执行模式，支持的执行模式包括
- “lazy”：“lazy”：延迟执行图，包含从脚本逐个 op 传递的 op，类似于 Eager 模式。它提供了在 Gaudi 上具有性能的 Eager 模式体验。与使用 torch.compile 的 Eager 模式不同，图在每次迭代中都会被分析，导致 CPU 使用率较高。
- “eager”：“eager”：按照标准 PyTorch Eager 模式脚本中定义的方式逐个 op 执行。
- “eager.compile”：“eager.compile”：扩展了 torch.compile 的 Eager 模式 - 类似于 Eager 模式，但扩展了将模型的全部或部分（例如函数）包装到图中。未包装的部分则以 eager 方式执行。
更多详细理论可以在这里找到，详细的性能结果可以在这里找到。
设置训练方法，支持的方法包括
- “ddp”
- “deepspeed”
在 ScalingConfig 中为每个 worker 分配一个 HPU
在 TorchConfig 中将 backend 设置为 hccl

def train_llama(num_workers, execution_mode, training_method):
    import ray
    from ray.train import ScalingConfig
    from ray.train.torch import TorchTrainer, TorchConfig

    # deepspeed config, can also place it to config file
    deepspeed_config = {
        "steps_per_print": 64,
        "train_batch_size": "auto",
        "train_micro_batch_size_per_gpu": "auto",
        "gradient_accumulation_steps": "auto",
        "bf16": {
            "enabled": True
        },
        "gradient_clipping": 1.0,
        "zero_optimization": {
            "stage": 3,
            "overlap_comm": False,
            "contiguous_gradients": False,
            "stage3_gather_16bit_weights_on_model_save": True
        }
    }

    # Preparing train configurations
    train_config = {
        "execution_mode": execution_mode,
        "model": "/root/models/models--meta-llama--Llama-2-70b-chat-hf/snapshots/e9149a12809580e8602995856f8098ce973d1080/",
        "model_config": {"torch_dtype": torch.bfloat16, "trust_remote_code": False, "use_auth_token": None},
        "lora_config": {"task_type": "CAUSAL_LM", "r": 8, "lora_alpha": 32, "lora_dropout": 0.1, "target_modules": ["q_proj", "v_proj"]},
        "lr": 1e-4,
        "epochs": 2,
        "batch_size_per_worker": 8,
        "output": "/tmp/ray/",
        "deepspeed": deepspeed_config if training_method == "deepspeed" else None,
    }

    # Configure computation resources
    # In ScalingConfig, require an HPU for each worker
    scaling_config = ScalingConfig(num_workers=num_workers, resources_per_worker={"CPU": 1, "HPU": 1})
    # Set backend to hccl in TorchConfig
    torch_config = TorchConfig(backend = "hccl")

    # start your ray cluster
    ray.init()

    # Initialize a Ray TorchTrainer
    trainer = TorchTrainer(
        train_loop_per_worker=train_func_per_worker,
        train_loop_config=train_config,
        torch_config=torch_config,
        scaling_config=scaling_config,
    )

    result = trainer.fit()
    print(f"Training result: {result}")

开始训练#

最后，我们调用 train_llama 函数来启动训练过程。您可以调整使用的 worker 数量以及 HPU 的执行模式。

# set some environment variables
os.environ["RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES"] = "0"
# if using RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES env var
# you must set HABANA_VISIBLE_DEVICES, such as
# os.environ["HABANA_VISIBLE_DEVICES"] = "0,1,2,3"

# execution_mode are ["lazy", "eager", "eager.compile"]
execution_mode = "lazy"
os.environ["PT_HPU_LAZY_MODE"] = "1" if execution_mode == "lazy" else "0"

# training_method are ["ddp", "deepspeed"]
training_method = "deepspeed"
if training_method == "deepspeed":
    os.environ["PT_HPU_MAX_COMPOUND_OP_SIZE"] = "10"
    os.environ["DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED"] = "1"

# here use 4 HPUs
train_llama(num_workers=4, execution_mode=execution_mode, training_method=training_method)

最终输出#

适用于 HPU 上的 DDP#

Llama-2-70b-chat-hf
4 HPU
LoRA

(RayTrainWorker pid=123181) {'loss': 1.8051, 'grad_norm': 0.6015625, 'learning_rate': 9.938441702975689e-05, 'epoch': 0.16, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.6754, 'grad_norm': 0.408203125, 'learning_rate': 9.567727288213005e-05, 'epoch': 0.32, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.568, 'grad_norm': 0.4453125, 'learning_rate': 8.885729807284856e-05, 'epoch': 0.48, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.4934, 'grad_norm': 0.4609375, 'learning_rate': 7.938926261462366e-05, 'epoch': 0.65, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.3965, 'grad_norm': 0.3515625, 'learning_rate': 6.7918397477265e-05, 'epoch': 0.81, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.3461, 'grad_norm': 0.34765625, 'learning_rate': 5.522642316338268e-05, 'epoch': 0.97, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.2924, 'grad_norm': 0.32421875, 'learning_rate': 4.2178276747988446e-05, 'epoch': 1.13, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.2643, 'grad_norm': 0.33203125, 'learning_rate': 2.9663167846209998e-05, 'epoch': 1.29, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.263, 'grad_norm': 0.318359375, 'learning_rate': 1.8533980447508137e-05, 'epoch': 1.45, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.2502, 'grad_norm': 0.275390625, 'learning_rate': 9.549150281252633e-06, 'epoch': 1.61, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.2161, 'grad_norm': 0.2734375, 'learning_rate': 3.3209786751399187e-06, 'epoch': 1.77, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=123181) {'loss': 1.2517, 'grad_norm': 0.294921875, 'learning_rate': 2.7390523158633554e-07, 'epoch': 1.94, 'memory_allocated (GB)': 13.64, 'max_memory_allocated (GB)': 48.92, 'total_memory_available (GB)': 94.62}

适用于 HPU 上的 DeepSpeed#

Llama-2-70b-chat-hf
4 HPU
LoRA

(RayTrainWorker pid=50067) {'loss': 1.662, 'grad_norm': 0.36514782905578613, 'learning_rate': 9.938441702975689e-05, 'epoch': 0.16, 'memory_allocated (GB)': 32.86, 'max_memory_allocated (GB)': 94.46, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=50067) {'loss': 1.6047, 'grad_norm': 0.396455317735672, 'learning_rate': 9.567727288213005e-05, 'epoch': 0.32, 'memory_allocated (GB)': 32.86, 'max_memory_allocated (GB)': 94.57, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=50067) {'loss': 1.4974, 'grad_norm': 0.49250370264053345, 'learning_rate': 8.885729807284856e-05, 'epoch': 0.48, 'memory_allocated (GB)': 32.86, 'max_memory_allocated (GB)': 94.57, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=50067) {'loss': 1.4078, 'grad_norm': 0.49840453267097473, 'learning_rate': 7.938926261462366e-05, 'epoch': 0.65, 'memory_allocated (GB)': 32.86, 'max_memory_allocated (GB)': 94.57, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=50067) {'loss': 1.315, 'grad_norm': 0.3432576656341553, 'learning_rate': 6.7918397477265e-05, 'epoch': 0.81, 'memory_allocated (GB)': 32.86, 'max_memory_allocated (GB)': 94.59, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=50067) {'loss': 1.2651, 'grad_norm': 0.32175061106681824, 'learning_rate': 5.522642316338268e-05, 'epoch': 0.97, 'memory_allocated (GB)': 32.86, 'max_memory_allocated (GB)': 94.59, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=50067) {'loss': 1.1947, 'grad_norm': 0.3646097481250763, 'learning_rate': 4.2178276747988446e-05, 'epoch': 1.13, 'memory_allocated (GB)': 32.86, 'max_memory_allocated (GB)': 94.59, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=50067) {'loss': 1.1534, 'grad_norm': 0.4598522186279297, 'learning_rate': 2.9663167846209998e-05, 'epoch': 1.29, 'memory_allocated (GB)': 32.86, 'max_memory_allocated (GB)': 94.59, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=50067) {'loss': 1.1404, 'grad_norm': 0.2677183449268341, 'learning_rate': 1.8533980447508137e-05, 'epoch': 1.45, 'memory_allocated (GB)': 32.75, 'max_memory_allocated (GB)': 94.59, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=50067) {'loss': 1.1283, 'grad_norm': 0.32087600231170654, 'learning_rate': 9.549150281252633e-06, 'epoch': 1.61, 'memory_allocated (GB)': 32.86, 'max_memory_allocated (GB)': 94.59, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=50067) {'loss': 1.0877, 'grad_norm': 0.28305548429489136, 'learning_rate': 3.3209786751399187e-06, 'epoch': 1.77, 'memory_allocated (GB)': 32.86, 'max_memory_allocated (GB)': 94.59, 'total_memory_available (GB)': 94.62}
(RayTrainWorker pid=50067) {'loss': 1.1238, 'grad_norm': 0.25713953375816345, 'learning_rate': 2.7390523158633554e-07, 'epoch': 1.94, 'memory_allocated (GB)': 32.86, 'max_memory_allocated (GB)': 94.59, 'total_memory_available (GB)': 94.62}