Source code for lmflow.args

#!/usr/bin/env python
# coding=utf-8
"""This script defines dataclasses: ModelArguments and DatasetArguments,
that contain the arguments for the model and dataset used in training.

It imports several modules, including dataclasses, field from typing, Optional from typing,
require_version from transformers.utils.versions, MODEL_FOR_CAUSAL_LM_MAPPING,
and TrainingArguments from transformers.

MODEL_CONFIG_CLASSES is assigned a list of the model config classes from
MODEL_FOR_CAUSAL_LM_MAPPING. MODEL_TYPES is assigned a tuple of the model types
extracted from the MODEL_CONFIG_CLASSES.
"""
import logging
from dataclasses import dataclass, field, fields, Field, make_dataclass
from pathlib import Path
from typing import Optional, List, Union, Dict

from transformers import (
    MODEL_FOR_CAUSAL_LM_MAPPING,
    TrainingArguments,
)
from transformers.utils.versions import require_version

from lmflow.utils.versioning import is_flash_attn_available

[docs] MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
[docs] MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
[docs] logger = logging.getLogger(__name__)
[docs] class OptimizerNames():
[docs] DUMMY = "dummy"
[docs] ADABELIEF = "adabelief"
[docs] ADABOUND = "adabound"
[docs] LARS = "lars"
[docs] LAMB = "lamb"
[docs] ADAMAX = "adamax"
[docs] NADAM = "nadam"
[docs] RADAM = "radam"
[docs] ADAMP = "adamp"
[docs] SGDP = "sgdp"
[docs] YOGI = "yogi"
[docs] SOPHIA = "sophia"
[docs] ADAN = "adan"
[docs] ADAM = "adam"
[docs] NOVOGRAD = "novograd"
[docs] ADADELTA = "adadelta"
[docs] ADAGRAD = "adagrad"
[docs] ADAMW_SCHEDULE_FREE = "adamw_schedule_free"
[docs] SGD_SCHEDULE_FREE = "sgd_schedule_free"
@dataclass
[docs] class ModelArguments: """ Define a class ModelArguments using the dataclass decorator. The class contains several optional parameters that can be used to configure a model. model_name_or_path : str a string representing the path or name of a pretrained model checkpoint for weights initialization. If None, a model will be trained from scratch. model_type : str a string representing the type of model to use if training from scratch. If not provided, a pretrained model will be used. config_overrides : str a string representing the default config settings to override when training a model from scratch. config_name : str a string representing the name or path of the pretrained config to use, if different from the model_name_or_path. tokenizer_name : str a string representing the name or path of the pretrained tokenizer to use, if different from the model_name_or_path. cache_dir : str a string representing the path to the directory where pretrained models downloaded from huggingface.co will be stored. use_fast_tokenizer : bool a boolean indicating whether to use a fast tokenizer (backed by the tokenizers library) or not. model_revision : str a string representing the specific model version to use (can be a branch name, tag name, or commit id). token : Optional[str] Necessary when accessing a private model/dataset. torch_dtype : str a string representing the dtype to load the model under. If auto is passed, the dtype will be automatically derived from the model's weights. use_ram_optimized_load : bool a boolean indicating whether to use disk mapping when memory is not enough. use_int8 : bool a boolean indicating whether to load int8 quantization for inference. load_in_4bit : bool whether to load the model in 4bit model_max_length : int The maximum length of the model. truncation_side : str The side on which the model should have truncation applied. arch_type : str Model architecture type. padding_side : str The side on which the tokenizer should have padding applied. eos_padding : bool whether to pad with eos token instead of pad token. ignore_bias_buffers : bool fix for DDP issues with LM bias/mask buffers - invalid scalar type,`inplace operation. """
[docs] model_name_or_path: Optional[str] = field( default=None, metadata={ "help": ( "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." ) }, )
[docs] lora_model_path: Optional[str] = field( default=None, metadata={ "help": ( "The incremental model diff introduced by LoRA finetuning." " Along with the original non-finetuned model forms the whole" " finetuned model." ) } )
[docs] model_type: Optional[str] = field( default=None, metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, )
[docs] config_overrides: Optional[str] = field( default=None, metadata={ "help": ( "Override some existing default config settings when a model is trained from scratch. Example: " "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" ) }, )
[docs] arch_type: Optional[str] = field( default="decoder_only", metadata={ "help": ("Model architecture type."), "choices": ["decoder_only", "encoder_decoder", "text_regression", "vision_encoder_decoder"], }, )
[docs] config_name: Optional[str] = field( default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} )
[docs] tokenizer_name: Optional[str] = field( default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} )
[docs] cache_dir: Optional[str] = field( default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, )
[docs] use_fast_tokenizer: bool = field( default=True, metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, )
[docs] model_revision: str = field( default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, )
[docs] token: Optional[str] = field( default=None, metadata={ "help": ("Necessary to specify when accessing a private model/dataset.") }, )
[docs] trust_remote_code: bool = field( default=False, metadata={ "help": ( "Whether to trust remote code when loading model." ) }, )
[docs] torch_dtype: Optional[str] = field( default=None, metadata={ "help": ( "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " "dtype will be automatically derived from the model's weights." ), "choices": ["auto", "bfloat16", "float16", "float32"], }, )
[docs] use_lora: bool = field( default=False, metadata={"help": "Whether to lora."}, )
[docs] use_qlora: bool = field( default=False, metadata={"help": "Whether to use qlora."}, )
[docs] bits: int = field( default=4, metadata={"help": "The number of bits for quantization.", "choices": [4, 8], }, )
[docs] quant_type: str = field( default='nf4', metadata={"help": "The quantization type for quantization.", "choices": ["nf4", "fp4"], }, )
[docs] double_quant: bool = field( default=True, metadata={"help": "Whether to use double quantization."}, )
[docs] lora_r: int = field( default=8, metadata={"help": "the rank of the lora parameters. The smaller lora_r is , the fewer parameters lora has."}, )
[docs] lora_alpha: int = field( default=32, metadata={ "help": "Merging ratio between the fine-tuned model and the original. This is controlled by a parameter called alpha in the paper."}, )
[docs] lora_target_modules: List[str] = field( default=None, metadata={"help": "Pretrained config name or path if not the same as model_name",} )
[docs] lora_dropout: float = field( default=0.1, metadata={"help": "The dropout rate in lora.linear."}, )
[docs] save_aggregated_lora: bool = field( default=False, metadata={"help": "Whether to save aggregated lora."}, )
[docs] use_ram_optimized_load: bool = field( default=True, metadata={"help": "Whether use disk mapping when memory is not enough."} )
[docs] use_flash_attention: bool = field( default=False, metadata={ "help": ( "whether use flash attention layer to reduce GPU memory with" " higher time cost." ) } )
[docs] truncate_to_model_max_length: bool = field( default=True, metadata={ "help": ( "whether truncate the dataset to model max length." ) } )
[docs] do_rope_scaling: bool = field( default=False, metadata={ "help": ( "whether do ROPE scaling for llama model." "Linear_scaling credits to the Reddit user /u/kaiokendev." "https://arxiv.org/abs/2306.15595" "NTK_scaling credits to the Reddit users /u/bloc97 and /u/emozilla." "https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/" ) } )
[docs] rope_pi_ratio: int = field( default=1, metadata={ "help": ( "the ratio of pi in RoPE scaling." ) } )
[docs] rope_ntk_ratio: int = field( default=1, metadata={ "help": ( "the ratio of NTK in RoPE scaling." ) } )
[docs] use_int8: bool = field( default=False, metadata={"help": "whether to load int8 quantization for inference"} )
[docs] load_in_4bit: Optional[bool] = field( default=True, metadata={ "help": "whether to load the model in 4bit" }, )
[docs] model_max_length: Optional[int] = field( default=None, metadata={"help": ( "The maximum length of the model. When not specified, " "will follow the model's default max length. (i.e., tokenizer.model_max_length)") }, )
[docs] truncation_side: str = field( default=None, metadata={ "help": ( "The side on which the tokenizer should have truncation applied. " "When not specified, will follow the tokenizer's default truncation strategy. " "(i.e., tokenizer.truncation_side)"), "choices": [None, "left", "right"], }, )
[docs] padding_side: str = field( default='right', metadata={ "help": ( "The side on which the tokenizer should have padding applied. " "LMFlow uses right padding by default. When set to `auto`, will " "use padding_side from tokenizer.padding_side."), "choices": ["right", "left", "auto"], } )
[docs] eos_padding: Optional[bool] = field( default=False, metadata={"help": "whether to pad with eos token"} )
[docs] ignore_bias_buffers: Optional[bool] = field( default=False, metadata={ # debug argument for distributed training "help": "fix for DDP issues with LM bias/mask buffers - invalid scalar type,`inplace operation. See" "https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992" }, )
[docs] def __post_init__(self): if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): raise ValueError( "--config_overrides can't be used in combination with --config_name or --model_name_or_path" ) if self.use_qlora: if not self.use_lora: logger.warning("use_qlora is set to True, but use_lora is not set to True. Setting use_lora to True.") self.use_lora = True if self.use_flash_attention: if not is_flash_attn_available(): self.use_flash_attention = False logger.warning("Flash attention is not available in the current environment. Disabling flash attention.")
@dataclass
[docs] class VisModelArguments(ModelArguments):
[docs] low_resource: Optional[bool] = field( default=False, metadata={ "help": "Use 8 bit and float16 when loading llm" } )
[docs] custom_model: bool = field( default=False, metadata={"help": "flag for the model from huggingface or not"} )
[docs] pretrained_language_projection_path: str = field( default=None, metadata={"help": "path for model pretrained_language_projection_path"} )
[docs] custom_vision_model: bool = field( default=False, metadata={"help": "flag for the model from huggingface or not"} )
[docs] image_encoder_name_or_path: Optional[str] = field( default=None, metadata={ "help": ( "The name or path of the image encoder to use." ) }, )
[docs] qformer_name_or_path: Optional[str] = field( default=None, metadata={ "help": ( "llm model in multi-modality model" ) }, )
[docs] llm_model_name_or_path: Optional[str] = field( default=None, metadata={ "help": ( "llm model in multi-modality model" ) }, )
[docs] use_prompt_cache: bool = field( default=False, metadata={"help": "Whether to use prompt cache."}, )
[docs] prompt_cache_path: Optional[str] = field( default=None, metadata={"help": "Path to prompt cache."}, )
[docs] llava_loading: Optional[bool] = field( default=False, metadata={"help": "Whether to load module by module from pretrained model."}, )
[docs] with_qformer: Optional[bool] = field( default=False, metadata={"help": "Whether to use qformer."}, )
[docs] vision_select_layer: Optional[int] = field( default=-2, metadata={"help": "Which layer to select in vision model."}, )
[docs] llava_pretrain_model_path: Optional[str] = field( default=None, metadata={"help": "Path to llava pretrained model."}, )
[docs] save_pretrain_model_path: Optional[str] = field( default=None, metadata={"help": "Path to pretrained model."}, )
@dataclass
[docs] class DatasetArguments: """ Define a class DatasetArguments using the dataclass decorator. The class contains several optional parameters that can be used to configure a dataset for a language model. dataset_path : str a string representing the path of the dataset to use. dataset_name : str a string representing the name of the dataset to use. The default value is "customized". is_custom_dataset : bool a boolean indicating whether to use custom data. The default value is False. customized_cache_dir : str a string representing the path to the directory where customized dataset caches will be stored. dataset_config_name : str a string representing the configuration name of the dataset to use (via the datasets library). train_file : str a string representing the path to the input training data file (a text file). validation_file : str a string representing the path to the input evaluation data file to evaluate the perplexity on (a text file). max_train_samples : int an integer indicating the maximum number of training examples to use for debugging or quicker training. If set, the training dataset will be truncated to this number. max_eval_samples: int an integer indicating the maximum number of evaluation examples to use for debugging or quicker training. If set, the evaluation dataset will be truncated to this number. streaming : bool a boolean indicating whether to enable streaming mode. block_size: int an integer indicating the optional input sequence length after tokenization. The training dataset will be truncated in blocks of this size for training. train_on_prompt: bool a boolean indicating whether to train on prompt for conversation datasets such as ShareGPT. conversation_template: str a string representing the template for conversation datasets. The class also includes some additional parameters that can be used to configure the dataset further, such as `overwrite_cache`, `validation_split_percentage`, `preprocessing_num_workers`, `disable_group_texts`, `demo_example_in_prompt`, `explanation_in_prompt`, `keep_linebreaks`, and `prompt_structure`. The field function is used to set default values and provide help messages for each parameter. The Optional type hint is used to indicate that a parameter is optional. The metadata argument is used to provide additional information about each parameter, such as a help message. """
[docs] dataset_path: Optional[str] = field( default=None, metadata={"help": "The path of the dataset to use."} )
[docs] dataset_name: Optional[str] = field( default="customized", metadata={"help": "Should be \"customized\""} )
[docs] is_custom_dataset: Optional[bool] = field( default=False, metadata={"help": "whether to use custom data"} )
[docs] customized_cache_dir: Optional[str] = field( default=".cache/llm-ft/datasets", metadata={"help": "Where do you want to store the customized dataset caches"}, )
[docs] dataset_config_name: Optional[str] = field( default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} )
[docs] train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
[docs] validation_file: Optional[str] = field( default=None, metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, )
[docs] max_train_samples: Optional[int] = field( default=None, metadata={ "help": ( "For debugging purposes or quicker training, truncate the number of training examples to this " "value if set." ) }, )
[docs] max_eval_samples: Optional[int] = field( default=1e10, metadata={ "help": ( "For debugging purposes or quicker training, truncate the number of evaluation examples to this " "value if set." ) }, )
[docs] streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
[docs] block_size: Optional[int] = field( default=None, metadata={ "help": ( "Optional input sequence length after tokenization. " "The training dataset will be truncated in block of this size for training. " "Default to the model max input length for single sentence inputs (take into account special tokens)." ) }, )
[docs] overwrite_cache: bool = field( default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} )
[docs] validation_split_percentage: Optional[int] = field( default=5, metadata={ "help": "The percentage of the train set used as validation set in case there's no validation split" }, )
[docs] preprocessing_num_workers: Optional[int] = field( default=None, metadata={"help": "The number of processes to use for the preprocessing."}, )
[docs] group_texts_batch_size: int = field( default=1000, metadata={ "help": ( "Number of samples that will be grouped together to go though" " `group_texts` operation. See `--disable_group_texts` for" " detailed explanation of this operation." ) } )
[docs] disable_group_texts: bool = field( default=True, metadata={ "help": ( "Whether we disable group of original samples together to" " generate sample sequences of length `block_size`" " By Default, it is True, which means the long samples" " are truncated to `block_size` tokens" " and short samples are padded to `block_size` tokens." " If set to False, we group every 1000 tokenized" " sequences together, divide them into" " [{total_num_tokens} / {block_size}] sequences," " each with `block_size` tokens" " (the remaining tokens are ommited." " This group text behavior is useful" " for continual pretrain or pretrain." ) }, )
[docs] keep_linebreaks: bool = field( default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} )
[docs] test_file: Optional[str] = field( default=None, metadata={"help": "Evaluation File Path"}, )
[docs] train_on_prompt: bool = field( default=False, metadata={"help": "Whether to train on prompt for conversation datasets such as ShareGPT."} )
[docs] conversation_template: Optional[str] = field( default=None, metadata={"help": "The template for conversation datasets."} )
[docs] def __post_init__(self): if self.streaming: require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`") if self.dataset_name is None and self.train_file is None and self.validation_file is None: raise ValueError("Need either a dataset name or a training/validation file.") else: if self.train_file is not None: extension = self.train_file.split(".")[-1] assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." if self.validation_file is not None: extension = self.validation_file.split(".")[-1] assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
@dataclass
[docs] class MultiModalDatasetArguments(DatasetArguments):
[docs] image_folder: Optional[str] = field( default=None, metadata={"help": "The folder of the image file."} )
[docs] image_aspect_ratio: Optional[str] = field( default="pad", metadata={"help": "The ratio type"} )
[docs] is_multimodal: Optional[bool] = field( default=True, metadata={"help": "Flag for the modality type."} )
[docs] use_image_start_end: Optional[bool] = field( default=True, metadata={"help": "Flag for the modality type."} )
[docs] sep_style: Optional[str] = field( default="plain", metadata={"help": "Sep style in multi_modality dataset."} )
@dataclass
[docs] class FinetunerArguments(TrainingArguments): """ Adapt transformers.TrainingArguments """
[docs] eval_dataset_path: Optional[str] = field( default=None, metadata={"help": "The path of the eval dataset to use."} )
[docs] remove_unused_columns: Optional[bool] = field( default=False, metadata={ "help": "whether to remove the unused columns in collate fn"} )
[docs] finetune_part: Optional[str] = field( default="language_projection", metadata={ "help": "the module to finetune." } )
[docs] save_language_projection: Optional[str] = field( default=False, metadata={ "help": "whether to save language projection layer in multi-modal models." } )
[docs] use_lisa: bool = field( default=False, metadata={ "help": "whether to use LISA training strategy." } )
[docs] lisa_activated_layers: int = field( default=2, metadata={ "help": "the number of activated layers in LISA." } )
[docs] lisa_interval_steps: int = field( default=20, metadata={ "help": "the number of steps in each freezing interval of LISA, i.e. the selected unfreezed layers are randomly switched every {lisa_interval_steps} steps." } )
[docs] lisa_layers_attribute: str = field( default="model.model.layers", metadata={ "help": "where the layer attribute stores, e.g. model.model.layers" } )
[docs] use_customized_optim: bool = field( default=False, metadata={ "help": "whether to use customized optimizers." } )
[docs] customized_optim: str = field( default="sign_sgd", metadata={ "help": "name of the customized optimizer." } )
[docs] customized_optim_args: str = field( default=None, metadata={ "help": "optional arguments that are supplied." } )
[docs] optim_dummy_beta1: float = field( default=0.9, metadata={ "help": "A useless argument for dummy optimizer, just for tutorial" } )
[docs] optim_dummy_beta2: float = field( default=0.999, metadata={ "help": "A useless argument for dummy optimizer, just for tutorial" } )
[docs] optim_adam_beta1: float = field( default=0.9, metadata={ "help": "Coefficient used for computing running averages of gradient" } )
[docs] optim_adam_beta2: float = field( default=0.999, metadata={ "help": "Coefficient used for computing running averages of squared gradient" } )
[docs] optim_beta1: float = field( default=0.9, metadata={ "help": "Coefficient used for computing running averages of gradient" } )
[docs] optim_beta2: float = field( default=0.999, metadata={ "help": "Coefficient used for computing running averages of squared gradient" } )
[docs] optim_beta3: float = field( default=0.9, metadata={ "help": "Coefficient used for computing running averages of gradient" } )
[docs] optim_momentum: float = field( default=0.999, metadata={ "help": "Coefficient used for the momentum term in optimizers like SGD with momentum" } )
[docs] optim_weight_decay: float = field( default=0, metadata={ "help": "Weight decay (L2 penalty) added to the loss to prevent overfitting" } )
@dataclass
[docs] class RewardModelTunerArguments(FinetunerArguments): """ Arguments for reward modeling. """ pass
@dataclass
[docs] class EvaluatorArguments: """ Define a class EvaluatorArguments using the dataclass decorator. The class contains several optional parameters that can be used to configure a evaluator. local_rank : str For distributed training: local_rank random_shuffle : bool use_wandb : bool random_seed : int, default = 1 output_dir : str, default = './output_dir', mixed_precision : str, choice from ["bf16","fp16"]. mixed precision mode, whether to use bf16 or fp16 deepspeed : Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already loaded json file as a dict temperature : float An argument of model.generate in huggingface to control the diversity of generation. repetition_penalty : float An argument of model.generate in huggingface to penalize repetitions. """
[docs] local_rank: int = field( default=-1, metadata={"help": "For distributed training: local_rank" } )
[docs] random_shuffle: Optional[bool] = field( default=False, metadata={"help": "" } )
[docs] use_wandb: Optional[bool] = field( default=False, metadata={ "help": ( "When this flag is True, wandb will be enabled" ) }, )
[docs] random_seed: Optional[int] = field( default=1, metadata={ "help": ( "used to set random seed" ) }, )
[docs] output_dir: Optional[str] = field( default="./output_dir", metadata={"help": "Output path for the inferenced results"}, )
[docs] mixed_precision: Optional[str] = field( default="bf16", metadata={ "help": ( "mixed precision mode, whether to use bf16 or fp16" ), "choices": ["bf16", "fp16"], }, )
[docs] deepspeed: Optional[str] = field( default=None, metadata={ "help": ( "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already" " loaded json file as a dict" ) }, )
[docs] answer_type: Optional[str] = field( default="text", metadata={ "help": ( 'Question type for answer extraction from the decoder output.' ' Supported types: \n' ' 1) "multiple_choice", e.g. A, B, C, D, ...\n' ' 2) "binary_choice", e.g. yes, no, maybe\n' ' 3) "math", e.g. 1.0, -3.52\n' ' 4) "text", e.g. "I think that it is okay"\n' ' 5) Special treatment for several datasets\n' ' - "gsm8k"\n' ' - "svamp"\n' ' - "asdiv"\n' ' - "addsub"\n' ' - "singleeq"\n' ' - "multiarith"\n' ' - "aqua"\n' ' - "csqa"\n' ' - "strategyqa"\n' ' - "pubmedqa"\n' ' - "medmcqa"\n' ' - "usmle"\n' ) }, )
[docs] prompt_structure: Optional[str] = field( default="{input}", metadata={ "help": ( 'Prompt structure to facilitate prompt engineering during' ' inference. The model will receive' ' `prompt_structure.format(input=input)` as its input.' ) }, )
[docs] evaluate_block_size: Optional[int] = field( default=512, metadata={ "help": ( "the model will have at least block_size tokens for context when calculating the conditional likelihood of any one token" " (provided there are block_size preceding tokens available to condition on)" ) }, )
[docs] metric: Optional[str] = field( default="accuracy", metadata={ "help": "the metric the model will be evaluated on", "choices": ["ppl", "perplexity", "acc", "accuracy", "nll", "neg_log_likelihood"], }, )
[docs] inference_batch_size_per_device: Optional[int] = field( default=1, metadata={ "help": ( "every device will infer {inference_batch_size_per_device}" " samples in parallel. The inferred results will be concatenaed" " with inputs and attach a reward." ), }, )
[docs] use_accelerator_for_evaluator: bool = field( default=False, metadata={"help": "Whether to use Huggingface Accelerator instead of Deepspeed"}, )
[docs] temperature: float = field( default=0, metadata={"help": "Temperature during inference."}, )
[docs] repetition_penalty: float = field( default=1, metadata={"help": "Repetition_penalty during inference."}, )
[docs] max_new_tokens: int = field( default=100, metadata={"help": "Maximum length during inference."}, )
@dataclass
[docs] class InferencerArguments: """ Define a class InferencerArguments using the dataclass decorator. The class contains several optional parameters that can be used to configure a inferencer. local_rank : str For distributed training: local_rank random_seed : int, default = 1 inference_batch_size : int, default = 1 deepspeed : Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already loaded json file as a dict mixed_precision : str, choice from ["bf16","fp16"]. mixed precision mode, whether to use bf16 or fp16 temperature : float An argument of model.generate in huggingface to control the diversity of generation. repetition_penalty : float An argument of model.generate in huggingface to penalize repetitions. use_beam_search : Optional[bool] Whether to use beam search during inference, By default False. num_output_sequences : Optional[int] Number of output sequences to return for the given prompt, currently only used in vllm inference, By default 8. top_p : Optional[float] top_p for sampling, By default 1.0. top_k : Optional[int] top_k for sampling, By default -1 (no top_k). additional_stop_token_ids : Optional[List[int]] the ids of the end of sentence tokens, By default []. apply_chat_template : Optional[bool] Whether to apply chat template, By default True. save_results : Optional[bool] Whether to save inference results, By default False. results_path : Optional[str] The **json file** path of inference results, By default None. enable_decode_inference_result : Optional[bool] Whether to detokenize the inference results. NOTE: For iterative align pipelines, whether to detokenize depends on the homogeneity of the policy model and the reward model (i.e., if they have the same tokenizer). use_vllm: bool, optional Whether to use VLLM for inference, By default False. vllm_tensor_parallel_size: int, optional The tensor parallel size for VLLM inference. vllm_gpu_memory_utilization: float, optional The GPU memory utilization for VLLM inference. The proportion of GPU memory (per GPU) to use for VLLM inference. """
[docs] device: str = field( default="gpu", metadata={ "help": "device of chatbot", "choices": ["gpu", "cpu"], }, )
[docs] local_rank: int = field( default=-1, metadata={"help": "For distributed training: local_rank" }, )
[docs] inference_batch_size: int = field( default=1, metadata={"help": "batch size for inference"}, )
[docs] vllm_inference_batch_size: int = field( default=1, metadata={"help": "The batch size for VLLM inference."} )
[docs] temperature: float = field( default=0.0, metadata={"help": "Temperature during inference."}, )
[docs] repetition_penalty: float = field( default=1, metadata={"help": "Repetition_penalty during inference."}, )
[docs] max_new_tokens: int = field( default=100, metadata={"help": "Maximum length during inference."}, )
[docs] random_seed: Optional[int] = field( default=1, metadata={ "help": ( "used to set random seed" ) }, )
[docs] deepspeed: Optional[str] = field( default=None, metadata={ "help": ( "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already" " loaded json file as a dict" ) }, )
[docs] mixed_precision: Optional[str] = field( default="bf16", metadata={ "help": ( "mixed precision mode, whether to use bf16 or fp16" ), "choices": ["bf16", "fp16"], }, )
[docs] do_sample: Optional[bool] = field( default=False, metadata={ "help": "whether turn on true random sampling during inference." }, )
[docs] use_accelerator: bool = field( default=False, metadata={"help": "Whether to use Huggingface Accelerator instead of Deepspeed"}, )
[docs] num_output_sequences: Optional[int] = field( default=8, metadata={"help": ( "number of output sequences to return for the given prompt, " "currently only used in vllm inference." )}, )
[docs] top_p: Optional[float] = field( default=1.0, metadata={"help": "top_p for sampling."}, )
[docs] top_k: Optional[int] = field( default=-1, metadata={"help": "top_k for sampling."}, )
[docs] additional_stop_token_ids: Optional[List[int]] = field( default_factory=lambda: [], metadata={"help": "the ids of the end of sentence tokens"}, )
[docs] apply_chat_template: Optional[bool] = field( default=True, metadata={"help": "whether to apply chat template"}, )
[docs] enable_decode_inference_result: Optional[bool] = field( default=False, metadata={"help": "Whether to decode the inference results."}, )
[docs] tensor_parallel_size: Optional[int] = field( default=1, metadata={"help": "The tp size for distributed (multi-instance) inference."} )
[docs] enable_distributed_inference: Optional[bool] = field( default=False, metadata={"help": "Whether to use multi-instance VLLM inference."} )
[docs] distributed_inference_num_instances: Optional[int] = field( default=1, metadata={"help": "The number of instances for multi-instance VLLM inference."} )
# vllm inference args
[docs] use_vllm: bool = field( default=False, metadata={"help": "Whether to use VLLM for inference, By default False."} )
[docs] vllm_tensor_parallel_size: Optional[int] = field( default=1, metadata={"help": "The tensor parallel size for VLLM inference."} )
[docs] vllm_gpu_memory_utilization: Optional[float] = field( default=0.95, metadata={"help": "The GPU memory utilization for VLLM inference."} )
# Args for result saving
[docs] save_results: Optional[bool] = field( default=False, metadata={"help": "Whether to save inference results."} )
[docs] results_path: Optional[str] = field( default=None, metadata={"help": "The path of inference results."} )
[docs] def __post_init__(self): if self.save_results: if self.results_path is None: raise ValueError("Need to specify results_path when save_results is True.") else: if not self.results_path.endswith(".json"): raise ValueError("The results_path must be a json file.") else: Path(self.results_path).parent.mkdir(parents=True, exist_ok=True)
@dataclass
[docs] class RaftAlignerArguments(TrainingArguments): """ Define a class RaftAlignerArguments to configure raft aligner. """
[docs] output_reward_path: Optional[str] = field( default="tmp/raft_aligner/", metadata={ "help": "The path of output rewards." } )
[docs] output_min_length: Optional[int] = field( default=64, metadata={ "help": ( "minimum length of the output token sequence generated from" " model given an input." ), }, )
[docs] output_max_length: Optional[int] = field( default=128, metadata={ "help": ( "maximum length of the output token sequence generated from" " model given an output." ), }, )
[docs] num_raft_iteration: Optional[int] = field( default=20, metadata={ "help": "number of iterations of the raft aligner." }, )
[docs] raft_batch_size: Optional[int] = field( default=1024, metadata={ "help": ( "only select {raft_batch_size} samples each time for STF training." ) }, )
[docs] top_reward_percentage: Optional[float] = field( default=0.2, metadata={ "help": ( "only top {top_reward_percentage} samples in the raft batch," " (in terms of rewards), will be used for SFT the model." ), }, )
[docs] inference_batch_size_per_device: Optional[int] = field( default=1, metadata={ "help": ( "every device will infer {inference_batch_size_per_device}" " samples in parallel. The inferred results will be concatenaed" " with inputs and attach a reward." ), }, )
[docs] collection_strategy: Optional[str] = field( default="top", metadata={ "help": ( "{collection_strategy} is either top or local" " top means that we rank the samples globally regardless of the prompts" " local means that we only rank the samples with the same prompt" ), }, )
@dataclass
[docs] class BenchmarkingArguments:
[docs] dataset_name: Optional[str] = field( default=None, metadata={ "help": "benchmark dataset name provided by lmflow" }, )
[docs] lm_evaluation_metric: Optional[str] = field( default="accuracy", metadata={ "help": "the metric the model will be evaluated on", "choices": ["acc", "acc_norm", "bleu", "chrf", "em", "f1", "ppl", \ "ter", "r@1", "r@2", "mrr", "mc1", "mc2", "word_perplexity", \ "byte_perplexity", "bits_per_byte"], }, )
@dataclass
[docs] class DPOAlignerArguments: """ The arguments for the DPO training script. """
[docs] local_rank: int = field( default=-1, metadata={"help": "For distributed training: local_rank" }, )
# data parameters
[docs] beta: Optional[float] = field( default=0.1, metadata={ "help": "the beta parameter for DPO loss" } )
# # training parameters
[docs] learning_rate: Optional[float] = field( default=5e-4, metadata={ "help": "optimizer learning rate" } )
[docs] lr_scheduler_type: Optional[str] = field( default="cosine", metadata={ "help": "the lr scheduler type" } )
[docs] warmup_steps: Optional[int] = field( default=100, metadata={ "help": "the number of warmup steps" } )
[docs] weight_decay: Optional[float] = field( default=0.05, metadata={ "help": "the weight decay" } )
[docs] optimizer_type: Optional[str] = field( default="paged_adamw_32bit", metadata={ "help": "the optimizer type" } )
[docs] per_device_train_batch_size: Optional[int] = field( default=4, metadata={ "help": "train batch size per device" } )
[docs] per_device_eval_batch_size: Optional[int] = field( default=1, metadata={ "help": "eval batch size per device" } )
[docs] gradient_accumulation_steps: Optional[int] = field( default=4, metadata={ "help": "the number of gradient accumulation steps" }, )
[docs] gradient_checkpointing: Optional[bool] = field( default=True, metadata={ "help": "whether to use gradient checkpointing" }, )
[docs] gradient_checkpointing_use_reentrant: Optional[bool] = field( default=False, metadata={ "help": "whether to use reentrant for gradient checkpointing" }, )
[docs] max_prompt_length: Optional[int] = field( default=512, metadata={ "help": "the maximum prompt length" }, )
[docs] max_length: Optional[int] = field( default=1024, metadata={ "help": "the maximum sequence length" }, )
[docs] max_steps: Optional[int] = field( default=1000, metadata={ "help": "max number of training steps" }, )
[docs] logging_steps: Optional[int] = field( default=10, metadata={ "help": "the logging frequency" }, )
[docs] save_steps: Optional[int] = field( default=100, metadata={ "help": "the saving frequency" }, )
[docs] eval_steps: Optional[int] = field( default=100, metadata={ "help": "the evaluation frequency" }, )
[docs] output_dir: Optional[str] = field( default="./results", metadata={ "help": "the output directory" }, )
[docs] log_freq: Optional[int] = field( default=1, metadata={ "help": "the logging frequency" }, )
[docs] sanity_check: Optional[bool] = field( default=False, metadata={ "help": "only train on 1000 samples" } )
[docs] report_to: Optional[str] = field( default="wandb", metadata={ "help": 'The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,' '`"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. ' 'Use `"all"` to report to all integrations installed, `"none"` for no integrations.' }, )
[docs] seed: Optional[int] = field( default=0, metadata={"help": "Random seed that will be set at the beginning of training."} )
[docs] run_name: Optional[str] = field( default="dpo", metadata={"help": "The name of the run."} )
[docs] eval_dataset_path: Optional[str] = field( default=None, metadata={"help": "The path of the eval dataset."} )
@dataclass
[docs] class DPOv2AlignerArguments(FinetunerArguments): """ The arguments for the DPOv2 training script. """ # general args
[docs] random_seed: Optional[int] = field(default=42, metadata={"help": "the random seed"})
[docs] accelerate_config_file: Optional[str] = field( default=None, metadata={"help": "file path for accelerate config file, only used in memory safe dpov2 align."} )
# pair sampling args
[docs] margin_scale: Optional[float] = field(default=1.0, metadata={"help": "the margin scale"})
[docs] sampling_paired_method: Optional[str] = field(default="max_random", metadata={"help": "the choose type"})
[docs] length_penalty: Optional[float] = field(default=0, metadata={"help": "the length penalty"})
# data collator args
[docs] max_length: Optional[int] = field(default=2048, metadata={"help": "the maximum sequence length, prompt + output"})
[docs] max_prompt_length: Optional[int] = field(default=1000, metadata={"help": "the maximum prompt length"})
[docs] mask_prompt: Optional[bool] = field(default=False, metadata={"help": "mask prompt"})
# dpov2 aligner args
[docs] beta: Optional[float] = field(default=0.1, metadata={"help": "the beta parameter for DPO loss"})
[docs] loss_type: Optional[str] = field(default="sigmoid", metadata={"help": "the loss type"})
@dataclass
[docs] class IterativeAlignerArguments(InferencerArguments): """ Arguments for iterative aligners. """
[docs] dataset_path_list: List[str] = field( default_factory=list, metadata={"help": "The list of dataset paths for iterative aligners."} )
[docs] initial_iter_idx: int = field( default=0, metadata={"help": "The initial iteration index, 0 refers to the first dataset in dataset_path_list."} )
@dataclass
[docs] class IterativeDPOAlignerArguments(IterativeAlignerArguments, DPOv2AlignerArguments): """ Arguments for iterative DPO aligners. """
[docs] output_dir: Optional[str] = field( default="./runs", metadata={"help": "Output path for the inferenced results"}, )
[docs] reward_model_inference_batch_size: int = field( default=1, metadata={"help": "The batch size for reward model inference."} )
[docs] reward_model_inference_block_size: int = field( default=2048, metadata={"help": "The block size for reward model inference."} )
[docs] do_response_generation: bool = field( default=True, metadata={"help": "Whether to generate responses using the model."} )
[docs] do_scoring: bool = field( default=True, metadata={"help": "Whether to score the responses using the reward model."} )
[docs] do_dpo_align: bool = field( default=True, metadata={"help": "Whether to perform DPO alignment."} )
[docs] PIPELINE_ARGUMENT_MAPPING = { "finetuner": FinetunerArguments, "evaluator": EvaluatorArguments, "inferencer": InferencerArguments, "vllm_inferencer": InferencerArguments, "rm_inferencer": InferencerArguments, "raft_aligner": RaftAlignerArguments, "dpo_aligner": DPOAlignerArguments, "rm_tuner": RewardModelTunerArguments, "dpov2_aligner": DPOv2AlignerArguments, "iterative_dpo_aligner": IterativeDPOAlignerArguments, }
[docs] class AutoArguments: """ Automatically choose arguments from FinetunerArguments or EvaluatorArguments. """
[docs] def get_pipeline_args_class(pipeline_name: str): return PIPELINE_ARGUMENT_MAPPING[pipeline_name]