Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python | |
| # coding=utf-8 | |
| """This script defines dataclasses: ModelArguments and DatasetArguments, | |
| that contain the arguments for the model and dataset used in training. | |
| It imports several modules, including dataclasses, field from typing, Optional from typing, | |
| require_version from transformers.utils.versions, MODEL_FOR_CAUSAL_LM_MAPPING, | |
| and TrainingArguments from transformers. | |
| MODEL_CONFIG_CLASSES is assigned a list of the model config classes from | |
| MODEL_FOR_CAUSAL_LM_MAPPING. MODEL_TYPES is assigned a tuple of the model types | |
| extracted from the MODEL_CONFIG_CLASSES. | |
| """ | |
| from dataclasses import dataclass, field | |
| from typing import Optional, List | |
| from transformers.utils.versions import require_version | |
| from transformers import ( | |
| MODEL_FOR_CAUSAL_LM_MAPPING, | |
| TrainingArguments, | |
| ) | |
| MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) | |
| MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) | |
| class ModelArguments: | |
| """ | |
| Define a class ModelArguments using the dataclass decorator. | |
| The class contains several optional parameters that can be used to configure a model. | |
| model_name_or_path : str | |
| a string representing the path or name of a pretrained | |
| model checkpoint for weights initialization. If None, a model will be trained from scratch. | |
| model_type : str | |
| a string representing the type of model to use if training from | |
| scratch. If not provided, a pretrained model will be used. | |
| config_overrides : str | |
| a string representing the default config settings to override | |
| when training a model from scratch. | |
| config_name : str | |
| a string representing the name or path of the pretrained config to | |
| use, if different from the model_name_or_path. | |
| tokenizer_name : str | |
| a string representing the name or path of the pretrained tokenizer | |
| to use, if different from the model_name_or_path. | |
| cache_dir : str | |
| a string representing the path to the directory where pretrained models | |
| downloaded from huggingface.co will be stored. | |
| use_fast_tokenizer : bool | |
| a boolean indicating whether to use a fast tokenizer (backed by the | |
| tokenizers library) or not. | |
| model_revision : str | |
| a string representing the specific model version to use (can be a | |
| branch name, tag name, or commit id). | |
| use_auth_token : bool | |
| a boolean indicating whether to use the token generated when running | |
| huggingface-cli login (necessary to use this script with private models). | |
| torch_dtype : str | |
| a string representing the dtype to load the model under. If auto is | |
| passed, the dtype will be automatically derived from the model's weights. | |
| use_ram_optimized_load : bool | |
| a boolean indicating whether to use disk mapping when memory is not | |
| enough. | |
| """ | |
| model_name_or_path: Optional[str] = field( | |
| default=None, | |
| metadata={ | |
| "help": ( | |
| "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." | |
| ) | |
| }, | |
| ) | |
| lora_model_path: Optional[str] = field( | |
| default=None, | |
| metadata={ | |
| "help": ( | |
| "The incremental model diff introduced by LoRA finetuning." | |
| " Along with the original non-finetuned model forms the whole" | |
| " finetuned model." | |
| ) | |
| } | |
| ) | |
| model_type: Optional[str] = field( | |
| default=None, | |
| metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, | |
| ) | |
| arch_type: Optional[str] = field( | |
| default="decoder_only", | |
| metadata={"help": "The architecture type of the model. Currently supported decoder_only or encoder_decoder"} | |
| ) | |
| config_overrides: Optional[str] = field( | |
| default=None, | |
| metadata={ | |
| "help": ( | |
| "Override some existing default config settings when a model is trained from scratch. Example: " | |
| "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" | |
| ) | |
| }, | |
| ) | |
| arch_type: Optional[str] = field( | |
| default="decoder_only", | |
| metadata={ | |
| "help": ( | |
| "Model architecture type, e.g. \"decoder_only\"," | |
| " \"encoder_decoder\"" | |
| ), | |
| "choices": ["decoder_only", "encoder_decoder", "text_regression"], | |
| }, | |
| ) | |
| config_name: Optional[str] = field( | |
| default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} | |
| ) | |
| tokenizer_name: Optional[str] = field( | |
| default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} | |
| ) | |
| cache_dir: Optional[str] = field( | |
| default=None, | |
| metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, | |
| ) | |
| use_fast_tokenizer: bool = field( | |
| default=True, | |
| metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, | |
| ) | |
| model_revision: str = field( | |
| default="main", | |
| metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, | |
| ) | |
| use_auth_token: bool = field( | |
| default=False, | |
| metadata={ | |
| "help": ( | |
| "Will use the token generated when running `huggingface-cli login` (necessary to use this script " | |
| "with private models)." | |
| ) | |
| }, | |
| ) | |
| torch_dtype: Optional[str] = field( | |
| default=None, | |
| metadata={ | |
| "help": ( | |
| "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " | |
| "dtype will be automatically derived from the model's weights." | |
| ), | |
| "choices": ["auto", "bfloat16", "float16", "float32"], | |
| }, | |
| ) | |
| use_lora: bool = field( | |
| default=False, | |
| metadata={"help": "Whether to lora."}, | |
| ) | |
| lora_r: int = field( | |
| default=8, | |
| metadata={"help": "the rank of the lora parameters. The smaller lora_r is , the fewer parameters lora has."}, | |
| ) | |
| lora_alpha: int = field( | |
| default=32, | |
| metadata={"help": "Merging ratio between the fine-tuned model and the original. This is controlled by a parameter called alpha in the paper."}, | |
| ) | |
| lora_target_modules: List[str] = field( | |
| default=None, metadata={"help": "Pretrained config name or path if not the same as model_name", | |
| } | |
| ) | |
| lora_dropout: float = field( | |
| default=0.1, | |
| metadata={"help": "The dropout rate in lora.linear."}, | |
| ) | |
| save_aggregated_lora: bool = field( | |
| default=False, | |
| metadata={"help": "Whether to save aggregated lora."}, | |
| ) | |
| use_ram_optimized_load: bool = field( | |
| default=True, | |
| metadata={"help": "Whether use disk mapping when memory is not enough."} | |
| ) | |
| def __post_init__(self): | |
| if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): | |
| raise ValueError( | |
| "--config_overrides can't be used in combination with --config_name or --model_name_or_path" | |
| ) | |
| class DatasetArguments: | |
| """ | |
| Define a class DatasetArguments using the dataclass decorator. | |
| The class contains several optional parameters that can be used to configure a dataset for a language model. | |
| dataset_path : str | |
| a string representing the path of the dataset to use. | |
| dataset_name : str | |
| a string representing the name of the dataset to use. The default value is "customized". | |
| is_custom_dataset : bool | |
| a boolean indicating whether to use custom data. The default value is False. | |
| customized_cache_dir : str | |
| a string representing the path to the directory where customized dataset caches will be stored. | |
| dataset_config_name : str | |
| a string representing the configuration name of the dataset to use (via the datasets library). | |
| train_file : str | |
| a string representing the path to the input training data file (a text file). | |
| validation_file : str | |
| a string representing the path to the input evaluation data file to evaluate the perplexity on (a text file). | |
| max_train_samples : int | |
| an integer indicating the maximum number of training examples to use for debugging or quicker training. | |
| If set, the training dataset will be truncated to this number. | |
| max_eval_samples: int | |
| an integer indicating the maximum number of evaluation examples to use for debugging or quicker training. | |
| If set, the evaluation dataset will be truncated to this number. | |
| streaming : bool | |
| a boolean indicating whether to enable streaming mode. | |
| block_size: int | |
| an integer indicating the optional input sequence length after tokenization. The training dataset will be | |
| truncated in blocks of this size for training. | |
| The class also includes some additional parameters that can be used to configure the dataset further, such as `overwrite_cache`, | |
| `validation_split_percentage`, `preprocessing_num_workers`, `disable_group_texts`, `demo_example_in_prompt`, `explanation_in_prompt`, | |
| `keep_linebreaks`, and `prompt_structure`. | |
| The field function is used to set default values and provide help messages for each parameter. The Optional type hint is | |
| used to indicate that a parameter is optional. The metadata argument is used to provide additional information about | |
| each parameter, such as a help message. | |
| """ | |
| dataset_path: Optional[str] = field( | |
| default=None, metadata={"help": "The path of the dataset to use."} | |
| ) | |
| dataset_name: Optional[str] = field( | |
| default="customized", metadata={"help": "Should be \"customized\""} | |
| ) | |
| is_custom_dataset: Optional[bool] = field( | |
| default=False, metadata={"help": "whether to use custom data"} | |
| ) | |
| customized_cache_dir: Optional[str] = field( | |
| default=".cache/llm-ft/datasets", | |
| metadata={"help": "Where do you want to store the customized dataset caches"}, | |
| ) | |
| dataset_config_name: Optional[str] = field( | |
| default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} | |
| ) | |
| train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) | |
| validation_file: Optional[str] = field( | |
| default=None, | |
| metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, | |
| ) | |
| max_train_samples: Optional[int] = field( | |
| default=None, | |
| metadata={ | |
| "help": ( | |
| "For debugging purposes or quicker training, truncate the number of training examples to this " | |
| "value if set." | |
| ) | |
| }, | |
| ) | |
| max_eval_samples: Optional[int] = field( | |
| default=1e10, | |
| metadata={ | |
| "help": ( | |
| "For debugging purposes or quicker training, truncate the number of evaluation examples to this " | |
| "value if set." | |
| ) | |
| }, | |
| ) | |
| streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"}) | |
| block_size: Optional[int] = field( | |
| default=None, | |
| metadata={ | |
| "help": ( | |
| "Optional input sequence length after tokenization. " | |
| "The training dataset will be truncated in block of this size for training. " | |
| "Default to the model max input length for single sentence inputs (take into account special tokens)." | |
| ) | |
| }, | |
| ) | |
| overwrite_cache: bool = field( | |
| default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} | |
| ) | |
| validation_split_percentage: Optional[int] = field( | |
| default=5, | |
| metadata={ | |
| "help": "The percentage of the train set used as validation set in case there's no validation split" | |
| }, | |
| ) | |
| preprocessing_num_workers: Optional[int] = field( | |
| default=None, | |
| metadata={"help": "The number of processes to use for the preprocessing."}, | |
| ) | |
| disable_group_texts: bool = field( | |
| default=False, | |
| metadata={ | |
| "help": ( | |
| "Whether we group original samples together to generate sample" | |
| " sequences of length `block_size`. By default, we group every" | |
| " 1000 tokenized sequences together, divide them into " | |
| " [{total_num_tokens} / {block_size}] sequences, each with" | |
| " `block_size` tokens (the remaining tokens are ommited." | |
| " If this flag is set to True, we only group 1 tokenized" | |
| " sequence, i.e. cutting long sequence into chunks." | |
| ) | |
| }, | |
| ) | |
| keep_linebreaks: bool = field( | |
| default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} | |
| ) | |
| test_file: Optional[str] = field( | |
| default=None, | |
| metadata={"help": "Evaluation File Path"}, | |
| ) | |
| def __post_init__(self): | |
| if self.streaming: | |
| require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`") | |
| if self.dataset_name is None and self.train_file is None and self.validation_file is None: | |
| raise ValueError("Need either a dataset name or a training/validation file.") | |
| else: | |
| if self.train_file is not None: | |
| extension = self.train_file.split(".")[-1] | |
| assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." | |
| if self.validation_file is not None: | |
| extension = self.validation_file.split(".")[-1] | |
| assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." | |
| class FinetunerArguments(TrainingArguments): | |
| """ | |
| Adapt transformers.TrainingArguments | |
| """ | |
| pass | |
| class EvaluatorArguments: | |
| """ | |
| Define a class EvaluatorArguments using the dataclass decorator. The class contains several optional | |
| parameters that can be used to configure a evaluator. | |
| local_rank : str | |
| For distributed training: local_rank | |
| random_shuffle : bool | |
| use_wandb : bool | |
| random_seed : int, default = 1 | |
| output_dir : str, default = './output_dir', | |
| mixed_precision : str, choice from ["bf16","fp16"]. | |
| mixed precision mode, whether to use bf16 or fp16 | |
| deepspeed : | |
| Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already | |
| loaded json file as a dict | |
| """ | |
| local_rank: int = field( | |
| default=-1, | |
| metadata={"help": "For distributed training: local_rank" | |
| } | |
| ) | |
| random_shuffle: Optional[bool] = field( | |
| default=False, | |
| metadata={"help": "" | |
| } | |
| ) | |
| use_wandb: Optional[bool] = field( | |
| default=False, | |
| metadata={ | |
| "help": ( | |
| "When this flag is True, wandb will be enabled" | |
| ) | |
| }, | |
| ) | |
| random_seed: Optional[int] = field( | |
| default=1, | |
| metadata={ | |
| "help": ( | |
| "used to set random seed" | |
| ) | |
| }, | |
| ) | |
| output_dir: Optional[str] = field( | |
| default="./output_dir", | |
| metadata={"help": "Output path for the inferenced results"}, | |
| ) | |
| mixed_precision: Optional[str] = field( | |
| default="bf16", | |
| metadata={ | |
| "help": ( | |
| "mixed precision mode, whether to use bf16 or fp16" | |
| ), | |
| "choices": ["bf16","fp16"], | |
| }, | |
| ) | |
| deepspeed: Optional[str] = field( | |
| default=None, | |
| metadata={ | |
| "help": ( | |
| "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already" | |
| " loaded json file as a dict" | |
| ) | |
| }, | |
| ) | |
| answer_type: Optional[str] = field( | |
| default="text", | |
| metadata={ | |
| "help": ( | |
| 'Question type for answer extraction from the decoder output.' | |
| ' Supported types: \n' | |
| ' 1) "multiple_choice", e.g. A, B, C, D, ...\n' | |
| ' 2) "binary_choice", e.g. yes, no, maybe\n' | |
| ' 3) "math", e.g. 1.0, -3.52\n' | |
| ' 4) "text", e.g. "I think that it is okay"\n' | |
| ' 5) Special treatment for several datasets\n' | |
| ' - "gsm8k"\n' | |
| ' - "svamp"\n' | |
| ' - "asdiv"\n' | |
| ' - "addsub"\n' | |
| ' - "singleeq"\n' | |
| ' - "multiarith"\n' | |
| ' - "aqua"\n' | |
| ' - "csqa"\n' | |
| ' - "strategyqa"\n' | |
| ' - "pubmedqa"\n' | |
| ' - "medmcqa"\n' | |
| ' - "usmle"\n' | |
| ) | |
| }, | |
| ) | |
| prompt_structure: Optional[str] = field( | |
| default="{input}", | |
| metadata={ | |
| "help": ( | |
| 'Prompt structure to facilitate prompt engineering during' | |
| ' inference. The model will receive' | |
| ' `prompt_structure.format(input=input)` as its input.' | |
| ) | |
| }, | |
| ) | |
| evaluate_block_size: Optional[int] = field( | |
| default=512, | |
| metadata={ | |
| "help": ( | |
| "the model will have at least block_size tokens for context when calculating the conditional likelihood of any one token" | |
| " (provided there are block_size preceding tokens available to condition on)" | |
| ) | |
| }, | |
| ) | |
| metric: Optional[str] = field( | |
| default="accuracy", | |
| metadata={ | |
| "help": "the metric the model will be evaluated on", | |
| "choices": ["ppl", "perplexity", "acc", "accuracy", "nll", "neg_log_likelihood"], | |
| }, | |
| ) | |
| class InferencerArguments: | |
| """ | |
| Define a class InferencerArguments using the dataclass decorator. The class contains several optional | |
| parameters that can be used to configure a inferencer. | |
| local_rank : str | |
| For distributed training: local_rank | |
| random_seed : int, default = 1 | |
| deepspeed : | |
| Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already | |
| loaded json file as a dict | |
| mixed_precision : str, choice from ["bf16","fp16"]. | |
| mixed precision mode, whether to use bf16 or fp16 | |
| """ | |
| device: str = field( | |
| default="gpu", | |
| metadata={ | |
| "help": "device of chatbot", | |
| "choices": ["gpu", "cpu"], | |
| }, | |
| ) | |
| local_rank: int = field( | |
| default=-1, | |
| metadata={"help": "For distributed training: local_rank" | |
| } | |
| ) | |
| random_seed: Optional[int] = field( | |
| default=1, | |
| metadata={ | |
| "help": ( | |
| "used to set random seed" | |
| ) | |
| }, | |
| ) | |
| deepspeed: Optional[str] = field( | |
| default=None, | |
| metadata={ | |
| "help": ( | |
| "Enable deepspeed and pass the path to deepspeed json config file (e.g. ds_config.json) or an already" | |
| " loaded json file as a dict" | |
| ) | |
| }, | |
| ) | |
| mixed_precision: Optional[str] = field( | |
| default="bf16", | |
| metadata={ | |
| "help": ( | |
| "mixed precision mode, whether to use bf16 or fp16" | |
| ), | |
| "choices": ["bf16","fp16"], | |
| }, | |
| ) | |
| class RaftAlignerArguments(TrainingArguments): | |
| """ | |
| Define a class RaftAlignerArguments to configure raft aligner. | |
| """ | |
| output_reward_path: Optional[str] = field( | |
| default="tmp/raft_aligner/", | |
| metadata={ | |
| "help": "The path of output rewards." | |
| } | |
| ) | |
| output_min_length: Optional[int] = field( | |
| default=16, | |
| metadata={ | |
| "help": ( | |
| "minimum length of the output token sequence generated from" | |
| " model given an input." | |
| ), | |
| }, | |
| ) | |
| output_max_length: Optional[int] = field( | |
| default=48, | |
| metadata={ | |
| "help": ( | |
| "maximum length of the output token sequence generated from" | |
| " model given an output." | |
| ), | |
| }, | |
| ) | |
| num_raft_iteration: Optional[int] = field( | |
| default=20, | |
| metadata={ | |
| "help": "number of iterations of the raft aligner." | |
| }, | |
| ) | |
| raft_batch_size: Optional[int] = field( | |
| default=320, | |
| metadata={ | |
| "help": ( | |
| "only select {raft_batch_size} samples each time to" | |
| " generate rewards and be ranked for STF training." | |
| ) | |
| }, | |
| ) | |
| top_reward_percentage: Optional[int] = field( | |
| default=0.2, | |
| metadata={ | |
| "help": ( | |
| "only top {top_reward_percentage} samples in the raft batch," | |
| " (in terms of rewards), will be used for SFT the model." | |
| ), | |
| }, | |
| ) | |
| inference_batch_size_per_device: Optional[int] = field( | |
| default=1, | |
| metadata={ | |
| "help": ( | |
| "every device will infer {inference_batch_size_per_device}" | |
| " samples in parallel. The inferred results will be concatenaed" | |
| " with inputs and attach a reward." | |
| ), | |
| }, | |
| ) | |
| PIPELINE_ARGUMENT_MAPPING = { | |
| "finetuner": FinetunerArguments, | |
| "evaluator": EvaluatorArguments, | |
| "inferencer": InferencerArguments, | |
| "raft_aligner": RaftAlignerArguments, | |
| } | |
| class AutoArguments: | |
| """ | |
| Automatically choose arguments from FinetunerArguments or EvaluatorArguments. | |
| """ | |
| def get_pipeline_args_class(pipeline_name: str): | |
| return PIPELINE_ARGUMENT_MAPPING[pipeline_name] | |