| | |
| | |
| | """This is a class called HFDecoderModel which is a wrapper around transformers model and |
| | tokenizer classes. It has several methods such as __init__, tokenize, and train that are |
| | used for training and fine-tuning the model. The __init__ method takes in several arguments |
| | such as model_args, tune_strategy, and ds_config, which are used to load the pretrained |
| | model and tokenizer, and initialize the training settings. |
| | |
| | The tokenize method is used to tokenize the input text and return the input IDs and attention |
| | masks that can be fed to the model for training or inference. |
| | |
| | This class supports different tune_strategy options such as 'normal', 'none', 'lora', and |
| | 'adapter', which allow for different fine-tuning settings of the model. However, the 'lora' |
| | and 'adapter' strategies are not yet implemented. |
| | |
| | Overall, this class provides a convenient interface for loading and fine-tuning transformer |
| | models and can be used for various NLP tasks such as language modeling, text classification, |
| | and question answering. |
| | """ |
| |
|
| | import logging |
| | from typing import List, Union |
| |
|
| | import deepspeed |
| |
|
| | from peft import ( |
| | LoraConfig, |
| | PeftModel, |
| | TaskType, |
| | get_peft_config, |
| | get_peft_model, |
| | ) |
| |
|
| | import torch |
| | import transformers |
| | from transformers.deepspeed import HfDeepSpeedConfig |
| |
|
| | from transformers.testing_utils import CaptureLogger |
| |
|
| | from transformers import ( |
| | CONFIG_MAPPING, |
| | AutoConfig, |
| | AutoTokenizer, |
| | AutoModelForSeq2SeqLM, |
| | AutoModel, |
| | ) |
| |
|
| | from lmflow.datasets.dataset import Dataset |
| | from lmflow.models.encoder_decoder_model import EncoderDecoderModel |
| | from lmflow.models.interfaces.tunable import Tunable |
| |
|
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | class HFEncoderDecoderModel(EncoderDecoderModel, Tunable): |
| | r""" |
| | Initializes a HFEncoderDecoderModel instance. |
| | |
| | Parameters |
| | ------------ |
| | |
| | model_args : |
| | Model arguments such as model name, path, revision, etc. |
| | |
| | tune_strategy : str or none, default="normal". |
| | A string representing the dataset backend. Defaults to "huggingface". |
| | |
| | ds_config : |
| | Deepspeed configuations. |
| | |
| | args : Optional. |
| | Positional arguments. |
| | |
| | kwargs : Optional. |
| | Keyword arguments. |
| | """ |
| |
|
| | def __init__( |
| | self, |
| | model_args, |
| | tune_strategy='normal', |
| | ds_config=None, |
| | device="gpu", |
| | *args, |
| | **kwargs |
| | ): |
| | """ |
| | Initializes a HFDecoderModel instance. |
| | :param model_args: dictionary with model arguments such as model name, path, revision, etc. |
| | :param tune_strategy: tuning strategy: normal, none, lora or adapter |
| | :param ds_config: deepspeed configuration for distributed training |
| | """ |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | self.device = device |
| |
|
| | if tune_strategy == 'normal': |
| | raise NotImplementedError( |
| | f"tune_strategy \"{tune_strategy}\" is not supported" |
| | ) |
| | elif tune_strategy == 'none': |
| | dschf = HfDeepSpeedConfig(ds_config) |
| | peft_model_id = model_args.lora_model_path |
| | |
| | if "llama" in model_args.model_name_or_path and model_args.use_ram_optimized_load: |
| | logger.warning( |
| | "llama does not support RAM optimized load. Automatically" |
| | " use original load instead." |
| | ) |
| | model_args.use_ram_optimized_load = False |
| |
|
| |
|
| | if model_args.model_name_or_path == 'THUDM/chatglm-6b': |
| | self.backend_model = AutoModel.from_pretrained(model_args.model_name_or_path, trust_remote_code=True) |
| |
|
| | elif model_args.use_ram_optimized_load and peft_model_id is None: |
| | try: |
| | |
| | self.backend_model = AutoModelForSeq2SeqLM.from_pretrained( |
| | model_args.model_name_or_path, |
| | device_map="auto", |
| | offload_folder="offload", |
| | offload_state_dict=True, |
| | ) |
| | except: |
| | logger.warning( |
| | "Failed to use RAM optimized load. Automatically" |
| | " use original load instead." |
| | ) |
| | |
| | self.backend_model = AutoModelForSeq2SeqLM.from_pretrained( |
| | model_args.model_name_or_path, |
| | ) |
| | else: |
| | if peft_model_id is not None: |
| | logger.warning( |
| | "LoRA does not support RAM optimized load currently." |
| | " Automatically use original load instead." |
| | ) |
| | self.backend_model = AutoModelForSeq2SeqLM.from_pretrained( |
| | model_args.model_name_or_path, |
| | ) |
| |
|
| | self.tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, trust_remote_code=True) |
| | self.backend_model_full = self.backend_model |
| | if peft_model_id is not None: |
| | self.backend_model = PeftModel.from_pretrained( |
| | self.backend_model, peft_model_id |
| | ) |
| |
|
| | if device == "gpu": |
| | deepspeed.init_distributed() |
| | self.ds_engine = deepspeed.initialize(model=self.backend_model, config_params=ds_config)[0] |
| | self.ds_engine.module.eval() |
| |
|
| | elif tune_strategy == 'adapter': |
| | raise NotImplementedError('adapter tune strategy not implemented') |
| |
|
| |
|
| | def tokenize(self, dataset, *args, **kwargs): |
| | """ |
| | Tokenize the full dataset. |
| | |
| | Parameters |
| | ------------ |
| | dataset : |
| | Text dataset. |
| | |
| | args : Optional. |
| | Positional arguments. |
| | |
| | kwargs : Optional. |
| | Keyword arguments. |
| | |
| | Returns |
| | ------------ |
| | tokenized_datasets : |
| | The tokenized dataset. |
| | """ |
| | raise NotImplementedError('tokenize not implemented') |
| |
|
| | def encode(self, input: Union[str, List[str]], *args, **kwargs ) -> Union[List[int], List[List[int]]]: |
| | """ |
| | Perform encoding process of the tokenizer. |
| | |
| | Parameters |
| | ------------ |
| | inputs : str or list. |
| | The text sequence. |
| | |
| | args : Optional. |
| | Positional arguments. |
| | |
| | kwargs : Optional. |
| | Keyword arguments. |
| | |
| | Returns |
| | ------------ |
| | outputs : |
| | The tokenized inputs. |
| | """ |
| | if isinstance(input, list): |
| | output = [] |
| | for single_input in input: |
| | single_output = self.encode(single_input, *args, **kwargs) |
| | output.append(single_output) |
| | return output |
| | elif isinstance(input, str): |
| | return self.tokenizer.encode(text=input, *args, **kwargs) |
| | else: |
| | raise NotImplementedError(f'type "{type(input)}" cannot be encoded') |
| |
|
| |
|
| | def decode(self, input, *args, **kwargs ) -> Union[str, List[str]]: |
| | """ |
| | Perform decoding process of the tokenizer. |
| | |
| | Parameters |
| | ------------ |
| | inputs : list. |
| | The token sequence. |
| | |
| | args : Optional. |
| | Positional arguments. |
| | |
| | kwargs : Optional. |
| | Keyword arguments. |
| | |
| | Returns |
| | ------------ |
| | outputs : |
| | The text decoded from the token inputs. |
| | """ |
| | if isinstance(input, list) and input and isinstance(input[0], list): |
| | output = [] |
| | for single_input in input: |
| | single_output = self.decode(single_input, *args, **kwargs) |
| | output.append(single_output) |
| | return output |
| | else: |
| | |
| | return self.tokenizer.decode(input, *args, **kwargs) |
| |
|
| |
|
| | def inference(self, inputs, *args, **kwargs): |
| | """ |
| | Perform generation process of the model. |
| | |
| | Parameters |
| | ------------ |
| | inputs : |
| | The sequence used as a prompt for the generation or as model inputs to the model. |
| | |
| | args : Optional. |
| | Positional arguments. |
| | |
| | kwargs : Optional. |
| | Keyword arguments. |
| | |
| | Returns |
| | ------------ |
| | outputs : |
| | The generated sequence output |
| | """ |
| |
|
| |
|
| | with torch.no_grad(): |
| | if self.device == "gpu": |
| | outputs = self.ds_engine.module.generate( |
| | input_ids=inputs, |
| | synced_gpus=True, |
| | pad_token_id=self.tokenizer.eos_token_id, |
| | *args, |
| | **kwargs |
| | ) |
| | elif self.device == "cpu": |
| | outputs = self.backend_model.generate( |
| | input_ids=inputs, |
| | synced_gpus=True, |
| | pad_token_id=self.tokenizer.eos_token_id, |
| | *args, |
| | **kwargs |
| | ) |
| | else: |
| | raise NotImplementedError( |
| | f"device \"{self.device}\" is not supported" |
| | ) |
| | return outputs |
| |
|
| |
|
| | def merge_lora_weights(self): |
| | if self.model_args.use_lora: |
| | self.get_backend_model().merge_and_unload() |
| | else: |
| | logger.warning("LoRA training is NOT enabled. Merging LoRA weights is not applicable.") |
| |
|
| |
|
| | def save(self, dir, save_full_model=False, *args, **kwargs): |
| | """ |
| | Perform generation process of the model. |
| | |
| | Parameters |
| | ------------ |
| | dir : |
| | The directory to save model and tokenizer |
| | |
| | save_full_model : Optional. |
| | Whether to save full model. |
| | |
| | kwargs : Optional. |
| | Keyword arguments. |
| | |
| | Returns |
| | ------------ |
| | outputs : |
| | The generated sequence output |
| | """ |
| | self.get_tokenizer().save_pretrained(dir) |
| | if save_full_model and self.model_args.use_lora: |
| | self.backend_model_full.save_pretrained(dir) |
| | else: |
| | self.get_backend_model().save_pretrained(dir) |
| |
|
| |
|
| | def get_max_length(self): |
| | """ |
| | Return max acceptable input length in terms of tokens. |
| | """ |
| | return self.tokenizer.model_max_length |
| |
|
| |
|
| | def get_tokenizer(self): |
| | """ |
| | Return the tokenizer of the model. |
| | """ |
| | return self.tokenizer |
| |
|
| |
|
| | def get_backend_model(self): |
| | """ |
| | Return the backend model. |
| | """ |
| | return self.backend_model |
| |
|