|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| import inspect
|
| import warnings
|
| from dataclasses import FrozenInstanceError, replace
|
| from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
|
| import torch
|
| import torch.nn as nn
|
| from datasets import Dataset
|
| from transformers import DataCollator, PreTrainedModel, PreTrainedTokenizerBase, Trainer, TrainingArguments
|
| from transformers.trainer_callback import TrainerCallback
|
| from transformers.trainer_pt_utils import nested_detach
|
| from transformers.trainer_utils import EvalPrediction
|
|
|
| from ..import_utils import is_peft_available
|
| from .reward_config import RewardConfig
|
| from .utils import RewardDataCollatorWithPadding, compute_accuracy
|
|
|
|
|
| if is_peft_available():
|
| from peft import PeftModel, get_peft_model, prepare_model_for_kbit_training
|
|
|
|
|
| class RewardTrainer(Trainer):
|
| r"""
|
| The RewardTrainer can be used to train your custom Reward Model. It is a subclass of the
|
| `transformers.Trainer` class and inherits all of its attributes and methods. It is recommended to use
|
| an `AutoModelForSequenceClassification` as the reward model. The reward model should be trained on a dataset
|
| of paired examples, where each example is a tuple of two sequences. The reward model should be trained to
|
| predict which example in the pair is more relevant to the task at hand.
|
|
|
| The reward trainer expects a very specific format for the dataset. The dataset should contain two 4 entries at least
|
| if you don't use the default `RewardDataCollatorWithPadding` data collator. The entries should be named
|
| - `input_ids_chosen`
|
| - `attention_mask_chosen`
|
| - `input_ids_rejected`
|
| - `attention_mask_rejected`
|
|
|
| Optionally, you can also pass a `margin` entry to the dataset. This entry should contain the margin used to modulate the
|
| loss of the reward model as outlined in https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/.
|
| If you don't pass a margin, no margin will be used.
|
| """
|
|
|
| def __init__(
|
| self,
|
| model: Union[PreTrainedModel, nn.Module] = None,
|
| args: Optional[RewardConfig] = None,
|
| data_collator: Optional[DataCollator] = None,
|
| train_dataset: Optional[Dataset] = None,
|
| eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
|
| tokenizer: Optional[PreTrainedTokenizerBase] = None,
|
| model_init: Optional[Callable[[], PreTrainedModel]] = None,
|
| compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
|
| callbacks: Optional[List[TrainerCallback]] = None,
|
| optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
|
| None,
|
| None,
|
| ),
|
| preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
|
| max_length: Optional[int] = None,
|
| peft_config: Optional[Dict] = None,
|
| ):
|
| """
|
| Initialize RewardTrainer.
|
|
|
| Args:
|
| model (`transformers.PreTrainedModel`):
|
| The model to train, preferably an `AutoModelForSequenceClassification`.
|
| args (`RewardConfig`):
|
| The arguments to use for training.
|
| data_collator (`transformers.DataCollator`):
|
| The data collator to use for training. If None is specified, the default data collator (`RewardDataCollatorWithPadding`) will be used
|
| which will pad the sequences to the maximum length of the sequences in the batch, given a dataset of paired sequences.
|
| train_dataset (`datasets.Dataset`):
|
| The dataset to use for training.
|
| eval_dataset (`datasets.Dataset`):
|
| The dataset to use for evaluation.
|
| tokenizer (`transformers.PreTrainedTokenizerBase`):
|
| The tokenizer to use for training. This argument is required if you want to use the default data collator.
|
| model_init (`Callable[[], transformers.PreTrainedModel]`):
|
| The model initializer to use for training. If None is specified, the default model initializer will be used.
|
| compute_metrics (`Callable[[transformers.EvalPrediction], Dict]`, *optional* defaults to `compute_accuracy`):
|
| The metrics to use for evaluation. If no metrics are specified, the default metric (`compute_accuracy`) will be used.
|
| callbacks (`List[transformers.TrainerCallback]`):
|
| The callbacks to use for training.
|
| optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
|
| The optimizer and scheduler to use for training.
|
| preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
|
| The function to use to preprocess the logits before computing the metrics.
|
| max_length (`int`, defaults to `None`):
|
| The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
|
| peft_config (`Dict`, defaults to `None`):
|
| The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in a PEFT model.
|
| """
|
| if type(args) == TrainingArguments:
|
| warnings.warn(
|
| "Using `transformers.TrainingArguments` for `args` is deprecated and will be removed in a future version. Please use `RewardConfig` instead.",
|
| FutureWarning,
|
| )
|
| if max_length is not None:
|
| warnings.warn(
|
| "The `max_length` argument is deprecated and will be removed in a future version. Please use the `RewardConfig` to set `max_length` instead.",
|
| FutureWarning,
|
| )
|
| else:
|
| if max_length is not None and args.max_length is not None:
|
| raise ValueError("You cannot specify both `max_length` and `args.max_length`. Please use the `RewardConfig` to set `max_length` once.")
|
| if max_length is not None and args.max_length is None:
|
| warnings.warn(
|
| "The `max_length` argument is deprecated and will be removed in a future version. Please use the `RewardConfig` to set `max_length` instead.",
|
| FutureWarning,
|
| )
|
| if not is_peft_available() and peft_config is not None:
|
| raise ValueError("PEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT models")
|
| elif is_peft_available() and peft_config is not None:
|
| if not isinstance(model, PeftModel):
|
| if getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_quantized", False):
|
| _supports_gc_kwargs = "gradient_checkpointing_kwargs" in list(inspect.signature(prepare_model_for_kbit_training).parameters)
|
|
|
| preprare_model_kwargs = {"use_gradient_checkpointing": args.gradient_checkpointing}
|
|
|
| if not _supports_gc_kwargs and args.gradient_checkpointing_kwargs is not None:
|
| warnings.warn("You passed `gradient_checkpointing_kwargs` in the trainer's kwargs, but your peft version does not support it. " "please update to the latest version of peft to use `gradient_checkpointing_kwargs`.")
|
| elif _supports_gc_kwargs and args.gradient_checkpointing_kwargs is not None:
|
| preprare_model_kwargs["gradient_checkpointing_kwargs"] = args.gradient_checkpointing_kwargs
|
|
|
| model = prepare_model_for_kbit_training(model, **preprare_model_kwargs)
|
|
|
| model = get_peft_model(model, peft_config)
|
|
|
| if compute_metrics is None:
|
| compute_metrics = compute_accuracy
|
|
|
| if data_collator is None:
|
| if tokenizer is None:
|
| raise ValueError("max_length or a tokenizer must be specified when using the default RewardDataCollatorWithPadding")
|
| if type(args) == TrainingArguments:
|
| if max_length is None:
|
| warnings.warn(
|
| "When using RewardDataCollatorWithPadding, you should set `max_length` in RewardConfig." " It will be set to `512` by default, but you should do it yourself in the future.",
|
| UserWarning,
|
| )
|
| max_length = 512
|
| else:
|
| if max_length is None and args.max_length is None:
|
| warnings.warn(
|
| "When using RewardDataCollatorWithPadding, you should set `max_length` in RewardConfig." " It will be set to `512` by default, but you should do it yourself in the future.",
|
| UserWarning,
|
| )
|
| max_length = 512
|
| if max_length is None and args.max_length is not None:
|
| max_length = args.max_length
|
|
|
| data_collator = RewardDataCollatorWithPadding(tokenizer, max_length=max_length)
|
|
|
| if args.remove_unused_columns:
|
| try:
|
| args.remove_unused_columns = False
|
| except FrozenInstanceError:
|
| args = replace(args, remove_unused_columns=False)
|
|
|
| warnings.warn(
|
| "When using RewardDataCollatorWithPadding, you should set `remove_unused_columns=False` in your RewardConfig" " we have set it for you, but you should do it yourself in the future.",
|
| UserWarning,
|
| )
|
|
|
| self.use_reward_data_collator = True
|
| else:
|
| self.use_reward_data_collator = False
|
| super().__init__(
|
| model,
|
| args,
|
| data_collator,
|
| train_dataset,
|
| eval_dataset,
|
| tokenizer,
|
| model_init,
|
| compute_metrics,
|
| callbacks,
|
| optimizers,
|
| preprocess_logits_for_metrics,
|
| )
|
|
|
| def compute_loss(
|
| self,
|
| model: Union[PreTrainedModel, nn.Module],
|
| inputs: Dict[str, Union[torch.Tensor, Any]],
|
| return_outputs=False,
|
| ) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, torch.Tensor]]]:
|
| if not self.use_reward_data_collator:
|
| warnings.warn("The current compute_loss is implemented for RewardDataCollatorWithPadding," " if you are using a custom data collator make sure you know what you are doing or" " implement your own compute_loss method.")
|
| rewards_chosen = model(
|
| input_ids=inputs["input_ids_chosen"],
|
| attention_mask=inputs["attention_mask_chosen"],
|
| return_dict=True,
|
| )["logits"]
|
| rewards_rejected = model(
|
| input_ids=inputs["input_ids_rejected"],
|
| attention_mask=inputs["attention_mask_rejected"],
|
| return_dict=True,
|
| )["logits"]
|
|
|
| if "margin" in inputs:
|
| loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected - inputs["margin"]).mean()
|
| else:
|
| loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected).mean()
|
|
|
| if return_outputs:
|
| return loss, {
|
| "rewards_chosen": rewards_chosen,
|
| "rewards_rejected": rewards_rejected,
|
| }
|
| return loss
|
|
|
| def prediction_step(
|
| self,
|
| model: Union[PreTrainedModel, nn.Module],
|
| inputs: Dict[str, Union[torch.Tensor, Any]],
|
| prediction_loss_only: bool,
|
| ignore_keys: Optional[List[str]] = None,
|
| ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
|
| inputs = self._prepare_inputs(inputs)
|
| if ignore_keys is None:
|
| if hasattr(self.model, "config"):
|
| ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
|
| else:
|
| ignore_keys = []
|
|
|
| with torch.no_grad():
|
| loss, logits_dict = self.compute_loss(model, inputs, return_outputs=True)
|
|
|
| if prediction_loss_only:
|
| return (loss, None, None)
|
|
|
| loss = loss.detach()
|
| logits = tuple(v for k, v in logits_dict.items() if k not in ignore_keys)
|
| logits = nested_detach(logits)
|
|
|
|
|
| logits = torch.stack(logits).mean(dim=2).softmax(dim=0).T
|
|
|
| labels = torch.zeros(logits.shape[0])
|
| labels = self._prepare_inputs(labels)
|
|
|
| return loss, logits, labels
|
|
|