""" 2026.5.4 2026.5.8 4.56.2 0.22.2 __UNSLOTH_VERSIONING__ """ # Unsloth auto generated code # Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from torch import Tensor import torch import torch.nn as nn from torch.nn import functional as F from unsloth_zoo.temporary_patches.common import torch_compile from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable from trl.trainer.reward_trainer import (Any, BaseImageProcessor, Callable, DataCollator, Dataset, EvalPrediction, FeatureExtractionMixin, FrozenInstanceError, Optional, PartialState, Path, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, RewardConfig, RewardDataCollatorWithPadding, RewardTrainer, Trainer, TrainerCallback, Union, _tokenize, compute_accuracy, decode_and_strip_padding, defaultdict, disable_dropout_in_model, gather_object, generate_model_card, get_comet_experiment_url, is_rich_available, is_wandb_available, log_table_to_comet_experiment, logger, logging, maybe_apply_chat_template, nested_detach, nn, os, pd, print_rich_table, replace, torch, wandb, BaseImageProcessor, Callable, DataCollator, Dataset, EvalPrediction, FeatureExtractionMixin, FrozenInstanceError, Optional, PartialState, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, RewardConfig, RewardDataCollatorWithPadding, RewardTrainer, Trainer, TrainerCallback, Union, compute_accuracy, disable_dropout_in_model, logger, maybe_apply_chat_template, nn, os, replace, torch, Optional, PreTrainedModel, Trainer, logger, os, torch) import os import math import logging from typing import * from dataclasses import dataclass, field from packaging.version import Version import torch import numpy as np from contextlib import nullcontext from torch.nn import functional as F import inspect from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling from transformers.training_args import ParallelMode from unsloth_zoo.device_type import DEVICE_TYPE, device_synchronize # Wrap trainer with padding to right and enable training mode import functools from types import MethodType try: from unsloth_zoo.gradient_checkpointing import reset_unsloth_gradient_checkpointing_buffers except: def reset_unsloth_gradient_checkpointing_buffers(): pass def prepare_for_training_mode(f): @functools.wraps(f) def wrapper(self, *args, **kwargs): # Finish the previous W&B run if this is a subsequent train() call. # We do this at the START of train() (not the end) so that # evaluate() / log() still work after train() completes. # HF's WandbCallback.setup() will call wandb.init() for the new run. # See: https://github.com/unslothai/unsloth/issues/3954 if getattr(self, '_unsloth_training_completed', False): try: import wandb if wandb.run is not None: wandb.finish() # Reset HF's WandbCallback so it calls wandb.init() for the new run for cb in self.callback_handler.callbacks: if type(cb).__name__ == 'WandbCallback': cb._initialized = False break except: pass # Enable training mode _was_training = None # Get gradient checkpointing setting from training arguments use_gc = getattr(self.args, 'gradient_checkpointing', True) if hasattr(self, 'model') and hasattr(self.model, "training"): _was_training = self.model.training if hasattr(self, 'model') and hasattr(self.model, "for_training"): self.model.for_training(use_gradient_checkpointing=use_gc) output = f(self, *args, **kwargs) # Restore previous mode when possible if hasattr(self, 'model') and hasattr(self.model, "for_inference"): if _was_training is False: self.model.for_inference() elif _was_training is True and hasattr(self.model, "for_training"): self.model.for_training(use_gradient_checkpointing=use_gc) # Reset gradient checkpointing buffers to free memory while staying ready for next run try: reset_unsloth_gradient_checkpointing_buffers() except: pass # Mark that training completed so the next train() call can # finish this W&B run before starting a new one self._unsloth_training_completed = True return output return wrapper pass torch_compile_options = { "epilogue_fusion" : True, "max_autotune" : False, "shape_padding" : True, "trace.enabled" : False, "triton.cudagraphs" : False, } @torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) def chunked_hidden_states_selective_log_softmax( hidden_states: torch.Tensor, lm_head: torch.Tensor, index: torch.Tensor, chunks: int = 4, logit_scale_multiply: float = 0.0, logit_scale_divide: float = 0.0, logit_softcapping: float = 0.0, temperature: float = 1.0, ) -> torch.Tensor: # All Unsloth Zoo code licensed under AGPL3 flat_hidden_states = hidden_states.reshape(-1, hidden_states.shape[-1]) flat_index = index.reshape(-1) chunked_hidden_states = torch.chunk(flat_hidden_states, chunks=chunks, dim=0) chunked_index = torch.chunk(flat_index, chunks=chunks, dim=0) all_per_token_logps = [] for chunk_hidden_states, chunk_index in zip(chunked_hidden_states, chunked_index): chunk_logits = chunk_hidden_states.to(lm_head.dtype) @ lm_head.t() if logit_scale_multiply != 0.0: chunk_logits = chunk_logits * logit_scale_multiply if logit_scale_divide != 0.0: chunk_logits = chunk_logits / logit_scale_divide if logit_softcapping != 0.0: chunk_logits = logit_softcapping * torch.tanh(chunk_logits / logit_softcapping) chunk_logits = chunk_logits.to(torch.float32) if temperature != 1.0: chunk_logits = chunk_logits / temperature selected_logits = torch.gather(chunk_logits, dim=-1, index=chunk_index.unsqueeze(-1)).squeeze(-1) logsumexp_values = torch.logsumexp(chunk_logits, dim=-1) per_token_logps = selected_logits - logsumexp_values all_per_token_logps.append(per_token_logps) all_per_token_logps = torch.concat(all_per_token_logps) all_per_token_logps = all_per_token_logps.reshape((hidden_states.shape[0], hidden_states.shape[1])) return all_per_token_logps @torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) def chunked_selective_log_softmax( logits, index, temperature: float = 1.0, chunks: int = 4, ): chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = chunks, dim = 0) chunked_index = torch.chunk(index.reshape(-1), chunks = chunks, dim = 0) all_per_token_logps = [] # Below loop does the same as selective_log_softmax(chunk_logits, chunk_index) for chunk_logits, chunk_index in zip(chunked_logits, chunked_index): chunk_logits = chunk_logits.to(torch.float32) if temperature != 1.0: chunk_logits = chunk_logits / temperature selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1) logsumexp_values = torch.logsumexp(chunk_logits, dim = -1) per_token_logps = selected_logits - logsumexp_values all_per_token_logps.append(per_token_logps) pass all_per_token_logps = torch.concat(all_per_token_logps) all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1])) return all_per_token_logps def calculate_pad_tokens_in_prompt( input_ids: torch.Tensor, logits_to_keep: int, pad_token_id: int ) -> torch.Tensor: """ Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens """ if logits_to_keep >= input_ids.shape[1]: raise ValueError("logits_to_keep must be smaller than the sequence length.") prompt_section = input_ids[:, :-logits_to_keep] padding_mask = (prompt_section == pad_token_id) pad_token_counts = padding_mask.sum(dim=1) return pad_token_counts def create_completion_attention_mask( completion_input_ids: torch.Tensor, left_pad_tokens_per_prompt: torch.Tensor, max_left_pad: int, pad_token_id: int ) -> torch.Tensor: """ Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad] Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens and pad are pad tokens, this function would make a completion mask that would 0 out the pad and p tokens. so in this example [0,0,0,1,1,1,0,0,0] """ batch_size, completion_len = completion_input_ids.shape device = completion_input_ids.device num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt indices = torch.arange(completion_len, device=device).unsqueeze(0) shift_mask = indices >= num_tokens_to_mask.unsqueeze(1) non_padding_mask = (completion_input_ids != pad_token_id) final_mask = shift_mask & non_padding_mask return final_mask def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor: """ Moves all padding tokens in each sequence of a batch to the right. """ mask = (tensor != pad_id) # Must do stable=True since binary mark is unordered sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True) packed_tensor = torch.gather(tensor, 1, sorted_indices) return packed_tensor def align_logprobs_with_mask( logprob_tensor: torch.Tensor, attention_mask: torch.Tensor, pad_value: float = 0.0 ) -> torch.Tensor: """ Aligns a log probability tensor with a given attention mask. """ device = logprob_tensor.device batch_size, logprob_seq_len = logprob_tensor.shape mask_seq_len = attention_mask.shape[1] padded_logprobs = torch.full( attention_mask.shape, fill_value=pad_value, dtype=logprob_tensor.dtype, device=device ) left_pad_counts = torch.argmax(attention_mask, dim=1) cols = torch.arange(logprob_seq_len, device=device) dest_indices = left_pad_counts.unsqueeze(1) + cols # Create destination row indices # Shape: [batch_size, logprob_seq_len] row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices) # --- 4. Filter out-of-bounds indices and perform assignment --- # Create a mask to identify only the indices that are within the bounds # of the target tensor's sequence length. valid_mask = dest_indices < mask_seq_len # Use this mask to select only the valid row indices, column indices, # and the corresponding values from the logprob tensor. # This flattens the selected elements into 1D tensors. valid_rows = row_indices[valid_mask] valid_cols = dest_indices[valid_mask] valid_vals = logprob_tensor[valid_mask] # Place the valid values into their correct positions in the padded tensor # using a single, efficient advanced indexing operation. padded_logprobs[valid_rows, valid_cols] = valid_vals return padded_logprobs def autotune_batch_and_chunks( total_input_rows, seq_len, hidden_size, vocab_size, dtype_bytes=16, multiplier=None ): if multiplier is None: final_m = max(4, seq_len // 4096) else: final_m = multiplier if torch.cuda.is_available(): free_bytes, _ = torch.cuda.mem_get_info() limit_gb = (free_bytes / (1024**3))*.80 elif hasattr(torch, "xpu") and torch.xpu.is_available(): # For XPU: estimate free memory from total - reserved total_mem = torch.xpu.get_device_properties(0).total_memory reserved_mem = torch.xpu.memory_reserved() free_bytes = total_mem - reserved_mem limit_gb = (free_bytes / (1024**3)) * 0.80 else: # Fallback: assume 8GB available limit_gb = 8.0 bytes_to_gb = 1024**3 b_vals = torch.arange(total_input_rows, 0, -1, device='cpu', dtype=torch.float32) hidden_gb = (b_vals * seq_len * hidden_size * dtype_bytes) / bytes_to_gb base_logits = ((b_vals/total_input_rows) * b_vals * seq_len * vocab_size * dtype_bytes) / bytes_to_gb logits_gb = base_logits / final_m total_mem_gb = hidden_gb + logits_gb valid_mask = total_mem_gb <= limit_gb valid_indices = torch.nonzero(valid_mask, as_tuple=False) if valid_indices.shape[0] == 0: #This means your GPU will OOM return 4, final_m best_idx = valid_indices[0].item() final_b = int(b_vals[best_idx].item()) return final_b, final_m def sanitize_logprob(logprob): """Local port of trl.scripts.vllm_serve.sanitize_logprob. Filters NaN logprobs from vLLM outputs.""" value = logprob.logprob if math.isnan(value): logging.getLogger(__name__).warning( f"Generated NaN logprob, token logprob '{logprob}' will be ignored" ) return None return value @dataclass class UnslothRewardConfig(RewardConfig): """ Configuration class for the [`RewardTrainer`]. This class includes only the parameters that are specific to Reward training. For a full list of training arguments, please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may differ from those in [`~transformers.TrainingArguments`]. Using [`~transformers.HfArgumentParser`] we can turn this class into [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the command line. Parameters: max_length (`int` or `None`, *optional*, defaults to `1024`): Maximum length of the sequences (prompt + completion) in the batch, filters out entries that exceed the limit. This argument is required if you want to use the default data collator. disable_dropout (`bool`, *optional*, defaults to `True`): Whether to disable dropout in the model. dataset_num_proc (`int`, *optional*, defaults to `None`): Number of processes to use for processing the dataset. center_rewards_coefficient (`float`, *optional*, defaults to `None`): Coefficient to incentivize the reward model to output mean-zero rewards (proposed by https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`. remove_unused_columns (`bool`, *optional*, defaults to `False`): Whether to remove the columns that are not used by the model's forward pass. Can be `True` only if the dataset is pretokenized. """ vllm_sampling_params: Optional[Any] = field( default = None, metadata = {'help': 'vLLM SamplingParams'}, ) unsloth_num_chunks : Optional[int] = field( default = -1, metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, ) unsloth_logit_chunk_multiplier : Optional[int] = field( default = None, metadata = {'help': 'Multiplier for chunked logit computations.'}, ) unsloth_grpo_mini_batch : Optional[int] = field( default = None, metadata = {'help': 'Mini batch size for GRPO hidden state accumulation. Default is None unless user defines it.'}, ) max_seq_length : Optional[int] = field( default = None, metadata = {'help': 'Maximum sequence length to truncate to.'}, ) def __init__( self, output_dir = None, overwrite_output_dir = None, do_train = False, do_eval = False, do_predict = False, eval_strategy = 'no', prediction_loss_only = False, per_device_train_batch_size = 4, per_device_eval_batch_size = 4, per_gpu_train_batch_size = None, per_gpu_eval_batch_size = None, gradient_accumulation_steps = 2, eval_accumulation_steps = 2, eval_delay = 0, torch_empty_cache_steps = 250, learning_rate = 5e-05, weight_decay = 0.001, adam_beta1 = 0.9, adam_beta2 = 0.999, adam_epsilon = 1e-08, max_grad_norm = 1.0, num_train_epochs = 3.0, max_steps = -1, lr_scheduler_type = 'linear', warmup_ratio = 0.1, warmup_steps = 0, log_level = 'passive', log_level_replica = 'warning', log_on_each_node = True, logging_dir = None, logging_strategy = 'steps', logging_first_step = False, logging_steps = 1, logging_nan_inf_filter = False, save_strategy = 'steps', save_steps = 500, save_total_limit = None, save_safetensors = True, save_on_each_node = False, save_only_model = False, restore_callback_states_from_checkpoint = False, no_cuda = False, use_cpu = False, use_mps_device = False, seed = 3407, data_seed = 3407, jit_mode_eval = False, use_ipex = False, bf16 = False, fp16 = False, fp16_opt_level = 'O1', half_precision_backend = 'auto', bf16_full_eval = False, fp16_full_eval = False, tf32 = None, local_rank = -1, ddp_backend = None, tpu_num_cores = None, tpu_metrics_debug = False, debug = '', dataloader_drop_last = False, eval_steps = None, dataloader_num_workers = 0, dataloader_prefetch_factor = None, past_index = -1, run_name = None, disable_tqdm = None, remove_unused_columns = False, label_names = None, load_best_model_at_end = False, metric_for_best_model = None, greater_is_better = None, ignore_data_skip = False, fsdp = '', fsdp_min_num_params = 0, fsdp_config = None, fsdp_transformer_layer_cls_to_wrap = None, accelerator_config = None, parallelism_config = None, deepspeed = None, label_smoothing_factor = 0.0, optim = 'adamw_8bit', optim_args = None, adafactor = False, group_by_length = False, length_column_name = 'length', report_to = 'none', ddp_find_unused_parameters = None, ddp_bucket_cap_mb = None, ddp_broadcast_buffers = None, dataloader_pin_memory = True, dataloader_persistent_workers = False, skip_memory_metrics = True, use_legacy_prediction_loop = False, push_to_hub = False, resume_from_checkpoint = None, hub_model_id = None, hub_strategy = 'every_save', hub_token = None, hub_private_repo = None, hub_always_push = False, hub_revision = None, gradient_checkpointing = True, gradient_checkpointing_kwargs = None, include_inputs_for_metrics = False, eval_do_concat_batches = True, fp16_backend = 'auto', push_to_hub_model_id = None, push_to_hub_organization = None, push_to_hub_token = None, mp_parameters = '', auto_find_batch_size = False, full_determinism = False, torchdynamo = None, ray_scope = 'last', ddp_timeout = 1800, torch_compile = False, torch_compile_backend = None, torch_compile_mode = None, include_tokens_per_second = False, include_num_input_tokens_seen = False, neftune_noise_alpha = None, optim_target_modules = None, batch_eval_metrics = False, eval_on_start = False, use_liger_kernel = False, liger_kernel_config = None, eval_use_gather_object = False, average_tokens_across_devices = True, max_length = 1024, disable_dropout = True, dataset_num_proc = None, center_rewards_coefficient = None, vllm_sampling_params = None, unsloth_num_chunks = -1, unsloth_logit_chunk_multiplier = None, unsloth_grpo_mini_batch = None, max_seq_length = None, **kwargs, ): if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') if num_train_epochs is None: num_train_epochs = 3.0 # Default to 3 epochs if None, max_steps will override if output_dir is None and save_strategy == 'steps' and save_steps == 500: output_dir = 'unsloth_training_checkpoints' save_strategy = 'no' import multiprocessing as _mp if dataset_num_proc is None: if _mp.get_start_method() != 'fork': dataset_num_proc = None else: import psutil dataset_num_proc = min(max((psutil.cpu_count() or 1)+4, 2), 64) memory_gb_left = psutil.virtual_memory().available / (1024**3) if memory_gb_left <= 2: dataset_num_proc = 1 else: dataset_num_proc = min(dataset_num_proc, int(memory_gb_left)) super().__init__( output_dir = output_dir, overwrite_output_dir = overwrite_output_dir, do_train = do_train, do_eval = do_eval, do_predict = do_predict, eval_strategy = eval_strategy, prediction_loss_only = prediction_loss_only, per_device_train_batch_size = per_device_train_batch_size, per_device_eval_batch_size = per_device_eval_batch_size, per_gpu_train_batch_size = per_gpu_train_batch_size, per_gpu_eval_batch_size = per_gpu_eval_batch_size, gradient_accumulation_steps = gradient_accumulation_steps, eval_accumulation_steps = eval_accumulation_steps, eval_delay = eval_delay, torch_empty_cache_steps = torch_empty_cache_steps, learning_rate = learning_rate, weight_decay = weight_decay, adam_beta1 = adam_beta1, adam_beta2 = adam_beta2, adam_epsilon = adam_epsilon, max_grad_norm = max_grad_norm, num_train_epochs = num_train_epochs, max_steps = max_steps, lr_scheduler_type = lr_scheduler_type, warmup_ratio = warmup_ratio, warmup_steps = warmup_steps, log_level = log_level, log_level_replica = log_level_replica, log_on_each_node = log_on_each_node, logging_dir = logging_dir, logging_strategy = logging_strategy, logging_first_step = logging_first_step, logging_steps = logging_steps, logging_nan_inf_filter = logging_nan_inf_filter, save_strategy = save_strategy, save_steps = save_steps, save_total_limit = save_total_limit, save_safetensors = save_safetensors, save_on_each_node = save_on_each_node, save_only_model = save_only_model, restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, no_cuda = no_cuda, use_cpu = use_cpu, use_mps_device = use_mps_device, seed = seed, data_seed = data_seed, jit_mode_eval = jit_mode_eval, use_ipex = use_ipex, bf16 = bf16, fp16 = fp16, fp16_opt_level = fp16_opt_level, half_precision_backend = half_precision_backend, bf16_full_eval = bf16_full_eval, fp16_full_eval = fp16_full_eval, tf32 = tf32, local_rank = local_rank, ddp_backend = ddp_backend, tpu_num_cores = tpu_num_cores, tpu_metrics_debug = tpu_metrics_debug, debug = debug, dataloader_drop_last = dataloader_drop_last, eval_steps = eval_steps, dataloader_num_workers = dataloader_num_workers, dataloader_prefetch_factor = dataloader_prefetch_factor, past_index = past_index, run_name = run_name, disable_tqdm = disable_tqdm, remove_unused_columns = remove_unused_columns, label_names = label_names, load_best_model_at_end = load_best_model_at_end, metric_for_best_model = metric_for_best_model, greater_is_better = greater_is_better, ignore_data_skip = ignore_data_skip, fsdp = fsdp, fsdp_min_num_params = fsdp_min_num_params, fsdp_config = fsdp_config, fsdp_transformer_layer_cls_to_wrap = fsdp_transformer_layer_cls_to_wrap, accelerator_config = accelerator_config, parallelism_config = parallelism_config, deepspeed = deepspeed, label_smoothing_factor = label_smoothing_factor, optim = optim, optim_args = optim_args, adafactor = adafactor, group_by_length = group_by_length, length_column_name = length_column_name, report_to = report_to, ddp_find_unused_parameters = ddp_find_unused_parameters, ddp_bucket_cap_mb = ddp_bucket_cap_mb, ddp_broadcast_buffers = ddp_broadcast_buffers, dataloader_pin_memory = dataloader_pin_memory, dataloader_persistent_workers = dataloader_persistent_workers, skip_memory_metrics = skip_memory_metrics, use_legacy_prediction_loop = use_legacy_prediction_loop, push_to_hub = push_to_hub, resume_from_checkpoint = resume_from_checkpoint, hub_model_id = hub_model_id, hub_strategy = hub_strategy, hub_token = hub_token, hub_private_repo = hub_private_repo, hub_always_push = hub_always_push, hub_revision = hub_revision, gradient_checkpointing = gradient_checkpointing, gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, include_inputs_for_metrics = include_inputs_for_metrics, eval_do_concat_batches = eval_do_concat_batches, fp16_backend = fp16_backend, push_to_hub_model_id = push_to_hub_model_id, push_to_hub_organization = push_to_hub_organization, push_to_hub_token = push_to_hub_token, mp_parameters = mp_parameters, auto_find_batch_size = auto_find_batch_size, full_determinism = full_determinism, torchdynamo = torchdynamo, ray_scope = ray_scope, ddp_timeout = ddp_timeout, torch_compile = torch_compile, torch_compile_backend = torch_compile_backend, torch_compile_mode = torch_compile_mode, include_tokens_per_second = include_tokens_per_second, include_num_input_tokens_seen = include_num_input_tokens_seen, neftune_noise_alpha = neftune_noise_alpha, optim_target_modules = optim_target_modules, batch_eval_metrics = batch_eval_metrics, eval_on_start = eval_on_start, use_liger_kernel = use_liger_kernel, liger_kernel_config = liger_kernel_config, eval_use_gather_object = eval_use_gather_object, average_tokens_across_devices = average_tokens_across_devices, max_length = max_length, disable_dropout = disable_dropout, dataset_num_proc = dataset_num_proc, center_rewards_coefficient = center_rewards_coefficient,**kwargs) self.vllm_sampling_params = vllm_sampling_params self.unsloth_num_chunks = unsloth_num_chunks if unsloth_grpo_mini_batch is not None: if self.generation_batch_size >= unsloth_grpo_mini_batch: self.unsloth_grpo_mini_batch = unsloth_grpo_mini_batch else: raise ValueError( f"Unsloth GRPO mini batch size needs to be less than or equal to the effective generation batch size, " f"which is self.per_device_train_batch_size * gradient_accumulation_steps." ) self.unsloth_logit_chunk_multiplier = unsloth_logit_chunk_multiplier self.max_seq_length = max_seq_length pass class _UnslothRewardTrainer(Trainer): _tag_names = ["trl", "reward-trainer"] def __init__( self, model: Optional[Union[PreTrainedModel, nn.Module]] = None, args: Optional[RewardConfig] = None, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None, processing_class: Optional[ Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] ] = None, model_init: Optional[Callable[[], PreTrainedModel]] = None, compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None, callbacks: Optional[list[TrainerCallback]] = None, optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = ( None, None, ), preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, peft_config: Optional[dict] = None, ): """ Initialize RewardTrainer. Args: model (`transformers.PreTrainedModel`): The model to train, preferably an `AutoModelForSequenceClassification`. args (`RewardConfig`): The arguments to use for training. data_collator (`transformers.DataCollator`): The data collator to use for training. If None is specified, the default data collator (`RewardDataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of the sequences in the batch, given a dataset of paired sequences. train_dataset (`datasets.Dataset`): The dataset to use for training. eval_dataset (`datasets.Dataset`): The dataset to use for evaluation. processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`): Processing class used to process the data. If provided, will be used to automatically process the inputs for the model, and it will be saved along the model to make it easier to rerun an interrupted training or reuse the fine-tuned model. model_init (`Callable[[], transformers.PreTrainedModel]`): The model initializer to use for training. If None is specified, the default model initializer will be used. compute_metrics (`Callable[[transformers.EvalPrediction], dict]`, *optional* defaults to `compute_accuracy`): The metrics to use for evaluation. If no metrics are specified, the default metric (`compute_accuracy`) will be used. callbacks (`list[transformers.TrainerCallback]`): The callbacks to use for training. optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`): The optimizer and scheduler to use for training. preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`): The function to use to preprocess the logits before computing the metrics. peft_config (`dict`, defaults to `None`): The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped in a PEFT model. """ if False: pass # Disable dropout in the model if args.disable_dropout: disable_dropout_in_model(model) if compute_metrics is None: compute_metrics = compute_accuracy if data_collator is None: if processing_class is None: raise ValueError( "A processing_class must be specified when using the default RewardDataCollatorWithPadding" ) max_length = args.max_length data_collator = RewardDataCollatorWithPadding(processing_class) if args.remove_unused_columns: try: # for bc before https://github.com/huggingface/transformers/pull/25435 args.remove_unused_columns = False except FrozenInstanceError: args = replace(args, remove_unused_columns=False) # warn users logger.warning( "When using RewardDataCollatorWithPadding, you should set `remove_unused_columns=False` in your RewardConfig" " we have set it for you, but you should do it yourself in the future.", ) self.use_reward_data_collator = True else: self.use_reward_data_collator = False # The trainer estimates the number of FLOPs [floating-point operations] using the number of elements in the # input tensor associated with the key "input_ids". However, in Reward, the sampled data does not include the # "input_ids" key. Instead, the available keys are "input_ids_chosen" and "input_ids_rejected". As a result, # the trainer issues the warning: "Could not estimate the number of tokens of the input, floating-point # operations will not be computed." To suppress this warning, we set the "estimate_tokens" key in the model's # "warnings_issued" dictionary to True. This acts as a flag to indicate that the warning has already been # issued. model.warnings_issued["estimate_tokens"] = True if "input_ids_chosen" not in train_dataset.column_names: with PartialState().main_process_first(): fn_kwargs = {"tokenizer": processing_class} train_dataset = train_dataset.map(maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class}) train_dataset = train_dataset.map( _tokenize, batched=True, fn_kwargs=fn_kwargs, num_proc=args.dataset_num_proc, ) # This filter is important because otherwise you get samples that exceed the model's context length and # get truncated => noisy signal the chosen/rejected label gets lost. The downside is that the # user might get surprised if N samples are missing from training. train_dataset = train_dataset.filter( lambda x: len(x["input_ids_chosen"]) <= max_length and len(x["input_ids_rejected"]) <= max_length, num_proc=args.dataset_num_proc, ) if eval_dataset is not None: eval_dataset = eval_dataset.map( maybe_apply_chat_template, fn_kwargs={"tokenizer": processing_class} ) eval_dataset = eval_dataset.map( _tokenize, fn_kwargs=fn_kwargs, batched=True, num_proc=args.dataset_num_proc, ) # This filter is important because otherwise you get samples that exceed the model's context length and # get truncated => noisy signal the chosen/rejected label gets lost. The downside is that the # user might get surprised if N samples are missing from training. eval_dataset = eval_dataset.filter( lambda x: len(x["input_ids_chosen"]) <= max_length and len(x["input_ids_rejected"]) <= max_length, num_proc=args.dataset_num_proc, ) super().__init__( model=model, args=args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, processing_class=processing_class, model_init=model_init, compute_metrics=compute_metrics, callbacks=callbacks, optimizers=optimizers, preprocess_logits_for_metrics=preprocess_logits_for_metrics, ) # Add tags for models that have been loaded with the correct transformers version if hasattr(self.model, "add_model_tags"): self.model.add_model_tags(self._tag_names) def compute_loss( self, model: Union[PreTrainedModel, nn.Module], inputs: dict[str, Union[torch.Tensor, Any]], return_outputs=False, num_items_in_batch=None, ) -> Union[torch.Tensor, tuple[torch.Tensor, dict[str, torch.Tensor]]]: rewards_chosen = model( input_ids=inputs["input_ids_chosen"], attention_mask=inputs["attention_mask_chosen"], return_dict=True, )["logits"] rewards_rejected = model( input_ids=inputs["input_ids_rejected"], attention_mask=inputs["attention_mask_rejected"], return_dict=True, )["logits"] # calculate loss, optionally modulate with margin if "margin" in inputs: loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected - inputs["margin"]).mean() else: loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected).mean() if self.args.center_rewards_coefficient is not None: loss += self.args.center_rewards_coefficient * torch.mean((rewards_chosen + rewards_rejected) ** 2) if return_outputs: return loss, { "rewards_chosen": rewards_chosen, "rewards_rejected": rewards_rejected, } return loss def prediction_step( self, model: Union[PreTrainedModel, nn.Module], inputs: dict[str, Union[torch.Tensor, Any]], prediction_loss_only: bool, ignore_keys: Optional[list[str]] = None, ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: inputs = self._prepare_inputs(inputs) if ignore_keys is None: if hasattr(self.model, "config"): ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", []) else: ignore_keys = [] with torch.no_grad(): loss, logits_dict = self.compute_loss(model, inputs, return_outputs=True) if prediction_loss_only: return (loss, None, None) loss = loss.detach() logits = tuple(v for k, v in logits_dict.items() if k not in ignore_keys) logits = nested_detach(logits) # Stack accepted against rejected, mean over logits # and softmax to get preferences between accepted and rejected to sum to 1 logits = torch.stack(logits).mean(dim=2).softmax(dim=0).T labels = torch.zeros(logits.shape[0]) labels = self._prepare_inputs(labels) return loss, logits, labels def evaluate(self, *args, **kwargs): num_print_samples = kwargs.pop("num_print_samples", 4) self.visualize_samples(num_print_samples) return super().evaluate(*args, **kwargs) def visualize_samples(self, num_print_samples: int): """ Visualize the reward model logits prediction Args: num_print_samples (`int`, defaults to `4`): The number of samples to print. Set to `-1` to print all samples. """ eval_dataloader = self.get_eval_dataloader() table = defaultdict(list) for _, inputs in enumerate(eval_dataloader): _, logits, _ = self.prediction_step(self.model, inputs, prediction_loss_only=False) chosen_text = decode_and_strip_padding(inputs["input_ids_chosen"], self.processing_class) rejected_text = decode_and_strip_padding(inputs["input_ids_rejected"], self.processing_class) table["chosen_text"].extend(gather_object(chosen_text)) table["rejected_text"].extend(gather_object(rejected_text)) table["logits"].extend( gather_object([[round(inner_item, 4) for inner_item in item] for item in logits.tolist()]) ) if num_print_samples >= 0 and len(table["chosen_text"]) >= num_print_samples: break df = pd.DataFrame(table) if self.accelerator.process_index == 0: if is_rich_available(): print_rich_table(df[:num_print_samples]) if "wandb" in self.args.report_to: import wandb if wandb.run is not None: wandb.log({"completions": wandb.Table(dataframe=df)}) if "comet_ml" in self.args.report_to: log_table_to_comet_experiment( name="completions.csv", table=df, ) # Ensure the model card is saved along with the checkpoint def _save_checkpoint(self, model, trial): if self.args.hub_model_id is None: model_name = Path(self.args.output_dir).name else: model_name = self.args.hub_model_id.split("/")[-1] self.create_model_card(model_name=model_name) super()._save_checkpoint(model, trial) def create_model_card( self, model_name: Optional[str] = None, dataset_name: Optional[str] = None, tags: Union[str, list[str], None] = None, ): """ Creates a draft of a model card using the information available to the `Trainer`. Args: model_name (`str` or `None`, *optional*, defaults to `None`): Name of the model. dataset_name (`str` or `None`, *optional*, defaults to `None`): Name of the dataset used for training. tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`): Tags to be associated with the model card. """ if not self.is_world_process_zero(): return if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): base_model = self.model.config._name_or_path else: base_model = None # normalize `tags` to a mutable set if tags is None: tags = set() elif isinstance(tags, str): tags = {tags} else: tags = set(tags) if hasattr(self.model.config, "unsloth_version"): tags.add("unsloth") if "JOB_ID" in os.environ: tags.add("hf_jobs") tags.update(self._tag_names) model_card = generate_model_card( base_model=base_model, model_name=model_name, hub_model_id=self.hub_model_id, dataset_name=dataset_name, tags=tags, wandb_url=wandb.run.url if is_wandb_available() and wandb.run is not None else None, comet_url=get_comet_experiment_url(), trainer_name="Reward", ) model_card.save(os.path.join(self.args.output_dir, "README.md")) class UnslothRewardTrainer(_UnslothRewardTrainer): """ """ def __init__( self, model = None, args = None, data_collator = None, train_dataset = None, eval_dataset = None, processing_class = None, model_init = None, compute_metrics = None, callbacks = None, preprocess_logits_for_metrics = None, peft_config = None, **kwargs ): if args is None: args = UnslothRewardConfig() use_bf16 = getattr(args, 'bf16', False) if type(use_bf16) is not bool: use_bf16 = False use_fp16 = getattr(args, 'fp16', False) if type(use_fp16) is not bool: use_fp16 = False force_float32 = False full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1' if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'): print('Unsloth: Switching to float32 training since model cannot work with float16') force_float32 = True mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None) if dtype is None: dtype = model.get_input_embeddings().weight.dtype from unsloth_zoo.utils import _get_dtype dtype = _get_dtype(dtype) float16 = dtype == torch.float16 if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') if force_float32: # Forced float32 training args.fp16 = False args.bf16 = False os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' if hasattr(args, 'mixed_precision'): args.mixed_precision = 'no' # args.mixed_precision is a new argument which needs to be set now elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': # Mixed precision training args.fp16 = float16 args.bf16 = not float16 os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' if hasattr(args, 'mixed_precision'): args.mixed_precision = 'fp16' if float16 else 'bf16' # args.mixed_precision is a new argument which needs to be set now elif mixed_precision_dtype == 'bfloat16': # Both False since bfloat16 full finetuning doesn't do any autocasting. args.fp16 = False args.bf16 = False os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' if hasattr(args, 'mixed_precision'): args.mixed_precision = 'no' # args.mixed_precision is a new argument which needs to be set now if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': args.eval_strategy = 'steps' if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 ga_steps = getattr(args, 'gradient_accumulation_steps', None) if ga_steps is not None and ga_steps > 1: from transformers import __version__ as transformers_version if Version(transformers_version) <= Version('4.45.2'): print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') if getattr(args, 'eval_strategy', 'no') != 'no': eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps fp16_full_eval = getattr(args, 'fp16_full_eval', False) if type(fp16_full_eval) is not bool: fp16_full_eval = False bf16_full_eval = getattr(args, 'bf16_full_eval', False) if type(bf16_full_eval) is not bool: bf16_full_eval = False if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False if force_float32: args.bf16_full_eval = False args.fp16_full_eval = False elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': args.bf16_full_eval = True args.fp16_full_eval = False elif not bf16_full_eval and not fp16_full_eval: args.bf16_full_eval = args.bf16 args.fp16_full_eval = args.fp16 _output_logits = False if locals().get('compute_metrics', None) is not None: _output_logits = True if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True if _output_logits: os.environ['UNSLOTH_RETURN_LOGITS'] = '1' if model is not None: _warnings_issued = getattr(model, 'warnings_issued', None) if _warnings_issued is None: model.warnings_issued = {} elif not isinstance(_warnings_issued, dict): try: model.warnings_issued = dict(_warnings_issued) except Exception: model.warnings_issued = {} if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): pass else: model_max_seq_length = getattr(model, 'max_seq_length', None) args_max_seq_length = getattr(args, 'max_seq_length', None) if args_max_seq_length is None and model_max_seq_length is not None: max_seq_length = model.max_seq_length if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length elif args_max_seq_length is not None and model_max_seq_length is not None: if args_max_seq_length > model_max_seq_length: print('Unsloth: You set `max_seq_length` as ' + str(args_max_seq_length) + ' but ' 'the maximum the model supports is ' + str(model_max_seq_length) + '. We shall reduce it.') args.max_seq_length = model_max_seq_length if model is not None and hasattr(model, 'for_training'): model.for_training(use_gradient_checkpointing=getattr(args, 'gradient_checkpointing', True)) if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' if 'processing_class' in locals(): if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' __tokenizer = processing_class if 'processing_class' in locals() else tokenizer from unsloth_zoo.vision_utils import UnslothVisionDataCollator if not isinstance(data_collator, UnslothVisionDataCollator): if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names: data_collator = TransformersDataCollatorForLanguageModeling( __tokenizer, mlm = False, mlm_probability = 0.0, pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None), ) elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names: data_collator = DataCollatorForSeq2Seq( __tokenizer, pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None), ) else: if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False if hasattr(args, 'dataset_text_field'): args.dataset_text_field = '' if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True} if not isinstance(data_collator, UnslothVisionDataCollator): if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'): if isinstance(data_collator, DataCollatorForSeq2Seq): data_collator = DataCollatorForSeq2Seq( __tokenizer.tokenizer, pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None), ) elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling): data_collator = TransformersDataCollatorForLanguageModeling( __tokenizer.tokenizer, mlm = False, mlm_probability = 0.0, pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None), ) other_metrics = [] from unsloth_zoo.logging_utils import PatchRLStatistics PatchRLStatistics('reward_trainer', other_metrics) # [TODO] Fix up DataParallel multiplying batch sizes # [TODO] DDP works, but DP seems to not work? [TODO] if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1: if getattr(args, "_n_gpu", 1) != 1: args._n_gpu = 1 if "model" in locals() and hasattr(model, "for_training"): model.for_training(use_gradient_checkpointing=getattr(args, 'gradient_checkpointing', True)) super().__init__( model = model, args = args, data_collator = data_collator, train_dataset = train_dataset, eval_dataset = eval_dataset, processing_class = processing_class, model_init = model_init, compute_metrics = compute_metrics, callbacks = callbacks, preprocess_logits_for_metrics = preprocess_logits_for_metrics, peft_config = peft_config,**kwargs) if "model" in locals() and hasattr(model, "for_inference"): model.for_inference() if hasattr(self, 'neftune_hook_handle'): self.neftune_hook_handle.remove() if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle if getattr(args, 'neftune_noise_alpha', None) is not None: model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha pass if hasattr(self, 'accelerator'): scaler = self.accelerator.scaler current_model = model while hasattr(current_model, 'model'): current_model.accelerator_scaler = scaler current_model = current_model.model current_model.accelerator_scaler = scaler pass if hasattr(self, 'train'): self.train = MethodType(prepare_for_training_mode(self.__class__.train), self) pass if hasattr(self, 'llm') and self.llm is not None and hasattr(self.llm, 'get_tokenizer'): _vllm_tok = self.llm.get_tokenizer() _pc = getattr(self, 'processing_class', None) or getattr(self, 'tokenizer', None) if _vllm_tok is not None and _pc is not None and getattr(_pc, 'chat_template', None) is not None and getattr(_vllm_tok, 'chat_template', None) is None: _vllm_tok.chat_template = _pc.chat_template pass pass if hasattr(logger, "addFilter"): import logging class HideLoggingMessage(logging.Filter): def __init__(self, text): self.text = text def filter(self, x): return not (self.text in x.getMessage()) pass logger.addFilter(HideLoggingMessage("`use_cache=True`"))