| """ |
| 2026.2.1 |
| 2026.2.1 |
| 5.2.0 |
| 0.24.0 |
| __UNSLOTH_VERSIONING__ |
| """ |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| from torch import Tensor |
| import torch |
| import torch.nn as nn |
| from torch.nn import functional as F |
| from unsloth_zoo.temporary_patches.common import torch_compile |
| from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable |
| from trl.trainer.grpo_trainer import (Any, AutoConfig, AutoModelForSequenceClassification, AutoProcessor, AutoTokenizer, BaseTrainer, DataLoader, Dataset, FSDP, GRPOConfig, GRPOTrainer, GenerationConfig, GuidedDecodingParams, IterableDataset, LLM, Optional, Path, PeftConfig, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, RepeatSampler, RewardFunc, Sampler, SamplingParams, SyncRefModelCallback, TrainerCallback, Union, VLLMClient, _ForwardRedirection, apply_chat_template, broadcast_object_list, datasets, defaultdict, deque, disable_dropout_in_model, ensure_master_addr_port, gather, gather_object, identity, inspect, is_conversational, is_datasets_available, is_flash_attn_2_available, is_liger_kernel_available, is_peft_model, is_rich_available, is_vllm_available, logger, logging, maybe_apply_chat_template, nanmax, nanmin, nanstd, nn, nullcontext, os, pad, partial, prepare_deepspeed, prepare_fsdp, prepare_multimodal_messages, print_prompt_completions_sample, profiling_context, profiling_decorator, seed_worker, selective_log_softmax, set_seed, shuffle_sequence_dict, split_pixel_values_by_grid, split_tensor_dict, textwrap, torch, transformers, unsplit_pixel_values_by_grid, unwrap_model_for_generation, AutoConfig, AutoModelForSequenceClassification, AutoProcessor, AutoTokenizer, Dataset, GRPOConfig, GRPOTrainer, GenerationConfig, IterableDataset, LLM, Optional, PeftConfig, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, RewardFunc, SyncRefModelCallback, TrainerCallback, Union, VLLMClient, datasets, defaultdict, deque, disable_dropout_in_model, ensure_master_addr_port, identity, inspect, is_liger_kernel_available, is_peft_model, is_vllm_available, logger, nn, os, pad, prepare_deepspeed, prepare_fsdp, set_seed, torch, transformers, Any, LLM, Union, gather, gather_object, is_conversational, logging, nanmax, nanmin, nanstd, os, pad, torch, FSDP, GuidedDecodingParams, LLM, Optional, SamplingParams, apply_chat_template, broadcast_object_list, gather, gather_object, is_flash_attn_2_available, maybe_apply_chat_template, nullcontext, os, pad, prepare_multimodal_messages, profiling_context, torch, transformers, unwrap_model_for_generation, os, pad, selective_log_softmax, torch, transformers, Any, Union, profiling_decorator, shuffle_sequence_dict, split_pixel_values_by_grid, split_tensor_dict, torch, unsplit_pixel_values_by_grid, PreTrainedModel, logger, os, torch, FSDP, LLM, nn, os, FSDP, nn, torch, GRPOTrainer, gather, nanmax, nanmin, os, pad, torch) |
|
|
|
|
| import os |
| from typing import * |
| from dataclasses import dataclass, field |
| from packaging.version import Version |
| import torch |
| import numpy as np |
| from contextlib import nullcontext |
| from torch.nn import functional as F |
| import inspect |
| from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling |
| from transformers.training_args import ParallelMode |
| from unsloth_zoo.device_type import DEVICE_TYPE, device_synchronize |
|
|
| |
| |
| import functools |
| from types import MethodType |
| try: |
| from unsloth_zoo.gradient_checkpointing import reset_unsloth_gradient_checkpointing_buffers |
| except: |
| def reset_unsloth_gradient_checkpointing_buffers(): pass |
| def prepare_for_training_mode(f): |
| @functools.wraps(f) |
| def wrapper(self, *args, **kwargs): |
| |
| _was_training = None |
| |
| use_gc = getattr(self.args, 'gradient_checkpointing', True) |
| if hasattr(self, 'model') and hasattr(self.model, "training"): |
| _was_training = self.model.training |
| if hasattr(self, 'model') and hasattr(self.model, "for_training"): |
| self.model.for_training(use_gradient_checkpointing=use_gc) |
| output = f(self, *args, **kwargs) |
| |
| if hasattr(self, 'model') and hasattr(self.model, "for_inference"): |
| if _was_training is False: |
| self.model.for_inference() |
| elif _was_training is True and hasattr(self.model, "for_training"): |
| self.model.for_training(use_gradient_checkpointing=use_gc) |
| |
| try: |
| reset_unsloth_gradient_checkpointing_buffers() |
| except: |
| pass |
| |
| try: |
| import wandb |
| wandb.finish() |
| except: |
| pass |
| return output |
| return wrapper |
| pass |
|
|
| torch_compile_options = { |
| "epilogue_fusion" : True, |
| "max_autotune" : False, |
| "shape_padding" : True, |
| "trace.enabled" : False, |
| "triton.enable_persistent_tma_matmul": torch.cuda.get_device_capability()[0] >= 9, |
| "cuda.cutlass_epilogue_fusion_enabled": torch.cuda.get_device_capability()[0] >= 9, |
| "cuda.cutlass_tma_only": torch.cuda.get_device_capability()[0] >= 9, |
| "cuda.compile_opt_level" : "-O2", |
| "cuda.enable_cuda_lto" : True, |
| } |
|
|
| @torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) |
| def chunked_hidden_states_selective_log_softmax( |
| hidden_states: torch.Tensor, |
| lm_head: torch.Tensor, |
| index: torch.Tensor, |
| chunks: int = 4, |
| logit_scale_multiply: float = 0.0, |
| logit_scale_divide: float = 0.0, |
| logit_softcapping: float = 0.0, |
| temperature: float = 1.0, |
| ) -> torch.Tensor: |
| |
| flat_hidden_states = hidden_states.reshape(-1, hidden_states.shape[-1]) |
| flat_index = index.reshape(-1) |
|
|
| chunked_hidden_states = torch.chunk(flat_hidden_states, chunks=chunks, dim=0) |
| chunked_index = torch.chunk(flat_index, chunks=chunks, dim=0) |
|
|
| all_per_token_logps = [] |
|
|
| for chunk_hidden_states, chunk_index in zip(chunked_hidden_states, chunked_index): |
| chunk_logits = chunk_hidden_states.to(lm_head.dtype) @ lm_head.t() |
|
|
| if logit_scale_multiply != 0.0: |
| chunk_logits = chunk_logits * logit_scale_multiply |
| if logit_scale_divide != 0.0: |
| chunk_logits = chunk_logits / logit_scale_divide |
| if logit_softcapping != 0.0: |
| chunk_logits = chunk_logits * torch.tanh(chunk_logits / logit_softcapping) |
|
|
| chunk_logits = chunk_logits.to(torch.float32) |
|
|
| if temperature != 1.0: |
| chunk_logits = chunk_logits / temperature |
|
|
| selected_logits = torch.gather(chunk_logits, dim=-1, index=chunk_index.unsqueeze(-1)).squeeze(-1) |
| logsumexp_values = torch.logsumexp(chunk_logits, dim=-1) |
| per_token_logps = selected_logits - logsumexp_values |
| all_per_token_logps.append(per_token_logps) |
|
|
| all_per_token_logps = torch.concat(all_per_token_logps) |
|
|
| all_per_token_logps = all_per_token_logps.reshape((hidden_states.shape[0], hidden_states.shape[1])) |
| return all_per_token_logps |
|
|
| @torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) |
| def chunked_selective_log_softmax(logits, index): |
| |
| chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0) |
| chunked_index = torch.chunk(index.reshape(-1), chunks = 4, dim = 0) |
| all_per_token_logps = [] |
| |
| for chunk_logits, chunk_index in zip(chunked_logits, chunked_index): |
| chunk_logits = chunk_logits.to(torch.float32) |
| selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1) |
| logsumexp_values = torch.logsumexp(chunk_logits, dim = -1) |
| per_token_logps = selected_logits - logsumexp_values |
| all_per_token_logps.append(per_token_logps) |
| pass |
| all_per_token_logps = torch.concat(all_per_token_logps) |
| all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1])) |
| return all_per_token_logps |
|
|
| def calculate_pad_tokens_in_prompt( |
| input_ids: torch.Tensor, |
| logits_to_keep: int, |
| pad_token_id: int |
| ) -> torch.Tensor: |
| """ |
| Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens |
| """ |
| if logits_to_keep >= input_ids.shape[1]: |
| raise ValueError("logits_to_keep must be smaller than the sequence length.") |
|
|
| prompt_section = input_ids[:, :-logits_to_keep] |
|
|
| padding_mask = (prompt_section == pad_token_id) |
|
|
| pad_token_counts = padding_mask.sum(dim=1) |
|
|
| return pad_token_counts |
|
|
| def create_completion_attention_mask( |
| completion_input_ids: torch.Tensor, |
| left_pad_tokens_per_prompt: torch.Tensor, |
| max_left_pad: int, |
| pad_token_id: int |
| ) -> torch.Tensor: |
| """ |
| Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad] |
| |
| Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens |
| and pad are pad tokens, this function would make a completion mask that would 0 out the pad |
| and p tokens. so in this example [0,0,0,1,1,1,0,0,0] |
| """ |
| batch_size, completion_len = completion_input_ids.shape |
| device = completion_input_ids.device |
|
|
| num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt |
|
|
| indices = torch.arange(completion_len, device=device).unsqueeze(0) |
| shift_mask = indices >= num_tokens_to_mask.unsqueeze(1) |
|
|
| non_padding_mask = (completion_input_ids != pad_token_id) |
|
|
| final_mask = shift_mask & non_padding_mask |
|
|
| return final_mask |
|
|
| def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor: |
| """ |
| Moves all padding tokens in each sequence of a batch to the right. |
| """ |
| mask = (tensor != pad_id) |
| |
| sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True) |
| packed_tensor = torch.gather(tensor, 1, sorted_indices) |
| return packed_tensor |
|
|
| def align_logprobs_with_mask( |
| logprob_tensor: torch.Tensor, |
| attention_mask: torch.Tensor, |
| pad_value: float = 0.0 |
| ) -> torch.Tensor: |
| """ |
| Aligns a log probability tensor with a given attention mask. |
| """ |
|
|
| device = logprob_tensor.device |
| batch_size, logprob_seq_len = logprob_tensor.shape |
| mask_seq_len = attention_mask.shape[1] |
|
|
| padded_logprobs = torch.full( |
| attention_mask.shape, |
| fill_value=pad_value, |
| dtype=logprob_tensor.dtype, |
| device=device |
| ) |
|
|
| left_pad_counts = torch.argmax(attention_mask, dim=1) |
|
|
| cols = torch.arange(logprob_seq_len, device=device) |
| dest_indices = left_pad_counts.unsqueeze(1) + cols |
|
|
| |
| |
| row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices) |
|
|
| |
| |
| |
| valid_mask = dest_indices < mask_seq_len |
|
|
| |
| |
| |
| valid_rows = row_indices[valid_mask] |
| valid_cols = dest_indices[valid_mask] |
| valid_vals = logprob_tensor[valid_mask] |
|
|
| |
| |
| padded_logprobs[valid_rows, valid_cols] = valid_vals |
|
|
| return padded_logprobs |
|
|
| def autotune_batch_and_chunks( |
| total_input_rows, |
| seq_len, |
| hidden_size, |
| vocab_size, |
| dtype_bytes=16, |
| multiplier=None |
| ): |
| if multiplier is None: |
| final_m = max(4, seq_len // 4096) |
| else: |
| final_m = multiplier |
|
|
| if torch.cuda.is_available(): |
| free_bytes, _ = torch.cuda.mem_get_info() |
| limit_gb = (free_bytes / (1024**3))*.80 |
| elif hasattr(torch, "xpu") and torch.xpu.is_available(): |
| |
| total_mem = torch.xpu.get_device_properties(0).total_memory |
| reserved_mem = torch.xpu.memory_reserved() |
| free_bytes = total_mem - reserved_mem |
| limit_gb = (free_bytes / (1024**3)) * 0.80 |
| else: |
| |
| limit_gb = 8.0 |
|
|
| bytes_to_gb = 1024**3 |
|
|
| b_vals = torch.arange(total_input_rows, 0, -1, device='cpu', dtype=torch.float32) |
|
|
| hidden_gb = (b_vals * seq_len * hidden_size * dtype_bytes) / bytes_to_gb |
|
|
| base_logits = ((b_vals/total_input_rows) * b_vals * seq_len * vocab_size * dtype_bytes) / bytes_to_gb |
| logits_gb = base_logits / final_m |
|
|
| total_mem_gb = hidden_gb + logits_gb |
|
|
| valid_mask = total_mem_gb <= limit_gb |
| valid_indices = torch.nonzero(valid_mask, as_tuple=False) |
|
|
| if valid_indices.shape[0] == 0: |
| |
| return 4, final_m |
|
|
| best_idx = valid_indices[0].item() |
| final_b = int(b_vals[best_idx].item()) |
|
|
| return final_b, final_m |
| def grpo_compute_loss( |
| ref, |
| new, |
| old, |
| sampling_per_token_logps, |
| input_ids, |
| mask, |
| beta, |
| advantages, |
| **kwargs |
| ): |
| |
| |
| loss_type = kwargs.get("loss_type", "grpo") |
| epsilon_low = kwargs.get("epsilon_low", 0.2) |
| epsilon_high = kwargs.get("epsilon_high", 0.2) |
| max_completion_length = kwargs.get("max_completion_length", 8192) |
| delta = kwargs.get("delta", None) |
| importance_sampling_level = kwargs.get("importance_sampling_level", "token") |
| num_items_in_batch = kwargs.get("num_items_in_batch", None) |
| current_gradient_accumulation_steps = kwargs.get("current_gradient_accumulation_steps", 1) |
| num_processes = kwargs.get("num_processes", 1) |
| use_vllm = kwargs.get("use_vllm", False) |
| vllm_importance_sampling_cap = kwargs.get("vllm_importance_sampling_cap", 2.0) |
| get_sapo_token_loss = kwargs.get("get_sapo_token_loss", None) |
| sapo_temperature_pos = kwargs.get("sapo_temperature_pos", 1.0) |
| sapo_temperature_neg = kwargs.get("sapo_temperature_neg", 1.05) |
| get_off_policy_mask = kwargs.get("get_off_policy_mask", None) |
| off_policy_mask_threshold = kwargs.get("off_policy_mask_threshold", None) |
| input_ids = input_ids.unsqueeze(-1) |
|
|
| if advantages.dim() == 1: |
| advantages = advantages.unsqueeze(1) |
|
|
| if off_policy_mask_threshold is not None: |
| off_policy_mask = get_off_policy_mask( |
| advantages=advantages, |
| per_token_logps=new, |
| old_per_token_logps=old, |
| mask=mask, |
| off_policy_threshold=off_policy_mask_threshold, |
| ) |
|
|
| with torch.no_grad(): |
| if use_vllm and sampling_per_token_logps is not None: |
| |
| importance_sampling_ratio = torch.exp((old * mask) - sampling_per_token_logps) |
| importance_sampling_ratio = torch.clamp( |
| importance_sampling_ratio, max=vllm_importance_sampling_cap |
| ) |
| pass |
|
|
| |
| |
| |
| if old is not None: |
| log_ratio = new - old |
| else: |
| log_ratio = new - new.detach() |
|
|
| if importance_sampling_level == "token": |
| log_importance_weights = log_ratio |
| elif importance_sampling_level == "sequence": |
| log_importance_weights = (log_ratio * mask).sum(-1) / mask.sum(-1).clamp(min=1.0) |
| log_importance_weights = log_importance_weights.unsqueeze(-1) |
| else: |
| raise ValueError( |
| f"Unknown importance sampling level: {importance_sampling_level}. Possible values are 'token' " |
| "and 'sequence'." |
| ) |
|
|
| coef_1 = torch.exp(log_importance_weights) |
|
|
| |
| |
| if beta != 0.0: |
| kl_i = torch.exp(ref - new) - (ref - new) - 1.0 |
|
|
| else: |
| |
| if importance_sampling_level == "sequence": |
| kl_i = new.new_zeros(new.size(0), 1) |
| else: |
| kl_i = torch.zeros_like(new) |
| |
| |
|
|
| |
| |
| if loss_type == "cispo": |
| clamped_ratios = torch.clamp(coef_1, max=epsilon_high).detach() |
| loss_i = -clamped_ratios * advantages * new |
| |
| elif loss_type in ["grpo", "bnpo", "dr_grpo", "dapo"]: |
| coef_2 = torch.clamp(coef_1, 1 - epsilon_low, 1 + epsilon_high) |
|
|
| if delta is not None: |
| loss_1 = torch.clamp(coef_1, max=delta) * advantages |
| else: |
| loss_1 = coef_1 * advantages |
| pass |
| loss_2 = coef_2 * advantages |
| loss_i = -torch.min(loss_1, loss_2) |
| elif loss_type == "sapo": |
| if get_sapo_token_loss is None: |
| raise Exception(f"sapo is only available in TRL 0.26.0+") |
| loss_i = torch.empty_like(coef_1) |
| positive_advantages_mask = advantages.repeat([1, coef_1.shape[1]]) > 0 |
| |
| if coef_1[positive_advantages_mask].numel() != 0: |
| loss_i[positive_advantages_mask] = get_sapo_token_loss( |
| coef_1[positive_advantages_mask], sapo_temperature_pos |
| ) |
| if coef_1[~positive_advantages_mask].numel() != 0: |
| loss_i[~positive_advantages_mask] = get_sapo_token_loss( |
| coef_1[~positive_advantages_mask], sapo_temperature_neg |
| ) |
| loss_i = -loss_i * advantages |
| else: |
| raise ValueError(f"Unknown loss type: {loss_type}") |
|
|
| if off_policy_mask_threshold is not None: |
| loss_i = loss_i * off_policy_mask |
|
|
| if use_vllm and sampling_per_token_logps is not None: |
| loss_i = loss_i * importance_sampling_ratio |
| |
| with torch.no_grad(): |
| delta = torch.abs(old - sampling_per_token_logps) |
| delta = delta * mask |
| flat_is_ratio = importance_sampling_ratio * mask |
| else: |
| delta = torch.tensor([]).detach() |
| flat_is_ratio = torch.tensor([]).detach() |
| if beta != 0.0: |
| loss_i = loss_i + beta * kl_i |
|
|
| mask = mask.to(torch.float32) |
| n_mask_per_reward = mask.sum(1) |
|
|
| |
| if loss_type in ["grpo", "sapo"]: |
| loss = ((loss_i * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() |
| loss = loss / current_gradient_accumulation_steps |
| elif loss_type == "bnpo": |
| loss = (loss_i * mask).sum() / mask.sum().clamp(min=1.0) |
| loss = loss / current_gradient_accumulation_steps |
| elif loss_type == "dr_grpo": |
| loss = (loss_i * mask).sum() / (loss_i.size(0) * max_completion_length) |
| loss = loss / current_gradient_accumulation_steps |
| elif loss_type in ["cispo", "dapo"]: |
| normalizer = num_items_in_batch/ num_processes |
| loss = (loss_i * mask).sum() / normalizer |
| else: |
| raise ValueError(f"Unknown loss type: {loss_type}") |
|
|
| |
|
|
| |
| def masked_batch_mean(x): |
| with torch.inference_mode(): |
| completion_length = n_mask_per_reward.mean() |
| if x.shape[1] == 1: |
| return completion_length, x.mean() |
| else: |
| mean_kl_per_reward = (x * mask).sum(1) / n_mask_per_reward |
| mean_kl = mean_kl_per_reward.mean() |
| return completion_length, mean_kl |
| completion_length, mean_kl = masked_batch_mean(kl_i) |
| return loss, completion_length, mean_kl, delta, flat_is_ratio, coef_1 |
|
|
| class UnslothEfficientGRPO(torch.autograd.Function): |
| |
| @staticmethod |
| def forward(ctx, _new_logps, _old_logps, _ref_logps, _sampling_per_token_logps, lm_head, _input_ids, _mask, _advantages, beta, scaler = None, n_chunks = 1, extra_kwargs=None): |
| if extra_kwargs is None: |
| extra_kwargs = {} |
| def compute_loss(new_logps, old_logps, ref_logps, sampling_per_token_logps, input_ids, mask, advantages, scaling): |
| loss, completion_length, mean_kl, delta, flat_is_ratio, coef_1 = grpo_compute_loss( |
| ref_logps, |
| new_logps, |
| old_logps, |
| sampling_per_token_logps, |
| input_ids, |
| mask, |
| beta, |
| advantages, |
| **extra_kwargs, |
| ) |
|
|
| |
| scaled_loss = loss * scaling |
| |
| return scaled_loss, (loss.detach(), completion_length, mean_kl, delta, flat_is_ratio, coef_1) |
| pass |
|
|
| device =_new_logps.device |
| grad_inputs = torch.empty_like(_new_logps) |
| accumulated_loss = torch.zeros(1, device = device) |
| accumulated_completion_length = torch.zeros(1, device = device) |
| accumulated_mean_kl = torch.zeros(1, device = device) |
| accumulated_delta = [] |
| accumulated_flat_is_ratio = [] |
| accumulated_coef_1 = [] |
|
|
| def accumulate_chunk( |
| new_logps_j, |
| old_logps_j, |
| ref_logps_j, |
| sampling_per_token_logps_j, |
| input_ids_j, |
| mask_j, |
| advantages_j, |
| scaling, |
| grad_inputs_j, |
| ): |
| (chunk_grad_input,), (chunk_loss, (unscaled_loss, chunk_completion_length, chunk_mean_kl, chunk_delta, chunk_flat_is_ratio, chunk_coef_1)) = torch.func.grad_and_value( |
| compute_loss, |
| argnums = (0,), |
| has_aux = True, |
| )(new_logps_j, old_logps_j, ref_logps_j, sampling_per_token_logps_j, input_ids_j, mask_j, advantages_j, scaling) |
| accumulated_loss .add_(unscaled_loss) |
| accumulated_completion_length.add_(chunk_completion_length) |
| accumulated_mean_kl .add_(chunk_mean_kl) |
| accumulated_delta .append(chunk_delta) |
| accumulated_flat_is_ratio .append(chunk_flat_is_ratio) |
| accumulated_coef_1 .append(chunk_coef_1) |
| grad_inputs_j[:] = chunk_grad_input |
| pass |
|
|
| accumulate_chunk = torch.compile( |
| accumulate_chunk, |
| fullgraph = True, |
| |
| dynamic = True, |
| options = torch_compile_options, |
| ) |
|
|
| grad_inputs_chunks = torch.chunk(grad_inputs, chunks = n_chunks, dim = 0) |
| new_logps = torch.chunk(_new_logps, chunks = n_chunks, dim = 0) |
| if _old_logps is not None: |
| old_logps = torch.chunk(_old_logps, chunks = n_chunks, dim = 0) |
| else: |
| old_logps = [None] * n_chunks |
| if _ref_logps is not None: |
| ref_logps = torch.chunk(_ref_logps, chunks = n_chunks, dim = 0) |
| else: |
| ref_logps = [None] * n_chunks |
| if _sampling_per_token_logps is not None: |
| sampling_per_token_logps = torch.chunk(_sampling_per_token_logps, chunks = n_chunks, dim = 0) |
| else: |
| sampling_per_token_logps = [None] * n_chunks |
| input_ids = torch.chunk(_input_ids, chunks = n_chunks, dim = 0) |
| mask = torch.chunk(_mask, chunks = n_chunks, dim = 0) |
| advantages = torch.chunk(_advantages, chunks = n_chunks, dim = 0) |
|
|
| |
| scaling = scaler.get_scale() if scaler is not None else 1.0 |
|
|
| |
| |
|
|
| for (grad_inputs_j, new_logps_j, old_logps_j, ref_logps_j, sampling_per_token_logps_j, input_ids_j, mask_j, advantages_j, ) in \ |
| zip(grad_inputs_chunks, new_logps, old_logps, ref_logps, sampling_per_token_logps, input_ids, mask, advantages): |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| accumulate_chunk( |
| new_logps_j, |
| old_logps_j, |
| ref_logps_j, |
| sampling_per_token_logps_j, |
| input_ids_j, |
| mask_j, |
| advantages_j, |
| scaling, |
| grad_inputs_j, |
| ) |
| pass |
|
|
| grad_inputs .div_(n_chunks) |
| accumulated_loss .div_(n_chunks) |
| accumulated_completion_length.div_(n_chunks) |
| accumulated_mean_kl .div_(n_chunks) |
|
|
| if _sampling_per_token_logps is not None: |
| accumulated_delta = torch.cat(accumulated_delta, dim=0) |
| accumulated_flat_is_ratio = torch.cat(accumulated_flat_is_ratio, dim=0) |
| else: |
| accumulated_delta = None |
| accumulated_flat_is_ratio = None |
| accumulated_coef_1 = torch.cat(accumulated_coef_1, dim=0) |
| ctx.save_for_backward(grad_inputs) |
| return ( |
| accumulated_loss, |
| accumulated_completion_length, |
| accumulated_mean_kl, |
| accumulated_delta, |
| accumulated_flat_is_ratio, |
| accumulated_coef_1 |
| ) |
| pass |
|
|
| @staticmethod |
| def backward(ctx, grad_output, dcompletion_length, dmean_kl, ddelta, ddflat_is_ratio, dcoef_1): |
| (grad_input,) = ctx.saved_tensors |
| return (grad_input, None, None, None, None, None, None, None, None, None, None, None) |
| pass |
|
|
| def grpo_accumulated_loss( |
| trainer, |
| input_ids, |
| attention_mask, |
| logits_to_keep, |
| completion_mask, |
| advantages, |
| old_logps, |
| ref_logps, |
| n_chunks = -1, |
| **kwargs, |
| ): |
| |
| bsz, qlen = input_ids.shape |
|
|
| pixel_values = kwargs.get('pixel_values',None) |
| image_grid_thw = kwargs.get('image_grid_thw',None) |
| pixel_attention_mask = kwargs.get('pixel_attention_mask',None) |
| image_sizes = kwargs.get('image_sizes',None) |
| sampling_per_token_logps = kwargs.get("sampling_per_token_logps", None) if getattr(trainer, "vllm_importance_sampling_correction", False) else None |
| temperature = kwargs.get("temperature", 1.0) |
| logit_scale_multiply = kwargs.get("logit_scale_multiply", 0.0) |
| logit_scale_divide = kwargs.get("logit_scale_divide", 0.0) |
| logit_softcapping = kwargs.get("logit_softcapping", 0.0) |
| prev_max_left_pad = kwargs.get("max_left_pad", 0) |
|
|
| |
| _ = kwargs.pop("sampling_per_token_logps", None) |
| kwargs["vllm_importance_sampling_cap"] = trainer.vllm_importance_sampling_cap if sampling_per_token_logps is not None else None |
| kwargs["get_sapo_token_loss"] = trainer.get_sapo_token_loss if hasattr(trainer, "get_sapo_token_loss") else None |
| kwargs["sapo_temperature_pos"] = trainer.args.sapo_temperature_pos if hasattr(trainer.args, "sapo_temperature_pos") else None |
| kwargs["sapo_temperature_neg"] = trainer.args.sapo_temperature_neg if hasattr(trainer.args, "sapo_temperature_neg") else None |
| kwargs["get_off_policy_mask"] = trainer.get_off_policy_mask if hasattr(trainer, "get_off_policy_mask") else None |
| kwargs["off_policy_mask_threshold"] = trainer.args.off_policy_mask_threshold if hasattr(trainer.args, "off_policy_mask_threshold") else None |
| kwargs["use_vllm"] = trainer.use_vllm |
| |
| factors = [i for i in range(1, bsz + 1) if bsz % i == 0] |
| if n_chunks == -1: n_chunks = bsz |
| n_chunks = factors[min(np.searchsorted(factors, n_chunks), len(factors)-1)] |
|
|
| if not hasattr(trainer, '_autocast_dtype'): |
| trainer._autocast_dtype = torch.float16 if os.environ.get('ACCELERATE_MIXED_PRECISION', 'fp16') == 'fp16' else torch.bfloat16 |
| if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': trainer._autocast_dtype = None |
| pass |
| os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "1" |
|
|
| lm_head = trainer.model.get_output_embeddings().weight |
| dtype_bytes = 16 if trainer._autocast_dtype in [torch.float16, torch.bfloat16] else 32 |
|
|
| total_rows = input_ids.shape[0] |
| seq_len = input_ids.shape[1] |
| hidden_dim = lm_head.shape[1] |
| vocab_dim = lm_head.shape[0] |
|
|
| if trainer.args.unsloth_grpo_mini_batch is None: |
| if not hasattr(trainer, "_has_autotuned"): |
| trainer._has_autotuned = True |
| B, multiplier = autotune_batch_and_chunks( |
| total_rows, seq_len, hidden_dim, vocab_dim, dtype_bytes, trainer.args.unsloth_logit_chunk_multiplier |
| ) |
| trainer.args.unsloth_grpo_mini_batch = total_rows//B |
| trainer.args.unsloth_logit_chunk_multiplier = multiplier |
| B = trainer.args.unsloth_grpo_mini_batch |
| multiplier = trainer.args.unsloth_logit_chunk_multiplier |
| elif trainer._step % trainer.current_gradient_accumulation_steps == 0: |
| B = trainer.args.unsloth_grpo_mini_batch |
| multiplier = trainer.args.unsloth_logit_chunk_multiplier |
| del trainer._has_autotuned |
| del trainer.args.unsloth_grpo_mini_batch |
| del trainer.args.unsloth_logit_chunk_multiplier |
| else: |
| B = trainer.unsloth_grpo_mini_batch |
| multiplier = trainer.args.unsloth_logit_chunk_multiplier |
| else: |
| if trainer.args.unsloth_grpo_mini_batch > total_rows: |
| B = total_rows |
| else: |
| B = trainer.args.unsloth_grpo_mini_batch |
|
|
| if trainer.args.unsloth_logit_chunk_multiplier is None: |
| multiplier = max(4, seq_len // 4096) |
| else: |
| multiplier = trainer.args.unsloth_logit_chunk_multiplier |
|
|
| if pixel_values is None: |
| left_pad_tokens_per_prompt = calculate_pad_tokens_in_prompt(input_ids, logits_to_keep, trainer.processing_class.pad_token_id) |
|
|
| |
| if old_logps is not None: |
| max_left_pad = old_logps.shape[1] - logits_to_keep |
| elif ref_logps is not None: |
| max_left_pad = ref_logps.shape[1] - logits_to_keep |
| else: |
| max_left_pad = torch.max(left_pad_tokens_per_prompt).item() |
|
|
| input_ids = left_pack_padding(input_ids, trainer.processing_class.pad_token_id) |
|
|
| completion_input_ids = input_ids[:, -(logits_to_keep +max_left_pad):] |
|
|
| completion_mask = create_completion_attention_mask(completion_input_ids, left_pad_tokens_per_prompt, max_left_pad, trainer.processing_class.pad_token_id).to(attention_mask.dtype) |
|
|
| if trainer.use_vllm and sampling_per_token_logps is not None and getattr(trainer, "vllm_importance_sampling_correction", False): |
| sampling_per_token_logps = align_logprobs_with_mask(sampling_per_token_logps, completion_mask) |
| else: |
| sampling_per_token_logps = None |
| attention_mask = input_ids != trainer.processing_class.pad_token_id |
| attention_mask = attention_mask.to(attention_mask.dtype) |
| else: |
| completion_input_ids = input_ids[:, -logits_to_keep:] |
|
|
| unwrapped_model = trainer.accelerator.unwrap_model(trainer.model, keep_fp32_wrapper = False) |
|
|
| for module in unwrapped_model.modules(): |
| if hasattr(module, "_hf_hook") and hasattr(module._hf_hook, "io_same_decice"): |
| module._hf_hook.io_same_decice = False |
| pass |
|
|
| all_logprobs_list = [] |
|
|
| attention_mask_chunks = torch.chunk(attention_mask, chunks=B, dim=0) |
| completion_ids_chunks = torch.chunk(completion_input_ids, chunks=B, dim=0) |
|
|
| def chunk_optional(tensor, chunks): |
| if tensor is None: |
| return [None] * chunks |
| return torch.chunk(tensor, chunks=chunks, dim=0) |
|
|
| import math |
| total_samples = input_ids.shape[0] |
| batch_size = math.ceil(total_samples / B) |
|
|
| input_ids_chunks = [] |
| attention_mask_chunks = [] |
| pixel_values_chunks = [] |
| image_grid_thw_chunks = [] |
| pixel_attention_mask_chunks = [] |
|
|
| current_pixel_idx = 0 |
| |
| for start in range(0, total_samples, batch_size): |
| end = start + batch_size |
|
|
| input_ids_chunks.append(input_ids[start:end]) |
| attention_mask_chunks.append(attention_mask[start:end]) |
|
|
| if image_grid_thw is not None and pixel_values is not None: |
|
|
| grid_slice = image_grid_thw[start:end] |
| image_grid_thw_chunks.append(grid_slice) |
| batch_pixel_count = grid_slice.prod(dim=-1).sum().item() |
|
|
| start_pixel_idx = current_pixel_idx |
| end_pixel_idx = current_pixel_idx + batch_pixel_count |
|
|
| pixel_values_chunks.append(pixel_values[start_pixel_idx:end_pixel_idx]) |
|
|
| if pixel_attention_mask is not None: |
| pixel_attention_mask_chunks.append( |
| pixel_attention_mask[start_pixel_idx:end_pixel_idx] |
| ) |
| else: |
| pixel_attention_mask_chunks.append(None) |
|
|
| current_pixel_idx = end_pixel_idx |
|
|
| else: |
| pixel_values_chunks.append(None) |
| image_grid_thw_chunks.append(None) |
| pixel_attention_mask_chunks.append(None) |
|
|
| if image_sizes is not None and not isinstance(image_sizes, torch.Tensor): |
| image_sizes_chunks = [[size] for size in image_sizes] |
| else: |
| image_sizes_chunks = chunk_optional(image_sizes, B) |
|
|
| zipped_inputs = zip( |
| input_ids_chunks, |
| attention_mask_chunks, |
| pixel_values_chunks, |
| image_grid_thw_chunks, |
| pixel_attention_mask_chunks, |
| image_sizes_chunks, |
| completion_ids_chunks |
| ) |
|
|
| if trainer._autocast_dtype is None: |
| autocaster = nullcontext() |
| else: |
| autocaster = torch.amp.autocast(device_type = trainer.model.device.type, dtype = trainer._autocast_dtype) |
|
|
| def to_device(tensor, device, non_blocking=True): |
| if tensor is None: return None |
| return tensor.to(device, non_blocking=non_blocking) |
|
|
| class Unsloth_Offloaded_Log_Softmax(torch.autograd.Function): |
| """ |
| Manual Gradient Checkpointing/CPU Offloading for Log Softmax. |
| """ |
| @staticmethod |
| def forward(ctx, hidden_states, lm_head, index, chunks, |
| logit_scale_multiply, logit_scale_divide, |
| logit_softcapping, temperature): |
|
|
| ctx.saved_hidden_states = to_device(hidden_states, "cpu", non_blocking=True) |
| ctx.device = hidden_states.device |
| ctx.dtype = hidden_states.dtype |
|
|
| ctx.lm_head = lm_head |
| ctx.lm_head_requires_grad = lm_head.requires_grad |
| ctx.index = index |
| ctx.args = (chunks, logit_scale_multiply, logit_scale_divide, logit_softcapping, temperature) |
|
|
| with torch.no_grad(): |
| output = chunked_hidden_states_selective_log_softmax( |
| hidden_states, lm_head, index, *ctx.args |
| ) |
|
|
| return output |
|
|
| @staticmethod |
| def backward(ctx, grad_output): |
| hidden_states = to_device(ctx.saved_hidden_states, ctx.device) |
| hidden_states = hidden_states.to(ctx.dtype) |
| hidden_states.requires_grad_(True) |
|
|
| lm_head = ctx.lm_head |
| |
| |
| |
| |
| |
|
|
| index = ctx.index |
|
|
| with torch.enable_grad(): |
| output = chunked_hidden_states_selective_log_softmax( |
| hidden_states, lm_head, index, *ctx.args |
| ) |
|
|
| torch.autograd.backward(output, grad_output) |
|
|
| return ( |
| hidden_states.grad, |
| lm_head.grad if ctx.lm_head_requires_grad else None, |
| None, |
| None, |
| None, |
| None, |
| None, |
| None, |
| ) |
|
|
| def efficient_log_softmax(hidden_states, lm_head, index, chunks=32, |
| logit_scale_multiply=0.0, logit_scale_divide=0.0, |
| logit_softcapping=0.0, temperature=1, batch_size=8): |
| if (index.shape[1] <= 1024 and batch_size <= 8) or batch_size==1: |
| |
| return chunked_hidden_states_selective_log_softmax( |
| hidden_states, |
| lm_head, |
| index, |
| chunks, |
| logit_scale_multiply, |
| logit_scale_divide, |
| logit_softcapping, |
| temperature |
| ) |
| else: |
| return Unsloth_Offloaded_Log_Softmax.apply( |
| hidden_states, lm_head, index, chunks, |
| logit_scale_multiply, logit_scale_divide, |
| logit_softcapping, temperature |
| ) |
| for ( |
| input_ids_chunk, |
| attention_mask_chunk, |
| pixel_values_chunk, |
| image_grid_thw_chunk, |
| pixel_attention_mask_chunk, |
| image_sizes_chunk, |
| completion_ids |
| ) in zipped_inputs: |
| with autocaster: |
| if pixel_values is None: |
| new_hidden_states_chunk = unwrapped_model( |
| input_ids = input_ids_chunk, |
| attention_mask = attention_mask_chunk, |
| pixel_values = pixel_values_chunk, |
| image_grid_thw = image_grid_thw_chunk, |
| pixel_attention_mask = pixel_attention_mask_chunk, |
| image_sizes = image_sizes_chunk, |
| ).logits |
|
|
| new_hidden_states_chunk = new_hidden_states_chunk[:, -(logits_to_keep + max_left_pad + 1): , :] |
| new_hidden_states_chunk = new_hidden_states_chunk[:, :-1, :] |
| else: |
| new_hidden_states_chunk = unwrapped_model( |
| input_ids = input_ids_chunk, |
| attention_mask = attention_mask_chunk, |
| pixel_values = pixel_values_chunk, |
| image_grid_thw = image_grid_thw_chunk, |
| pixel_attention_mask = pixel_attention_mask_chunk, |
| image_sizes = image_sizes_chunk, |
| logits_to_keep = logits_to_keep + 1, |
| ).logits |
|
|
| new_hidden_states_chunk = new_hidden_states_chunk[:, :-1, :] |
|
|
| logprobs_chunk = efficient_log_softmax( |
| new_hidden_states_chunk, |
| lm_head, |
| completion_ids, |
| chunks=input_ids_chunk.shape[0]*multiplier, |
| logit_scale_multiply=logit_scale_multiply, |
| logit_scale_divide=logit_scale_divide, |
| logit_softcapping=logit_softcapping, |
| temperature=temperature, |
| batch_size = B |
| ) |
| |
| |
| device_synchronize() |
| all_logprobs_list.append(logprobs_chunk) |
|
|
| new_logprobs = torch.cat(all_logprobs_list, dim=0) |
|
|
| with autocaster: |
| loss, completion_length, mean_kl, delta, flat_is_ratio, coef_1 = UnslothEfficientGRPO.apply( |
| new_logprobs, |
| old_logps, |
| ref_logps, |
| sampling_per_token_logps, |
| lm_head, |
| completion_input_ids, |
| completion_mask, |
| advantages, |
| trainer.beta, |
| trainer.accelerator.scaler, |
| 1, |
| kwargs |
| ) |
|
|
| |
| os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "0" |
|
|
| return loss, completion_length, mean_kl, delta, flat_is_ratio, coef_1 |
| |
| new_logits = torch.matmul(new_hidden_states, lm_head.t()) |
| new_logits = new_logits[:, :-1, :] |
| old_logits = torch.matmul(old_hidden_states, lm_head.t()) |
| old_logits = old_logits[:, :-1, :] |
| loss, completion_length, mean_kl = grpo_compute_loss( |
| old_logits, |
| new_logits, |
| completion_input_ids, |
| completion_mask, |
| trainer.beta, |
| advantages, |
| ) |
| return loss, completion_length, mean_kl |
| pass |
|
|
| @torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options) |
| def grpo_compute_loss_slow( |
| ref, |
| new, |
| old, |
| sampling_per_token_logps, |
| input_ids, |
| mask, |
| beta, |
| advantages, |
| **kwargs |
| ): |
| |
| |
| loss_type = kwargs.get("loss_type", "grpo") |
| epsilon_low = kwargs.get("epsilon_low", 0.2) |
| epsilon_high = kwargs.get("epsilon_high", 0.2) |
| max_completion_length = kwargs.get("max_completion_length", 8192) |
| delta = kwargs.get("delta", None) |
| importance_sampling_level = kwargs.get("importance_sampling_level", "token") |
| num_items_in_batch = kwargs.get("num_items_in_batch", None) |
| current_gradient_accumulation_steps = kwargs.get("current_gradient_accumulation_steps", 1) |
| num_processes = kwargs.get("num_processes", 1) |
| use_vllm = kwargs.get("use_vllm", False) |
| vllm_importance_sampling_cap = kwargs.get("vllm_importance_sampling_cap", 2.0) |
| get_sapo_token_loss = kwargs.get("get_sapo_token_loss", None) |
| sapo_temperature_pos = kwargs.get("sapo_temperature_pos", 1.0) |
| sapo_temperature_neg = kwargs.get("sapo_temperature_neg", 1.05) |
| get_off_policy_mask = kwargs.get("get_off_policy_mask", None) |
| off_policy_mask_threshold = kwargs.get("off_policy_mask_threshold", None) |
| input_ids = input_ids.unsqueeze(-1) |
|
|
| if advantages.dim() == 1: |
| advantages = advantages.unsqueeze(1) |
|
|
| if off_policy_mask_threshold is not None: |
| off_policy_mask = get_off_policy_mask( |
| advantages=advantages, |
| per_token_logps=new, |
| old_per_token_logps=old, |
| mask=mask, |
| off_policy_threshold=off_policy_mask_threshold, |
| ) |
|
|
| with torch.no_grad(): |
| if use_vllm and sampling_per_token_logps is not None: |
| |
| importance_sampling_ratio = torch.exp((old * mask) - sampling_per_token_logps) |
| importance_sampling_ratio = torch.clamp( |
| importance_sampling_ratio, max=vllm_importance_sampling_cap |
| ) |
| pass |
|
|
| |
| |
| |
| if old is not None: |
| log_ratio = new - old |
| else: |
| log_ratio = new - new.detach() |
|
|
| if importance_sampling_level == "token": |
| log_importance_weights = log_ratio |
| elif importance_sampling_level == "sequence": |
| log_importance_weights = (log_ratio * mask).sum(-1) / mask.sum(-1).clamp(min=1.0) |
| log_importance_weights = log_importance_weights.unsqueeze(-1) |
| else: |
| raise ValueError( |
| f"Unknown importance sampling level: {importance_sampling_level}. Possible values are 'token' " |
| "and 'sequence'." |
| ) |
|
|
| coef_1 = torch.exp(log_importance_weights) |
|
|
| |
| |
| if beta != 0.0: |
| kl_i = torch.exp(ref - new) - (ref - new) - 1.0 |
|
|
| else: |
| |
| if importance_sampling_level == "sequence": |
| kl_i = new.new_zeros(new.size(0), 1) |
| else: |
| kl_i = torch.zeros_like(new) |
| |
| |
|
|
| |
| |
| if loss_type == "cispo": |
| clamped_ratios = torch.clamp(coef_1, max=epsilon_high).detach() |
| loss_i = -clamped_ratios * advantages * new |
| |
| elif loss_type in ["grpo", "bnpo", "dr_grpo", "dapo"]: |
| coef_2 = torch.clamp(coef_1, 1 - epsilon_low, 1 + epsilon_high) |
|
|
| if delta is not None: |
| loss_1 = torch.clamp(coef_1, max=delta) * advantages |
| else: |
| loss_1 = coef_1 * advantages |
| pass |
| loss_2 = coef_2 * advantages |
| loss_i = -torch.min(loss_1, loss_2) |
| elif loss_type == "sapo": |
| if get_sapo_token_loss is None: |
| raise Exception(f"sapo is only available in TRL 0.26.0+") |
| loss_i = torch.empty_like(coef_1) |
| positive_advantages_mask = advantages.repeat([1, coef_1.shape[1]]) > 0 |
| |
| if coef_1[positive_advantages_mask].numel() != 0: |
| loss_i[positive_advantages_mask] = get_sapo_token_loss( |
| coef_1[positive_advantages_mask], sapo_temperature_pos |
| ) |
| if coef_1[~positive_advantages_mask].numel() != 0: |
| loss_i[~positive_advantages_mask] = get_sapo_token_loss( |
| coef_1[~positive_advantages_mask], sapo_temperature_neg |
| ) |
| loss_i = -loss_i * advantages |
| else: |
| raise ValueError(f"Unknown loss type: {loss_type}") |
|
|
| if off_policy_mask_threshold is not None: |
| loss_i = loss_i * off_policy_mask |
|
|
| if use_vllm and sampling_per_token_logps is not None: |
| loss_i = loss_i * importance_sampling_ratio |
| |
| with torch.no_grad(): |
| delta = torch.abs(old - sampling_per_token_logps) |
| delta = delta * mask |
| flat_is_ratio = importance_sampling_ratio * mask |
| else: |
| delta = torch.tensor([]).detach() |
| flat_is_ratio = torch.tensor([]).detach() |
| if beta != 0.0: |
| loss_i = loss_i + beta * kl_i |
|
|
| mask = mask.to(torch.float32) |
| n_mask_per_reward = mask.sum(1) |
|
|
| |
| if loss_type in ["grpo", "sapo"]: |
| loss = ((loss_i * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)).mean() |
| loss = loss / current_gradient_accumulation_steps |
| elif loss_type == "bnpo": |
| loss = (loss_i * mask).sum() / mask.sum().clamp(min=1.0) |
| loss = loss / current_gradient_accumulation_steps |
| elif loss_type == "dr_grpo": |
| loss = (loss_i * mask).sum() / (loss_i.size(0) * max_completion_length) |
| loss = loss / current_gradient_accumulation_steps |
| elif loss_type in ["cispo", "dapo"]: |
| normalizer = num_items_in_batch/ num_processes |
| loss = (loss_i * mask).sum() / normalizer |
| else: |
| raise ValueError(f"Unknown loss type: {loss_type}") |
|
|
| |
|
|
| |
| def masked_batch_mean(x): |
| with torch.inference_mode(): |
| completion_length = n_mask_per_reward.mean() |
| if x.shape[1] == 1: |
| return completion_length, x.mean() |
| else: |
| mean_kl_per_reward = (x * mask).sum(1) / n_mask_per_reward |
| mean_kl = mean_kl_per_reward.mean() |
| return completion_length, mean_kl |
| completion_length, mean_kl = masked_batch_mean(kl_i) |
| return loss, completion_length, mean_kl, delta, flat_is_ratio, coef_1 |
|
|
| def grpo_update_SamplingParams(SamplingParams, generation_kwargs, vllm_sampling_params = None): |
| good_sampling_params_keys = inspect.signature(SamplingParams).parameters.keys() |
|
|
| |
| new_generation_kwargs = {} |
| for key in generation_kwargs.keys(): |
| if key in good_sampling_params_keys: |
| new_generation_kwargs[key] = generation_kwargs[key] |
| generation_kwargs = new_generation_kwargs |
|
|
| if vllm_sampling_params is not None: |
| for key in good_sampling_params_keys: |
| if hasattr(vllm_sampling_params, key): |
| overwrited_key = getattr(vllm_sampling_params, key) |
| if overwrited_key is not None and (type(overwrited_key) in (list, tuple,) and len(overwrited_key) != 0): |
| generation_kwargs[key] = overwrited_key |
| return generation_kwargs |
|
|
| def _get_inference_mode_context_manager(model: torch.nn.Module): |
| """ |
| If the state dict was quantized using torchao, we will run into |
| the following error when calling ops like aten.t() in inference mode. |
| This is a bug in PyTorch that affects all tensor subclasses. |
| |
| Cannot set version_counter for inference tensor |
| |
| For now, we work around this issue by using `torch.no_grad()` in this case. |
| See https://github.com/pytorch/pytorch/issues/164872 for more details. |
| Otherwise, just return `torch.inference_mode()`. |
| """ |
| torchao_config = getattr(model, "torchao_config", None) |
| if torchao_config is not None and torchao_config.qat_scheme is None: |
| return torch.no_grad() |
| else: |
| return torch.inference_mode() |
|
|
| def vLLMSamplingParams(**kwargs): |
| from vllm import SamplingParams |
|
|
| sampling_params = SamplingParams(**kwargs) |
| sampling_params._set_kwargs = kwargs |
| return sampling_params |
| @dataclass |
| class UnslothGRPOConfig(GRPOConfig): |
| """ |
| |
| Configuration class for the [`GRPOTrainer`]. |
| |
| This class includes only the parameters that are specific to GRPO training. For a full list of training arguments, |
| please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may |
| differ from those in [`~transformers.TrainingArguments`]. |
| |
| Using [`~transformers.HfArgumentParser`] we can turn this class into |
| [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the |
| command line. |
| |
| Parameters: |
| > Parameters that control the model and reference model |
| |
| model_init_kwargs (`str`, `dict[str, Any]`, *optional*): |
| Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model` |
| argument of the [`GRPOTrainer`] is provided as a string. |
| disable_dropout (`bool`, *optional*, defaults to `False`): |
| Whether to disable dropout in the model. This is useful for training with a reference model, as it prevents |
| the model from generating different logprobs for the same input. |
| |
| > Parameters that control the data preprocessing |
| |
| remove_unused_columns (`bool`, *optional*, defaults to `False`): |
| Whether to only keep the column `"prompt"` in the dataset. If you use a custom reward function that |
| requires any column other than `"prompts"` and `"completions"`, you should keep this to `False`. |
| max_prompt_length (`int` or `None`, *optional*, defaults to `512`): |
| Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left. |
| num_generations (`int` or `None`, *optional*, defaults to `8`): |
| Number of generations per prompt to sample. The effective batch size (num_processes * per_device_batch_size |
| * gradient_accumulation_steps) must be evenly divisible by this value. |
| max_completion_length (`int` or `None`, *optional*, defaults to `256`): |
| Maximum length of the generated completion. |
| ds3_gather_for_generation (`bool`, *optional*, defaults to `True`): |
| This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, |
| improving generation speed. However, disabling this option allows training models that exceed the VRAM |
| capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible |
| with vLLM generation. |
| shuffle_dataset (`bool`, *optional*, defaults to `True`): |
| Whether to shuffle the training dataset. |
| |
| > Parameters that control generation |
| |
| generation_batch_size: (`int`, *optional*): |
| Batch size to use for generation. If `None`, it defaults to the effective training batch size: |
| `per_device_train_batch_size * num_processes * steps_per_generation`. In other words, there is one |
| generation batch processed per optimization step. Mutually exclusive with `steps_per_generation`. |
| steps_per_generation: (`int`, *optional*): |
| Number of steps per generation. If `None`, it defaults to `gradient_accumulation_steps`. Mutually exclusive |
| with `generation_batch_size`. |
| temperature (`float`, defaults to `1.0`): |
| Temperature for sampling. The higher the temperature, the more random the completions. |
| top_p (`float`, *optional*, defaults to `1.0`): |
| Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to |
| `1.0` to consider all tokens. |
| top_k (`int`, *optional*): |
| Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, top-k-filtering is |
| disabled and all tokens are considered. |
| min_p (`float`, *optional*): |
| Minimum token probability, which will be scaled by the probability of the most likely token. It must be a |
| value between `0.0` and `1.0`. Typical values are in the `0.01-0.2` range. |
| repetition_penalty (`float`, *optional*, defaults to `1.0`): |
| Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far. |
| Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat |
| tokens. |
| use_transformers_paged (`bool`, *optional*, defaults to `False`): |
| Whether to use the `transformers` paged implementation for generation. If set to `True`, the `transformers` |
| paged implementation will be used for generation instead of the default padded implementation. This |
| parameter is only effective when `use_vllm` is set to `False`. |
| cache_implementation (`str`, *optional*): |
| Implementation of the cache method for faster generation when `use_vllm` is set to `False`. |
| generation_kwargs (`dict[str, Any]`, *optional*): |
| Additional keyword arguments to pass to [`~transformers.GenerationConfig`] (if using transformers) or |
| `SamplingParams` (if using vLLM) when sampling completions. This can be used to further customize the |
| generation behavior, such as setting `suppress_tokens`, `num_beams`, etc. If it contains keys that conflict |
| with the other generation parameters (like `min_p`, `top_p`, etc.), they will override them. |
| |
| > Parameters that control generation acceleration powered by vLLM |
| |
| use_vllm (`bool`, *optional*, defaults to `False`): |
| Whether to use vLLM for generating completions. If set to `True`, the trainer will use vLLM for generation |
| instead of the default model.generate(). Requires `vllm` to be installed. |
| vllm_mode (`str`, *optional*, defaults to `"server"`): |
| Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `"server"` or |
| `"colocate"`. |
| |
| - `"server"`: The trainer will send generation requests to a separate vLLM server. Make sure a TRL vLLM |
| server is running (start with `trl vllm-serve`). |
| - `"colocate"`: vLLM will run in the same process and share the training GPUs. This avoids the need for a |
| separate server but may cause resource contention with training. |
| vllm_model_impl (`str`, *optional*, defaults to `"vllm"`): |
| Model implementation to use for vLLM. Must be one of `"transformers"` or `"vllm"`. `"transformers"`: Use |
| the `transformers` backend for model implementation. `"vllm"`: Use the `vllm` library for model |
| implementation. |
| vllm_guided_decoding_regex (`str`, *optional*): |
| Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled. |
| |
| > Parameters that control the vLLM server (only used when `vllm_mode` is `"server"`) |
| |
| vllm_server_base_url (`str`, *optional*): |
| Base URL for the vLLM server (e.g., `"http://localhost:8000"`). If provided, `vllm_server_host` and |
| `vllm_server_port` are ignored. |
| vllm_server_host (`str`, *optional*, defaults to `"0.0.0.0"`): |
| Host of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided. |
| vllm_server_port (`int`, *optional*, defaults to `8000`): |
| Port of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided. |
| vllm_server_timeout (`float`, *optional*, defaults to `240.0`): |
| Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up after the |
| timeout, a `ConnectionError` is raised. |
| |
| > Parameters that control colocated vLLM execution (only used when `vllm_mode` is `"colocate"`) |
| |
| vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.3`): |
| Control the GPU memory utilization for vLLM. This setting only applies when `vllm_mode` is set to |
| `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when |
| launching the vLLM server via the `--vllm_gpu_memory_utilization` flag. |
| vllm_tensor_parallel_size (`int`, *optional*, defaults to `1`): |
| Control the tensor parallel size for vLLM. This setting only applies when `vllm_mode` is set to |
| `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when |
| launching the vLLM server via the `--vllm_tensor_parallel_size` flag. |
| vllm_enable_sleep_mode (`bool`, *optional*, defaults to `False`): |
| Whether to enable sleep mode for vLLM. If `True`, vLLM will sleep during the optimization step and woken |
| for weight sync and generation. |
| |
| > Parameters that control the training |
| |
| beta (`float`, *optional*, defaults to `0.0`): |
| KL coefficient. If `0.0` (default), the reference model is not loaded, reducing memory usage and improving |
| training speed. |
| num_iterations (`int`, *optional*, defaults to `1`): |
| Number of iterations per batch (denoted as μ in the algorithm). |
| epsilon (`float`, *optional*, defaults to `0.2`): |
| Epsilon value for clipping. |
| delta (`float`, *optional*): |
| Enables the upper clipping bound in two-sided GRPO loss when set to a float. If `None` (default), standard |
| GRPO clipping is used. Recommended to be greater than `1 + ε` when enabled. This method is introduced in |
| the [INTELLECT-2 tech report](https://huggingface.co/papers/2505.07291). |
| epsilon_high (`float`, *optional*): |
| Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the lower-bound |
| specified in argument `epsilon`. Paper [DAPO](https://huggingface.co/papers/2503.14476) recommends `0.28`. |
| importance_sampling_level (`str`, *optional*, defaults to `"token"`): |
| Controls whether importance sampling ratios are computed at the `"token"` or `"sequence"` level. `"token"` |
| keeps the raw per-token log-probability ratios (one weight per token). `"sequence"` averages the |
| log-probability ratios across valid tokens to produce a single ratio per sequence. The [GSPO |
| paper](https://huggingface.co/papers/2507.18071) shows that sequence-level sampling often yields more |
| stable training and better alignment with sequence-level rewards. |
| reward_weights (`list[float]`, *optional*): |
| Weights for each reward function. Must match the number of reward functions. If `None`, all rewards are |
| weighted equally with weight `1.0`. |
| scale_rewards (`str` or `bool`, *optional*, defaults to `"group"`): |
| Specifies the scaling strategy for rewards. Supported values are: |
| |
| - `True` or `"group"` (default): rewards are scaled by the standard deviation within each group, ensuring |
| unit variance within a group. |
| - `"batch"`: rewards are scaled by the standard deviation across the entire batch, as recommended in the |
| [PPO Lite paper](https://huggingface.co/papers/2508.08221). |
| - `False` or `"none"`: no scaling is applied. The [Dr. GRPO |
| paper](https://huggingface.co/papers/2503.20783) recommends not scaling rewards, as scaling by the |
| standard deviation introduces a question-level difficulty bias. |
| loss_type (`str`, *optional*, defaults to `"dapo"`): |
| Specifies the loss formulation to use. Supported values are: |
| |
| - `"grpo"`: Aggregates token-level losses by normalizing over sequence length. Not recommended due to |
| length bias—this approach tends to prefer shorter completions with positive advantages and longer ones |
| with negative advantages. |
| - `"dr_grpo"`: Aggregates token-level losses by normalizing with a global constant. This method was |
| introduced in the [Dr. GRPO paper](https://huggingface.co/papers/2503.20783) to eliminate length bias. |
| The value of the constant corresponds to `max_completion_length`. |
| - `"dapo"` (default): Aggregates token-level losses by normalizing with the number of active token in the |
| global accumulated batch. This method was introduced in the [DAPO |
| paper](https://huggingface.co/papers/2503.14476) to eliminate length bias. |
| - `"bnpo"`: Aggregates token-level losses by normalizing with the number of active token in the local |
| batch. Note that normalization is performed over the local batch only, so results may slightly vary |
| depending on the local batch size, despite a constant effective batch size. When using |
| `per_device_train_batch_size==1`, the loss is equivalent to the GRPO loss. |
| mask_truncated_completions (`bool`, *optional*, defaults to `False`): |
| When enabled, truncated completions are excluded from the loss calculation, preventing them from being |
| incorrectly penalized and introducing noise during training. According to the |
| [DAPO](https://huggingface.co/papers/2503.14476) paper, this is a good practice for training stability. |
| sync_ref_model (`bool`, *optional*, defaults to `False`): |
| Whether to synchronize the reference model with the active model every `ref_model_sync_steps` steps, using |
| the `ref_model_mixup_alpha` parameter. This synchronization originates from the |
| [TR-DPO](https://huggingface.co/papers/2404.09656) paper. |
| ref_model_mixup_alpha (`float`, *optional*, defaults to `0.6`): |
| α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix |
| between the current policy and the previous reference policy during updates. The reference policy is |
| updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you |
| must set `sync_ref_model=True`. |
| ref_model_sync_steps (`int`, *optional*, defaults to `512`): |
| τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how |
| frequently the current policy is synchronized with the reference policy. To use this parameter, you must |
| set `sync_ref_model=True`. |
| top_entropy_quantile (`float`, *optional*, defaults to `1.0`): |
| ρ parameter from [Beyond the 80/20 Rule](https://huggingface.co/papers/2506.01939). Keeps in the policy |
| loss term only the top-ρ quantile of tokens by entropy of the probability distribution at each sequence |
| position, improving results. Range: `[0.0-1.0]`. A value of `0.0` masks all but the highest entropy token; |
| `1.0` keeps all tokens. The paper recommends a value of `0.2`. If used with |
| `mask_truncated_completions=True`, only tokens from non-truncated completions are considered. |
| use_liger_loss (`bool`, *optional*, defaults to `False`): |
| Whether to use the Liger GRPO loss. |
| vllm_importance_sampling_correction (`bool`, *optional*, defaults to `True`): |
| Whether to apply Truncated Importance Sampling (TIS) between vLLM completion logprobs and recomputed |
| logprobs. [Your Efficient RL Framework Secretly Brings You Off-Policy RL |
| Training](https://fengyao.notion.site/off-policy-rl) highlights that using a separate generation framework |
| (such as vLLM) can introduce off-policy effects due to subtle implementation differences between generation |
| and training backends. TIS is proposed as a remedy for this issue. |
| vllm_importance_sampling_cap (`float`, *optional*, defaults to `2.0`): |
| Truncation parameter C for Truncated Importance Sampling (TIS). This sets an upper bound on the importance |
| sampling ratio, improving training stability. |
| |
| > Parameters that control the logging |
| |
| log_completions (`bool`, *optional*, defaults to `False`): |
| Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is installed, |
| it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`. |
| num_completions_to_print (`int`, *optional*): |
| Number of completions to print with `rich`. If `None`, all completions are logged. |
| wandb_log_unique_prompts (`bool`, *optional*, defaults to `False`): |
| Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, all prompts |
| are logged. |
| |
| """ |
| vllm_sampling_params: Optional[Any] = field( |
| default = None, |
| metadata = {'help': 'vLLM SamplingParams'}, |
| ) |
| unsloth_num_chunks : Optional[int] = field( |
| default = -1, |
| metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, |
| ) |
| unsloth_logit_chunk_multiplier : Optional[int] = field( |
| default = None, |
| metadata = {'help': 'Multiplier for chunked logit computations.'}, |
| ) |
| unsloth_grpo_mini_batch : Optional[int] = field( |
| default = None, |
| metadata = {'help': 'Mini batch size for GRPO hidden state accumulation. Default is None unless user defines it.'}, |
| ) |
| |
| def __init__( |
| self, |
| output_dir = None, |
| per_device_train_batch_size = 4, |
| num_train_epochs = 3.0, |
| max_steps = -1, |
| learning_rate = 5e-05, |
| lr_scheduler_type = 'linear', |
| lr_scheduler_kwargs = None, |
| warmup_steps = 0.1, |
| optim = 'adamw_8bit', |
| optim_args = None, |
| weight_decay = 0.01, |
| adam_beta1 = 0.9, |
| adam_beta2 = 0.999, |
| adam_epsilon = 1e-08, |
| optim_target_modules = None, |
| gradient_accumulation_steps = 2, |
| average_tokens_across_devices = True, |
| max_grad_norm = 1.0, |
| label_smoothing_factor = 0.0, |
| bf16 = False, |
| fp16 = False, |
| bf16_full_eval = False, |
| fp16_full_eval = False, |
| tf32 = None, |
| gradient_checkpointing = True, |
| gradient_checkpointing_kwargs = None, |
| torch_compile = False, |
| torch_compile_backend = None, |
| torch_compile_mode = None, |
| use_liger_kernel = False, |
| liger_kernel_config = None, |
| use_cache = False, |
| neftune_noise_alpha = None, |
| torch_empty_cache_steps = 250, |
| auto_find_batch_size = False, |
| logging_strategy = 'steps', |
| logging_steps = 1, |
| logging_first_step = False, |
| log_on_each_node = True, |
| logging_nan_inf_filter = False, |
| include_num_input_tokens_seen = False, |
| log_level = 'passive', |
| log_level_replica = 'warning', |
| disable_tqdm = None, |
| report_to = 'none', |
| run_name = None, |
| project = 'huggingface', |
| trackio_space_id = 'trackio', |
| eval_strategy = 'no', |
| eval_steps = None, |
| eval_delay = 0, |
| per_device_eval_batch_size = 4, |
| prediction_loss_only = False, |
| eval_on_start = False, |
| eval_do_concat_batches = True, |
| eval_use_gather_object = False, |
| eval_accumulation_steps = 2, |
| batch_eval_metrics = False, |
| save_only_model = False, |
| save_strategy = 'steps', |
| save_steps = 500, |
| save_on_each_node = False, |
| save_total_limit = None, |
| enable_jit_checkpoint = False, |
| push_to_hub = False, |
| hub_token = None, |
| hub_private_repo = None, |
| hub_model_id = None, |
| hub_strategy = 'every_save', |
| hub_always_push = False, |
| hub_revision = None, |
| load_best_model_at_end = False, |
| metric_for_best_model = None, |
| greater_is_better = None, |
| ignore_data_skip = False, |
| restore_callback_states_from_checkpoint = False, |
| full_determinism = False, |
| seed = 3407, |
| data_seed = 3407, |
| use_cpu = False, |
| accelerator_config = None, |
| parallelism_config = None, |
| dataloader_drop_last = False, |
| dataloader_num_workers = 0, |
| dataloader_pin_memory = True, |
| dataloader_persistent_workers = False, |
| dataloader_prefetch_factor = None, |
| remove_unused_columns = False, |
| label_names = None, |
| train_sampling_strategy = 'random', |
| length_column_name = 'length', |
| ddp_find_unused_parameters = None, |
| ddp_bucket_cap_mb = None, |
| ddp_broadcast_buffers = None, |
| ddp_backend = None, |
| ddp_timeout = 1800, |
| fsdp = None, |
| fsdp_config = None, |
| deepspeed = None, |
| debug = '', |
| skip_memory_metrics = True, |
| do_train = False, |
| do_eval = False, |
| do_predict = False, |
| resume_from_checkpoint = None, |
| warmup_ratio = None, |
| logging_dir = None, |
| local_rank = -1, |
| model_init_kwargs = None, |
| disable_dropout = False, |
| max_prompt_length = 512, |
| num_generations = 8, |
| max_completion_length = 256, |
| ds3_gather_for_generation = True, |
| shuffle_dataset = True, |
| generation_batch_size = None, |
| steps_per_generation = None, |
| temperature = 1.0, |
| top_p = 1.0, |
| top_k = None, |
| min_p = None, |
| generation_kwargs = {}, |
| repetition_penalty = 1.0, |
| use_transformers_paged = False, |
| cache_implementation = None, |
| use_vllm = False, |
| vllm_mode = 'colocate', |
| vllm_model_impl = 'vllm', |
| vllm_enable_sleep_mode = False, |
| vllm_guided_decoding_regex = None, |
| vllm_server_base_url = None, |
| vllm_server_host = '0.0.0.0', |
| vllm_server_port = 8000, |
| vllm_server_timeout = 240.0, |
| vllm_gpu_memory_utilization = 0.3, |
| vllm_tensor_parallel_size = 1, |
| beta = 0.001, |
| num_iterations = 1, |
| epsilon = 0.2, |
| delta = None, |
| epsilon_high = None, |
| importance_sampling_level = 'token', |
| reward_weights = None, |
| scale_rewards = 'group', |
| loss_type = 'bnpo', |
| mask_truncated_completions = False, |
| sync_ref_model = False, |
| ref_model_mixup_alpha = 0.6, |
| ref_model_sync_steps = 512, |
| top_entropy_quantile = 1.0, |
| use_liger_loss = False, |
| vllm_importance_sampling_correction = False, |
| vllm_importance_sampling_cap = 2.0, |
| log_completions = False, |
| num_completions_to_print = None, |
| wandb_log_unique_prompts = False, |
| vllm_sampling_params = None, |
| unsloth_num_chunks = -1, |
| unsloth_logit_chunk_multiplier = None, |
| unsloth_grpo_mini_batch = None, |
| |
| **kwargs, |
| ): |
| if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') |
| if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') |
| if num_train_epochs is None: |
| num_train_epochs = 3.0 |
| if output_dir is None and save_strategy == 'steps' and save_steps == 500: |
| output_dir = 'unsloth_training_checkpoints' |
| save_strategy = 'no' |
| if loss_type.lower() == 'dr_grpo': |
| loss_type = 'dr_grpo' |
| elif loss_type.lower() == 'dapo': |
| loss_type = 'dapo' |
| if loss_type.lower() == 'dr_grpo': |
| if scale_rewards == None: |
| scale_rewards = True |
| elif scale_rewards == True: |
| print('Unsloth: The Dr GRPO paper recommends setting `scale_rewards` to False! Will override. Set it to `None` to force False.') |
| scale_rewards = False |
| elif loss_type.lower() == 'dapo': |
| if mask_truncated_completions != True: |
| print('Unsloth: The DAPO paper recommends `mask_truncated_completions = True` - we will set it.') |
| if epsilon_high != 0.28: |
| print('Unsloth: The DAPO paper recommends `epsilon_high = 0.28` - we will set it.') |
| if beta != 0.0: |
| print(f'[WARNING] Unsloth: The DAPO paper recommends setting `beta = 0.0` to remove the KL term - You have set it to {beta}.') |
| mask_truncated_completions = True |
| epsilon_high = 0.28 |
| |
| if steps_per_generation is None and generation_batch_size is None: |
| ga = gradient_accumulation_steps |
| world_size = int(os.environ.get('WORLD_SIZE', '1')) |
| if (ga * world_size * per_device_train_batch_size) % num_generations != 0: |
| print('Unsloth: We now expect `per_device_train_batch_size` * `gradient_accumulation_steps` * `world_size` to be a multiple of `num_generations`.\nWe will change the batch size of ' + str(per_device_train_batch_size) + ' to the `num_generations` of ' + str(num_generations)) |
| per_device_train_batch_size = num_generations |
| |
| if temperature <= 0: |
| raise ValueError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.') |
| elif temperature >= 10: |
| raise ValueError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.') |
| |
| if use_vllm and (top_k is None or top_k == 0): top_k = -1 |
| |
| super().__init__( |
| output_dir = output_dir, |
| per_device_train_batch_size = per_device_train_batch_size, |
| num_train_epochs = num_train_epochs, |
| max_steps = max_steps, |
| learning_rate = learning_rate, |
| lr_scheduler_type = lr_scheduler_type, |
| lr_scheduler_kwargs = lr_scheduler_kwargs, |
| warmup_steps = warmup_steps, |
| optim = optim, |
| optim_args = optim_args, |
| weight_decay = weight_decay, |
| adam_beta1 = adam_beta1, |
| adam_beta2 = adam_beta2, |
| adam_epsilon = adam_epsilon, |
| optim_target_modules = optim_target_modules, |
| gradient_accumulation_steps = gradient_accumulation_steps, |
| average_tokens_across_devices = average_tokens_across_devices, |
| max_grad_norm = max_grad_norm, |
| label_smoothing_factor = label_smoothing_factor, |
| bf16 = bf16, |
| fp16 = fp16, |
| bf16_full_eval = bf16_full_eval, |
| fp16_full_eval = fp16_full_eval, |
| tf32 = tf32, |
| gradient_checkpointing = gradient_checkpointing, |
| gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, |
| torch_compile = torch_compile, |
| torch_compile_backend = torch_compile_backend, |
| torch_compile_mode = torch_compile_mode, |
| use_liger_kernel = use_liger_kernel, |
| liger_kernel_config = liger_kernel_config, |
| use_cache = use_cache, |
| neftune_noise_alpha = neftune_noise_alpha, |
| torch_empty_cache_steps = torch_empty_cache_steps, |
| auto_find_batch_size = auto_find_batch_size, |
| logging_strategy = logging_strategy, |
| logging_steps = logging_steps, |
| logging_first_step = logging_first_step, |
| log_on_each_node = log_on_each_node, |
| logging_nan_inf_filter = logging_nan_inf_filter, |
| include_num_input_tokens_seen = include_num_input_tokens_seen, |
| log_level = log_level, |
| log_level_replica = log_level_replica, |
| disable_tqdm = disable_tqdm, |
| report_to = report_to, |
| run_name = run_name, |
| project = project, |
| trackio_space_id = trackio_space_id, |
| eval_strategy = eval_strategy, |
| eval_steps = eval_steps, |
| eval_delay = eval_delay, |
| per_device_eval_batch_size = per_device_eval_batch_size, |
| prediction_loss_only = prediction_loss_only, |
| eval_on_start = eval_on_start, |
| eval_do_concat_batches = eval_do_concat_batches, |
| eval_use_gather_object = eval_use_gather_object, |
| eval_accumulation_steps = eval_accumulation_steps, |
| batch_eval_metrics = batch_eval_metrics, |
| save_only_model = save_only_model, |
| save_strategy = save_strategy, |
| save_steps = save_steps, |
| save_on_each_node = save_on_each_node, |
| save_total_limit = save_total_limit, |
| enable_jit_checkpoint = enable_jit_checkpoint, |
| push_to_hub = push_to_hub, |
| hub_token = hub_token, |
| hub_private_repo = hub_private_repo, |
| hub_model_id = hub_model_id, |
| hub_strategy = hub_strategy, |
| hub_always_push = hub_always_push, |
| hub_revision = hub_revision, |
| load_best_model_at_end = load_best_model_at_end, |
| metric_for_best_model = metric_for_best_model, |
| greater_is_better = greater_is_better, |
| ignore_data_skip = ignore_data_skip, |
| restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, |
| full_determinism = full_determinism, |
| seed = seed, |
| data_seed = data_seed, |
| use_cpu = use_cpu, |
| accelerator_config = accelerator_config, |
| parallelism_config = parallelism_config, |
| dataloader_drop_last = dataloader_drop_last, |
| dataloader_num_workers = dataloader_num_workers, |
| dataloader_pin_memory = dataloader_pin_memory, |
| dataloader_persistent_workers = dataloader_persistent_workers, |
| dataloader_prefetch_factor = dataloader_prefetch_factor, |
| remove_unused_columns = remove_unused_columns, |
| label_names = label_names, |
| train_sampling_strategy = train_sampling_strategy, |
| length_column_name = length_column_name, |
| ddp_find_unused_parameters = ddp_find_unused_parameters, |
| ddp_bucket_cap_mb = ddp_bucket_cap_mb, |
| ddp_broadcast_buffers = ddp_broadcast_buffers, |
| ddp_backend = ddp_backend, |
| ddp_timeout = ddp_timeout, |
| fsdp = fsdp, |
| fsdp_config = fsdp_config, |
| deepspeed = deepspeed, |
| debug = debug, |
| skip_memory_metrics = skip_memory_metrics, |
| do_train = do_train, |
| do_eval = do_eval, |
| do_predict = do_predict, |
| resume_from_checkpoint = resume_from_checkpoint, |
| warmup_ratio = warmup_ratio, |
| logging_dir = logging_dir, |
| local_rank = local_rank, |
| model_init_kwargs = model_init_kwargs, |
| disable_dropout = disable_dropout, |
| max_prompt_length = max_prompt_length, |
| num_generations = num_generations, |
| max_completion_length = max_completion_length, |
| ds3_gather_for_generation = ds3_gather_for_generation, |
| shuffle_dataset = shuffle_dataset, |
| generation_batch_size = generation_batch_size, |
| steps_per_generation = steps_per_generation, |
| temperature = temperature, |
| top_p = top_p, |
| top_k = top_k, |
| min_p = min_p, |
| generation_kwargs = generation_kwargs, |
| repetition_penalty = repetition_penalty, |
| use_transformers_paged = use_transformers_paged, |
| cache_implementation = cache_implementation, |
| use_vllm = use_vllm, |
| vllm_mode = vllm_mode, |
| vllm_model_impl = vllm_model_impl, |
| vllm_enable_sleep_mode = vllm_enable_sleep_mode, |
| vllm_guided_decoding_regex = vllm_guided_decoding_regex, |
| vllm_server_base_url = vllm_server_base_url, |
| vllm_server_host = vllm_server_host, |
| vllm_server_port = vllm_server_port, |
| vllm_server_timeout = vllm_server_timeout, |
| vllm_gpu_memory_utilization = vllm_gpu_memory_utilization, |
| vllm_tensor_parallel_size = vllm_tensor_parallel_size, |
| beta = beta, |
| num_iterations = num_iterations, |
| epsilon = epsilon, |
| delta = delta, |
| epsilon_high = epsilon_high, |
| importance_sampling_level = importance_sampling_level, |
| reward_weights = reward_weights, |
| scale_rewards = scale_rewards, |
| loss_type = loss_type, |
| mask_truncated_completions = mask_truncated_completions, |
| sync_ref_model = sync_ref_model, |
| ref_model_mixup_alpha = ref_model_mixup_alpha, |
| ref_model_sync_steps = ref_model_sync_steps, |
| top_entropy_quantile = top_entropy_quantile, |
| use_liger_loss = use_liger_loss, |
| vllm_importance_sampling_correction = vllm_importance_sampling_correction, |
| vllm_importance_sampling_cap = vllm_importance_sampling_cap, |
| log_completions = log_completions, |
| num_completions_to_print = num_completions_to_print, |
| wandb_log_unique_prompts = wandb_log_unique_prompts,**kwargs) |
| self.vllm_sampling_params = vllm_sampling_params |
| self.unsloth_num_chunks = unsloth_num_chunks |
| if unsloth_grpo_mini_batch is not None: |
| if self.generation_batch_size >= unsloth_grpo_mini_batch: |
| self.unsloth_grpo_mini_batch = unsloth_grpo_mini_batch |
| else: |
| raise ValueError( |
| f"Unsloth GRPO mini batch size needs to be less than or equal to the effective generation batch size, " |
| f"which is self.per_device_train_batch_size * gradient_accumulation_steps." |
| ) |
| self.unsloth_logit_chunk_multiplier = unsloth_logit_chunk_multiplier |
| |
|
|
| pass |
|
|
| class _UnslothGRPOTrainer(BaseTrainer): |
| """""" |
|
|
| _tag_names = ["trl", "grpo"] |
| _name = "GRPO" |
| _paper = { |
| "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models", |
| "id": "2402.03300", |
| |
| "citation": textwrap.dedent("""\ |
| @article{shao2024deepseekmath, |
| title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, |
| author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, |
| year = 2024, |
| eprint = {arXiv:2402.03300}, |
| } |
| """), |
| } |
|
|
| def __init__( |
| self, |
| model: Union[str, PreTrainedModel], |
| reward_funcs: Union[RewardFunc, list[RewardFunc]], |
| args: Optional[GRPOConfig] = None, |
| train_dataset: Optional[Union[Dataset, IterableDataset]] = None, |
| eval_dataset: Optional[Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]] = None, |
| processing_class: Optional[Union[PreTrainedTokenizerBase, ProcessorMixin]] = None, |
| reward_processing_classes: Optional[Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]] = None, |
| callbacks: Optional[list[TrainerCallback]] = None, |
| optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None), |
| peft_config: Optional["PeftConfig"] = None, |
| ): |
|
|
| if hasattr(model, 'vllm_engine') and hasattr(args, 'use_vllm'): |
| if (getattr(args, 'use_vllm', False) == False): |
| args.use_vllm = True |
| args.vllm_mode='colocate' |
| if os.environ.get('UNSLOTH_VLLM_STANDBY', '0') == '1': |
| args.vllm_enable_sleep_mode=True |
| |
| if args is None: |
| model_name = model if isinstance(model, str) else model.config._name_or_path |
| model_name = model_name.split("/")[-1] |
| args = GRPOConfig(f"{model_name}-GRPO") |
|
|
| |
| |
| model_init_kwargs = args.model_init_kwargs or {} |
| if isinstance(model, str): |
| model_id = model |
| dtype = model_init_kwargs.get("dtype") |
| if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None: |
| pass |
| elif isinstance(dtype, str): |
| dtype = getattr(torch, dtype) |
| model_init_kwargs["dtype"] = dtype |
| else: |
| raise ValueError( |
| "Invalid `dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing " |
| f"a `torch.dtype` (e.g., 'float32'), but got {dtype}." |
| ) |
| |
| config = AutoConfig.from_pretrained(model_id) |
| architecture = getattr(transformers, config.architectures[0]) |
| model = architecture.from_pretrained(model_id, **model_init_kwargs) |
| else: |
| model_id = model.config._name_or_path |
| if args.model_init_kwargs is not None: |
| logger.warning( |
| "You passed `model_init_kwargs` to the `GRPOConfig`, but your model is already instantiated. " |
| "The `model_init_kwargs` will be ignored." |
| ) |
|
|
| |
| |
| self.model_kwarg_keys = ( |
| inspect.signature(model.forward).parameters.keys() |
| if not hasattr(model, "get_base_model") |
| else inspect.signature(model.get_base_model().forward).parameters.keys() |
| ) |
|
|
| if False: |
| pass |
|
|
| |
| if processing_class is None: |
| processing_class = AutoProcessor.from_pretrained(model.config._name_or_path, truncation_side="left") |
|
|
| |
| if isinstance(processing_class, ProcessorMixin): |
| tokenizer = processing_class.tokenizer |
| elif isinstance(processing_class, PreTrainedTokenizerBase): |
| tokenizer = processing_class |
| else: |
| raise TypeError("The `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`") |
|
|
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| self.pad_token = tokenizer.pad_token |
| self.pad_token_id = tokenizer.pad_token_id |
| self.eos_token_id = tokenizer.eos_token_id |
|
|
| |
| if not isinstance(reward_funcs, list): |
| reward_funcs = [reward_funcs] |
| self.reward_func_names = [] |
| for i, reward_func in enumerate(reward_funcs): |
| if isinstance(reward_func, str): |
| reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained( |
| reward_func, num_labels=1, **model_init_kwargs |
| ) |
| if isinstance(reward_funcs[i], nn.Module): |
| self.reward_func_names.append(reward_funcs[i].config._name_or_path.split("/")[-1]) |
| else: |
| self.reward_func_names.append(reward_funcs[i].__name__) |
| self.reward_funcs = reward_funcs |
|
|
| |
| if args.reward_weights is not None: |
| if len(args.reward_weights) != len(reward_funcs): |
| raise ValueError( |
| f"Number of reward weights ({len(args.reward_weights)}) must match number of reward " |
| f"functions ({len(reward_funcs)})" |
| ) |
| self.reward_weights = torch.tensor(args.reward_weights, dtype=torch.float32) |
| else: |
| self.reward_weights = torch.ones(len(reward_funcs), dtype=torch.float32) |
|
|
| |
| if reward_processing_classes is None: |
| reward_processing_classes = [None] * len(reward_funcs) |
| elif not isinstance(reward_processing_classes, list): |
| reward_processing_classes = [reward_processing_classes] |
| if len(reward_processing_classes) != len(reward_funcs): |
| raise ValueError( |
| f"The number of reward processing classes ({len(reward_processing_classes)}) must match the number of " |
| f"reward functions ({len(reward_funcs)})." |
| ) |
|
|
| for i, (reward_processing_class, reward_func) in enumerate(zip(reward_processing_classes, reward_funcs)): |
| if isinstance(reward_func, PreTrainedModel): |
| if reward_processing_class is None: |
| reward_processing_class = AutoTokenizer.from_pretrained(reward_func.config._name_or_path) |
| if reward_processing_class.pad_token_id is None: |
| reward_processing_class.pad_token = reward_processing_class.eos_token |
| |
| |
| reward_func.config.pad_token_id = reward_processing_class.pad_token_id |
| reward_processing_classes[i] = reward_processing_class |
|
|
| self.reward_processing_classes = reward_processing_classes |
|
|
| |
| self.max_prompt_length = args.max_prompt_length |
| self.max_completion_length = args.max_completion_length |
| self.num_generations = args.num_generations |
| self.temperature = args.temperature |
| self.top_p = args.top_p |
| self.top_k = args.top_k |
| self.min_p = args.min_p |
| self.repetition_penalty = args.repetition_penalty |
| self.use_transformers_paged = args.use_transformers_paged |
| self.use_vllm = args.use_vllm |
| self.vllm_mode = args.vllm_mode |
| self.vllm_gpu_memory_utilization = args.vllm_gpu_memory_utilization |
| self.vllm_tensor_parallel_size = args.vllm_tensor_parallel_size |
| self.vllm_importance_sampling_correction = args.vllm_importance_sampling_correction |
| self.vllm_importance_sampling_cap = args.vllm_importance_sampling_cap |
| self.use_liger_loss = args.use_liger_loss |
| self.loss_type = args.loss_type |
| self.scale_rewards = args.scale_rewards |
| self.importance_sampling_level = args.importance_sampling_level |
| self.mask_truncated_completions = args.mask_truncated_completions |
| self.top_entropy_quantile = args.top_entropy_quantile |
| if self.use_liger_loss and self.top_entropy_quantile < 1.0: |
| raise NotImplementedError( |
| "Liger Kernels don't currently support masking token positions based on entropy." |
| ) |
| if self.use_liger_loss and not self.importance_sampling_level == "token": |
| raise NotImplementedError( |
| "Liger Kernels currently only support token-level importance sampling. Please set" |
| "`importance_sampling_level` to 'token'." |
| ) |
|
|
| |
| self.shuffle_dataset = args.shuffle_dataset |
|
|
| if ( |
| isinstance(train_dataset, IterableDataset) |
| or isinstance(eval_dataset, IterableDataset) |
| or ( |
| isinstance(eval_dataset, dict) and any(isinstance(ds, IterableDataset) for ds in eval_dataset.values()) |
| ) |
| ): |
| |
| raise NotImplementedError( |
| "Iterable datasets are not yet supported in GRPOTrainer. Please use a standard dataset instead." |
| ) |
|
|
| |
| self.num_iterations = args.num_iterations |
| self.epsilon_low = args.epsilon |
| self.epsilon_high = args.epsilon_high if args.epsilon_high is not None else args.epsilon |
| |
| self._step = 0 |
| |
| |
| self._buffered_inputs = None |
|
|
| |
| |
| |
| |
| |
| |
| model.warnings_issued["estimate_tokens"] = True |
|
|
| super().__init__( |
| model=model, |
| args=args, |
| data_collator=identity, |
| train_dataset=train_dataset, |
| eval_dataset=eval_dataset, |
| processing_class=processing_class, |
| callbacks=callbacks, |
| optimizers=optimizers, |
| |
| |
| |
| |
| |
| compute_loss_func="non-None value to disable scaling", |
| ) |
|
|
| |
| self.beta = args.beta |
| if self.beta == 0.0: |
| |
| self.ref_model = None |
| elif is_peft_model(model): |
| |
| |
| self.ref_model = None |
| else: |
| |
| config = AutoConfig.from_pretrained(model_id) |
| architecture = getattr(transformers, config.architectures[0]) |
| self.ref_model = architecture.from_pretrained(model_id, **model_init_kwargs) |
|
|
| |
| if args.disable_dropout: |
| disable_dropout_in_model(model) |
| if self.ref_model is not None: |
| disable_dropout_in_model(self.ref_model) |
|
|
| |
| if self.use_liger_loss: |
| if not is_liger_kernel_available(): |
| raise ImportError( |
| "Liger is required to use `liger_loss` as the GRPO loss. Run `pip install liger-kernel`." |
| ) |
| |
| self._forward_redirection = _ForwardRedirection() |
|
|
| self.liger_grpo_loss = LigerFusedLinearGRPOLoss( |
| beta=self.beta, |
| epsilon_low=self.epsilon_low, |
| epsilon_high=self.epsilon_high, |
| temperature=self.temperature, |
| use_ref_model=self.beta != 0.0, |
| loss_type=self.loss_type, |
| max_completion_length=self.max_completion_length, |
| ) |
|
|
| |
| self._metrics = {"train": defaultdict(list), "eval": defaultdict(list)} |
| self._total_train_tokens = 0 |
| self.log_completions = args.log_completions |
| self.wandb_log_unique_prompts = args.wandb_log_unique_prompts |
| self.num_completions_to_print = args.num_completions_to_print |
| |
| self._logs = { |
| "images": deque(maxlen=args.generation_batch_size), |
| "prompt": deque(maxlen=args.generation_batch_size), |
| "completion": deque(maxlen=args.generation_batch_size), |
| "rewards": defaultdict(lambda: deque(maxlen=args.generation_batch_size)), |
| "advantages": deque(maxlen=args.generation_batch_size), |
| } |
|
|
| |
| |
| |
| set_seed(args.seed, device_specific=True) |
|
|
| if self.use_vllm: |
| if not is_vllm_available(): |
| raise ImportError( |
| "vLLM is not available and `use_vllm` is set to True. Please install vLLM with " |
| "`pip install trl[vllm]` to use it." |
| ) |
|
|
| if self.vllm_mode == "server": |
| if self.accelerator.is_main_process: |
| if args.vllm_server_base_url is not None: |
| base_url = args.vllm_server_base_url |
| else: |
| base_url = f"http://{args.vllm_server_host}:{args.vllm_server_port}" |
| self.vllm_client = VLLMClient(base_url=base_url, connection_timeout=args.vllm_server_timeout) |
| self.vllm_client.init_communicator(device=torch.cuda.current_device()) |
|
|
| elif self.vllm_mode == "colocate": |
| if not self.accelerator.num_processes % self.vllm_tensor_parallel_size == 0: |
| raise ValueError( |
| f"vllm_tensor_parallel_size ({self.vllm_tensor_parallel_size}) must divide world size " |
| f"({self.accelerator.num_processes}) evenly." |
| ) |
|
|
| if self.vllm_tensor_parallel_size > 1: |
| self.tp_group, _ = torch.distributed.new_subgroups_by_enumeration( |
| [ |
| list(range(i * self.vllm_tensor_parallel_size, (i + 1) * self.vllm_tensor_parallel_size)) |
| for i in range(self.accelerator.num_processes // self.vllm_tensor_parallel_size) |
| ] |
| ) |
| os.environ["RANK"] = str(self.accelerator.process_index) |
| os.environ["LOCAL_RANK"] = str(self.accelerator.local_process_index) |
| os.environ["WORLD_SIZE"] = str(self.accelerator.num_processes) |
| ensure_master_addr_port() |
|
|
| if self.max_prompt_length is not None and self.max_completion_length is not None: |
| max_model_len = self.max_prompt_length + self.max_completion_length |
| else: |
| max_model_len = None |
| self.llm = model.vllm_engine |
| if self.args.vllm_enable_sleep_mode: |
| self.llm.sleep(level=1) |
| else: |
| raise ValueError(f"vllm_mode must be either 'server' or 'colocate', got '{self.vllm_mode}'.") |
| self.guided_decoding_regex = args.vllm_guided_decoding_regex |
|
|
| self._last_loaded_step = -1 |
| self.accelerator.wait_for_everyone() |
| else: |
| generation_kwargs = { |
| "max_new_tokens": self.max_completion_length, |
| "do_sample": True, |
| "pad_token_id": tokenizer.pad_token_id, |
| "bos_token_id": tokenizer.bos_token_id, |
| "eos_token_id": tokenizer.eos_token_id, |
| "temperature": self.temperature, |
| "top_p": self.top_p, |
| "top_k": self.top_k, |
| "min_p": self.min_p, |
| "repetition_penalty": self.repetition_penalty, |
| "cache_implementation": args.cache_implementation, |
| } |
| if args.generation_kwargs is not None: |
| generation_kwargs.update(args.generation_kwargs) |
| self.generation_config = GenerationConfig(**generation_kwargs) |
|
|
| |
| |
| |
| self.model_accepts_loss_kwargs = False |
|
|
| |
| self.model.add_model_tags(self._tag_names) |
|
|
| if self.ref_model is not None: |
| if self.is_deepspeed_enabled: |
| self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator) |
| elif self.is_fsdp_enabled: |
| self.ref_model = prepare_fsdp(self.ref_model, self.accelerator) |
| else: |
| self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) |
|
|
| if args.sync_ref_model: |
| self.add_callback(SyncRefModelCallback(ref_model=self.ref_model, accelerator=self.accelerator)) |
|
|
| for i, reward_func in enumerate(self.reward_funcs): |
| if isinstance(reward_func, PreTrainedModel): |
| if self.is_deepspeed_enabled: |
| self.reward_funcs[i] = prepare_deepspeed(reward_func, self.accelerator) |
| else: |
| |
| self.reward_funcs[i] = self.accelerator.prepare_model( |
| reward_func, evaluation_mode=True, device_placement=True |
| ) |
|
|
| def _set_signature_columns_if_needed(self): |
| |
| |
| |
| |
| if self._signature_columns is None: |
| self._signature_columns = ["prompt", "image", "images"] |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| def get_train_dataloader(self): |
| if self.train_dataset is None: |
| raise ValueError("Trainer: training requires a train_dataset.") |
|
|
| train_dataset = self.train_dataset |
| data_collator = self.data_collator |
| if is_datasets_available() and isinstance(train_dataset, datasets.Dataset): |
| train_dataset = self._remove_unused_columns(train_dataset, description="training") |
| else: |
| data_collator = self._get_collator_with_removed_columns(data_collator, description="training") |
|
|
| dataloader_params = { |
| "batch_size": self._train_batch_size * self.args.steps_per_generation, |
| "collate_fn": data_collator, |
| "num_workers": self.args.dataloader_num_workers, |
| "pin_memory": self.args.dataloader_pin_memory, |
| "persistent_workers": self.args.dataloader_persistent_workers, |
| } |
|
|
| if not isinstance(train_dataset, torch.utils.data.IterableDataset): |
| dataloader_params["sampler"] = self._get_train_sampler() |
| dataloader_params["drop_last"] = self.args.dataloader_drop_last |
| dataloader_params["worker_init_fn"] = partial( |
| seed_worker, num_workers=self.args.dataloader_num_workers, rank=self.args.process_index |
| ) |
|
|
| dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor |
|
|
| return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params)) |
|
|
| def _get_train_sampler(self, dataset: Optional[Dataset] = None) -> Sampler: |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| if dataset is None: |
| dataset = self.train_dataset |
| return RepeatSampler( |
| data_source=dataset, |
| mini_repeat_count=self.num_generations, |
| batch_size=self.args.generation_batch_size // self.num_generations, |
| repeat_count=self.num_iterations * self.args.steps_per_generation, |
| shuffle=self.shuffle_dataset, |
| seed=self.args.seed, |
| ) |
|
|
| def _get_eval_sampler(self, eval_dataset) -> Sampler: |
| |
| return RepeatSampler( |
| data_source=eval_dataset, |
| mini_repeat_count=self.num_generations, |
| seed=self.args.seed, |
| ) |
|
|
| @profiling_decorator |
| def _get_last_hidden_state( |
| self, |
| unwrapped_model, |
| input_ids, |
| attention_mask, |
| logits_to_keep, |
| pixel_values=None, |
| image_grid_thw=None, |
| pixel_attention_mask=None, |
| image_sizes=None, |
| ): |
| if is_peft_model(unwrapped_model): |
| unwrapped_model = unwrapped_model.base_model.model |
|
|
| |
| model_inputs = {"input_ids": input_ids, "attention_mask": attention_mask} |
|
|
| |
| if image_grid_thw is not None and pixel_values is not None: |
| model_inputs["image_grid_thw"] = image_grid_thw |
| |
| if pixel_values is not None: |
| model_inputs["pixel_values"] = pixel_values |
| |
| if pixel_attention_mask is not None: |
| model_inputs["pixel_attention_mask"] = pixel_attention_mask |
| |
| if image_sizes is not None: |
| model_inputs["image_sizes"] = image_sizes |
|
|
| |
| if "logits_to_keep" in self.model_kwarg_keys: |
| |
| model_inputs["logits_to_keep"] = logits_to_keep + 1 |
|
|
| model_inputs["use_cache"] = False |
|
|
| last_hidden_state = unwrapped_model.model(**model_inputs).last_hidden_state |
| |
| last_hidden_state = last_hidden_state[:, :-1, :] |
| |
| last_hidden_state = last_hidden_state[:, -logits_to_keep:, :] |
| return last_hidden_state |
|
|
| def get_high_entropy_mask(self, entropies: torch.Tensor, mask: torch.Tensor, threshold: float) -> torch.Tensor: |
| """ |
| Returns a binary mask identifying tokens whose entropy exceeds a given quantile threshold. |
| |
| Args: |
| entropies (`torch.Tensor`): |
| Tensor of shape (batch_size, seq_len) with per-token entropy values. |
| mask (`torch.Tensor`): |
| Binary mask of the same shape as `entropies`, where `1` indicates valid tokens and `0` padding. |
| threshold (`float`): |
| Quantile threshold between `0.0` and `1.0` to select high-entropy tokens. |
| |
| Returns: |
| `torch.Tensor`: |
| Boolean mask of shape (batch_size, seq_len), where `True` indicates tokens with entropy >= threshold |
| and `False` otherwise. |
| """ |
| local = entropies[mask.bool()].float() |
|
|
| |
| |
| pad_value = -1e9 |
|
|
| |
| padded = self.accelerator.pad_across_processes(local, dim=0, pad_index=pad_value) |
| gathered = self.accelerator.gather(padded) |
|
|
| |
| gathered = gathered[gathered != pad_value] |
|
|
| if gathered.numel() == 0: |
| return torch.zeros_like(entropies, dtype=torch.bool) |
|
|
| entropy_threshold = torch.quantile(gathered, threshold) |
| masked_entropies = entropies * mask.float() |
| entropy_mask = masked_entropies >= entropy_threshold |
| return entropy_mask & mask.bool() |
|
|
| def _get_per_token_logps_and_entropies( |
| self, |
| model, |
| input_ids, |
| attention_mask, |
| logits_to_keep, |
| batch_size = None, |
| compute_entropy = False, |
| compute_efficient = False, |
| *args, |
| **kwargs, |
| ): |
| |
| |
| |
| if compute_efficient: |
| return None, None |
| else: |
| if not hasattr(self, "_autocast_dtype"): |
| self._autocast_dtype = ( |
| torch.float16 |
| if os.environ.get("ACCELERATE_MIXED_PRECISION", "fp16") == "fp16" |
| else torch.bfloat16 |
| ) |
| if os.environ.get("UNSLOTH_FORCE_FLOAT32", "0") == "1": |
| self._autocast_dtype = torch.float16 |
|
|
| pixel_values, image_grid_thw = ( |
| kwargs.get("pixel_values", None), |
| kwargs.get("image_grid_thw", None), |
| ) |
| pixel_attention_mask, image_sizes = ( |
| kwargs.get("pixel_attention_mask", None), |
| kwargs.get("image_sizes", None), |
| ) |
|
|
| unwrapped_model = self.accelerator.unwrap_model( |
| model, keep_fp32_wrapper = False |
| ) |
|
|
| lm_head = self.model.get_output_embeddings().weight |
|
|
| dtype_bytes = ( |
| 16 if self._autocast_dtype in [torch.float16, torch.bfloat16] else 32 |
| ) |
| total_rows = input_ids.shape[0] |
| seq_len = input_ids.shape[1] |
| hidden_dim = lm_head.shape[1] |
| vocab_dim = lm_head.shape[0] |
|
|
| if self.args.unsloth_grpo_mini_batch is None: |
| B, multiplier = autotune_batch_and_chunks( |
| total_rows, |
| seq_len, |
| hidden_dim, |
| vocab_dim, |
| dtype_bytes, |
| self.args.unsloth_logit_chunk_multiplier, |
| ) |
| B = total_rows // B |
| else: |
| B = self.args.unsloth_grpo_mini_batch |
|
|
| if self.args.unsloth_logit_chunk_multiplier is None: |
| multiplier = max(4, seq_len // 4096) |
| else: |
| multiplier = self.args.unsloth_logit_chunk_multiplier |
|
|
| all_logprobs_list = [] |
| if pixel_values is None: |
| left_pad_tokens_per_prompt = calculate_pad_tokens_in_prompt( |
| input_ids, logits_to_keep, self.processing_class.pad_token_id |
| ) |
| max_left_pad = torch.max(left_pad_tokens_per_prompt).item() |
| input_ids = left_pack_padding( |
| input_ids, self.processing_class.pad_token_id |
| ) |
| attention_mask = input_ids != self.processing_class.pad_token_id |
| attention_mask = attention_mask.to(attention_mask.dtype) |
| else: |
| max_left_pad = 0 |
|
|
| |
| attention_mask_chunks = torch.chunk(attention_mask, chunks = B, dim = 0) |
|
|
| def chunk_optional(tensor, chunks): |
| if tensor is None: |
| return [None] * chunks |
| return torch.chunk(tensor, chunks = chunks, dim = 0) |
|
|
| import math |
|
|
| total_samples = input_ids.shape[0] |
| batch_size = math.ceil(total_samples / B) |
|
|
| input_ids_chunks = [] |
| attention_mask_chunks = [] |
| pixel_values_chunks = [] |
| image_grid_thw_chunks = [] |
| pixel_attention_mask_chunks = [] |
|
|
| current_pixel_idx = 0 |
| |
| for start in range(0, total_samples, batch_size): |
| end = start + batch_size |
|
|
| input_ids_chunks.append(input_ids[start:end]) |
| attention_mask_chunks.append(attention_mask[start:end]) |
|
|
| if image_grid_thw is not None and pixel_values is not None: |
| grid_slice = image_grid_thw[start:end] |
| image_grid_thw_chunks.append(grid_slice) |
|
|
| batch_pixel_count = grid_slice.prod(dim = -1).sum().item() |
|
|
| start_pixel_idx = current_pixel_idx |
| end_pixel_idx = current_pixel_idx + batch_pixel_count |
|
|
| pixel_values_chunks.append( |
| pixel_values[start_pixel_idx:end_pixel_idx] |
| ) |
|
|
| if pixel_attention_mask is not None: |
| pixel_attention_mask_chunks.append( |
| pixel_attention_mask[start_pixel_idx:end_pixel_idx] |
| ) |
| else: |
| pixel_attention_mask_chunks.append(None) |
|
|
| current_pixel_idx = end_pixel_idx |
|
|
| else: |
| pixel_values_chunks.append(None) |
| image_grid_thw_chunks.append(None) |
| pixel_attention_mask_chunks.append(None) |
|
|
| if image_sizes is not None and not isinstance(image_sizes, torch.Tensor): |
| image_sizes_chunks = [[size] for size in image_sizes] |
| else: |
| image_sizes_chunks = chunk_optional(image_sizes, B) |
|
|
| temperature = self.temperature |
| logit_softcapping = getattr(model.config, "final_logit_softcapping", 0) |
| if logit_softcapping is None: |
| logit_softcapping = 0 |
| logit_scale_multiply = getattr(model.config, "logit_scale", 0) |
| if logit_scale_multiply is None: |
| logit_scale_multiply = 0 |
| logit_scale_divide = getattr(model.config, "logits_scaling", 0) |
| if logit_scale_divide is None: |
| logit_scale_divide = 0 |
|
|
| zipped_inputs = zip( |
| input_ids_chunks, |
| attention_mask_chunks, |
| pixel_values_chunks, |
| image_grid_thw_chunks, |
| pixel_attention_mask_chunks, |
| image_sizes_chunks, |
| ) |
| os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "1" |
|
|
| with _get_inference_mode_context_manager(model): |
| for ( |
| input_ids_chunk, |
| attention_mask_chunk, |
| pixel_values_chunk, |
| image_grid_thw_chunk, |
| pixel_attention_mask_chunk, |
| image_sizes_chunk, |
| ) in zipped_inputs: |
| with torch.amp.autocast( |
| device_type = "cuda", dtype = self._autocast_dtype |
| ): |
| if pixel_values is None: |
| logits_chunk = unwrapped_model( |
| input_ids = input_ids_chunk, |
| attention_mask = attention_mask_chunk, |
| pixel_values = pixel_values_chunk, |
| image_grid_thw = image_grid_thw_chunk, |
| pixel_attention_mask = pixel_attention_mask_chunk, |
| image_sizes = image_sizes_chunk, |
| ).logits |
|
|
| completion_input_ids_chunk = input_ids_chunk[ |
| :, -(logits_to_keep + max_left_pad) : |
| ] |
| logits_chunk = logits_chunk[ |
| :, -(logits_to_keep + max_left_pad + 1) :, : |
| ] |
| logits_chunk = logits_chunk[:, :-1, :] |
| else: |
| |
| |
| logits_chunk = unwrapped_model( |
| input_ids = input_ids_chunk, |
| attention_mask = attention_mask_chunk, |
| pixel_values = pixel_values_chunk, |
| image_grid_thw = image_grid_thw_chunk, |
| pixel_attention_mask = pixel_attention_mask_chunk, |
| image_sizes = image_sizes_chunk, |
| logits_to_keep = logits_to_keep + 1, |
| ).logits |
|
|
| logits_chunk = logits_chunk[:, :-1, :] |
| completion_input_ids_chunk = input_ids_chunk[ |
| :, -logits_to_keep: |
| ] |
|
|
| logprobs_chunk = chunked_hidden_states_selective_log_softmax( |
| logits_chunk, |
| lm_head, |
| completion_input_ids_chunk, |
| chunks = input_ids_chunk.shape[0] * multiplier, |
| logit_scale_multiply = logit_scale_multiply, |
| logit_scale_divide = logit_scale_divide, |
| logit_softcapping = logit_softcapping, |
| temperature = temperature, |
| ) |
| |
| |
| device_synchronize() |
| all_logprobs_list.append(logprobs_chunk) |
| logprobs = torch.cat(all_logprobs_list, dim = 0) |
| entropies = None |
|
|
| os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "0" |
|
|
| return logprobs.detach(), entropies |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
|
|
| def _fix_param_name_to_vllm(self, name, extra_prefixes: Optional[list[str]] = None): |
| extra_prefixes = extra_prefixes or [] |
| prefixes = ["_checkpoint_wrapped_module."] + extra_prefixes |
| for prefix in prefixes: |
| name = name.replace(prefix, "") |
| return name |
|
|
| def _sync_fsdp1_params_to_vllm(self, module: nn.Module, prefix: str = "", visited=None): |
| """Memory-efficient post-order traversal of FSDP modules to extract full parameters and sync with vLLM.""" |
| |
| if visited is None: |
| visited = set() |
| for child_name, child_module in module.named_children(): |
| child_prefix = f"{prefix}.{child_name}" if prefix else child_name |
| self._sync_fsdp1_params_to_vllm( |
| child_module, prefix=child_prefix, visited=visited |
| ) |
|
|
| if isinstance(module, FSDP): |
| with FSDP.summon_full_params(module, recurse=False, writeback=False): |
| for param_name, param in module.named_parameters(): |
| full_name = f"{prefix}.{param_name}" if prefix else param_name |
| full_name = self._fix_param_name_to_vllm(full_name, extra_prefixes=["_fsdp_wrapped_module."]) |
|
|
| if full_name in visited: |
| continue |
| visited.add(full_name) |
|
|
| if self.vllm_mode == "server" and self.accelerator.is_main_process: |
| self.vllm_client.update_named_param(full_name, param.data) |
| elif self.vllm_mode == "colocate": |
|
|
| pass |
|
|
| pass |
|
|
| def _sync_fsdp2_params_to_vllm(self, module: nn.Module): |
| |
| for name, param in module.items(): |
| if param.is_cpu: |
| param = param.to(torch.device("cuda")) |
| param = param.full_tensor() |
|
|
| if self.vllm_mode == "server" and self.accelerator.is_main_process: |
| self.vllm_client.update_named_param(name, param) |
| elif self.vllm_mode == "colocate": |
|
|
| pass |
|
|
| pass |
|
|
| def _move_model_to_vllm(self, *args, **kwargs): |
| return None |
|
|
| @profiling_decorator |
| def _prepare_inputs( |
| self, generation_batch: dict[str, Union[torch.Tensor, Any]] |
| ) -> dict[str, Union[torch.Tensor, Any]]: |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| mode = "train" if self.model.training else "eval" |
| if mode == "train": |
| generate_every = self.args.steps_per_generation * self.num_iterations |
| if self._step % generate_every == 0 or self._buffered_inputs is None: |
| |
| generation_batch = self._generate_and_score_completions(generation_batch) |
| generation_batch = split_pixel_values_by_grid(generation_batch) |
|
|
| try: generation_batch = shuffle_sequence_dict(generation_batch) |
|
|
| except: pass |
| generation_batches = split_tensor_dict(generation_batch, self.args.steps_per_generation) |
| self._buffered_inputs = [unsplit_pixel_values_by_grid(batch) for batch in generation_batches] |
| inputs = self._buffered_inputs[self._step % self.args.steps_per_generation] |
| self._step += 1 |
| else: |
| |
| |
| inputs = self._generate_and_score_completions(generation_batch) |
| return inputs |
|
|
| @profiling_decorator |
| def _calculate_rewards(self, inputs, prompts, completions, completion_ids_list): |
| device = self.accelerator.device |
| rewards_per_func = torch.zeros(len(prompts), len(self.reward_funcs), device=device) |
|
|
| |
| keys = [key for key in inputs[0] if key not in ["prompt", "completion", "completion_ids"]] |
| reward_kwargs = {key: [example[key] for example in inputs] for key in keys} |
|
|
| |
| reward_kwargs["trainer_state"] = self.state |
|
|
| for i, (reward_func, reward_processing_class, reward_func_name) in enumerate( |
| zip(self.reward_funcs, self.reward_processing_classes, self.reward_func_names) |
| ): |
| with profiling_context(self, reward_func_name): |
| if isinstance(reward_func, nn.Module): |
| if is_conversational(inputs[0]): |
| messages = [{"messages": p + c} for p, c in zip(prompts, completions)] |
| texts = [apply_chat_template(x, reward_processing_class)["text"] for x in messages] |
| else: |
| texts = [p + c for p, c in zip(prompts, completions)] |
| reward_inputs = reward_processing_class( |
| text=texts, return_tensors="pt", padding=True, padding_side="right", add_special_tokens=False |
| ) |
| reward_inputs = super()._prepare_inputs(reward_inputs) |
| with torch.inference_mode(): |
| rewards_per_func[:, i] = reward_func(**reward_inputs).logits[:, 0] |
| else: |
| output_reward_func = reward_func( |
| prompts=prompts, completions=completions, completion_ids=completion_ids_list, **reward_kwargs |
| ) |
| |
| output_reward_func = [reward if reward is not None else torch.nan for reward in output_reward_func] |
|
|
| rewards_per_func[:, i] = torch.tensor(output_reward_func, dtype=torch.float32, device=device) |
|
|
| |
| if torch.isnan(rewards_per_func).all(dim=1).any(): |
| nan_row_idx = torch.isnan(rewards_per_func).all(dim=1).nonzero(as_tuple=True)[0][0] |
| row_reward_kwargs = { |
| key: value[nan_row_idx] for key, value in reward_kwargs.items() if key != "trainer_state" |
| } |
| row_reward_kwargs["prompt"] = prompts[nan_row_idx] |
| row_reward_kwargs["completion"] = completions[nan_row_idx] |
| logger.warning( |
| f"All reward functions returned None for the following kwargs:\n{row_reward_kwargs}\n" |
| "Please ensure that at least one reward function returns a valid reward." |
| ) |
|
|
| |
| |
| rewards_per_func = gather(rewards_per_func) |
| return rewards_per_func |
|
|
| def _generate_single_turn(self, prompts: list[str], images: Optional[list]): |
| device = self.accelerator.device |
|
|
| |
| |
| |
| kwargs = {} |
| if images is not None: |
| kwargs = {"images": images} |
| for prompt, image_list in zip(prompts, images): |
| if isinstance(prompt, list): |
| prepare_multimodal_messages(prompt, num_images=len(image_list)) |
|
|
| |
| _chat_template_ = getattr(self.processing_class, "chat_template", None) |
| if _chat_template_ is None: _chat_template_ = "" |
| _supported_keys_ = set(("prompt", "chosen", "rejected", "completion", "messages", "label")) |
| _batch_chat_kwargs_ = getattr(self, "_unsloth_batch_chat_kwargs", None) |
|
|
| prompts_text = [] |
| for _idx_, _example_ in enumerate(prompts): |
| _tokenizer_kwargs_ = {} |
| if type(_example_) is not dict: |
| _example_ = {"prompt": _example_} |
| _left_keys_ = _example_.keys() - _supported_keys_ |
| for k in _left_keys_: |
| if k in _chat_template_: |
| v = _example_[k] |
| if type(v) is str: |
| _tokenizer_kwargs_[k] = v |
| if _batch_chat_kwargs_ is not None and _idx_ < len(_batch_chat_kwargs_): |
| for _bk_, _bv_ in _batch_chat_kwargs_[_idx_].items(): |
| if _bk_ not in _tokenizer_kwargs_: |
| _tokenizer_kwargs_[_bk_] = _bv_ |
| _x_ = maybe_apply_chat_template(_example_, self.processing_class, **_tokenizer_kwargs_)["prompt"] |
| prompts_text.append(_x_) |
| if images is not None: |
| prompt_inputs = self.processing_class(text=prompts_text, padding=True, return_tensors="pt", **kwargs) |
| prompt_inputs = super()._prepare_inputs(prompt_inputs) |
| forward_kwargs = {k: v for k, v in prompt_inputs.items() if k not in ["input_ids", "attention_mask"]} |
| else: |
| forward_kwargs = {} |
|
|
| |
| if self.use_vllm: |
| if self.vllm_mode == "colocate" and self.args.vllm_enable_sleep_mode: |
| |
| torch.cuda.empty_cache() |
| self.llm.wake_up() |
|
|
| |
| if self.state.global_step != self._last_loaded_step: |
| self._move_model_to_vllm() |
| self._last_loaded_step = self.state.global_step |
|
|
| |
| if self.vllm_mode == "server": |
| all_prompts_text = gather_object(prompts_text) |
| if images is not None: |
| all_images = gather_object(images) |
|
|
| if self.accelerator.is_main_process: |
| |
| |
| |
| ordered_set_of_prompts = all_prompts_text[:: self.num_generations] |
|
|
| if images is not None: |
| ordered_set_of_images = all_images[:: self.num_generations] |
| else: |
| ordered_set_of_images = None |
|
|
| with profiling_context(self, "vLLM.generate"): |
| output = self.vllm_client.generate( |
| prompts=ordered_set_of_prompts, |
| images=ordered_set_of_images, |
| n=self.num_generations, |
| repetition_penalty=self.repetition_penalty, |
| temperature=self.temperature, |
| top_p=self.top_p, |
| top_k=-1 if self.top_k is None else self.top_k, |
| min_p=0.0 if self.min_p is None else self.min_p, |
| max_tokens=self.max_completion_length, |
| truncate_prompt_tokens=self.max_prompt_length, |
| guided_decoding_regex=self.guided_decoding_regex, |
| generation_kwargs=self.args.generation_kwargs, |
| ) |
| payload = (output["prompt_ids"], output["completion_ids"], output["logprobs"]) |
| else: |
| payload = None |
|
|
| |
| obj_list = [payload] |
| broadcast_object_list(obj_list, from_process=0) |
| all_prompt_ids, all_completion_ids, all_logprobs = obj_list[0] |
|
|
| |
| all_prompt_ids = [ids for ids in all_prompt_ids for _ in range(self.num_generations)] |
|
|
| process_slice = slice( |
| self.accelerator.process_index * len(prompts), |
| (self.accelerator.process_index + 1) * len(prompts), |
| ) |
| prompt_ids = all_prompt_ids[process_slice] |
| completion_ids = all_completion_ids[process_slice] |
| logprobs = all_logprobs[process_slice] |
|
|
| |
| elif self.vllm_mode == "colocate": |
| if self.guided_decoding_regex: |
| guided_decoding = GuidedDecodingParams(regex=self.guided_decoding_regex) |
| else: |
| guided_decoding = None |
|
|
| generation_kwargs = { |
| "n": 1, |
| "repetition_penalty": self.repetition_penalty, |
| "temperature": self.temperature, |
| "top_p": self.top_p, |
| "top_k": -1 if self.top_k is None else self.top_k, |
| "min_p": 0.0 if self.min_p is None else self.min_p, |
| "max_tokens": self.max_completion_length, |
| "truncate_prompt_tokens": self.max_prompt_length, |
| "guided_decoding": guided_decoding, |
| "logprobs": 0, |
| } |
| if self.args.generation_kwargs is not None: |
| generation_kwargs.update(self.args.generation_kwargs) |
| sampling_params = SamplingParams(**grpo_update_SamplingParams(SamplingParams, generation_kwargs, getattr(self.args, 'vllm_sampling_params', None))) |
|
|
| if self.vllm_tensor_parallel_size > 1: |
| |
| |
| orig_size = len(prompts_text) |
| gathered_prompts = [None for _ in range(self.vllm_tensor_parallel_size)] |
| torch.distributed.all_gather_object(gathered_prompts, prompts_text, group=self.tp_group) |
| all_prompts_text = [p for sublist in gathered_prompts for p in sublist] |
|
|
| if images is not None: |
| gathered_images = [None for _ in range(self.vllm_tensor_parallel_size)] |
| torch.distributed.all_gather_object(gathered_images, images, group=self.tp_group) |
| all_images = [img for sublist in gathered_images for img in sublist] |
| else: |
| all_images = None |
| else: |
| all_prompts_text = prompts_text |
| all_images = images |
|
|
| if images is not None and all_images: |
| vllm_inputs = [] |
| for prompt, image_list in zip(all_prompts_text, all_images): |
| vllm_inputs.append({"prompt": prompt, "multi_modal_data": {"image": image_list}}) |
|
|
| else: |
| vllm_inputs = all_prompts_text |
|
|
| with profiling_context(self, "vLLM.generate"): |
| all_outputs = self.llm.generate(vllm_inputs, sampling_params=sampling_params, use_tqdm=False, lora_request = self.model.load_lora('grpo_trainer_lora_model_' + (os.environ.get('CUDA_VISIBLE_DEVICES', '0').replace(',','')), load_tensors = True)) |
|
|
| all_prompt_ids = [output.prompt_token_ids for output in all_outputs] |
| all_completion_ids = [output.token_ids for outputs in all_outputs for output in outputs.outputs] |
| all_logprobs = [ |
| [next(iter(lp.values())).logprob for lp in output.logprobs] |
| for outputs in all_outputs |
| for output in outputs.outputs |
| ] |
|
|
| if self.vllm_tensor_parallel_size > 1: |
| |
| |
| local_rank_in_group = torch.distributed.get_rank(group=self.tp_group) |
| tp_slice = slice(local_rank_in_group * orig_size, (local_rank_in_group + 1) * orig_size) |
| prompt_ids = all_prompt_ids[tp_slice] |
| completion_ids = all_completion_ids[tp_slice] |
| logprobs = all_logprobs[tp_slice] |
| else: |
| prompt_ids = all_prompt_ids |
| completion_ids = all_completion_ids |
| logprobs = all_logprobs |
|
|
| if self.args.vllm_enable_sleep_mode: |
| self.llm.sleep(level=1) |
|
|
| elif self.use_transformers_paged: |
| |
| |
| paged_prompt_inputs = self.processing_class(text=prompts_text, **kwargs) |
| previous_attn = self.model_wrapped.config._attn_implementation |
|
|
| if is_flash_attn_2_available(): |
| self.model_wrapped.config._attn_implementation = "paged_attention" |
| else: |
| self.model_wrapped.config._attn_implementation = "sdpa_paged" |
| with ( |
| profiling_context(self, "transformers.generate_batch"), |
| unwrap_model_for_generation( |
| self.model_wrapped, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation |
| ) as unwrapped_model, |
| torch.no_grad(), |
| FSDP.summon_full_params(self.model_wrapped, recurse=False) if self.is_fsdp_enabled else nullcontext(), |
| ): |
| |
| if self.args.bf16: |
| unwrapped_model.to(torch.bfloat16) |
| elif self.args.fp16: |
| unwrapped_model.to(torch.float16) |
| with torch.inference_mode(): |
| all_outputs = unwrapped_model.generate_batch( |
| paged_prompt_inputs.input_ids, generation_config=self.generation_config, progress_bar=False |
| ) |
| unwrapped_model.train() |
| completion_ids = [output.generated_tokens for output in all_outputs.values()] |
| prompt_ids = paged_prompt_inputs.input_ids |
| |
| self.model_wrapped.config._attn_implementation = previous_attn |
| logprobs = None |
|
|
| else: |
| |
| generate_inputs = self.processing_class( |
| text=prompts_text, |
| return_tensors="pt", |
| padding=True, |
| padding_side="left", |
| **kwargs, |
| ) |
| generate_inputs = super()._prepare_inputs(generate_inputs) |
|
|
| with ( |
| profiling_context(self, "transformers.generate"), |
| unwrap_model_for_generation( |
| self.model_wrapped, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation |
| ) as unwrapped_model, |
| torch.no_grad(), |
| FSDP.summon_full_params(self.model_wrapped, recurse=False) if self.is_fsdp_enabled else nullcontext(), |
| ): |
| prompt_completion_ids = unwrapped_model.generate( |
| **generate_inputs, generation_config=self.generation_config, disable_compile=True |
| ) |
| |
| prompt_ids, prompt_mask = generate_inputs["input_ids"], generate_inputs["attention_mask"] |
| prompt_length = prompt_ids.size(1) |
| completion_ids = prompt_completion_ids[:, prompt_length:] |
|
|
| |
| is_eos = completion_ids == self.eos_token_id |
| eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device) |
| eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)] |
| sequence_indices = torch.arange(is_eos.size(1), device=device).expand(is_eos.size(0), -1) |
| completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int() |
| prompt_ids = [p[m].tolist() for p, m in zip(prompt_ids, prompt_mask.bool())] |
| completion_ids = [c[m].tolist() for c, m in zip(completion_ids, completion_mask.bool())] |
| logprobs = None |
|
|
| return prompt_ids, completion_ids, logprobs, forward_kwargs |
|
|
| def _generate(self, prompts: list[str], images: Optional[list]): |
| device = self.accelerator.device |
| mode = "train" if self.model.training else "eval" |
|
|
| prompt_ids, completion_ids, logprobs, forward_kwargs = self._generate_single_turn(prompts, images) |
|
|
| |
| prompt_lengths = torch.tensor([len(ids) for ids in prompt_ids], device=device) |
| completion_lengths = torch.tensor([len(ids) for ids in completion_ids], device=device) |
| agg_prompt_lengths = self.accelerator.gather(prompt_lengths) |
| agg_completion_lengths = self.accelerator.gather(completion_lengths) |
| total_prompt_tokens = agg_prompt_lengths.sum() |
| total_completion_tokens = agg_completion_lengths.sum() |
|
|
| |
| if mode == "train": |
| self.state.num_input_tokens_seen += (total_prompt_tokens + total_completion_tokens).item() |
| self._metrics[mode]["num_tokens"] = [self.state.num_input_tokens_seen] |
|
|
| |
| self._metrics[mode]["completions/mean_length"].append(agg_completion_lengths.float().mean().item()) |
| self._metrics[mode]["completions/min_length"].append(agg_completion_lengths.float().min().item()) |
| self._metrics[mode]["completions/max_length"].append(agg_completion_lengths.float().max().item()) |
|
|
| |
| eos_and_pad = [self.eos_token_id, self.pad_token_id] |
| is_truncated = torch.tensor([ids[-1] not in eos_and_pad for ids in completion_ids], device=device) |
| agg_is_truncated = self.accelerator.gather(is_truncated) |
| self._metrics[mode]["completions/clipped_ratio"].append(agg_is_truncated.float().mean().item()) |
| term_completion_lengths = agg_completion_lengths[~agg_is_truncated] |
| if len(term_completion_lengths) == 0: |
| term_completion_lengths = torch.zeros(1, device=device) |
| self._metrics[mode]["completions/mean_terminated_length"].append(term_completion_lengths.float().mean().item()) |
| self._metrics[mode]["completions/min_terminated_length"].append(term_completion_lengths.float().min().item()) |
| self._metrics[mode]["completions/max_terminated_length"].append(term_completion_lengths.float().max().item()) |
|
|
| return prompt_ids, completion_ids, total_completion_tokens, logprobs, forward_kwargs |
|
|
| def _generate_and_score_completions( |
| self, inputs: list[dict[str, Union[torch.Tensor, Any]]] |
| ) -> dict[str, Union[torch.Tensor, Any]]: |
| device = self.accelerator.device |
| mode = "train" if self.model.training else "eval" |
|
|
| prompts = [x["prompt"] for x in inputs] |
| |
| _ct_ = getattr(self.processing_class, 'chat_template', None) or '' |
| _sk_ = {'prompt', 'chosen', 'rejected', 'completion', 'messages', 'label', |
| 'images', 'image', 'videos', 'video', 'audios', 'audio'} |
| self._unsloth_batch_chat_kwargs = [] |
| for _inp_ in inputs: |
| _kw_ = {} |
| if isinstance(_inp_, dict): |
| for _k_ in _inp_.keys() - _sk_: |
| if _k_ in _ct_ and isinstance(_inp_[_k_], str): |
| _kw_[_k_] = _inp_[_k_] |
| self._unsloth_batch_chat_kwargs.append(_kw_) |
| if "images" in inputs[0]: |
| images = [example.get("images") for example in inputs] |
| elif "image" in inputs[0]: |
| images = [[example.get("image")] if example.get("image") is not None else None for example in inputs] |
| else: |
| images = None |
| |
| if images is not None and all(img_list == [] for img_list in images): |
| images = None |
|
|
| ( |
| prompt_ids_list, |
| completion_ids_list, |
| num_items_in_batch, |
| sampling_per_token_logps_list, |
| forward_kwargs, |
| ) = self._generate(prompts, images) |
|
|
| |
| prompt_ids = [torch.tensor(ids, device=device) for ids in prompt_ids_list] |
| prompt_mask = [torch.ones_like(ids, dtype=torch.long) for ids in prompt_ids] |
| prompt_ids = pad(prompt_ids, padding_value=self.pad_token_id, padding_side="left") |
| prompt_mask = pad(prompt_mask, padding_value=0, padding_side="left") |
| completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids_list] |
| completion_mask = [torch.ones_like(ids, dtype=torch.long) for ids in completion_ids] |
| completion_ids = pad(completion_ids, padding_value=self.pad_token_id, padding_side="right") |
| completion_mask = pad(completion_mask, padding_value=0, padding_side="right") |
| if sampling_per_token_logps_list is not None: |
| sampling_per_token_logps = [torch.tensor(logps, device=device) for logps in sampling_per_token_logps_list] |
| sampling_per_token_logps = pad(sampling_per_token_logps, padding_value=0.0, padding_side="right") |
| else: |
| sampling_per_token_logps = None |
|
|
| |
| if self.mask_truncated_completions: |
| eos_and_pad = [self.eos_token_id, self.pad_token_id] |
| is_truncated = torch.tensor([ids[-1] not in eos_and_pad for ids in completion_ids_list], device=device) |
| completion_mask = completion_mask * (~is_truncated).unsqueeze(1).int() |
|
|
| |
| prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1) |
| attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) |
| |
| if "token_type_ids" in forward_kwargs: |
| token_type_ids = forward_kwargs["token_type_ids"] |
| forward_kwargs["token_type_ids"] = torch.cat( |
| [token_type_ids, token_type_ids.new_zeros(completion_ids.shape)], dim=1 |
| ) |
|
|
| logits_to_keep = completion_ids.size(1) |
| |
| max_left_pad = None |
| batch_size = self.args.per_device_train_batch_size if mode == "train" else self.args.per_device_eval_batch_size |
| try: |
| |
| if not has_images: |
| |
| left_pad_tokens_per_prompt = calculate_pad_tokens_in_prompt(prompt_completion_ids, logits_to_keep, self.processing_class.pad_token_id) |
| max_left_pad = torch.max(left_pad_tokens_per_prompt).item() |
| except: |
| |
| if images is None: |
| |
| left_pad_tokens_per_prompt = calculate_pad_tokens_in_prompt(prompt_completion_ids, logits_to_keep, self.processing_class.pad_token_id) |
| max_left_pad = torch.max(left_pad_tokens_per_prompt).item() |
| self.model.for_training() |
|
|
| num_images = [len(img_list) for img_list in images] if images is not None else None |
|
|
| with torch.no_grad(): |
| |
| |
| |
| |
| |
| |
| |
| generate_every = self.args.steps_per_generation * self.num_iterations |
|
|
| if self.args.gradient_accumulation_steps % generate_every != 0 or ( |
| self.use_vllm |
| ): |
| old_per_token_logps, _ = self._get_per_token_logps_and_entropies( |
| self.model, |
| prompt_completion_ids, |
| attention_mask, |
| logits_to_keep, |
| batch_size, |
| num_images=num_images, |
| **forward_kwargs, |
| ) |
| else: |
| old_per_token_logps = None |
|
|
| |
| if False and self.use_vllm and self.vllm_importance_sampling_correction: |
| importance_sampling_ratio = torch.exp(old_per_token_logps - sampling_per_token_logps) |
| importance_sampling_ratio = torch.clamp( |
| importance_sampling_ratio, max=self.vllm_importance_sampling_cap |
| ) |
|
|
| |
| if self.beta != 0.0: |
| if self.ref_model is not None: |
| ref_per_token_logps, _ = self._get_per_token_logps_and_entropies( |
| self.ref_model, |
| prompt_completion_ids, |
| attention_mask, |
| logits_to_keep, |
| batch_size=batch_size, |
| num_images=num_images, |
| **forward_kwargs, |
| ) |
| else: |
| with self.accelerator.unwrap_model(self.model).disable_adapter(): |
| ref_per_token_logps, _ = self._get_per_token_logps_and_entropies( |
| self.model, |
| prompt_completion_ids, |
| attention_mask, |
| logits_to_keep, |
| batch_size=batch_size, |
| num_images=num_images, |
| **forward_kwargs, |
| ) |
| else: |
| ref_per_token_logps = None |
|
|
| |
| prompts_text = self.processing_class.batch_decode(prompt_ids, skip_special_tokens=True) |
| completions_text = self.processing_class.batch_decode(completion_ids, skip_special_tokens=True) |
| if is_conversational(inputs[0]): |
| completions = [] |
| for prompt, completion in zip(prompts, completions_text): |
| bootstrap = prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else "" |
| completions.append([{"role": "assistant", "content": bootstrap + completion}]) |
| else: |
| completions = completions_text |
|
|
| |
| |
| |
| if images is not None: |
| rewards_per_func = self._calculate_rewards(inputs, prompts_text, completions_text, completion_ids_list) |
| else: |
| rewards_per_func = self._calculate_rewards(inputs, prompts, completions, completion_ids_list) |
|
|
| |
| rewards = (rewards_per_func * self.reward_weights.to(device).unsqueeze(0)).nansum(dim=1) |
|
|
| |
| mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1) |
|
|
| |
| mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(self.num_generations, dim=0) |
| advantages = rewards - mean_grouped_rewards |
|
|
| if self.scale_rewards in ["group", "none"]: |
| |
| std_rewards = rewards.view(-1, self.num_generations).std(dim=1) |
| std_rewards = std_rewards.repeat_interleave(self.num_generations, dim=0) |
| elif self.scale_rewards == "batch": |
| |
| std_rewards = rewards.std().expand_as(rewards) |
| else: |
| raise ValueError( |
| f"Invalid value for scale_rewards: {self.scale_rewards}. Must be one of 'batch', 'group', or 'none'." |
| ) |
|
|
| is_std_zero = torch.isclose(std_rewards, torch.zeros_like(std_rewards)) |
| if self.scale_rewards != "none": |
| advantages = advantages / (std_rewards + 1e-4) |
|
|
| |
| process_slice = slice( |
| self.accelerator.process_index * len(prompts), |
| (self.accelerator.process_index + 1) * len(prompts), |
| ) |
| all_process_advantages = advantages.clone() |
| advantages = advantages[process_slice] |
|
|
| |
| for i, reward_func_name in enumerate(self.reward_func_names): |
| mean_rewards = torch.nanmean(rewards_per_func[:, i]).item() |
| self._metrics[mode][f"rewards/{reward_func_name}/mean"].append(mean_rewards) |
| std_func_rewards = nanstd(rewards_per_func[:, i]).item() |
| self._metrics[mode][f"rewards/{reward_func_name}/std"].append(std_func_rewards) |
| self._metrics[mode]["reward"].append(mean_grouped_rewards.mean().item()) |
| self._metrics[mode]["reward_std"].append(std_rewards.mean().item()) |
| self._metrics[mode]["frac_reward_zero_std"].append(is_std_zero.float().mean().item()) |
|
|
| |
| self._logs["prompt"].extend(gather_object(prompts_text)) |
| self._logs["completion"].extend(gather_object(completions_text)) |
| for i, name in enumerate(self.reward_func_names): |
| self._logs["rewards"][name].extend(rewards_per_func[:, i].tolist()) |
| self._logs["advantages"].extend(all_process_advantages.tolist()) |
|
|
| if images is not None: |
| self._logs["images"].extend(gather_object(images)) |
|
|
| if False and self.use_vllm and self.vllm_importance_sampling_correction: |
| delta = torch.abs(old_per_token_logps - sampling_per_token_logps) |
| delta = delta[completion_mask.bool()] |
| mean_delta = torch.mean(delta) if delta.numel() > 0 else torch.tensor(0.0, device=device) |
| max_delta = torch.max(delta) if delta.numel() > 0 else torch.tensor(0.0, device=device) |
| self._metrics[mode]["sampling/sampling_logp_difference/mean"].append( |
| self.accelerator.gather(mean_delta).mean().item() |
| ) |
| self._metrics[mode]["sampling/sampling_logp_difference/max"].append( |
| self.accelerator.gather(max_delta).max().item() |
| ) |
|
|
| flat_is_ratio = importance_sampling_ratio[completion_mask.bool()] |
| min_importance_sampling_ratio = ( |
| torch.min(flat_is_ratio) if flat_is_ratio.numel() > 0 else torch.tensor(0.0, device=device) |
| ) |
| mean_importance_sampling_ratio = ( |
| torch.mean(flat_is_ratio) if flat_is_ratio.numel() > 0 else torch.tensor(0.0, device=device) |
| ) |
| max_importance_sampling_ratio = ( |
| torch.max(flat_is_ratio) if flat_is_ratio.numel() > 0 else torch.tensor(0.0, device=device) |
| ) |
| self._metrics[mode]["sampling/importance_sampling_ratio/min"].append( |
| nanmin(self.accelerator.gather(min_importance_sampling_ratio)).item() |
| ) |
| self._metrics[mode]["sampling/importance_sampling_ratio/mean"].append( |
| self.accelerator.gather(mean_importance_sampling_ratio).nanmean().item() |
| ) |
| self._metrics[mode]["sampling/importance_sampling_ratio/max"].append( |
| nanmax(self.accelerator.gather(max_importance_sampling_ratio)).item() |
| ) |
|
|
| output = { |
| "prompt_ids": prompt_ids, |
| "prompt_mask": prompt_mask, |
| "completion_ids": completion_ids, |
| "completion_mask": completion_mask, |
| "advantages": advantages, |
| "num_items_in_batch": num_items_in_batch, |
| } |
| if old_per_token_logps is not None: |
| output["old_per_token_logps"] = old_per_token_logps |
| if False and self.use_vllm and self.vllm_importance_sampling_correction: |
| output["importance_sampling_ratio"] = importance_sampling_ratio |
| if ref_per_token_logps is not None: |
| output["ref_per_token_logps"] = ref_per_token_logps |
| if "pixel_values" in forward_kwargs: |
| output["pixel_values"] = forward_kwargs["pixel_values"] |
| if "image_grid_thw" in forward_kwargs: |
| output["image_grid_thw"] = forward_kwargs["image_grid_thw"] |
| if "pixel_attention_mask" in forward_kwargs: |
| output["pixel_attention_mask"] = forward_kwargs["pixel_attention_mask"] |
| if "image_sizes" in forward_kwargs: |
| output["image_sizes"] = forward_kwargs["image_sizes"] |
| if "token_type_ids" in forward_kwargs: |
| output["token_type_ids"] = forward_kwargs["token_type_ids"] |
| if images is not None: |
| output["num_images"] = num_images |
| if max_left_pad is not None: |
| output["max_left_pad"] = torch.tensor(prompt_ids.shape[0] * [max_left_pad]).unsqueeze(-1) |
| try: |
| if self.use_vllm and getattr(self, "vllm_importance_sampling_correction", False): |
| output["sampling_per_token_logps"] = sampling_per_token_logps |
| except NameError: |
| output["sampling_per_token_logps"] = None |
| return output |
|
|
| def compute_liger_loss(self, unwrapped_model, inputs): |
| |
| prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"] |
| completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"] |
| input_ids = torch.cat([prompt_ids, completion_ids], dim=1) |
| attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) |
| logits_to_keep = completion_ids.size(1) |
|
|
| |
| last_hidden_state = self._get_last_hidden_state( |
| unwrapped_model, |
| input_ids, |
| attention_mask, |
| logits_to_keep, |
| inputs.get("pixel_values"), |
| inputs.get("image_grid_thw"), |
| inputs.get("pixel_attention_mask"), |
| inputs.get("image_sizes"), |
| ) |
|
|
| |
| loss, metrics = self.liger_grpo_loss( |
| _input=last_hidden_state, |
| lin_weight=unwrapped_model.lm_head.weight, |
| selected_token_ids=completion_ids, |
| attention_mask=completion_mask, |
| advantages=inputs["advantages"], |
| bias=unwrapped_model.lm_head.bias, |
| old_per_token_logps=inputs.get("old_per_token_logps"), |
| ref_per_token_logps=inputs.get("ref_per_token_logps"), |
| ) |
| |
| |
| mean_kl = metrics[0] if self.beta != 0.0 else None |
| clip_ratio = metrics[-1] |
|
|
| mode = "train" if self.model.training else "eval" |
| if self.beta != 0.0: |
| self._metrics[mode]["kl"].append(self.accelerator.gather(mean_kl).mean().item()) |
| self._metrics[mode]["clip_ratio"].append(self.accelerator.gather(clip_ratio).mean().item()) |
| return loss / self.current_gradient_accumulation_steps |
|
|
| def compute_loss( |
| self, model, inputs, return_outputs = False, num_items_in_batch = None |
| ): |
| if return_outputs: |
| raise ValueError("The GRPOTrainer does not support returning outputs") |
| |
|
|
| prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"] |
| completion_ids, completion_mask = ( |
| inputs["completion_ids"], |
| inputs["completion_mask"], |
| ) |
| pixel_values, image_grid_thw = ( |
| inputs.get("pixel_values", None), |
| inputs.get("image_grid_thw", None), |
| ) |
| pixel_attention_mask, image_sizes = ( |
| inputs.get("pixel_attention_mask", None), |
| inputs.get("image_sizes", None), |
| ) |
| num_items_in_batch = inputs.get("num_items_in_batch", None) |
| sampling_per_token_logps = inputs.get("sampling_per_token_logps", None) |
| current_gradient_accumulation_steps = self.current_gradient_accumulation_steps |
| num_processes = self.accelerator.num_processes |
|
|
| input_ids = torch.cat([prompt_ids, completion_ids], dim = 1) |
| bsz, qlen = input_ids.shape |
| attention_mask = torch.cat([prompt_mask, completion_mask], dim = 1) |
| |
| logits_to_keep = completion_ids.size( |
| 1 |
| ) |
| _input_ids = input_ids |
| _logits_to_keep = logits_to_keep |
|
|
| get_logps_func = ( |
| lambda model, |
| input_ids, |
| attention_mask, |
| logits_to_keep, |
| batch_size = None, |
| compute_entropy = False, |
| compute_efficient = False: self._get_per_token_logps( |
| model, input_ids, attention_mask, logits_to_keep, compute_efficient |
| ) |
| if hasattr(self, "_get_per_token_logps") |
| else self._get_per_token_logps_and_entropies( |
| model, |
| input_ids, |
| attention_mask, |
| logits_to_keep, |
| batch_size, |
| compute_entropy, |
| compute_efficient, |
| )[0] |
| ) |
|
|
| per_token_logps = get_logps_func( |
| model, input_ids, attention_mask, logits_to_keep, compute_efficient = True |
| ) |
| |
| |
| |
| |
| |
| |
| |
| |
| ref_logps = inputs.get("ref_per_token_logps", None) |
| |
| |
| advantages = inputs["advantages"] |
| |
| |
| |
| old_logps = inputs.get("old_per_token_logps", None) |
|
|
| input_ids = input_ids[:, -logits_to_keep:] |
|
|
| |
| logit_softcapping = getattr(model.config, "final_logit_softcapping", 0) |
| if logit_softcapping is None: |
| logit_softcapping = 0 |
| logit_scale_multiply = getattr(model.config, "logit_scale", 0) |
| if logit_scale_multiply is None: |
| logit_scale_multiply = 0 |
| logit_scale_divide = getattr(model.config, "logits_scaling", 0) |
| if logit_scale_divide is None: |
| logit_scale_divide = 0 |
|
|
| max_left_pad = inputs.get("max_left_pad", 0) |
| if per_token_logps is not None: |
| loss, completion_length, mean_kl, delta, flat_is_ratio, coef_1 = ( |
| grpo_compute_loss_slow( |
| ref_logps, |
| per_token_logps, |
| old_logps, |
| input_ids, |
| completion_mask, |
| self.beta, |
| advantages, |
| pixel_values = pixel_values, |
| image_grid_thw = image_grid_thw, |
| loss_type = self.args.loss_type, |
| importance_sampling_level = self.importance_sampling_level, |
| epsilon_low = self.epsilon_low, |
| epsilon_high = self.epsilon_high, |
| max_completion_length = self.args.max_completion_length, |
| delta = self.args.delta, |
| temperature = self.args.temperature, |
| max_left_pad = max_left_pad, |
| logit_softcapping = logit_softcapping, |
| logit_scale_multiply = logit_scale_multiply, |
| logit_scale_divide = logit_scale_divide, |
| num_items_in_batch = num_items_in_batch, |
| current_gradient_accumulation_steps = current_gradient_accumulation_steps, |
| num_processes = num_processes, |
| sampling_per_token_logps = sampling_per_token_logps, |
| ) |
| ) |
| else: |
| if hasattr(self.args, "loss_type"): |
| loss, completion_length, mean_kl, delta, flat_is_ratio, coef_1 = ( |
| grpo_accumulated_loss( |
| trainer = self, |
| input_ids = _input_ids, |
| pixel_values = pixel_values, |
| image_grid_thw = image_grid_thw, |
| logits_to_keep = logits_to_keep, |
| completion_mask = completion_mask, |
| advantages = advantages, |
| old_logps = old_logps, |
| ref_logps = ref_logps, |
| n_chunks = self.args.unsloth_num_chunks, |
| loss_type = self.args.loss_type, |
| importance_sampling_level = self.importance_sampling_level, |
| epsilon_low = self.epsilon_low, |
| epsilon_high = self.epsilon_high, |
| max_completion_length = self.args.max_completion_length, |
| delta = self.args.delta, |
| temperature = self.args.temperature, |
| max_left_pad = max_left_pad, |
| logit_softcapping = logit_softcapping, |
| logit_scale_multiply = logit_scale_multiply, |
| logit_scale_divide = logit_scale_divide, |
| attention_mask = attention_mask, |
| num_items_in_batch = num_items_in_batch, |
| current_gradient_accumulation_steps = current_gradient_accumulation_steps, |
| num_processes = num_processes, |
| sampling_per_token_logps = sampling_per_token_logps, |
| ) |
| ) |
| else: |
| |
| loss, completion_length, mean_kl, coef_1 = grpo_accumulated_loss( |
| trainer = self, |
| input_ids = _input_ids, |
| logits_to_keep = logits_to_keep, |
| completion_mask = completion_mask, |
| advantages = advantages, |
| old_logps = old_logps, |
| ref_logps = ref_logps, |
| n_chunks = self.args.unsloth_num_chunks, |
| temperature = self.args.temperature, |
| logit_softcapping = logit_softcapping, |
| logit_scale_multiply = logit_scale_multiply, |
| logit_scale_divide = logit_scale_divide, |
| attention_mask = attention_mask, |
| ) |
| if "train" in self._metrics: |
| mode = "eval" if self.control.should_evaluate else "train" |
| self._metrics[mode]["completion_length"].append(completion_length.item()) |
| self._metrics[mode]["kl"].append(mean_kl.item()) |
| else: |
| self._metrics["completion_length"].append(completion_length.item()) |
| self._metrics["kl"].append(mean_kl.item()) |
|
|
| if ( |
| self.use_vllm |
| and delta is not None |
| and getattr(self, "vllm_importance_sampling_correction", False) |
| ): |
| mean_delta = ( |
| torch.mean(delta) |
| if delta.numel() > 0 |
| else torch.tensor(0.0, device = self.model.device) |
| ) |
| max_delta = ( |
| torch.max(delta) |
| if delta.numel() > 0 |
| else torch.tensor(0.0, device = self.model.device) |
| ) |
| self._metrics[mode]["sampling/sampling_logp_difference/mean"].append( |
| self.accelerator.gather(mean_delta).mean().item() |
| ) |
| self._metrics[mode]["sampling/sampling_logp_difference/max"].append( |
| self.accelerator.gather(max_delta).max().item() |
| ) |
|
|
| min_importance_sampling_ratio = ( |
| torch.min(flat_is_ratio) |
| if flat_is_ratio.numel() > 0 |
| else torch.tensor(0.0, device = self.model.device) |
| ) |
| mean_importance_sampling_ratio = ( |
| torch.mean(flat_is_ratio) |
| if flat_is_ratio.numel() > 0 |
| else torch.tensor(0.0, device = self.model.device) |
| ) |
| max_importance_sampling_ratio = ( |
| torch.max(flat_is_ratio) |
| if flat_is_ratio.numel() > 0 |
| else torch.tensor(0.0, device = self.model.device) |
| ) |
| self._metrics[mode]["sampling/importance_sampling_ratio/min"].append( |
| self.accelerator.gather(min_importance_sampling_ratio) |
| .nan_to_num(nan = float("inf")) |
| .min() |
| .item() |
| ) |
| self._metrics[mode]["sampling/importance_sampling_ratio/mean"].append( |
| self.accelerator.gather(mean_importance_sampling_ratio).nanmean().item() |
| ) |
| self._metrics[mode]["sampling/importance_sampling_ratio/max"].append( |
| self.accelerator.gather(max_importance_sampling_ratio) |
| .nan_to_num(nan = float("-inf")) |
| .max() |
| .item() |
| ) |
|
|
| completion_token_count = completion_mask.sum().clamp(min = 1.0) |
|
|
| def masked_batch_mean(x): |
| if x.shape[1] == 1: |
| return x.mean() |
| else: |
| return (x * completion_mask).sum() / completion_token_count |
|
|
| if advantages.dim() == 1: |
| advantages = advantages.unsqueeze(1) |
|
|
| if self.loss_type in ["grpo", "bnpo", "dr_grpo", "dapo"]: |
| |
| is_low_clipped = (coef_1 < 1 - self.epsilon_low) & (advantages < 0) |
| is_high_clipped = (coef_1 > 1 + self.epsilon_high) & (advantages > 0) |
| is_region_clipped = is_low_clipped | is_high_clipped |
|
|
| low_clip = masked_batch_mean(is_low_clipped.float()) |
| high_clip = masked_batch_mean(is_high_clipped.float()) |
| clip_ratio = masked_batch_mean(is_region_clipped.float()) |
|
|
| gathered_low_clip = self.accelerator.gather(low_clip) |
| self._metrics[mode]["clip_ratio/low_mean"].append( |
| gathered_low_clip.nanmean().item() |
| ) |
| self._metrics[mode]["clip_ratio/low_min"].append( |
| nanmin(gathered_low_clip).item() |
| ) |
| gathered_high_clip = self.accelerator.gather(high_clip) |
| self._metrics[mode]["clip_ratio/high_mean"].append( |
| gathered_high_clip.nanmean().item() |
| ) |
| self._metrics[mode]["clip_ratio/high_max"].append( |
| nanmax(gathered_high_clip).item() |
| ) |
| gathered_clip_ratio = self.accelerator.gather(clip_ratio) |
| self._metrics[mode]["clip_ratio/region_mean"].append( |
| gathered_clip_ratio.nanmean().item() |
| ) |
| elif self.loss_type == "cispo": |
| is_cispo_clipped = (coef_1 > self.epsilon_high) & (advantages > 0) |
| cispo_clip_ratio = masked_batch_mean(is_cispo_clipped.float()) |
| gathered_cispo_clip_ratio = self.accelerator.gather(cispo_clip_ratio) |
| self._metrics[mode]["cispo_clip_ratio"].append( |
| gathered_cispo_clip_ratio.nanmean().item() |
| ) |
|
|
| return loss |
|
|
| def _compute_loss(self, model, inputs): |
| |
| prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"] |
| completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"] |
| input_ids = torch.cat([prompt_ids, completion_ids], dim=1) |
| attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) |
| logits_to_keep = completion_ids.size(1) |
|
|
| |
| per_token_logps, entropies = self._get_per_token_logps_and_entropies( |
| model, |
| input_ids, |
| attention_mask, |
| logits_to_keep, |
| compute_entropy=True, |
| pixel_values=inputs.get("pixel_values"), |
| image_grid_thw=inputs.get("image_grid_thw"), |
| num_images=inputs.get("num_images"), |
| pixel_attention_mask=inputs.get("pixel_attention_mask"), |
| image_sizes=inputs.get("image_sizes"), |
| token_type_ids=inputs.get("token_type_ids"), |
| ) |
|
|
| if self.top_entropy_quantile < 1.0: |
| entropy_mask = self.get_high_entropy_mask(entropies, completion_mask, 1 - self.top_entropy_quantile) |
| else: |
| entropy_mask = None |
|
|
| |
| if self.beta != 0.0: |
| ref_per_token_logps = inputs["ref_per_token_logps"] |
| per_token_kl = ( |
| torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1 |
| ) |
|
|
| |
| advantages = inputs["advantages"] |
| |
| |
| |
| |
| |
| old_per_token_logps = inputs.get("old_per_token_logps") |
| old_per_token_logps = per_token_logps.detach() if old_per_token_logps is None else old_per_token_logps |
|
|
| log_ratio = per_token_logps - old_per_token_logps |
| if self.importance_sampling_level == "token": |
| log_importance_weights = log_ratio |
| elif self.importance_sampling_level == "sequence": |
| log_importance_weights = (log_ratio * completion_mask).sum(-1) / completion_mask.sum(-1).clamp(min=1.0) |
| log_importance_weights = log_importance_weights.unsqueeze(-1) |
| else: |
| raise ValueError( |
| f"Unknown importance sampling level: {self.importance_sampling_level}. Possible values are 'token' " |
| "and 'sequence'." |
| ) |
| |
| |
|
|
| coef_1 = torch.exp(log_importance_weights) |
| coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high) |
|
|
| |
| if self.args.delta is not None: |
| coef_1 = torch.clamp(coef_1, max=self.args.delta) |
|
|
| per_token_loss1 = coef_1 * advantages.unsqueeze(1) |
| per_token_loss2 = coef_2 * advantages.unsqueeze(1) |
| per_token_loss = -torch.min(per_token_loss1, per_token_loss2) |
| if entropy_mask is not None: |
| per_token_loss = per_token_loss * entropy_mask |
|
|
| if self.use_vllm and self.vllm_importance_sampling_correction: |
| per_token_loss = per_token_loss * inputs["importance_sampling_ratio"] |
|
|
| if self.beta != 0.0: |
| per_token_loss = per_token_loss + self.beta * per_token_kl |
|
|
| if self.loss_type == "grpo": |
| loss = ((per_token_loss * completion_mask).sum(-1) / completion_mask.sum(-1).clamp(min=1.0)).mean() |
| loss = loss / self.current_gradient_accumulation_steps |
| elif self.loss_type == "bnpo": |
| loss = (per_token_loss * completion_mask).sum() / completion_mask.sum().clamp(min=1.0) |
| loss = loss / self.current_gradient_accumulation_steps |
| elif self.loss_type == "dr_grpo": |
| loss = (per_token_loss * completion_mask).sum() / (per_token_loss.size(0) * self.max_completion_length) |
| loss = loss / self.current_gradient_accumulation_steps |
| elif self.loss_type == "dapo": |
| normalizer = inputs["num_items_in_batch"] / self.accelerator.num_processes |
| loss = (per_token_loss * completion_mask).sum() / normalizer |
| else: |
| raise ValueError(f"Unknown loss type: {self.loss_type}") |
|
|
| |
| mode = "train" if self.model.training else "eval" |
|
|
| completion_token_count = completion_mask.sum().clamp(min=1.0) |
|
|
| def masked_batch_mean(x): |
| if x.shape[1] == 1: |
| return x.mean() |
| else: |
| return (x * completion_mask).sum() / completion_token_count |
|
|
| if self.beta != 0.0: |
| mean_kl = masked_batch_mean(per_token_kl) |
| self._metrics[mode]["kl"].append(self.accelerator.gather(mean_kl).nanmean().item()) |
|
|
| mean_entropy = masked_batch_mean(entropies) |
| self._metrics[mode]["entropy"].append(self.accelerator.gather(mean_entropy).nanmean().item()) |
|
|
| |
| is_low_clipped = (coef_1 < 1 - self.epsilon_low) & (advantages.unsqueeze(1) < 0) |
| is_high_clipped = (coef_1 > 1 + self.epsilon_high) & (advantages.unsqueeze(1) > 0) |
| is_region_clipped = is_low_clipped | is_high_clipped |
|
|
| low_clip = masked_batch_mean(is_low_clipped.float()) |
| high_clip = masked_batch_mean(is_high_clipped.float()) |
| clip_ratio = masked_batch_mean(is_region_clipped.float()) |
|
|
| gathered_low_clip = self.accelerator.gather(low_clip) |
| self._metrics[mode]["clip_ratio/low_mean"].append(gathered_low_clip.nanmean().item()) |
| self._metrics[mode]["clip_ratio/low_min"].append(nanmin(gathered_low_clip).item()) |
| gathered_high_clip = self.accelerator.gather(high_clip) |
| self._metrics[mode]["clip_ratio/high_mean"].append(gathered_high_clip.nanmean().item()) |
| self._metrics[mode]["clip_ratio/high_max"].append(nanmax(gathered_high_clip).item()) |
| gathered_clip_ratio = self.accelerator.gather(clip_ratio) |
| self._metrics[mode]["clip_ratio/region_mean"].append(gathered_clip_ratio.nanmean().item()) |
| return loss |
|
|
| def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys: Optional[list[str]] = None): |
| inputs = self._prepare_inputs(inputs) |
| with torch.no_grad(): |
| with self.compute_loss_context_manager(): |
| loss = self.compute_loss(model, inputs) |
| loss = loss.mean().detach() |
| return loss, None, None |
|
|
| def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None: |
| mode = "train" if self.model.training else "eval" |
| metrics = {key: sum(val) / len(val) for key, val in self._metrics[mode].items()} |
|
|
| |
| |
| if mode == "eval": |
| metrics = {f"eval_{key}": val for key, val in metrics.items()} |
|
|
| logs = {**logs, **metrics} |
| super().log(logs, start_time) |
| self._metrics[mode].clear() |
|
|
| if self.accelerator.is_main_process and self.log_completions: |
| if is_rich_available(): |
| print_prompt_completions_sample( |
| self._logs["prompt"], |
| self._logs["completion"], |
| self._logs["rewards"], |
| self._logs["advantages"], |
| self.state.global_step, |
| self.num_completions_to_print, |
| ) |
|
|
| if self.args.report_to and "wandb" in self.args.report_to and wandb.run is not None: |
| import pandas as pd |
|
|
| table = { |
| "step": [str(self.state.global_step)] * len(self._logs["prompt"]), |
| "prompt": self._logs["prompt"], |
| "completion": self._logs["completion"], |
| **self._logs["rewards"], |
| "advantage": self._logs["advantages"], |
| } |
|
|
| if self._logs["images"]: |
| table["images"] = [] |
| for image_list in self._logs["images"]: |
| |
| table["images"].append([wandb.Image(image) for image in image_list]) |
|
|
| df = pd.DataFrame(table) |
| if self.wandb_log_unique_prompts: |
| df = df.drop_duplicates(subset=["prompt"]) |
| wandb.log({"completions": wandb.Table(dataframe=df)}) |
|
|
| |
| def _save_checkpoint(self, model, trial): |
| if self.args.hub_model_id is None: |
| model_name = Path(self.args.output_dir).name |
| else: |
| model_name = self.args.hub_model_id.split("/")[-1] |
| self.create_model_card(model_name=model_name) |
| super()._save_checkpoint(model, trial) |
| class UnslothGRPOTrainer(_UnslothGRPOTrainer): |
| """ |
| |
| Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the |
| paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language |
| Models](https://huggingface.co/papers/2402.03300). |
| |
| Example: |
| |
| ```python |
| from datasets import load_dataset |
| from trl import GRPOTrainer |
| |
| dataset = load_dataset("trl-lib/tldr", split="train") |
| def reward_func(completions, **kwargs): |
| # Dummy reward function that rewards completions with more unique letters. |
| return [float(len(set(completion))) for completion in completions] |
| trainer = GRPOTrainer( |
| model="Qwen/Qwen2-0.5B-Instruct", |
| reward_funcs=reward_func, |
| train_dataset=dataset, |
| ) |
| |
| trainer.train() |
| ``` |
| |
| Args: |
| model (`Union[str, PreTrainedModel]`): |
| Model to be trained. Can be either: |
| |
| - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a |
| path to a *directory* containing model weights saved using |
| [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded |
| using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in |
| `args.model_init_kwargs`. |
| - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported. |
| reward_funcs (`Union[RewardFunc, list[RewardFunc]]`): |
| Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward |
| functions with the prompts and completions and sum the rewards. Can be either: |
| |
| - A single reward function, such as: |
| - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a |
| path to a *directory* containing model weights saved using |
| [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded |
| using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the |
| keyword arguments in `args.model_init_kwargs`. |
| - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported. |
| - A custom reward function: The function is provided with the prompts and the generated completions, |
| plus any additional columns in the dataset. It should return a list of rewards. Custom reward |
| functions can also return `None` when the reward is not applicable to those samples. This is useful |
| for multi-task training where different reward functions apply to different types of samples. When a |
| reward function returns `None` for a sample, that reward function is excluded from the reward |
| calculation for that sample. For more details, see [Using a custom reward |
| function](#using-a-custom-reward-function). |
| |
| The trainer's state is also passed to the reward function. The trainer's state is an instance of |
| [`~transformers.TrainerState`] and can be accessed by accessing the `trainer_state` argument to the |
| reward function's signature. |
| - A list of reward functions, where each item can independently be any of the above types. Mixing different |
| types within the list (e.g., a string model ID and a custom reward function) is allowed. |
| args ([`GRPOConfig`], *optional*): |
| Configuration for this trainer. If `None`, a default configuration is used. |
| train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]): |
| Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is |
| ignored. The format of the samples can be either: |
| |
| - [Standard](dataset_formats#standard): Each sample contains plain text. |
| - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role |
| and content). |
| eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`): |
| Dataset to use for evaluation. It must meet the same requirements as `train_dataset`. |
| processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`], *optional*): |
| Processing class used to process the data. The padding side must be set to "left". If `None`, the |
| processing class is loaded from the model's name with [`~transformers.AutoProcessor.from_pretrained`]. A |
| padding token, `tokenizer.pad_token`, must be set. If the processing class has not set a padding token, |
| `tokenizer.eos_token` will be used as the default. |
| reward_processing_classes ([`~transformers.PreTrainedTokenizerBase`] or `list[PreTrainedTokenizerBase]`, *optional*): |
| Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either: |
| |
| - A single processing class: Used when `reward_funcs` contains only one reward function. |
| - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`. |
| If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is |
| `None`, the tokenizer for the model is automatically loaded using |
| [`~transformers.AutoTokenizer.from_pretrained`]. For elements in `reward_funcs` that are custom reward |
| functions (not [`~transformers.PreTrainedModel`]), the corresponding entries in `reward_processing_classes` |
| are ignored. |
| callbacks (list of [`~transformers.TrainerCallback`], *optional*): |
| List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed |
| in [here](https://huggingface.co/docs/transformers/main_classes/callback). |
| |
| If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`] |
| method. |
| optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`): |
| A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your |
| model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`. |
| peft_config ([`~peft.PeftConfig`], *optional*): |
| PEFT configuration used to wrap the model. If `None`, the model is not wrapped. |
| |
| """ |
| def __init__( |
| self, |
| model, |
| reward_funcs, |
| args = None, |
| train_dataset = None, |
| eval_dataset = None, |
| processing_class = None, |
| reward_processing_classes = None, |
| callbacks = None, |
| peft_config = None, |
| **kwargs |
| ): |
| if args is None: args = UnslothGRPOConfig() |
| use_bf16 = getattr(args, 'bf16', False) |
| if type(use_bf16) is not bool: use_bf16 = False |
| use_fp16 = getattr(args, 'fp16', False) |
| if type(use_fp16) is not bool: use_fp16 = False |
| force_float32 = False |
| full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1' |
| if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'): |
| print('Unsloth: Switching to float32 training since model cannot work with float16') |
| force_float32 = True |
| mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') |
| dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None) |
| if dtype is None: dtype = model.get_input_embeddings().weight.dtype |
| from unsloth_zoo.utils import _get_dtype |
| dtype = _get_dtype(dtype) |
| float16 = dtype == torch.float16 |
| if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') |
| if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') |
| if force_float32: |
| |
| args.fp16 = False |
| args.bf16 = False |
| os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' |
| if hasattr(args, 'mixed_precision'): args.mixed_precision = 'no' |
| |
| elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': |
| |
| args.fp16 = float16 |
| args.bf16 = not float16 |
| os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' |
| if hasattr(args, 'mixed_precision'): args.mixed_precision = 'fp16' if float16 else 'bf16' |
| |
| elif mixed_precision_dtype == 'bfloat16': |
| |
| args.fp16 = False |
| args.bf16 = False |
| os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' |
| if hasattr(args, 'mixed_precision'): args.mixed_precision = 'no' |
| |
| |
| if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': |
| args.eval_strategy = 'steps' |
| if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 |
| ga_steps = getattr(args, 'gradient_accumulation_steps', None) |
| if ga_steps is not None and ga_steps > 1: |
| from transformers import __version__ as transformers_version |
| if Version(transformers_version) <= Version('4.45.2'): |
| print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' |
| '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') |
| if getattr(args, 'eval_strategy', 'no') != 'no': |
| eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) |
| if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size |
| if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps |
| fp16_full_eval = getattr(args, 'fp16_full_eval', False) |
| if type(fp16_full_eval) is not bool: fp16_full_eval = False |
| bf16_full_eval = getattr(args, 'bf16_full_eval', False) |
| if type(bf16_full_eval) is not bool: bf16_full_eval = False |
| if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True |
| if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False |
| if force_float32: |
| args.bf16_full_eval = False |
| args.fp16_full_eval = False |
| elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': |
| args.bf16_full_eval = True |
| args.fp16_full_eval = False |
| elif not bf16_full_eval and not fp16_full_eval: |
| args.bf16_full_eval = args.bf16 |
| args.fp16_full_eval = args.fp16 |
| _output_logits = False |
| if locals().get('compute_metrics', None) is not None: _output_logits = True |
| if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True |
| if _output_logits: |
| os.environ['UNSLOTH_RETURN_LOGITS'] = '1' |
| if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): |
| pass |
| else: |
| model_max_seq_length = getattr(model, 'max_seq_length', None) |
| args_max_seq_length = getattr(args, 'max_seq_length', None) |
| if args_max_seq_length is None and model_max_seq_length is not None: |
| max_seq_length = model.max_seq_length |
| if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length |
| elif args_max_seq_length is not None and model_max_seq_length is not None: |
| if args_max_seq_length > model_max_seq_length: |
| print('Unsloth: You set `max_seq_length` as ' + str(args_max_seq_length) + ' but ' |
| 'the maximum the model supports is ' + str(model_max_seq_length) + '. We shall reduce it.') |
| args.max_seq_length = model_max_seq_length |
| if model is not None and hasattr(model, 'for_training'): |
| model.for_training(use_gradient_checkpointing=getattr(args, 'gradient_checkpointing', True)) |
| if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' |
| if 'processing_class' in locals(): |
| if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' |
| if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' |
| other_metrics = [] |
| if not isinstance(reward_funcs, list): _reward_funcs = [reward_funcs] |
| else: _reward_funcs = reward_funcs |
| for reward_func in _reward_funcs: |
| try: |
| reward_func_name = reward_func.__name__ |
| if True: |
| other_metrics.append(f'rewards/{reward_func_name}/mean') |
| if True: |
| other_metrics.append(f'rewards/{reward_func_name}/std') |
| if False: |
| other_metrics.append(f'rewards/{reward_func_name}') |
| except: pass |
| |
| from unsloth_zoo.logging_utils import PatchRLStatistics |
| PatchRLStatistics('grpo_trainer', other_metrics) |
| |
| |
| |
| if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1: |
| if getattr(args, "_n_gpu", 1) != 1: |
| args._n_gpu = 1 |
| if "model" in locals() and hasattr(model, "for_training"): |
| model.for_training(use_gradient_checkpointing=getattr(args, 'gradient_checkpointing', True)) |
| super().__init__( |
| model = model, |
| reward_funcs = reward_funcs, |
| args = args, |
| train_dataset = train_dataset, |
| eval_dataset = eval_dataset, |
| processing_class = processing_class, |
| reward_processing_classes = reward_processing_classes, |
| callbacks = callbacks, |
| peft_config = peft_config,**kwargs) |
| if "model" in locals() and hasattr(model, "for_inference"): |
| model.for_inference() |
| if hasattr(self, 'neftune_hook_handle'): |
| self.neftune_hook_handle.remove() |
| if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle |
| if getattr(args, 'neftune_noise_alpha', None) is not None: |
| model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha |
| pass |
| if hasattr(self, 'accelerator'): |
| scaler = self.accelerator.scaler |
| current_model = model |
| while hasattr(current_model, 'model'): |
| current_model.accelerator_scaler = scaler |
| current_model = current_model.model |
| current_model.accelerator_scaler = scaler |
| pass |
| if hasattr(self, 'train'): |
| self.train = MethodType(prepare_for_training_mode(self.__class__.train), self) |
| pass |
| if hasattr(self, 'llm') and self.llm is not None and hasattr(self.llm, 'get_tokenizer'): |
| _vllm_tok = self.llm.get_tokenizer() |
| _pc = getattr(self, 'processing_class', None) or getattr(self, 'tokenizer', None) |
| if _vllm_tok is not None and _pc is not None and getattr(_pc, 'chat_template', None) is not None and getattr(_vllm_tok, 'chat_template', None) is None: |
| _vllm_tok.chat_template = _pc.chat_template |
| pass |
| |
| pass |
|
|
|
|
| if hasattr(logger, "addFilter"): |
| import logging |
| class HideLoggingMessage(logging.Filter): |
| def __init__(self, text): self.text = text |
| def filter(self, x): return not (self.text in x.getMessage()) |
| pass |
| logger.addFilter(HideLoggingMessage("`use_cache=True`")) |
|
|
|
|