| """ |
| 2026.2.1 |
| 2026.2.1 |
| 5.2.0 |
| 0.24.0 |
| __UNSLOTH_VERSIONING__ |
| """ |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| from torch import Tensor |
| import torch |
| import torch.nn as nn |
| from torch.nn import functional as F |
| from unsloth_zoo.temporary_patches.common import torch_compile |
| from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable |
| from trl.trainer.rloo_trainer import (Any, AutoConfig, AutoModelForSequenceClassification, AutoProcessor, AutoTokenizer, BaseTrainer, DataLoader, Dataset, FSDP, GenerationConfig, GuidedDecodingParams, IterableDataset, LLM, Optional, Path, PeftConfig, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, RLOOConfig, RLOOTrainer, RepeatSampler, RewardFunc, Sampler, SamplingParams, SyncRefModelCallback, TrainerCallback, Union, VLLMClient, apply_chat_template, broadcast_object_list, datasets, defaultdict, deque, disable_dropout_in_model, ensure_master_addr_port, entropy_from_logits, gather, gather_object, identity, inspect, is_conversational, is_datasets_available, is_flash_attn_2_available, is_peft_model, is_rich_available, is_vllm_available, logger, logging, maybe_apply_chat_template, nanmax, nanmin, nanstd, nn, nullcontext, os, pad, partial, prepare_deepspeed, prepare_fsdp, prepare_multimodal_messages, print_prompt_completions_sample, profiling_context, profiling_decorator, seed_worker, selective_log_softmax, set_seed, shuffle_sequence_dict, split_pixel_values_by_grid, split_tensor_dict, textwrap, torch, transformers, unsplit_pixel_values_by_grid, unwrap_model_for_generation, warnings, AutoConfig, AutoModelForSequenceClassification, AutoProcessor, AutoTokenizer, Dataset, GenerationConfig, IterableDataset, LLM, Optional, PeftConfig, PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, RLOOConfig, RLOOTrainer, RewardFunc, SyncRefModelCallback, TrainerCallback, Union, VLLMClient, datasets, defaultdict, deque, disable_dropout_in_model, ensure_master_addr_port, identity, inspect, is_peft_model, is_vllm_available, logger, nn, os, pad, prepare_deepspeed, prepare_fsdp, set_seed, torch, transformers, warnings, FSDP, GuidedDecodingParams, LLM, Optional, SamplingParams, apply_chat_template, broadcast_object_list, gather, gather_object, is_flash_attn_2_available, maybe_apply_chat_template, nullcontext, os, pad, prepare_multimodal_messages, profiling_context, torch, transformers, unwrap_model_for_generation, FSDP, LLM, gather, is_peft_model, nn, nullcontext, os, profiling_decorator, Any, Union, profiling_decorator, shuffle_sequence_dict, split_pixel_values_by_grid, split_tensor_dict, torch, unsplit_pixel_values_by_grid, PreTrainedModel, logger, os, torch, FSDP, LLM, nn, os, FSDP, nn, torch) |
|
|
|
|
| import os |
| from typing import * |
| from dataclasses import dataclass, field |
| from packaging.version import Version |
| import torch |
| import numpy as np |
| from contextlib import nullcontext |
| from torch.nn import functional as F |
| import inspect |
| from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling |
| from transformers.training_args import ParallelMode |
| from unsloth_zoo.device_type import DEVICE_TYPE, device_synchronize |
|
|
| |
| |
| import functools |
| from types import MethodType |
| try: |
| from unsloth_zoo.gradient_checkpointing import reset_unsloth_gradient_checkpointing_buffers |
| except: |
| def reset_unsloth_gradient_checkpointing_buffers(): pass |
| def prepare_for_training_mode(f): |
| @functools.wraps(f) |
| def wrapper(self, *args, **kwargs): |
| |
| _was_training = None |
| |
| use_gc = getattr(self.args, 'gradient_checkpointing', True) |
| if hasattr(self, 'model') and hasattr(self.model, "training"): |
| _was_training = self.model.training |
| if hasattr(self, 'model') and hasattr(self.model, "for_training"): |
| self.model.for_training(use_gradient_checkpointing=use_gc) |
| output = f(self, *args, **kwargs) |
| |
| if hasattr(self, 'model') and hasattr(self.model, "for_inference"): |
| if _was_training is False: |
| self.model.for_inference() |
| elif _was_training is True and hasattr(self.model, "for_training"): |
| self.model.for_training(use_gradient_checkpointing=use_gc) |
| |
| try: |
| reset_unsloth_gradient_checkpointing_buffers() |
| except: |
| pass |
| |
| try: |
| import wandb |
| wandb.finish() |
| except: |
| pass |
| return output |
| return wrapper |
| pass |
|
|
| torch_compile_options = { |
| "epilogue_fusion" : True, |
| "max_autotune" : False, |
| "shape_padding" : True, |
| "trace.enabled" : False, |
| "triton.cudagraphs" : False, |
| } |
|
|
| @torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) |
| def chunked_hidden_states_selective_log_softmax( |
| hidden_states: torch.Tensor, |
| lm_head: torch.Tensor, |
| index: torch.Tensor, |
| chunks: int = 4, |
| logit_scale_multiply: float = 0.0, |
| logit_scale_divide: float = 0.0, |
| logit_softcapping: float = 0.0, |
| temperature: float = 1.0, |
| ) -> torch.Tensor: |
| |
| flat_hidden_states = hidden_states.reshape(-1, hidden_states.shape[-1]) |
| flat_index = index.reshape(-1) |
|
|
| chunked_hidden_states = torch.chunk(flat_hidden_states, chunks=chunks, dim=0) |
| chunked_index = torch.chunk(flat_index, chunks=chunks, dim=0) |
|
|
| all_per_token_logps = [] |
|
|
| for chunk_hidden_states, chunk_index in zip(chunked_hidden_states, chunked_index): |
| chunk_logits = chunk_hidden_states.to(lm_head.dtype) @ lm_head.t() |
|
|
| if logit_scale_multiply != 0.0: |
| chunk_logits = chunk_logits * logit_scale_multiply |
| if logit_scale_divide != 0.0: |
| chunk_logits = chunk_logits / logit_scale_divide |
| if logit_softcapping != 0.0: |
| chunk_logits = chunk_logits * torch.tanh(chunk_logits / logit_softcapping) |
|
|
| chunk_logits = chunk_logits.to(torch.float32) |
|
|
| if temperature != 1.0: |
| chunk_logits = chunk_logits / temperature |
|
|
| selected_logits = torch.gather(chunk_logits, dim=-1, index=chunk_index.unsqueeze(-1)).squeeze(-1) |
| logsumexp_values = torch.logsumexp(chunk_logits, dim=-1) |
| per_token_logps = selected_logits - logsumexp_values |
| all_per_token_logps.append(per_token_logps) |
|
|
| all_per_token_logps = torch.concat(all_per_token_logps) |
|
|
| all_per_token_logps = all_per_token_logps.reshape((hidden_states.shape[0], hidden_states.shape[1])) |
| return all_per_token_logps |
|
|
| @torch.compile(dynamic = True, fullgraph = True, options = torch_compile_options,) |
| def chunked_selective_log_softmax(logits, index): |
| |
| chunked_logits = torch.chunk(logits.reshape(-1, logits.shape[-1]), chunks = 4, dim = 0) |
| chunked_index = torch.chunk(index.reshape(-1), chunks = 4, dim = 0) |
| all_per_token_logps = [] |
| |
| for chunk_logits, chunk_index in zip(chunked_logits, chunked_index): |
| chunk_logits = chunk_logits.to(torch.float32) |
| selected_logits = torch.gather(chunk_logits, dim = -1, index = chunk_index.unsqueeze(-1)).squeeze(-1) |
| logsumexp_values = torch.logsumexp(chunk_logits, dim = -1) |
| per_token_logps = selected_logits - logsumexp_values |
| all_per_token_logps.append(per_token_logps) |
| pass |
| all_per_token_logps = torch.concat(all_per_token_logps) |
| all_per_token_logps = all_per_token_logps.reshape((logits.shape[0], logits.shape[1])) |
| return all_per_token_logps |
|
|
| def calculate_pad_tokens_in_prompt( |
| input_ids: torch.Tensor, |
| logits_to_keep: int, |
| pad_token_id: int |
| ) -> torch.Tensor: |
| """ |
| Given prompt tensor, it returns all the left padded tokens in that sequence. so [pad, pad, pad, cat] = 3 tokens |
| """ |
| if logits_to_keep >= input_ids.shape[1]: |
| raise ValueError("logits_to_keep must be smaller than the sequence length.") |
|
|
| prompt_section = input_ids[:, :-logits_to_keep] |
|
|
| padding_mask = (prompt_section == pad_token_id) |
|
|
| pad_token_counts = padding_mask.sum(dim=1) |
|
|
| return pad_token_counts |
|
|
| def create_completion_attention_mask( |
| completion_input_ids: torch.Tensor, |
| left_pad_tokens_per_prompt: torch.Tensor, |
| max_left_pad: int, |
| pad_token_id: int |
| ) -> torch.Tensor: |
| """ |
| Given that we have a sequence, [p,p,p,c,c,c,pad,pad,pad] |
| |
| Where p are extra prompt tokens we got from slicing the torch tensor, c is completion tokens |
| and pad are pad tokens, this function would make a completion mask that would 0 out the pad |
| and p tokens. so in this example [0,0,0,1,1,1,0,0,0] |
| """ |
| batch_size, completion_len = completion_input_ids.shape |
| device = completion_input_ids.device |
|
|
| num_tokens_to_mask = max_left_pad - left_pad_tokens_per_prompt |
|
|
| indices = torch.arange(completion_len, device=device).unsqueeze(0) |
| shift_mask = indices >= num_tokens_to_mask.unsqueeze(1) |
|
|
| non_padding_mask = (completion_input_ids != pad_token_id) |
|
|
| final_mask = shift_mask & non_padding_mask |
|
|
| return final_mask |
|
|
| def left_pack_padding(tensor: torch.Tensor, pad_id: int) -> torch.Tensor: |
| """ |
| Moves all padding tokens in each sequence of a batch to the right. |
| """ |
| mask = (tensor != pad_id) |
| |
| sorted_indices = torch.argsort(mask, dim=1, descending=True, stable=True) |
| packed_tensor = torch.gather(tensor, 1, sorted_indices) |
| return packed_tensor |
|
|
| def align_logprobs_with_mask( |
| logprob_tensor: torch.Tensor, |
| attention_mask: torch.Tensor, |
| pad_value: float = 0.0 |
| ) -> torch.Tensor: |
| """ |
| Aligns a log probability tensor with a given attention mask. |
| """ |
|
|
| device = logprob_tensor.device |
| batch_size, logprob_seq_len = logprob_tensor.shape |
| mask_seq_len = attention_mask.shape[1] |
|
|
| padded_logprobs = torch.full( |
| attention_mask.shape, |
| fill_value=pad_value, |
| dtype=logprob_tensor.dtype, |
| device=device |
| ) |
|
|
| left_pad_counts = torch.argmax(attention_mask, dim=1) |
|
|
| cols = torch.arange(logprob_seq_len, device=device) |
| dest_indices = left_pad_counts.unsqueeze(1) + cols |
|
|
| |
| |
| row_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand_as(dest_indices) |
|
|
| |
| |
| |
| valid_mask = dest_indices < mask_seq_len |
|
|
| |
| |
| |
| valid_rows = row_indices[valid_mask] |
| valid_cols = dest_indices[valid_mask] |
| valid_vals = logprob_tensor[valid_mask] |
|
|
| |
| |
| padded_logprobs[valid_rows, valid_cols] = valid_vals |
|
|
| return padded_logprobs |
|
|
| def autotune_batch_and_chunks( |
| total_input_rows, |
| seq_len, |
| hidden_size, |
| vocab_size, |
| dtype_bytes=16, |
| multiplier=None |
| ): |
| if multiplier is None: |
| final_m = max(4, seq_len // 4096) |
| else: |
| final_m = multiplier |
|
|
| if torch.cuda.is_available(): |
| free_bytes, _ = torch.cuda.mem_get_info() |
| limit_gb = (free_bytes / (1024**3))*.80 |
| elif hasattr(torch, "xpu") and torch.xpu.is_available(): |
| |
| total_mem = torch.xpu.get_device_properties(0).total_memory |
| reserved_mem = torch.xpu.memory_reserved() |
| free_bytes = total_mem - reserved_mem |
| limit_gb = (free_bytes / (1024**3)) * 0.80 |
| else: |
| |
| limit_gb = 8.0 |
|
|
| bytes_to_gb = 1024**3 |
|
|
| b_vals = torch.arange(total_input_rows, 0, -1, device='cpu', dtype=torch.float32) |
|
|
| hidden_gb = (b_vals * seq_len * hidden_size * dtype_bytes) / bytes_to_gb |
|
|
| base_logits = ((b_vals/total_input_rows) * b_vals * seq_len * vocab_size * dtype_bytes) / bytes_to_gb |
| logits_gb = base_logits / final_m |
|
|
| total_mem_gb = hidden_gb + logits_gb |
|
|
| valid_mask = total_mem_gb <= limit_gb |
| valid_indices = torch.nonzero(valid_mask, as_tuple=False) |
|
|
| if valid_indices.shape[0] == 0: |
| |
| return 4, final_m |
|
|
| best_idx = valid_indices[0].item() |
| final_b = int(b_vals[best_idx].item()) |
|
|
| return final_b, final_m |
| def vLLMSamplingParams(**kwargs): |
| from vllm import SamplingParams |
|
|
| sampling_params = SamplingParams(**kwargs) |
| sampling_params._set_kwargs = kwargs |
| return sampling_params |
| @dataclass |
| class UnslothRLOOConfig(RLOOConfig): |
| """ |
| |
| Configuration class for the [`RLOOTrainer`]. |
| |
| This class includes only the parameters that are specific to RLOO training. For a full list of training arguments, |
| please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this class may |
| differ from those in [`~transformers.TrainingArguments`]. |
| |
| Using [`~transformers.HfArgumentParser`] we can turn this class into |
| [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the |
| command line. |
| |
| Parameters: |
| > Parameters that control the model and reference model |
| |
| model_init_kwargs (`str`, `dict[str, Any]`, *optional*): |
| Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model` |
| argument of the [`RLOOTrainer`] is provided as a string. |
| disable_dropout (`bool`, *optional*, defaults to `False`): |
| Whether to disable dropout in the model. This is useful for training with a reference model, as it prevents |
| the model from generating different logprobs for the same input. |
| |
| > Parameters that control the data preprocessing |
| |
| remove_unused_columns (`bool`, *optional*, defaults to `False`): |
| Whether to only keep the column `"prompt"` in the dataset. If you use a custom reward function that |
| requires any column other than `"prompts"` and `"completions"`, you should keep this to `False`. |
| max_prompt_length (`int` or `None`, *optional*, defaults to `512`): |
| Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left. |
| num_generations (`int` or `None`, *optional*, defaults to `2`): |
| Number of generations per prompt to sample. The effective batch size (num_processes * per_device_batch_size |
| * gradient_accumulation_steps) must be evenly divisible by this value. |
| max_completion_length (`int` or `None`, *optional*, defaults to `256`): |
| Maximum length of the generated completion. |
| ds3_gather_for_generation (`bool`, *optional*, defaults to `True`): |
| This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, |
| improving generation speed. However, disabling this option allows training models that exceed the VRAM |
| capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible |
| with vLLM generation. |
| shuffle_dataset (`bool`, *optional*, defaults to `True`): |
| Whether to shuffle the training dataset. |
| |
| > Parameters that control generation |
| |
| generation_batch_size: (`int`, *optional*): |
| Batch size to use for generation. If `None`, it defaults to the effective training batch size: |
| `per_device_train_batch_size * num_processes * steps_per_generation`. In other words, there is one |
| generation batch processed per optimization step. Mutually exclusive with `steps_per_generation`. |
| steps_per_generation: (`int`, *optional*): |
| Number of steps per generation. If `None`, it defaults to `gradient_accumulation_steps`. Mutually exclusive |
| with `generation_batch_size`. |
| temperature (`float`, defaults to `1.0`): |
| Temperature for sampling. The higher the temperature, the more random the completions. |
| top_p (`float`, *optional*, defaults to `1.0`): |
| Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to |
| `1.0` to consider all tokens. |
| top_k (`int`, *optional*): |
| Number of highest probability vocabulary tokens to keep for top-k-filtering. If `None`, top-k-filtering is |
| disabled and all tokens are considered. |
| min_p (`float`, *optional*): |
| Minimum token probability, which will be scaled by the probability of the most likely token. It must be a |
| value between `0.0` and `1.0`. Typical values are in the `0.01-0.2` range. |
| repetition_penalty (`float`, *optional*, defaults to `1.0`): |
| Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far. |
| Values > `1.0` encourage the model to use new tokens, while values < `1.0` encourage the model to repeat |
| tokens. |
| use_transformers_paged (`bool`, *optional*, defaults to `False`): |
| Whether to use the `transformers` paged implementation for generation. If set to `True`, the `transformers` |
| paged implementation will be used for generation instead of the default padded implementation. This |
| parameter is only effective when `use_vllm` is set to `False`. |
| cache_implementation (`str`, *optional*): |
| Implementation of the cache method for faster generation when `use_vllm` is set to `False`. |
| generation_kwargs (`dict[str, Any]`, *optional*): |
| Additional keyword arguments to pass to [`~transformers.GenerationConfig`] (if using transformers) or |
| `SamplingParams` (if using vLLM) when sampling completions. This can be used to further customize the |
| generation behavior, such as setting `suppress_tokens`, `num_beams`, etc. If it contains keys that conflict |
| with the other generation parameters (like `min_p`, `top_p`, etc.), they will override them. |
| |
| > Parameters that control generation acceleration powered by vLLM |
| |
| use_vllm (`bool`, *optional*, defaults to `False`): |
| Whether to use vLLM for generating completions. If set to `True`, the trainer will use vLLM for generation |
| instead of the default model.generate(). Requires `vllm` to be installed. |
| vllm_mode (`str`, *optional*, defaults to `"server"`): |
| Mode to use for vLLM integration when `use_vllm` is set to `True`. Must be one of `"server"` or |
| `"colocate"`. |
| |
| - `"server"`: The trainer will send generation requests to a separate vLLM server. Make sure a TRL vLLM |
| server is running (start with `trl vllm-serve`). |
| - `"colocate"`: vLLM will run in the same process and share the training GPUs. This avoids the need for a |
| separate server but may cause resource contention with training. |
| vllm_model_impl (`str`, *optional*, defaults to `"vllm"`): |
| Model implementation to use for vLLM. Must be one of `"transformers"` or `"vllm"`. `"transformers"`: Use |
| the `transformers` backend for model implementation. `"vllm"`: Use the `vllm` library for model |
| implementation. |
| vllm_guided_decoding_regex (`str`, *optional*): |
| Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled. |
| |
| > Parameters that control the vLLM server (only used when `vllm_mode` is `"server"`) |
| |
| vllm_server_base_url (`str`, *optional*): |
| Base URL for the vLLM server (e.g., `"http://localhost:8000"`). If provided, `vllm_server_host` and |
| `vllm_server_port` are ignored. |
| vllm_server_host (`str`, *optional*, defaults to `"0.0.0.0"`): |
| Host of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided. |
| vllm_server_port (`int`, *optional*, defaults to `8000`): |
| Port of the vLLM server to connect to. Ignored if `vllm_server_base_url` is provided. |
| vllm_server_timeout (`float`, *optional*, defaults to `240.0`): |
| Total timeout duration in seconds to wait for the vLLM server to be up. If the server is not up after the |
| timeout, a `ConnectionError` is raised. |
| |
| > Parameters that control colocated vLLM execution (only used when `vllm_mode` is `"colocate"`) |
| |
| vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.3`): |
| Control the GPU memory utilization for vLLM. This setting only applies when `vllm_mode` is set to |
| `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when |
| launching the vLLM server via the `--vllm_gpu_memory_utilization` flag. |
| vllm_tensor_parallel_size (`int`, *optional*, defaults to `1`): |
| Control the tensor parallel size for vLLM. This setting only applies when `vllm_mode` is set to |
| `"colocate"`. If you are using `vllm_mode="server"`, this parameter must be passed separately when |
| launching the vLLM server via the `--vllm_tensor_parallel_size` flag. |
| vllm_enable_sleep_mode (`bool`, *optional*, defaults to `False`): |
| Whether to enable sleep mode for vLLM. If `True`, vLLM will sleep during the optimization step and woken |
| for weight sync and generation. |
| |
| > Parameters that control the training |
| |
| beta (`float`, *optional*, defaults to `0.05`): |
| KL coefficient. If `0.0`, the reference model is not loaded, reducing memory usage and improving training |
| speed. |
| num_iterations (`int`, *optional*, defaults to `1`): |
| Number of iterations per batch (denoted as μ in the algorithm). |
| epsilon (`float`, *optional*, defaults to `0.2`): |
| Epsilon value for clipping. |
| epsilon_high (`float`, *optional*): |
| Upper-bound epsilon value for clipping. If not specified, it defaults to the same value as the lower-bound |
| specified in argument `epsilon`. Paper [DAPO](https://huggingface.co/papers/2503.14476) recommends `0.28`. |
| reward_weights (`list[float]`, *optional*): |
| Weights for each reward function. Must match the number of reward functions. If `None`, all rewards are |
| weighted equally with weight `1.0`. |
| normalize_advantages (`bool`, *optional*, defaults to `False`): |
| Whether to normalize advantages. Normalization is done per generation batch to have mean `0.0` and standard |
| deviation of `1.0`. |
| reward_clip_range (`tuple[float, float]`, *optional*): |
| Clip range for rewards as (min, max). If `None`, no clipping is applied. |
| mask_truncated_completions (`bool`, *optional*, defaults to `False`): |
| When enabled, truncated completions are excluded from the loss calculation, preventing them from being |
| incorrectly penalized and introducing noise during training. According to the |
| [DAPO](https://huggingface.co/papers/2503.14476) paper, this is a good practice for training stability. |
| sync_ref_model (`bool`, *optional*, defaults to `False`): |
| Whether to synchronize the reference model with the active model every `ref_model_sync_steps` steps, using |
| the `ref_model_mixup_alpha` parameter. This synchronization originates from the |
| [TR-DPO](https://huggingface.co/papers/2404.09656) paper. |
| ref_model_mixup_alpha (`float`, *optional*, defaults to `0.6`): |
| α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix |
| between the current policy and the previous reference policy during updates. The reference policy is |
| updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you |
| must set `sync_ref_model=True`. |
| ref_model_sync_steps (`int`, *optional*, defaults to `512`): |
| τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how |
| frequently the current policy is synchronized with the reference policy. To use this parameter, you must |
| set `sync_ref_model=True`. |
| |
| > Parameters that control the logging |
| |
| log_completions (`bool`, *optional*, defaults to `False`): |
| Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is installed, |
| it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`. |
| num_completions_to_print (`int`, *optional*): |
| Number of completions to print with `rich`. If `None`, all completions are logged. |
| wandb_log_unique_prompts (`bool`, *optional*, defaults to `False`): |
| Whether to log unique prompts in wandb. If `True`, only unique prompts are logged. If `False`, all prompts |
| are logged. |
| |
| > Deprecated parameters |
| |
| rloo_k: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. Use `num_generations` instead. |
| |
| </Deprecated> |
| |
| cliprange: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. Use `epsilon` instead. |
| |
| </Deprecated> |
| |
| kl_coef: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. Use `beta` instead. |
| |
| </Deprecated> |
| |
| exp_name: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. Use `run_name` instead. |
| |
| </Deprecated> |
| |
| normalize_reward: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. Use `normalize_advantages` instead. |
| |
| </Deprecated> |
| |
| num_ppo_epochs: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. Use `num_iterations` instead. |
| |
| </Deprecated> |
| |
| num_mini_batches: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. Use `steps_per_generation` instead. |
| |
| </Deprecated> |
| |
| total_episodes: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. Use `max_steps` instead. |
| |
| </Deprecated> |
| |
| response_length: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. Use `max_completion_length` instead. |
| |
| </Deprecated> |
| |
| token_level_kl: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. KL is now computed only at the sequence |
| level. |
| |
| </Deprecated> |
| |
| dataset_num_proc: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. This parameter was unused, you can |
| safely remove it from your scripts. |
| |
| </Deprecated> |
| |
| local_rollout_forward_batch_size: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. Now it is automatically set to |
| `per_device_train_batch_size` (or `per_device_eval_batch_size` during evaluation). |
| |
| </Deprecated> |
| |
| num_sample_generations: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. Use `logging_steps` to control |
| generation logging frequency. |
| |
| </Deprecated> |
| |
| stop_token: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. |
| |
| </Deprecated> |
| |
| stop_token_id: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. Use `processing_class.eos_token_id` |
| instead. |
| |
| </Deprecated> |
| |
| missing_eos_penalty: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. Replicate with a custom reward function |
| checking if `eos_token_id` is in `completion_ids`. |
| |
| </Deprecated> |
| |
| """ |
| vllm_sampling_params: Optional[Any] = field( |
| default = None, |
| metadata = {'help': 'vLLM SamplingParams'}, |
| ) |
| unsloth_num_chunks : Optional[int] = field( |
| default = -1, |
| metadata = {'help': 'Chunk size to reduce memory usage. -1 is most efficient.'}, |
| ) |
| unsloth_logit_chunk_multiplier : Optional[int] = field( |
| default = None, |
| metadata = {'help': 'Multiplier for chunked logit computations.'}, |
| ) |
| unsloth_grpo_mini_batch : Optional[int] = field( |
| default = None, |
| metadata = {'help': 'Mini batch size for GRPO hidden state accumulation. Default is None unless user defines it.'}, |
| ) |
| |
| def __init__( |
| self, |
| output_dir = None, |
| per_device_train_batch_size = 4, |
| num_train_epochs = 3.0, |
| max_steps = -1, |
| learning_rate = 5e-05, |
| lr_scheduler_type = 'linear', |
| lr_scheduler_kwargs = None, |
| warmup_steps = 0.1, |
| optim = 'adamw_8bit', |
| optim_args = None, |
| weight_decay = 0.01, |
| adam_beta1 = 0.9, |
| adam_beta2 = 0.999, |
| adam_epsilon = 1e-08, |
| optim_target_modules = None, |
| gradient_accumulation_steps = 2, |
| average_tokens_across_devices = True, |
| max_grad_norm = 1.0, |
| label_smoothing_factor = 0.0, |
| bf16 = False, |
| fp16 = False, |
| bf16_full_eval = False, |
| fp16_full_eval = False, |
| tf32 = None, |
| gradient_checkpointing = True, |
| gradient_checkpointing_kwargs = None, |
| torch_compile = False, |
| torch_compile_backend = None, |
| torch_compile_mode = None, |
| use_liger_kernel = False, |
| liger_kernel_config = None, |
| use_cache = False, |
| neftune_noise_alpha = None, |
| torch_empty_cache_steps = 250, |
| auto_find_batch_size = False, |
| logging_strategy = 'steps', |
| logging_steps = 1, |
| logging_first_step = False, |
| log_on_each_node = True, |
| logging_nan_inf_filter = False, |
| include_num_input_tokens_seen = False, |
| log_level = 'passive', |
| log_level_replica = 'warning', |
| disable_tqdm = None, |
| report_to = 'none', |
| run_name = None, |
| project = 'huggingface', |
| trackio_space_id = 'trackio', |
| eval_strategy = 'no', |
| eval_steps = None, |
| eval_delay = 0, |
| per_device_eval_batch_size = 4, |
| prediction_loss_only = False, |
| eval_on_start = False, |
| eval_do_concat_batches = True, |
| eval_use_gather_object = False, |
| eval_accumulation_steps = 2, |
| batch_eval_metrics = False, |
| save_only_model = False, |
| save_strategy = 'steps', |
| save_steps = 500, |
| save_on_each_node = False, |
| save_total_limit = None, |
| enable_jit_checkpoint = False, |
| push_to_hub = False, |
| hub_token = None, |
| hub_private_repo = None, |
| hub_model_id = None, |
| hub_strategy = 'every_save', |
| hub_always_push = False, |
| hub_revision = None, |
| load_best_model_at_end = False, |
| metric_for_best_model = None, |
| greater_is_better = None, |
| ignore_data_skip = False, |
| restore_callback_states_from_checkpoint = False, |
| full_determinism = False, |
| seed = 3407, |
| data_seed = 3407, |
| use_cpu = False, |
| accelerator_config = None, |
| parallelism_config = None, |
| dataloader_drop_last = False, |
| dataloader_num_workers = 0, |
| dataloader_pin_memory = True, |
| dataloader_persistent_workers = False, |
| dataloader_prefetch_factor = None, |
| remove_unused_columns = False, |
| label_names = None, |
| train_sampling_strategy = 'random', |
| length_column_name = 'length', |
| ddp_find_unused_parameters = None, |
| ddp_bucket_cap_mb = None, |
| ddp_broadcast_buffers = None, |
| ddp_backend = None, |
| ddp_timeout = 1800, |
| fsdp = None, |
| fsdp_config = None, |
| deepspeed = None, |
| debug = '', |
| skip_memory_metrics = True, |
| do_train = False, |
| do_eval = False, |
| do_predict = False, |
| resume_from_checkpoint = None, |
| warmup_ratio = None, |
| logging_dir = None, |
| local_rank = -1, |
| model_init_kwargs = None, |
| disable_dropout = False, |
| max_prompt_length = 512, |
| num_generations = 8, |
| max_completion_length = 256, |
| ds3_gather_for_generation = True, |
| shuffle_dataset = True, |
| generation_batch_size = None, |
| steps_per_generation = None, |
| temperature = 1.0, |
| top_p = 1.0, |
| top_k = None, |
| min_p = None, |
| generation_kwargs = {}, |
| repetition_penalty = 1.0, |
| use_transformers_paged = False, |
| cache_implementation = None, |
| use_vllm = False, |
| vllm_mode = 'colocate', |
| vllm_model_impl = 'vllm', |
| vllm_enable_sleep_mode = False, |
| vllm_guided_decoding_regex = None, |
| vllm_server_base_url = None, |
| vllm_server_host = '0.0.0.0', |
| vllm_server_port = 8000, |
| vllm_server_timeout = 240.0, |
| vllm_gpu_memory_utilization = 0.3, |
| vllm_tensor_parallel_size = 1, |
| beta = 0.05, |
| num_iterations = 1, |
| epsilon = 0.2, |
| epsilon_high = None, |
| reward_weights = None, |
| normalize_advantages = False, |
| reward_clip_range = None, |
| mask_truncated_completions = False, |
| sync_ref_model = False, |
| ref_model_mixup_alpha = 0.6, |
| ref_model_sync_steps = 512, |
| log_completions = False, |
| num_completions_to_print = None, |
| wandb_log_unique_prompts = False, |
| rloo_k = None, |
| cliprange = None, |
| kl_coef = None, |
| exp_name = None, |
| normalize_reward = None, |
| num_ppo_epochs = None, |
| num_mini_batches = None, |
| total_episodes = None, |
| response_length = None, |
| token_level_kl = None, |
| dataset_num_proc = None, |
| local_rollout_forward_batch_size = None, |
| num_sample_generations = None, |
| stop_token = None, |
| stop_token_id = None, |
| missing_eos_penalty = None, |
| vllm_sampling_params = None, |
| unsloth_num_chunks = -1, |
| unsloth_logit_chunk_multiplier = None, |
| unsloth_grpo_mini_batch = None, |
| |
| **kwargs, |
| ): |
| if learning_rate < 1e-7: print(f'Unsloth: Your learning rate of `{learning_rate}` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!') |
| if learning_rate > 1: print(f'Unsloth: Your learning rate of `{learning_rate}` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!') |
| if num_train_epochs is None: |
| num_train_epochs = 3.0 |
| if output_dir is None and save_strategy == 'steps' and save_steps == 500: |
| output_dir = 'unsloth_training_checkpoints' |
| save_strategy = 'no' |
| import multiprocessing as _mp |
| if _mp.get_start_method() != 'fork': |
| dataset_num_proc = None |
| elif dataset_num_proc is None: |
| import psutil |
| dataset_num_proc = min(max((psutil.cpu_count() or 1)+4, 2), 64) |
| memory_gb_left = psutil.virtual_memory().available / (1024**3) |
| if memory_gb_left <= 2: dataset_num_proc = 1 |
| else: dataset_num_proc = min(dataset_num_proc, int(memory_gb_left)) |
| if steps_per_generation is None and generation_batch_size is None: |
| ga = gradient_accumulation_steps |
| world_size = int(os.environ.get('WORLD_SIZE', '1')) |
| if (ga * world_size * per_device_train_batch_size) % num_generations != 0: |
| print('Unsloth: We now expect `per_device_train_batch_size` * `gradient_accumulation_steps` * `world_size` to be a multiple of `num_generations`.\nWe will change the batch size of ' + str(per_device_train_batch_size) + ' to the `num_generations` of ' + str(num_generations)) |
| per_device_train_batch_size = num_generations |
| |
| if temperature <= 0: |
| raise ValueError('Unsloth: Please set a positive non-zero temperature since your results will be wrong.') |
| elif temperature >= 10: |
| raise ValueError('Unsloth: Please set a positive non-zero temperature less than 10, since sampling will be quite erratic.') |
| |
| |
| super().__init__( |
| output_dir = output_dir, |
| per_device_train_batch_size = per_device_train_batch_size, |
| num_train_epochs = num_train_epochs, |
| max_steps = max_steps, |
| learning_rate = learning_rate, |
| lr_scheduler_type = lr_scheduler_type, |
| lr_scheduler_kwargs = lr_scheduler_kwargs, |
| warmup_steps = warmup_steps, |
| optim = optim, |
| optim_args = optim_args, |
| weight_decay = weight_decay, |
| adam_beta1 = adam_beta1, |
| adam_beta2 = adam_beta2, |
| adam_epsilon = adam_epsilon, |
| optim_target_modules = optim_target_modules, |
| gradient_accumulation_steps = gradient_accumulation_steps, |
| average_tokens_across_devices = average_tokens_across_devices, |
| max_grad_norm = max_grad_norm, |
| label_smoothing_factor = label_smoothing_factor, |
| bf16 = bf16, |
| fp16 = fp16, |
| bf16_full_eval = bf16_full_eval, |
| fp16_full_eval = fp16_full_eval, |
| tf32 = tf32, |
| gradient_checkpointing = gradient_checkpointing, |
| gradient_checkpointing_kwargs = gradient_checkpointing_kwargs, |
| torch_compile = torch_compile, |
| torch_compile_backend = torch_compile_backend, |
| torch_compile_mode = torch_compile_mode, |
| use_liger_kernel = use_liger_kernel, |
| liger_kernel_config = liger_kernel_config, |
| use_cache = use_cache, |
| neftune_noise_alpha = neftune_noise_alpha, |
| torch_empty_cache_steps = torch_empty_cache_steps, |
| auto_find_batch_size = auto_find_batch_size, |
| logging_strategy = logging_strategy, |
| logging_steps = logging_steps, |
| logging_first_step = logging_first_step, |
| log_on_each_node = log_on_each_node, |
| logging_nan_inf_filter = logging_nan_inf_filter, |
| include_num_input_tokens_seen = include_num_input_tokens_seen, |
| log_level = log_level, |
| log_level_replica = log_level_replica, |
| disable_tqdm = disable_tqdm, |
| report_to = report_to, |
| run_name = run_name, |
| project = project, |
| trackio_space_id = trackio_space_id, |
| eval_strategy = eval_strategy, |
| eval_steps = eval_steps, |
| eval_delay = eval_delay, |
| per_device_eval_batch_size = per_device_eval_batch_size, |
| prediction_loss_only = prediction_loss_only, |
| eval_on_start = eval_on_start, |
| eval_do_concat_batches = eval_do_concat_batches, |
| eval_use_gather_object = eval_use_gather_object, |
| eval_accumulation_steps = eval_accumulation_steps, |
| batch_eval_metrics = batch_eval_metrics, |
| save_only_model = save_only_model, |
| save_strategy = save_strategy, |
| save_steps = save_steps, |
| save_on_each_node = save_on_each_node, |
| save_total_limit = save_total_limit, |
| enable_jit_checkpoint = enable_jit_checkpoint, |
| push_to_hub = push_to_hub, |
| hub_token = hub_token, |
| hub_private_repo = hub_private_repo, |
| hub_model_id = hub_model_id, |
| hub_strategy = hub_strategy, |
| hub_always_push = hub_always_push, |
| hub_revision = hub_revision, |
| load_best_model_at_end = load_best_model_at_end, |
| metric_for_best_model = metric_for_best_model, |
| greater_is_better = greater_is_better, |
| ignore_data_skip = ignore_data_skip, |
| restore_callback_states_from_checkpoint = restore_callback_states_from_checkpoint, |
| full_determinism = full_determinism, |
| seed = seed, |
| data_seed = data_seed, |
| use_cpu = use_cpu, |
| accelerator_config = accelerator_config, |
| parallelism_config = parallelism_config, |
| dataloader_drop_last = dataloader_drop_last, |
| dataloader_num_workers = dataloader_num_workers, |
| dataloader_pin_memory = dataloader_pin_memory, |
| dataloader_persistent_workers = dataloader_persistent_workers, |
| dataloader_prefetch_factor = dataloader_prefetch_factor, |
| remove_unused_columns = remove_unused_columns, |
| label_names = label_names, |
| train_sampling_strategy = train_sampling_strategy, |
| length_column_name = length_column_name, |
| ddp_find_unused_parameters = ddp_find_unused_parameters, |
| ddp_bucket_cap_mb = ddp_bucket_cap_mb, |
| ddp_broadcast_buffers = ddp_broadcast_buffers, |
| ddp_backend = ddp_backend, |
| ddp_timeout = ddp_timeout, |
| fsdp = fsdp, |
| fsdp_config = fsdp_config, |
| deepspeed = deepspeed, |
| debug = debug, |
| skip_memory_metrics = skip_memory_metrics, |
| do_train = do_train, |
| do_eval = do_eval, |
| do_predict = do_predict, |
| resume_from_checkpoint = resume_from_checkpoint, |
| warmup_ratio = warmup_ratio, |
| logging_dir = logging_dir, |
| local_rank = local_rank, |
| model_init_kwargs = model_init_kwargs, |
| disable_dropout = disable_dropout, |
| max_prompt_length = max_prompt_length, |
| num_generations = num_generations, |
| max_completion_length = max_completion_length, |
| ds3_gather_for_generation = ds3_gather_for_generation, |
| shuffle_dataset = shuffle_dataset, |
| generation_batch_size = generation_batch_size, |
| steps_per_generation = steps_per_generation, |
| temperature = temperature, |
| top_p = top_p, |
| top_k = top_k, |
| min_p = min_p, |
| generation_kwargs = generation_kwargs, |
| repetition_penalty = repetition_penalty, |
| use_transformers_paged = use_transformers_paged, |
| cache_implementation = cache_implementation, |
| use_vllm = use_vllm, |
| vllm_mode = vllm_mode, |
| vllm_model_impl = vllm_model_impl, |
| vllm_enable_sleep_mode = vllm_enable_sleep_mode, |
| vllm_guided_decoding_regex = vllm_guided_decoding_regex, |
| vllm_server_base_url = vllm_server_base_url, |
| vllm_server_host = vllm_server_host, |
| vllm_server_port = vllm_server_port, |
| vllm_server_timeout = vllm_server_timeout, |
| vllm_gpu_memory_utilization = vllm_gpu_memory_utilization, |
| vllm_tensor_parallel_size = vllm_tensor_parallel_size, |
| beta = beta, |
| num_iterations = num_iterations, |
| epsilon = epsilon, |
| epsilon_high = epsilon_high, |
| reward_weights = reward_weights, |
| normalize_advantages = normalize_advantages, |
| reward_clip_range = reward_clip_range, |
| mask_truncated_completions = mask_truncated_completions, |
| sync_ref_model = sync_ref_model, |
| ref_model_mixup_alpha = ref_model_mixup_alpha, |
| ref_model_sync_steps = ref_model_sync_steps, |
| log_completions = log_completions, |
| num_completions_to_print = num_completions_to_print, |
| wandb_log_unique_prompts = wandb_log_unique_prompts, |
| rloo_k = rloo_k, |
| cliprange = cliprange, |
| kl_coef = kl_coef, |
| exp_name = exp_name, |
| normalize_reward = normalize_reward, |
| num_ppo_epochs = num_ppo_epochs, |
| num_mini_batches = num_mini_batches, |
| total_episodes = total_episodes, |
| response_length = response_length, |
| token_level_kl = token_level_kl, |
| dataset_num_proc = dataset_num_proc, |
| local_rollout_forward_batch_size = local_rollout_forward_batch_size, |
| num_sample_generations = num_sample_generations, |
| stop_token = stop_token, |
| stop_token_id = stop_token_id, |
| missing_eos_penalty = missing_eos_penalty,**kwargs) |
| self.vllm_sampling_params = vllm_sampling_params |
| self.unsloth_num_chunks = unsloth_num_chunks |
| if unsloth_grpo_mini_batch is not None: |
| if self.generation_batch_size >= unsloth_grpo_mini_batch: |
| self.unsloth_grpo_mini_batch = unsloth_grpo_mini_batch |
| else: |
| raise ValueError( |
| f"Unsloth GRPO mini batch size needs to be less than or equal to the effective generation batch size, " |
| f"which is self.per_device_train_batch_size * gradient_accumulation_steps." |
| ) |
| self.unsloth_logit_chunk_multiplier = unsloth_logit_chunk_multiplier |
| |
|
|
| pass |
|
|
| class _UnslothRLOOTrainer(BaseTrainer): |
| """""" |
|
|
| _tag_names = ["trl", "rloo"] |
| _name = "RLOO" |
| _paper = { |
| "title": "Back to Basics: Revisiting REINFORCE-Style Optimization for Learning from Human Feedback in LLMs", |
| "id": "2402.14740", |
| |
| "citation": textwrap.dedent("""\ |
| @inproceedings{ahmadian2024back, |
| title = {{Back to Basics: Revisiting REINFORCE-Style Optimization for Learning from Human Feedback in LLMs}}, |
| author = {Arash Ahmadian and Chris Cremer and Matthias Gall{\'{e}} and Marzieh Fadaee and Julia Kreutzer and Olivier Pietquin and Ahmet {\"{U}}st{\"{u}}n and Sara Hooker}, |
| year = 2024, |
| booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), {ACL} 2024, Bangkok, Thailand, August 11-16, 2024}, |
| pages = {12248--12267}, |
| publisher = {Association for Computational Linguistics}, |
| editor = {Lun{-}Wei Ku and Andre Martins and Vivek Srikumar}, |
| }"""), |
| } |
|
|
| def __init__( |
| self, |
| |
| model: Union[str, PreTrainedModel] = None, |
| reward_funcs: Union[RewardFunc, list[RewardFunc]] = None, |
| args: Optional[RLOOConfig] = None, |
| train_dataset: Optional[Union[Dataset, IterableDataset]] = None, |
| eval_dataset: Optional[Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]] = None, |
| processing_class: Optional[Union[PreTrainedTokenizerBase, ProcessorMixin]] = None, |
| reward_processing_classes: Optional[Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]] = None, |
| callbacks: Optional[list[TrainerCallback]] = None, |
| optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None), |
| peft_config: Optional["PeftConfig"] = None, |
| |
| config=None, |
| reward_model=None, |
| policy=None, |
| ref_policy=None, |
| data_collator=None, |
| ): |
|
|
| if hasattr(model, 'vllm_engine') and hasattr(args, 'use_vllm'): |
| if (getattr(args, 'use_vllm', False) == False): |
| args.use_vllm = True |
| if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"): |
| warnings.warn( |
| "This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on " |
| "it and want it to remain, please share your comments here: " |
| "https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable " |
| "TRL_EXPERIMENTAL_SILENCE=1." |
| ) |
| |
| if config is not None: |
| warnings.warn( |
| "Parameter 'config' is deprecated and will be removed in version 0.25.0. Please use 'args' instead. " |
| "We are setting args=config" |
| ) |
| if args is None: |
| args = config |
| else: |
| raise ValueError("Cannot specify both 'config' (deprecated) and 'args'. Please use 'args' only.") |
|
|
| if reward_model is not None: |
| warnings.warn( |
| "Parameter 'reward_model' is deprecated and will be removed in version 0.25.0. Please use " |
| "'reward_funcs' instead. We are setting reward_funcs=reward_model" |
| ) |
| if reward_funcs is None: |
| reward_funcs = reward_model |
| else: |
| raise ValueError( |
| "Cannot specify both 'reward_model' (deprecated) and 'reward_funcs'. Please use 'reward_funcs' " |
| "only." |
| ) |
| if policy is not None: |
| warnings.warn( |
| "Parameter 'policy' is deprecated and will be removed in version 0.25.0. Please use 'model' instead. " |
| "We are setting model=policy" |
| ) |
| if model is None: |
| model = policy |
| else: |
| raise ValueError("Cannot specify both 'policy' (deprecated) and 'model'. Please use 'model' only.") |
| if ref_policy is not None: |
| warnings.warn( |
| "Parameter 'ref_policy' is deprecated and will be removed in version 0.25.0. To use the initial model " |
| "as the reference model, simply omit this parameter. The parameter is ignored." |
| ) |
| if data_collator is not None: |
| warnings.warn( |
| "Parameter 'data_collator' is deprecated and will be removed in version 0.25.0. The RLOOTrainer does " |
| "not use a data collator, so this parameter is ignored." |
| ) |
| if "input_ids" in train_dataset.column_names: |
| warnings.warn( |
| "The training dataset contains a column named 'input_ids', indicating that it is pre-tokenized. " |
| "Support for pre-tokenized datasets is deprecated and will be removed in version 0.25. Please provide " |
| "the raw dataset (conversational or standard) with a 'prompt' column instead." |
| ) |
|
|
| def decode(example, tokenizer): |
| return {"prompt": tokenizer.decode(example["input_ids"])} |
|
|
| train_dataset = train_dataset.map(decode, fn_kwargs={"tokenizer": processing_class}) |
| if eval_dataset is not None and "input_ids" in eval_dataset.column_names: |
| warnings.warn( |
| "The evaluation dataset contains a column named 'input_ids', indicating that it is pre-tokenized. " |
| "Support for pre-tokenized datasets is deprecated and will be removed in version 0.25. Please provide " |
| "the raw dataset (conversational or standard) with a 'prompt' column instead." |
| ) |
|
|
| def decode(example, tokenizer): |
| return {"prompt": tokenizer.decode(example["input_ids"])} |
|
|
| eval_dataset = eval_dataset.map(decode, fn_kwargs={"tokenizer": processing_class}) |
|
|
| |
| if args is None: |
| model_name = model if isinstance(model, str) else model.config._name_or_path |
| model_name = model_name.split("/")[-1] |
| args = RLOOConfig(f"{model_name}-RLOO") |
|
|
| |
| |
| model_init_kwargs = args.model_init_kwargs or {} |
| if isinstance(model, str): |
| model_id = model |
| dtype = model_init_kwargs.get("dtype") |
| if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None: |
| pass |
| elif isinstance(dtype, str): |
| dtype = getattr(torch, dtype) |
| model_init_kwargs["dtype"] = dtype |
| else: |
| raise ValueError( |
| "Invalid `dtype` passed to `RLOOConfig`. Expected either 'auto' or a string representing " |
| f"a `torch.dtype` (e.g., 'float32'), but got {dtype}." |
| ) |
| |
| config = AutoConfig.from_pretrained(model_id) |
| architecture = getattr(transformers, config.architectures[0]) |
| model = architecture.from_pretrained(model_id, **model_init_kwargs) |
| else: |
| model_id = model.config._name_or_path |
| if args.model_init_kwargs is not None: |
| logger.warning( |
| "You passed `model_init_kwargs` to the `RLOOConfig`, but your model is already instantiated. " |
| "The `model_init_kwargs` will be ignored." |
| ) |
|
|
| |
| |
| self.model_kwarg_keys = ( |
| inspect.signature(model.forward).parameters.keys() |
| if not hasattr(model, "get_base_model") |
| else inspect.signature(model.get_base_model().forward).parameters.keys() |
| ) |
|
|
| if False: |
| pass |
|
|
| |
| if processing_class is None: |
| processing_class = AutoProcessor.from_pretrained(model.config._name_or_path, truncation_side="left") |
|
|
| |
| if isinstance(processing_class, ProcessorMixin): |
| tokenizer = processing_class.tokenizer |
| elif isinstance(processing_class, PreTrainedTokenizerBase): |
| tokenizer = processing_class |
| else: |
| raise TypeError("The `processing_class` must be either a `PreTrainedTokenizerBase` or a `ProcessorMixin`") |
|
|
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| self.pad_token = tokenizer.pad_token |
| self.pad_token_id = tokenizer.pad_token_id |
| self.eos_token_id = tokenizer.eos_token_id |
|
|
| |
| if not isinstance(reward_funcs, list): |
| reward_funcs = [reward_funcs] |
| self.reward_func_names = [] |
| for i, reward_func in enumerate(reward_funcs): |
| if isinstance(reward_func, str): |
| reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained( |
| reward_func, num_labels=1, **model_init_kwargs |
| ) |
| if isinstance(reward_funcs[i], nn.Module): |
| self.reward_func_names.append(reward_funcs[i].config._name_or_path.split("/")[-1]) |
| else: |
| self.reward_func_names.append(reward_funcs[i].__name__) |
| self.reward_funcs = reward_funcs |
|
|
| |
| if args.reward_weights is not None: |
| if len(args.reward_weights) != len(reward_funcs): |
| raise ValueError( |
| f"Number of reward weights ({len(args.reward_weights)}) must match number of reward " |
| f"functions ({len(reward_funcs)})" |
| ) |
| self.reward_weights = torch.tensor(args.reward_weights, dtype=torch.float32) |
| else: |
| self.reward_weights = torch.ones(len(reward_funcs), dtype=torch.float32) |
|
|
| |
| if reward_processing_classes is None: |
| reward_processing_classes = [None] * len(reward_funcs) |
| elif not isinstance(reward_processing_classes, list): |
| reward_processing_classes = [reward_processing_classes] |
| if len(reward_processing_classes) != len(reward_funcs): |
| raise ValueError( |
| f"The number of reward processing classes ({len(reward_processing_classes)}) must match the number of " |
| f"reward functions ({len(reward_funcs)})." |
| ) |
|
|
| for i, (reward_processing_class, reward_func) in enumerate(zip(reward_processing_classes, reward_funcs)): |
| if isinstance(reward_func, PreTrainedModel): |
| if reward_processing_class is None: |
| reward_processing_class = AutoTokenizer.from_pretrained(reward_func.config._name_or_path) |
| if reward_processing_class.pad_token_id is None: |
| reward_processing_class.pad_token = reward_processing_class.eos_token |
| |
| |
| reward_func.config.pad_token_id = reward_processing_class.pad_token_id |
| reward_processing_classes[i] = reward_processing_class |
|
|
| self.reward_processing_classes = reward_processing_classes |
|
|
| |
| self.max_prompt_length = args.max_prompt_length |
| self.max_completion_length = args.max_completion_length |
| self.num_generations = args.num_generations |
| self.temperature = args.temperature |
| self.top_p = args.top_p |
| self.top_k = args.top_k |
| self.min_p = args.min_p |
| self.repetition_penalty = args.repetition_penalty |
| self.use_transformers_paged = args.use_transformers_paged |
| self.use_vllm = args.use_vllm |
| self.vllm_mode = args.vllm_mode |
| self.vllm_gpu_memory_utilization = args.vllm_gpu_memory_utilization |
| self.vllm_tensor_parallel_size = args.vllm_tensor_parallel_size |
| self.normalize_advantages = args.normalize_advantages |
| self.mask_truncated_completions = args.mask_truncated_completions |
| self.reward_clip_range = args.reward_clip_range |
|
|
| |
| self.shuffle_dataset = args.shuffle_dataset |
|
|
| if ( |
| isinstance(train_dataset, IterableDataset) |
| or isinstance(eval_dataset, IterableDataset) |
| or ( |
| isinstance(eval_dataset, dict) and any(isinstance(ds, IterableDataset) for ds in eval_dataset.values()) |
| ) |
| ): |
| |
| raise NotImplementedError( |
| "Iterable datasets are not yet supported in RLOOTrainer. Please use a standard dataset instead." |
| ) |
|
|
| |
| self.num_iterations = args.num_iterations |
| self.epsilon_low = args.epsilon |
| self.epsilon_high = args.epsilon_high if args.epsilon_high is not None else args.epsilon |
| |
| self._step = 0 |
| |
| |
| self._buffered_inputs = None |
|
|
| |
| |
| |
| |
| |
| |
| model.warnings_issued["estimate_tokens"] = True |
|
|
| super().__init__( |
| model=model, |
| args=args, |
| data_collator=identity, |
| train_dataset=train_dataset, |
| eval_dataset=eval_dataset, |
| processing_class=processing_class, |
| callbacks=callbacks, |
| optimizers=optimizers, |
| ) |
|
|
| |
| self.beta = args.beta |
| if self.beta == 0.0: |
| |
| self.ref_model = None |
| elif is_peft_model(model): |
| |
| |
| self.ref_model = None |
| else: |
| |
| config = AutoConfig.from_pretrained(model_id) |
| architecture = getattr(transformers, config.architectures[0]) |
| self.ref_model = architecture.from_pretrained(model_id, **model_init_kwargs) |
|
|
| |
| if args.disable_dropout: |
| disable_dropout_in_model(model) |
| if self.ref_model is not None: |
| disable_dropout_in_model(self.ref_model) |
|
|
| |
| self._metrics = {"train": defaultdict(list), "eval": defaultdict(list)} |
| self._total_train_tokens = 0 |
| self.log_completions = args.log_completions |
| self.wandb_log_unique_prompts = args.wandb_log_unique_prompts |
| self.num_completions_to_print = args.num_completions_to_print |
| |
| self._logs = { |
| "images": deque(maxlen=args.generation_batch_size), |
| "prompt": deque(maxlen=args.generation_batch_size), |
| "completion": deque(maxlen=args.generation_batch_size), |
| "rewards": defaultdict(lambda: deque(maxlen=args.generation_batch_size)), |
| "advantages": deque(maxlen=args.generation_batch_size), |
| } |
|
|
| |
| |
| |
| set_seed(args.seed, device_specific=True) |
|
|
| if self.use_vllm: |
| if not is_vllm_available(): |
| raise ImportError( |
| "vLLM is not available and `use_vllm` is set to True. Please install vLLM with " |
| "`pip install trl[vllm]` to use it." |
| ) |
|
|
| if self.vllm_mode == "server": |
| if self.accelerator.is_main_process: |
| if args.vllm_server_base_url is not None: |
| base_url = args.vllm_server_base_url |
| else: |
| base_url = f"http://{args.vllm_server_host}:{args.vllm_server_port}" |
| self.vllm_client = VLLMClient(base_url=base_url, connection_timeout=args.vllm_server_timeout) |
| self.vllm_client.init_communicator(device=torch.cuda.current_device()) |
|
|
| elif self.vllm_mode == "colocate": |
| if not self.accelerator.num_processes % self.vllm_tensor_parallel_size == 0: |
| raise ValueError( |
| f"vllm_tensor_parallel_size ({self.vllm_tensor_parallel_size}) must divide world size " |
| f"({self.accelerator.num_processes}) evenly." |
| ) |
|
|
| if self.vllm_tensor_parallel_size > 1: |
| self.tp_group, _ = torch.distributed.new_subgroups_by_enumeration( |
| [ |
| list(range(i * self.vllm_tensor_parallel_size, (i + 1) * self.vllm_tensor_parallel_size)) |
| for i in range(self.accelerator.num_processes // self.vllm_tensor_parallel_size) |
| ] |
| ) |
| os.environ["RANK"] = str(self.accelerator.process_index) |
| os.environ["LOCAL_RANK"] = str(self.accelerator.local_process_index) |
| os.environ["WORLD_SIZE"] = str(self.accelerator.num_processes) |
| ensure_master_addr_port() |
|
|
| if self.max_prompt_length is not None and self.max_completion_length is not None: |
| max_model_len = self.max_prompt_length + self.max_completion_length |
| else: |
| max_model_len = None |
| self.llm = model.vllm_engine |
| if self.args.vllm_enable_sleep_mode: |
| self.llm.sleep(level=1) |
| else: |
| raise ValueError(f"vllm_mode must be either 'server' or 'colocate', got '{self.vllm_mode}'.") |
| self.guided_decoding_regex = args.vllm_guided_decoding_regex |
|
|
| self._last_loaded_step = -1 |
| self.accelerator.wait_for_everyone() |
| else: |
| generation_kwargs = { |
| "max_new_tokens": self.max_completion_length, |
| "do_sample": True, |
| "pad_token_id": tokenizer.pad_token_id, |
| "bos_token_id": tokenizer.bos_token_id, |
| "eos_token_id": tokenizer.eos_token_id, |
| "temperature": self.temperature, |
| "top_p": self.top_p, |
| "top_k": self.top_k, |
| "min_p": self.min_p, |
| "repetition_penalty": self.repetition_penalty, |
| "cache_implementation": args.cache_implementation, |
| } |
| if args.generation_kwargs is not None: |
| generation_kwargs.update(args.generation_kwargs) |
| self.generation_config = GenerationConfig(**generation_kwargs) |
|
|
| |
| |
| |
| self.model_accepts_loss_kwargs = False |
|
|
| |
| self.model.add_model_tags(self._tag_names) |
|
|
| if self.ref_model is not None: |
| if self.is_deepspeed_enabled: |
| self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator) |
| elif self.is_fsdp_enabled: |
| self.ref_model = prepare_fsdp(self.ref_model, self.accelerator) |
| else: |
| self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) |
|
|
| if args.sync_ref_model: |
| self.add_callback(SyncRefModelCallback(ref_model=self.ref_model, accelerator=self.accelerator)) |
|
|
| for i, reward_func in enumerate(self.reward_funcs): |
| if isinstance(reward_func, PreTrainedModel): |
| if self.is_deepspeed_enabled: |
| self.reward_funcs[i] = prepare_deepspeed(reward_func, self.accelerator) |
| else: |
| |
| self.reward_funcs[i] = self.accelerator.prepare_model( |
| reward_func, evaluation_mode=True, device_placement=True |
| ) |
|
|
| def _set_signature_columns_if_needed(self): |
| |
| |
| |
| |
| if self._signature_columns is None: |
| self._signature_columns = ["prompt", "image", "images"] |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| def get_train_dataloader(self): |
| if self.train_dataset is None: |
| raise ValueError("Trainer: training requires a train_dataset.") |
|
|
| train_dataset = self.train_dataset |
| data_collator = self.data_collator |
| if is_datasets_available() and isinstance(train_dataset, datasets.Dataset): |
| train_dataset = self._remove_unused_columns(train_dataset, description="training") |
| else: |
| data_collator = self._get_collator_with_removed_columns(data_collator, description="training") |
|
|
| dataloader_params = { |
| "batch_size": self._train_batch_size * self.args.steps_per_generation, |
| "collate_fn": data_collator, |
| "num_workers": self.args.dataloader_num_workers, |
| "pin_memory": self.args.dataloader_pin_memory, |
| "persistent_workers": self.args.dataloader_persistent_workers, |
| } |
|
|
| if not isinstance(train_dataset, torch.utils.data.IterableDataset): |
| dataloader_params["sampler"] = self._get_train_sampler() |
| dataloader_params["drop_last"] = self.args.dataloader_drop_last |
| dataloader_params["worker_init_fn"] = partial( |
| seed_worker, num_workers=self.args.dataloader_num_workers, rank=self.args.process_index |
| ) |
|
|
| dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor |
|
|
| return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params)) |
|
|
| def _get_train_sampler(self, dataset: Optional[Dataset] = None) -> Sampler: |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| if dataset is None: |
| dataset = self.train_dataset |
| return RepeatSampler( |
| data_source=dataset, |
| mini_repeat_count=self.num_generations, |
| batch_size=self.args.generation_batch_size // self.num_generations, |
| repeat_count=self.num_iterations * self.args.steps_per_generation, |
| shuffle=self.shuffle_dataset, |
| seed=self.args.seed, |
| ) |
|
|
| def _get_eval_sampler(self, eval_dataset) -> Sampler: |
| |
| return RepeatSampler( |
| data_source=eval_dataset, |
| mini_repeat_count=self.num_generations, |
| seed=self.args.seed, |
| ) |
|
|
| @profiling_decorator |
| def _get_per_token_logps_and_entropies( |
| self, |
| model, |
| input_ids, |
| attention_mask, |
| logits_to_keep, |
| batch_size=None, |
| compute_entropy=False, |
| pixel_values=None, |
| image_grid_thw=None, |
| num_images=None, |
| pixel_attention_mask=None, |
| image_sizes=None, |
| token_type_ids=None, |
| ) -> dict[str, Optional[torch.Tensor]]: |
| """Compute log-probs and (optionally) entropies for each token.""" |
| batch_size = batch_size or input_ids.size(0) |
| all_logps = [] |
| all_entropies = [] |
| for start in range(0, input_ids.size(0), batch_size): |
| input_ids_batch = input_ids[start : start + batch_size] |
| attention_mask_batch = attention_mask[start : start + batch_size] |
|
|
| |
| model_inputs = {"input_ids": input_ids_batch, "attention_mask": attention_mask_batch} |
|
|
| if image_grid_thw is not None and pixel_values is not None: |
| rows_per_image = image_grid_thw.prod(dim=-1) |
| rows_per_sample = torch.split(rows_per_image, num_images) |
| rows_per_sample = torch.stack([s.sum() for s in rows_per_sample]) |
| cum_rows = torch.cat([torch.tensor([0], device=rows_per_sample.device), rows_per_sample.cumsum(0)]) |
| row_start, row_end = cum_rows[start].item(), cum_rows[start + batch_size].item() |
| model_inputs["pixel_values"] = pixel_values[row_start:row_end] |
| cum_imgs = torch.tensor([0] + num_images).cumsum(0) |
| img_start, img_end = cum_imgs[start], cum_imgs[start + batch_size] |
| model_inputs["image_grid_thw"] = image_grid_thw[img_start:img_end] |
| elif pixel_values is not None: |
| model_inputs["pixel_values"] = pixel_values[start : start + batch_size] |
| if pixel_attention_mask is not None: |
| model_inputs["pixel_attention_mask"] = pixel_attention_mask[start : start + batch_size] |
| if image_sizes is not None: |
| model_inputs["image_sizes"] = image_sizes[start : start + batch_size] |
| if token_type_ids is not None: |
| model_inputs["token_type_ids"] = token_type_ids[start : start + batch_size] |
|
|
| |
| if "logits_to_keep" in self.model_kwarg_keys: |
| |
| model_inputs["logits_to_keep"] = logits_to_keep + 1 |
|
|
| model_inputs["use_cache"] = False |
|
|
| logits = model(**model_inputs).logits |
| |
| logits = logits[:, :-1, :] |
| |
| logits = logits[:, -logits_to_keep:, :] |
| |
| |
| logits = logits / self.temperature |
|
|
| completion_ids = input_ids_batch[:, -logits_to_keep:] |
| logps = selective_log_softmax(logits, completion_ids) |
| all_logps.append(logps) |
|
|
| if compute_entropy: |
| with torch.no_grad(): |
| entropies = entropy_from_logits(logits) |
| all_entropies.append(entropies) |
|
|
| logps = torch.cat(all_logps, dim=0) |
| entropies = torch.cat(all_entropies, dim=0) if compute_entropy else None |
| return logps, entropies |
|
|
| def _fix_param_name_to_vllm(self, name, extra_prefixes: Optional[list[str]] = None): |
| extra_prefixes = extra_prefixes or [] |
| prefixes = ["_checkpoint_wrapped_module."] + extra_prefixes |
| for prefix in prefixes: |
| name = name.replace(prefix, "") |
| return name |
|
|
| def _sync_fsdp1_params_to_vllm(self, module: nn.Module, prefix: str = "", visited=None): |
| """Memory-efficient post-order traversal of FSDP modules to extract full parameters and sync with vLLM.""" |
| |
| if visited is None: |
| visited = set() |
| for child_name, child_module in module.named_children(): |
| child_prefix = f"{prefix}.{child_name}" if prefix else child_name |
| self._sync_fsdp1_params_to_vllm( |
| child_module, prefix=child_prefix, visited=visited |
| ) |
|
|
| if isinstance(module, FSDP): |
| with FSDP.summon_full_params(module, recurse=False, writeback=False): |
| for param_name, param in module.named_parameters(): |
| full_name = f"{prefix}.{param_name}" if prefix else param_name |
| full_name = self._fix_param_name_to_vllm(full_name, extra_prefixes=["_fsdp_wrapped_module."]) |
|
|
| if full_name in visited: |
| continue |
| visited.add(full_name) |
|
|
| if self.vllm_mode == "server" and self.accelerator.is_main_process: |
| self.vllm_client.update_named_param(full_name, param.data) |
| elif self.vllm_mode == "colocate": |
|
|
| pass |
|
|
| pass |
|
|
| def _sync_fsdp2_params_to_vllm(self, module: nn.Module): |
| |
| for name, param in module.items(): |
| if param.is_cpu: |
| param = param.to(torch.device("cuda")) |
| param = param.full_tensor() |
|
|
| if self.vllm_mode == "server" and self.accelerator.is_main_process: |
| self.vllm_client.update_named_param(name, param) |
| elif self.vllm_mode == "colocate": |
|
|
| pass |
|
|
| pass |
|
|
| @profiling_decorator |
| def _move_model_to_vllm(self): |
| |
| deepspeed_plugin = self.accelerator.state.deepspeed_plugin |
| zero_stage_3 = deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3 |
| if zero_stage_3: |
| import deepspeed |
|
|
| gather_if_zero3 = deepspeed.zero.GatheredParameters |
| else: |
| gather_if_zero3 = nullcontext |
|
|
| if is_peft_model(self.model): |
| |
| |
| |
| with gather_if_zero3(list(self.model.parameters())): |
| self.model.merge_adapter() |
|
|
| |
| if self.is_fsdp_enabled: |
| |
| |
| fsdp_plugin = getattr(self.accelerator.state, "fsdp_plugin", None) |
| fsdp_version = getattr(fsdp_plugin, "fsdp_version", 1) if fsdp_plugin else 1 |
| if fsdp_version == 1: |
| self._sync_fsdp1_params_to_vllm( |
| self.model |
| ) |
| elif fsdp_version == 2: |
| self._sync_fsdp2_params_to_vllm(self.model) |
| else: |
| |
| for name, param in self.model.named_parameters(): |
| |
| name = name.removeprefix("base_model.model.").replace(".base_layer", "") |
| if self.model.prefix in name: |
| continue |
| |
| if "original_module" in name: |
| continue |
| name = self._fix_param_name_to_vllm(name, extra_prefixes=["modules_to_save.default."]) |
|
|
| if self.vllm_mode == "server" and self.accelerator.is_main_process: |
| self.vllm_client.update_named_param(name, param.data) |
| elif self.vllm_mode == "colocate": |
|
|
| pass |
|
|
| pass |
| |
| self.model.unmerge_adapter() |
| |
| else: |
| |
| if self.is_fsdp_enabled: |
| fsdp_plugin = getattr(self.accelerator.state, "fsdp_plugin", None) |
| fsdp_version = getattr(fsdp_plugin, "fsdp_version", 1) if fsdp_plugin else 1 |
| if fsdp_version == 1: |
| self._sync_fsdp1_params_to_vllm(self.model) |
| elif fsdp_version == 2: |
| self._sync_fsdp2_params_to_vllm(self.model) |
| else: |
| for name, param in self.model.named_parameters(): |
| name = self._fix_param_name_to_vllm(name) |
| with gather_if_zero3([param]): |
| if self.vllm_mode == "server" and self.accelerator.is_main_process: |
| self.vllm_client.update_named_param(name, param.data) |
| elif self.vllm_mode == "colocate": |
|
|
| pass |
|
|
| pass |
|
|
| |
| if self.vllm_mode == "server" and self.accelerator.is_main_process: |
| self.vllm_client.reset_prefix_cache() |
| elif self.vllm_mode == "colocate": |
| self.llm.reset_prefix_cache() |
|
|
| @profiling_decorator |
| def _prepare_inputs( |
| self, generation_batch: dict[str, Union[torch.Tensor, Any]] |
| ) -> dict[str, Union[torch.Tensor, Any]]: |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| mode = "train" if self.model.training else "eval" |
| if mode == "train": |
| generate_every = self.args.steps_per_generation * self.num_iterations |
| if self._step % generate_every == 0 or self._buffered_inputs is None: |
| |
| generation_batch = self._generate_and_score_completions(generation_batch) |
| generation_batch = split_pixel_values_by_grid(generation_batch) |
|
|
| try: generation_batch = shuffle_sequence_dict(generation_batch) |
|
|
| except: pass |
| generation_batches = split_tensor_dict(generation_batch, self.args.steps_per_generation) |
| self._buffered_inputs = [unsplit_pixel_values_by_grid(batch) for batch in generation_batches] |
| inputs = self._buffered_inputs[self._step % self.args.steps_per_generation] |
| self._step += 1 |
| else: |
| |
| |
| inputs = self._generate_and_score_completions(generation_batch) |
| return inputs |
|
|
| @profiling_decorator |
| def _calculate_rewards(self, inputs, prompts, completions, completion_ids_list): |
| device = self.accelerator.device |
| rewards_per_func = torch.zeros(len(prompts), len(self.reward_funcs), device=device) |
|
|
| |
| keys = [key for key in inputs[0] if key not in ["prompt", "completion", "completion_ids"]] |
| reward_kwargs = {key: [example[key] for example in inputs] for key in keys} |
|
|
| |
| reward_kwargs["trainer_state"] = self.state |
|
|
| for i, (reward_func, reward_processing_class, reward_func_name) in enumerate( |
| zip(self.reward_funcs, self.reward_processing_classes, self.reward_func_names) |
| ): |
| with profiling_context(self, reward_func_name): |
| if isinstance(reward_func, nn.Module): |
| if is_conversational(inputs[0]): |
| messages = [{"messages": p + c} for p, c in zip(prompts, completions)] |
| texts = [apply_chat_template(x, reward_processing_class)["text"] for x in messages] |
| else: |
| texts = [p + c for p, c in zip(prompts, completions)] |
| reward_inputs = reward_processing_class( |
| text=texts, return_tensors="pt", padding=True, padding_side="right", add_special_tokens=False |
| ) |
| reward_inputs = super()._prepare_inputs(reward_inputs) |
| with torch.inference_mode(): |
| rewards_per_func[:, i] = reward_func(**reward_inputs).logits[:, 0] |
| else: |
| output_reward_func = reward_func( |
| prompts=prompts, completions=completions, completion_ids=completion_ids_list, **reward_kwargs |
| ) |
| |
| output_reward_func = [reward if reward is not None else torch.nan for reward in output_reward_func] |
|
|
| rewards_per_func[:, i] = torch.tensor(output_reward_func, dtype=torch.float32, device=device) |
|
|
| |
| if torch.isnan(rewards_per_func).all(dim=1).any(): |
| nan_row_idx = torch.isnan(rewards_per_func).all(dim=1).nonzero(as_tuple=True)[0][0] |
| row_reward_kwargs = { |
| key: value[nan_row_idx] for key, value in reward_kwargs.items() if key != "trainer_state" |
| } |
| row_reward_kwargs["prompt"] = prompts[nan_row_idx] |
| row_reward_kwargs["completion"] = completions[nan_row_idx] |
| logger.warning( |
| f"All reward functions returned None for the following kwargs:\n{row_reward_kwargs}\n" |
| "Please ensure that at least one reward function returns a valid reward." |
| ) |
|
|
| |
| |
| rewards_per_func = gather(rewards_per_func) |
| return rewards_per_func |
|
|
| def _generate_single_turn(self, prompts: list[str], images: Optional[list]): |
| device = self.accelerator.device |
|
|
| |
| |
| |
| kwargs = {} |
| if images is not None: |
| kwargs = {"images": images} |
| for prompt, image_list in zip(prompts, images): |
| if isinstance(prompt, list): |
| prepare_multimodal_messages(prompt, num_images=len(image_list)) |
|
|
| prompts_text = [ |
| maybe_apply_chat_template({"prompt": prompt}, self.processing_class)["prompt"] for prompt in prompts |
| ] |
|
|
| if images is not None: |
| prompt_inputs = self.processing_class(text=prompts_text, padding=True, return_tensors="pt", **kwargs) |
| prompt_inputs = super()._prepare_inputs(prompt_inputs) |
| forward_kwargs = {k: v for k, v in prompt_inputs.items() if k not in ["input_ids", "attention_mask"]} |
| else: |
| forward_kwargs = {} |
|
|
| |
| if self.use_vllm: |
| if self.vllm_mode == "colocate" and self.args.vllm_enable_sleep_mode: |
| |
| torch.cuda.empty_cache() |
| self.llm.wake_up() |
|
|
| |
| if self.state.global_step != self._last_loaded_step: |
| self._move_model_to_vllm() |
| self._last_loaded_step = self.state.global_step |
|
|
| |
| if self.vllm_mode == "server": |
| all_prompts_text = gather_object(prompts_text) |
| if images is not None: |
| all_images = gather_object(images) |
|
|
| if self.accelerator.is_main_process: |
| |
| |
| |
| ordered_set_of_prompts = all_prompts_text[:: self.num_generations] |
|
|
| if images is not None: |
| ordered_set_of_images = all_images[:: self.num_generations] |
| else: |
| ordered_set_of_images = None |
|
|
| with profiling_context(self, "vLLM.generate"): |
| output = self.vllm_client.generate( |
| prompts=ordered_set_of_prompts, |
| images=ordered_set_of_images, |
| n=self.num_generations, |
| repetition_penalty=self.repetition_penalty, |
| temperature=self.temperature, |
| top_p=self.top_p, |
| top_k=-1 if self.top_k is None else self.top_k, |
| min_p=0.0 if self.min_p is None else self.min_p, |
| max_tokens=self.max_completion_length, |
| truncate_prompt_tokens=self.max_prompt_length, |
| guided_decoding_regex=self.guided_decoding_regex, |
| generation_kwargs=self.args.generation_kwargs, |
| ) |
| payload = (output["prompt_ids"], output["completion_ids"], output["logprobs"]) |
| else: |
| payload = None |
|
|
| |
| obj_list = [payload] |
| broadcast_object_list(obj_list, from_process=0) |
| all_prompt_ids, all_completion_ids, _ = obj_list[0] |
|
|
| |
| all_prompt_ids = [ids for ids in all_prompt_ids for _ in range(self.num_generations)] |
|
|
| process_slice = slice( |
| self.accelerator.process_index * len(prompts), |
| (self.accelerator.process_index + 1) * len(prompts), |
| ) |
| prompt_ids = all_prompt_ids[process_slice] |
| completion_ids = all_completion_ids[process_slice] |
|
|
| |
| elif self.vllm_mode == "colocate": |
| if self.guided_decoding_regex: |
| guided_decoding = GuidedDecodingParams(regex=self.guided_decoding_regex) |
| else: |
| guided_decoding = None |
|
|
| generation_kwargs = { |
| "n": 1, |
| "repetition_penalty": self.repetition_penalty, |
| "temperature": self.temperature, |
| "top_p": self.top_p, |
| "top_k": -1 if self.top_k is None else self.top_k, |
| "min_p": 0.0 if self.min_p is None else self.min_p, |
| "max_tokens": self.max_completion_length, |
| "truncate_prompt_tokens": self.max_prompt_length, |
| "guided_decoding": guided_decoding, |
| } |
| if self.args.generation_kwargs is not None: |
| generation_kwargs.update(self.args.generation_kwargs) |
| sampling_params = SamplingParams(**grpo_update_SamplingParams(SamplingParams, generation_kwargs, getattr(self.args, 'vllm_sampling_params', None))) |
|
|
| if self.vllm_tensor_parallel_size > 1: |
| |
| |
| orig_size = len(prompts_text) |
| gathered_prompts = [None for _ in range(self.vllm_tensor_parallel_size)] |
| torch.distributed.all_gather_object(gathered_prompts, prompts_text, group=self.tp_group) |
| all_prompts_text = [p for sublist in gathered_prompts for p in sublist] |
|
|
| if images is not None: |
| gathered_images = [None for _ in range(self.vllm_tensor_parallel_size)] |
| torch.distributed.all_gather_object(gathered_images, images, group=self.tp_group) |
| all_images = [img for sublist in gathered_images for img in sublist] |
| else: |
| all_images = None |
| else: |
| all_prompts_text = prompts_text |
| all_images = images |
|
|
| if images is not None and all_images: |
| vllm_inputs = [] |
| for prompt, image_list in zip(all_prompts_text, all_images): |
| vllm_inputs.append({"prompt": prompt, "multi_modal_data": {"image": image_list}}) |
|
|
| else: |
| vllm_inputs = all_prompts_text |
|
|
| with profiling_context(self, "vLLM.generate"): |
| all_outputs = self.llm.generate(vllm_inputs, sampling_params=sampling_params, use_tqdm=False, lora_request = self.model.load_lora('rloo_trainer_lora_model_' + (os.environ.get('CUDA_VISIBLE_DEVICES', '0').replace(',','')), load_tensors = True)) |
|
|
| all_prompt_ids = [output.prompt_token_ids for output in all_outputs] |
| all_completion_ids = [output.token_ids for outputs in all_outputs for output in outputs.outputs] |
|
|
| if self.vllm_tensor_parallel_size > 1: |
| |
| |
| local_rank_in_group = torch.distributed.get_rank(group=self.tp_group) |
| tp_slice = slice(local_rank_in_group * orig_size, (local_rank_in_group + 1) * orig_size) |
| prompt_ids = all_prompt_ids[tp_slice] |
| completion_ids = all_completion_ids[tp_slice] |
| else: |
| prompt_ids = all_prompt_ids |
| completion_ids = all_completion_ids |
|
|
| if self.args.vllm_enable_sleep_mode: |
| self.llm.sleep(level=1) |
|
|
| elif self.use_transformers_paged: |
| |
| |
| paged_prompt_inputs = self.processing_class(text=prompts_text, **kwargs) |
| previous_attn = self.model_wrapped.config._attn_implementation |
|
|
| if is_flash_attn_2_available(): |
| self.model_wrapped.config._attn_implementation = "paged_attention" |
| else: |
| self.model_wrapped.config._attn_implementation = "sdpa_paged" |
| with ( |
| profiling_context(self, "transformers.generate_batch"), |
| unwrap_model_for_generation( |
| self.model_wrapped, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation |
| ) as unwrapped_model, |
| torch.no_grad(), |
| FSDP.summon_full_params(self.model_wrapped, recurse=False) if self.is_fsdp_enabled else nullcontext(), |
| ): |
| |
| if self.args.bf16: |
| unwrapped_model.to(torch.bfloat16) |
| elif self.args.fp16: |
| unwrapped_model.to(torch.float16) |
| with torch.inference_mode(): |
| all_outputs = unwrapped_model.generate_batch( |
| paged_prompt_inputs.input_ids, generation_config=self.generation_config, progress_bar=False |
| ) |
| unwrapped_model.train() |
| completion_ids = [output.generated_tokens for output in all_outputs.values()] |
| prompt_ids = paged_prompt_inputs.input_ids |
| |
| self.model_wrapped.config._attn_implementation = previous_attn |
|
|
| else: |
| |
| generate_inputs = self.processing_class( |
| text=prompts_text, |
| return_tensors="pt", |
| padding=True, |
| padding_side="left", |
| max_length=self.max_prompt_length, |
| truncation=True, |
| add_special_tokens=False, |
| **kwargs, |
| ) |
| generate_inputs = super()._prepare_inputs(generate_inputs) |
|
|
| with ( |
| profiling_context(self, "transformers.generate"), |
| unwrap_model_for_generation( |
| self.model_wrapped, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation |
| ) as unwrapped_model, |
| torch.no_grad(), |
| FSDP.summon_full_params(self.model_wrapped, recurse=False) if self.is_fsdp_enabled else nullcontext(), |
| ): |
| prompt_completion_ids = unwrapped_model.generate( |
| **generate_inputs, generation_config=self.generation_config, disable_compile=True |
| ) |
| |
| prompt_ids, prompt_mask = generate_inputs["input_ids"], generate_inputs["attention_mask"] |
| prompt_length = prompt_ids.size(1) |
| completion_ids = prompt_completion_ids[:, prompt_length:] |
|
|
| |
| is_eos = completion_ids == self.eos_token_id |
| eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device) |
| eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)] |
| sequence_indices = torch.arange(is_eos.size(1), device=device).expand(is_eos.size(0), -1) |
| completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int() |
| prompt_ids = [p[m].tolist() for p, m in zip(prompt_ids, prompt_mask.bool())] |
| completion_ids = [c[m].tolist() for c, m in zip(completion_ids, completion_mask.bool())] |
|
|
| return prompt_ids, completion_ids, forward_kwargs |
|
|
| def _generate(self, prompts: list[str], images: Optional[list]): |
| device = self.accelerator.device |
| mode = "train" if self.model.training else "eval" |
|
|
| prompt_ids, completion_ids, forward_kwargs = self._generate_single_turn(prompts, images) |
|
|
| |
| prompt_lengths = torch.tensor([len(ids) for ids in prompt_ids], device=device) |
| completion_lengths = torch.tensor([len(ids) for ids in completion_ids], device=device) |
| agg_prompt_lengths = self.accelerator.gather(prompt_lengths) |
| agg_completion_lengths = self.accelerator.gather(completion_lengths) |
| total_prompt_tokens = agg_prompt_lengths.sum() |
| total_completion_tokens = agg_completion_lengths.sum() |
|
|
| |
| if mode == "train": |
| self.state.num_input_tokens_seen += (total_prompt_tokens + total_completion_tokens).item() |
| self._metrics[mode]["num_tokens"] = [self.state.num_input_tokens_seen] |
|
|
| |
| agg_completion_lengths = self.accelerator.gather(completion_lengths) |
| self._metrics[mode]["completions/mean_length"].append(agg_completion_lengths.float().mean().item()) |
| self._metrics[mode]["completions/min_length"].append(agg_completion_lengths.float().min().item()) |
| self._metrics[mode]["completions/max_length"].append(agg_completion_lengths.float().max().item()) |
|
|
| |
| eos_and_pad = [self.eos_token_id, self.pad_token_id] |
| is_truncated = torch.tensor([ids[-1] not in eos_and_pad for ids in completion_ids], device=device) |
| agg_is_truncated = self.accelerator.gather(is_truncated) |
| self._metrics[mode]["completions/clipped_ratio"].append(agg_is_truncated.float().mean().item()) |
| term_completion_lengths = agg_completion_lengths[~agg_is_truncated] |
| if len(term_completion_lengths) == 0: |
| term_completion_lengths = torch.zeros(1, device=device) |
| self._metrics[mode]["completions/mean_terminated_length"].append(term_completion_lengths.float().mean().item()) |
| self._metrics[mode]["completions/min_terminated_length"].append(term_completion_lengths.float().min().item()) |
| self._metrics[mode]["completions/max_terminated_length"].append(term_completion_lengths.float().max().item()) |
|
|
| return prompt_ids, completion_ids, forward_kwargs |
|
|
| def _generate_and_score_completions( |
| self, inputs: list[dict[str, Union[torch.Tensor, Any]]] |
| ) -> dict[str, Union[torch.Tensor, Any]]: |
| device = self.accelerator.device |
| mode = "train" if self.model.training else "eval" |
|
|
| prompts = [x["prompt"] for x in inputs] |
|
|
| if "images" in inputs[0]: |
| images = [example.get("images") for example in inputs] |
| elif "image" in inputs[0]: |
| images = [[example.get("image")] if example.get("image") is not None else None for example in inputs] |
| else: |
| images = None |
| |
| if images is not None and all(img_list == [] for img_list in images): |
| images = None |
|
|
| prompt_ids_list, completion_ids_list, forward_kwargs = self._generate(prompts, images) |
|
|
| |
| prompt_ids = [torch.tensor(ids, device=device) for ids in prompt_ids_list] |
| prompt_mask = [torch.ones_like(ids, dtype=torch.long) for ids in prompt_ids] |
| prompt_ids = pad(prompt_ids, padding_value=self.pad_token_id, padding_side="left") |
| prompt_mask = pad(prompt_mask, padding_value=0, padding_side="left") |
| completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids_list] |
| completion_mask = [torch.ones_like(ids, dtype=torch.long) for ids in completion_ids] |
| completion_ids = pad(completion_ids, padding_value=self.pad_token_id, padding_side="right") |
| completion_mask = pad(completion_mask, padding_value=0, padding_side="right") |
|
|
| |
| if self.mask_truncated_completions: |
| eos_and_pad = [self.eos_token_id, self.pad_token_id] |
| is_truncated = torch.tensor([ids[-1] not in eos_and_pad for ids in completion_ids_list], device=device) |
| completion_mask = completion_mask * (~is_truncated).unsqueeze(1).int() |
|
|
| |
| prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1) |
| attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) |
| |
| if "token_type_ids" in forward_kwargs: |
| token_type_ids = forward_kwargs["token_type_ids"] |
| forward_kwargs["token_type_ids"] = torch.cat( |
| [token_type_ids, token_type_ids.new_zeros(completion_ids.shape)], dim=1 |
| ) |
|
|
| logits_to_keep = completion_ids.size(1) |
| batch_size = self.args.per_device_train_batch_size if mode == "train" else self.args.per_device_eval_batch_size |
|
|
| num_images = [len(img_list) for img_list in images] if images is not None else None |
|
|
| with torch.no_grad(): |
| |
| old_per_token_logps, _ = self._get_per_token_logps_and_entropies( |
| self.model, |
| prompt_completion_ids, |
| attention_mask, |
| logits_to_keep, |
| batch_size, |
| num_images=num_images, |
| **forward_kwargs, |
| ) |
| old_logps = (old_per_token_logps * completion_mask).sum(1) |
|
|
| |
| if self.beta != 0.0: |
| if self.ref_model is not None: |
| ref_per_token_logps, _ = self._get_per_token_logps_and_entropies( |
| self.ref_model, |
| prompt_completion_ids, |
| attention_mask, |
| logits_to_keep, |
| batch_size=batch_size, |
| num_images=num_images, |
| **forward_kwargs, |
| ) |
| else: |
| with self.accelerator.unwrap_model(self.model).disable_adapter(): |
| ref_per_token_logps, _ = self._get_per_token_logps_and_entropies( |
| self.model, |
| prompt_completion_ids, |
| attention_mask, |
| logits_to_keep, |
| batch_size=batch_size, |
| num_images=num_images, |
| **forward_kwargs, |
| ) |
| else: |
| ref_per_token_logps = None |
|
|
| |
| prompts_text = self.processing_class.batch_decode(prompt_ids, skip_special_tokens=True) |
| completions_text = self.processing_class.batch_decode(completion_ids, skip_special_tokens=True) |
| if is_conversational(inputs[0]): |
| completions = [] |
| for prompt, completion in zip(prompts, completions_text): |
| bootstrap = prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else "" |
| completions.append([{"role": "assistant", "content": bootstrap + completion}]) |
| else: |
| completions = completions_text |
|
|
| |
| |
| |
| rewards_per_func = self._calculate_rewards(inputs, prompts, completions, completion_ids_list) |
|
|
| |
| rewards = (rewards_per_func * self.reward_weights.to(device).unsqueeze(0)).nansum(dim=1) |
|
|
| |
| if self.reward_clip_range: |
| rewards = rewards.clamp(min=self.reward_clip_range[0], max=self.reward_clip_range[1]) |
|
|
| |
| if self.beta != 0.0: |
| per_token_kl = old_per_token_logps - ref_per_token_logps |
| |
| kl = (per_token_kl * completion_mask).sum(-1) |
| kl = gather(kl) |
| rewards = rewards - self.beta * kl |
|
|
| grouped_rewards = rewards.view(-1, self.num_generations) |
| mean_grouped_rewards = grouped_rewards.mean(dim=1) |
| std_rewards = grouped_rewards.std(dim=1) |
| is_std_zero = torch.isclose(std_rewards, torch.zeros_like(std_rewards)) |
|
|
| |
| grouped_sum = grouped_rewards.sum(dim=1, keepdim=True) |
| baselines = (grouped_sum - grouped_rewards) / (self.num_generations - 1) |
| baselines = baselines.view(-1) |
| advantages = rewards - baselines |
|
|
| |
| if self.normalize_advantages: |
| advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-4) |
|
|
| |
| process_slice = slice( |
| self.accelerator.process_index * len(prompts), |
| (self.accelerator.process_index + 1) * len(prompts), |
| ) |
| all_process_advantages = advantages.clone() |
| advantages = advantages[process_slice] |
|
|
| |
| if self.beta != 0.0: |
| mean_kl = (per_token_kl * completion_mask).sum() / completion_mask.sum().clamp(min=1.0) |
| self._metrics[mode]["kl"].append(self.accelerator.gather(mean_kl).nanmean().item()) |
|
|
| |
| for i, reward_func_name in enumerate(self.reward_func_names): |
| mean_rewards = torch.nanmean(rewards_per_func[:, i]).item() |
| self._metrics[mode][f"rewards/{reward_func_name}/mean"].append(mean_rewards) |
| std_func_rewards = nanstd(rewards_per_func[:, i]).item() |
| self._metrics[mode][f"rewards/{reward_func_name}/std"].append(std_func_rewards) |
| self._metrics[mode]["reward"].append(mean_grouped_rewards.mean().item()) |
| self._metrics[mode]["reward_std"].append(std_rewards.mean().item()) |
| self._metrics[mode]["frac_reward_zero_std"].append(is_std_zero.float().mean().item()) |
|
|
| |
| self._logs["prompt"].extend(gather_object(prompts_text)) |
| self._logs["completion"].extend(gather_object(completions_text)) |
| for i, name in enumerate(self.reward_func_names): |
| self._logs["rewards"][name].extend(rewards_per_func[:, i].tolist()) |
| self._logs["advantages"].extend(all_process_advantages.tolist()) |
|
|
| if images is not None: |
| self._logs["images"].extend(gather_object(images)) |
|
|
| output = { |
| "prompt_ids": prompt_ids, |
| "prompt_mask": prompt_mask, |
| "completion_ids": completion_ids, |
| "completion_mask": completion_mask, |
| "old_logps": old_logps, |
| "advantages": advantages, |
| } |
| if "pixel_values" in forward_kwargs: |
| output["pixel_values"] = forward_kwargs["pixel_values"] |
| if "image_grid_thw" in forward_kwargs: |
| output["image_grid_thw"] = forward_kwargs["image_grid_thw"] |
| if "pixel_attention_mask" in forward_kwargs: |
| output["pixel_attention_mask"] = forward_kwargs["pixel_attention_mask"] |
| if "image_sizes" in forward_kwargs: |
| output["image_sizes"] = forward_kwargs["image_sizes"] |
| if "token_type_ids" in forward_kwargs: |
| output["token_type_ids"] = forward_kwargs["token_type_ids"] |
| if images is not None: |
| output["num_images"] = num_images |
| return output |
|
|
| @profiling_decorator |
| def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): |
| if return_outputs: |
| raise ValueError("The RLOOTrainer does not support returning outputs") |
| return self._compute_loss(model, inputs) |
|
|
| def _compute_loss(self, model, inputs): |
| |
| prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"] |
| completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"] |
| input_ids = torch.cat([prompt_ids, completion_ids], dim=1) |
| attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) |
| logits_to_keep = completion_ids.size(1) |
|
|
| |
| per_token_logps, entropies = self._get_per_token_logps_and_entropies( |
| model, |
| input_ids, |
| attention_mask, |
| logits_to_keep, |
| compute_entropy=True, |
| pixel_values=inputs.get("pixel_values"), |
| image_grid_thw=inputs.get("image_grid_thw"), |
| num_images=inputs.get("num_images"), |
| pixel_attention_mask=inputs.get("pixel_attention_mask"), |
| image_sizes=inputs.get("image_sizes"), |
| token_type_ids=inputs.get("token_type_ids"), |
| ) |
|
|
| logps = (per_token_logps * completion_mask).sum(1) |
| old_logps = inputs["old_logps"] |
| log_ratio = logps - old_logps |
|
|
| |
| advantages = inputs["advantages"] |
| coef_1 = torch.exp(log_ratio) |
| coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high) |
| per_sequence_loss1 = coef_1 * advantages |
| per_sequence_loss2 = coef_2 * advantages |
| per_sequence_loss = -torch.min(per_sequence_loss1, per_sequence_loss2) |
| loss = per_sequence_loss.mean() |
|
|
| |
| mode = "train" if self.model.training else "eval" |
|
|
| |
| mean_entropy = (entropies * completion_mask).sum() / completion_mask.sum().clamp(min=1.0) |
| self._metrics[mode]["entropy"].append(self.accelerator.gather(mean_entropy).nanmean().item()) |
|
|
| |
| is_low_clipped = (coef_1 < 1 - self.epsilon_low) & (advantages < 0) |
| is_high_clipped = (coef_1 > 1 + self.epsilon_high) & (advantages > 0) |
| is_region_clipped = is_low_clipped | is_high_clipped |
| gathered_low_clip = self.accelerator.gather(is_low_clipped.float().mean()) |
| self._metrics[mode]["clip_ratio/low_mean"].append(gathered_low_clip.nanmean().item()) |
| self._metrics[mode]["clip_ratio/low_min"].append(nanmin(gathered_low_clip).item()) |
| gathered_high_clip = self.accelerator.gather(is_high_clipped.float().mean()) |
| self._metrics[mode]["clip_ratio/high_mean"].append(gathered_high_clip.nanmean().item()) |
| self._metrics[mode]["clip_ratio/high_max"].append(nanmax(gathered_high_clip).item()) |
| gathered_clip_ratio = self.accelerator.gather(is_region_clipped.float().mean()) |
| self._metrics[mode]["clip_ratio/region_mean"].append(gathered_clip_ratio.nanmean().item()) |
| return loss |
|
|
| def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys: Optional[list[str]] = None): |
| inputs = self._prepare_inputs(inputs) |
| with torch.no_grad(): |
| with self.compute_loss_context_manager(): |
| loss = self.compute_loss(model, inputs) |
| loss = loss.mean().detach() |
| return loss, None, None |
|
|
| def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None: |
| mode = "train" if self.model.training else "eval" |
| metrics = {key: sum(val) / len(val) for key, val in self._metrics[mode].items()} |
|
|
| |
| |
| if mode == "eval": |
| metrics = {f"eval_{key}": val for key, val in metrics.items()} |
|
|
| logs = {**logs, **metrics} |
| super().log(logs, start_time) |
| self._metrics[mode].clear() |
|
|
| if self.accelerator.is_main_process and self.log_completions: |
| if is_rich_available(): |
| print_prompt_completions_sample( |
| self._logs["prompt"], |
| self._logs["completion"], |
| self._logs["rewards"], |
| self._logs["advantages"], |
| self.state.global_step, |
| self.num_completions_to_print, |
| ) |
|
|
| if self.args.report_to and "wandb" in self.args.report_to and wandb.run is not None: |
| import pandas as pd |
|
|
| table = { |
| "step": [str(self.state.global_step)] * len(self._logs["prompt"]), |
| "prompt": self._logs["prompt"], |
| "completion": self._logs["completion"], |
| **self._logs["rewards"], |
| "advantage": self._logs["advantages"], |
| } |
|
|
| if self._logs["images"]: |
| table["images"] = [] |
| for image_list in self._logs["images"]: |
| |
| table["images"].append([wandb.Image(image) for image in image_list]) |
|
|
| df = pd.DataFrame(table) |
| if self.wandb_log_unique_prompts: |
| df = df.drop_duplicates(subset=["prompt"]) |
| wandb.log({"completions": wandb.Table(dataframe=df)}) |
|
|
| |
| def _save_checkpoint(self, model, trial): |
| if self.args.hub_model_id is None: |
| model_name = Path(self.args.output_dir).name |
| else: |
| model_name = self.args.hub_model_id.split("/")[-1] |
| self.create_model_card(model_name=model_name) |
| super()._save_checkpoint(model, trial) |
| class UnslothRLOOTrainer(_UnslothRLOOTrainer): |
| """ |
| |
| Trainer for the Reinforce Leave One Out (RLOO) method. This algorithm was initially proposed in the paper [Back to |
| Basics: Revisiting REINFORCE Style Optimization for Learning from Human Feedback in |
| LLMs](https://huggingface.co/papers/2402.14740). |
| |
| Example: |
| |
| ```python |
| from datasets import load_dataset |
| from trl import RLOOTrainer |
| |
| dataset = load_dataset("trl-lib/tldr", split="train") |
| def reward_func(completions, **kwargs): |
| # Dummy reward function that rewards completions with more unique letters. |
| return [float(len(set(completion))) for completion in completions] |
| trainer = RLOOTrainer( |
| model="Qwen/Qwen2-0.5B-Instruct", |
| reward_funcs=reward_func, |
| train_dataset=dataset, |
| ) |
| |
| trainer.train() |
| ``` |
| |
| Args: |
| model (`Union[str, PreTrainedModel]`): |
| Model to be trained. Can be either: |
| |
| - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or a |
| path to a *directory* containing model weights saved using |
| [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded |
| using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keyword arguments in |
| `args.model_init_kwargs`. |
| - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported. |
| reward_funcs (`Union[RewardFunc, list[RewardFunc]]`): |
| Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward |
| functions with the prompts and completions and sum the rewards. Can be either: |
| |
| - A single reward function, such as: |
| - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a |
| path to a *directory* containing model weights saved using |
| [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded |
| using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the |
| keyword arguments in `args.model_init_kwargs`. |
| - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported. |
| - A custom reward function: The function is provided with the prompts and the generated completions, |
| plus any additional columns in the dataset. It should return a list of rewards. Custom reward |
| functions can also return `None` when the reward is not applicable to those samples. This is useful |
| for multi-task training where different reward functions apply to different types of samples. When a |
| reward function returns `None` for a sample, that reward function is excluded from the reward |
| calculation for that sample. For more details, see [Using a custom reward |
| function](#using-a-custom-reward-function). |
| |
| The trainer's state is also passed to the reward function. The trainer's state is an instance of |
| [`~transformers.TrainerState`] and can be accessed by accessing the `trainer_state` argument to the |
| reward function's signature. |
| - A list of reward functions, where each item can independently be any of the above types. Mixing different |
| types within the list (e.g., a string model ID and a custom reward function) is allowed. |
| args ([`RLOOConfig`], *optional*): |
| Configuration for this trainer. If `None`, a default configuration is used. |
| train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]): |
| Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is |
| ignored. The format of the samples can be either: |
| |
| - [Standard](dataset_formats#standard): Each sample contains plain text. |
| - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role |
| and content). |
| eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`): |
| Dataset to use for evaluation. It must meet the same requirements as `train_dataset`. |
| processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.ProcessorMixin`], *optional*): |
| Processing class used to process the data. The padding side must be set to "left". If `None`, the |
| processing class is loaded from the model's name with [`~transformers.AutoProcessor.from_pretrained`]. A |
| padding token, `tokenizer.pad_token`, must be set. If the processing class has not set a padding token, |
| `tokenizer.eos_token` will be used as the default. |
| reward_processing_classes ([`~transformers.PreTrainedTokenizerBase`] or `list[PreTrainedTokenizerBase]`, *optional*): |
| Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either: |
| |
| - A single processing class: Used when `reward_funcs` contains only one reward function. |
| - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`. |
| If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is |
| `None`, the tokenizer for the model is automatically loaded using |
| [`~transformers.AutoTokenizer.from_pretrained`]. For elements in `reward_funcs` that are custom reward |
| functions (not [`~transformers.PreTrainedModel`]), the corresponding entries in `reward_processing_classes` |
| are ignored. |
| callbacks (list of [`~transformers.TrainerCallback`], *optional*): |
| List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed |
| in [here](https://huggingface.co/docs/transformers/main_classes/callback). |
| |
| If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`] |
| method. |
| optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`): |
| A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your |
| model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`. |
| peft_config ([`~peft.PeftConfig`], *optional*): |
| PEFT configuration used to wrap the model. If `None`, the model is not wrapped. |
| |
| config: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. Use `args` instead. |
| |
| </Deprecated> |
| |
| reward_model: |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. Use `reward_funcs` instead. |
| |
| </Deprecated> |
| |
| policy: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. Use `model` instead. |
| |
| </Deprecated> |
| |
| ref_policy: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. To use the initial model as the |
| reference model, simply omit this parameter. The parameter is ignored. |
| |
| </Deprecated> |
| |
| data_collator: |
| |
| <Deprecated version="0.22.0"> |
| |
| This parameter is deprecated and will be removed in version 0.25.0. The RLOOTrainer does not use a data |
| collator, so this parameter is ignored. |
| |
| </Deprecated> |
| |
| """ |
| def __init__( |
| self, |
| model = None, |
| reward_funcs = None, |
| args = None, |
| train_dataset = None, |
| eval_dataset = None, |
| processing_class = None, |
| reward_processing_classes = None, |
| callbacks = None, |
| peft_config = None, |
| config = None, |
| reward_model = None, |
| policy = None, |
| ref_policy = None, |
| data_collator = None, |
| **kwargs |
| ): |
| if args is None: args = UnslothRLOOConfig() |
| use_bf16 = getattr(args, 'bf16', False) |
| if type(use_bf16) is not bool: use_bf16 = False |
| use_fp16 = getattr(args, 'fp16', False) |
| if type(use_fp16) is not bool: use_fp16 = False |
| force_float32 = False |
| full_finetuning = os.environ.get('UNSLOTH_ENABLE_FULL_FINETUNING', '0') == '1' |
| if not full_finetuning and (os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1'): |
| print('Unsloth: Switching to float32 training since model cannot work with float16') |
| force_float32 = True |
| mixed_precision_dtype = os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') |
| dtype = getattr(model.config, 'dtype', None) or getattr(model.config, 'torch_dtype', None) |
| if dtype is None: dtype = model.get_input_embeddings().weight.dtype |
| from unsloth_zoo.utils import _get_dtype |
| dtype = _get_dtype(dtype) |
| float16 = dtype == torch.float16 |
| if not force_float32 and (float16 and use_bf16): raise TypeError('Unsloth: Model is in float16 precision but you want to use bfloat16 precision. Set fp16 to `True` and bf16 to `False`') |
| if not force_float32 and (not float16 and use_fp16): raise TypeError('Unsloth: Model is in bfloat16 precision but you want to use float16 precision. Set fp16 to `False` and bf16 to `True`') |
| if force_float32: |
| |
| args.fp16 = False |
| args.bf16 = False |
| os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' |
| if hasattr(args, 'mixed_precision'): args.mixed_precision = 'no' |
| |
| elif (not use_bf16 and not use_fp16) and mixed_precision_dtype == 'float32': |
| |
| args.fp16 = float16 |
| args.bf16 = not float16 |
| os.environ['ACCELERATE_MIXED_PRECISION'] = 'fp16' if float16 else 'bf16' |
| if hasattr(args, 'mixed_precision'): args.mixed_precision = 'fp16' if float16 else 'bf16' |
| |
| elif mixed_precision_dtype == 'bfloat16': |
| |
| args.fp16 = False |
| args.bf16 = False |
| os.environ['ACCELERATE_MIXED_PRECISION'] = 'no' |
| if hasattr(args, 'mixed_precision'): args.mixed_precision = 'no' |
| |
| |
| if getattr(args, 'eval_dataset', None) is not None and getattr(args, 'eval_strategy', 'no') == 'no': |
| args.eval_strategy = 'steps' |
| if getattr(args, 'eval_steps', None) is None: args.eval_steps = 0.1 |
| ga_steps = getattr(args, 'gradient_accumulation_steps', None) |
| if ga_steps is not None and ga_steps > 1: |
| from transformers import __version__ as transformers_version |
| if Version(transformers_version) <= Version('4.45.2'): |
| print('**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!\n' |
| '`pip install --upgrade --no-cache-dir --force-reinstall --no-deps unsloth transformers trl unsloth_zoo`') |
| if getattr(args, 'eval_strategy', 'no') != 'no': |
| eval_bsz = getattr(args, 'per_device_eval_batch_size', 8) |
| if eval_bsz == 8 and args.per_device_train_batch_size < eval_bsz: args.per_device_eval_batch_size = args.per_device_train_batch_size |
| if getattr(args, 'eval_accumulation_steps', None) is None and ga_steps is not None: args.eval_accumulation_steps = ga_steps |
| fp16_full_eval = getattr(args, 'fp16_full_eval', False) |
| if type(fp16_full_eval) is not bool: fp16_full_eval = False |
| bf16_full_eval = getattr(args, 'bf16_full_eval', False) |
| if type(bf16_full_eval) is not bool: bf16_full_eval = False |
| if args.fp16 and bf16_full_eval: args.bf16_full_eval = False; args.fp16_full_eval = True |
| if args.bf16 and fp16_full_eval: args.bf16_full_eval = True; args.fp16_full_eval = False |
| if force_float32: |
| args.bf16_full_eval = False |
| args.fp16_full_eval = False |
| elif os.environ.get('UNSLOTH_MIXED_PRECISION', 'float32') == 'bfloat16': |
| args.bf16_full_eval = True |
| args.fp16_full_eval = False |
| elif not bf16_full_eval and not fp16_full_eval: |
| args.bf16_full_eval = args.bf16 |
| args.fp16_full_eval = args.fp16 |
| _output_logits = False |
| if locals().get('compute_metrics', None) is not None: _output_logits = True |
| if locals().get('preprocess_logits_for_metrics', None) is not None: _output_logits = True |
| if _output_logits: |
| os.environ['UNSLOTH_RETURN_LOGITS'] = '1' |
| if 'max_seq_length' not in locals() and not hasattr(args, 'max_seq_length'): |
| pass |
| else: |
| model_max_seq_length = getattr(model, 'max_seq_length', None) |
| args_max_seq_length = getattr(args, 'max_seq_length', None) |
| if args_max_seq_length is None and model_max_seq_length is not None: |
| max_seq_length = model.max_seq_length |
| if hasattr(args, 'max_seq_length'): args.max_seq_length = max_seq_length |
| elif args_max_seq_length is not None and model_max_seq_length is not None: |
| if args_max_seq_length > model_max_seq_length: |
| print('Unsloth: You set `max_seq_length` as ' + str(args_max_seq_length) + ' but ' |
| 'the maximum the model supports is ' + str(model_max_seq_length) + '. We shall reduce it.') |
| args.max_seq_length = model_max_seq_length |
| if model is not None and hasattr(model, 'for_training'): |
| model.for_training(use_gradient_checkpointing=getattr(args, 'gradient_checkpointing', True)) |
| if 'tokenizer' in locals() and hasattr(tokenizer, 'padding_side'): tokenizer.padding_side = 'right' |
| if 'processing_class' in locals(): |
| if hasattr(processing_class, 'padding_side'): processing_class.padding_side = 'right' |
| if hasattr(processing_class, 'tokenizer') and hasattr(processing_class.tokenizer, 'padding_side'): processing_class.tokenizer.padding_side = 'right' |
| __tokenizer = processing_class if 'processing_class' in locals() else tokenizer |
| from unsloth_zoo.vision_utils import UnslothVisionDataCollator |
| if not isinstance(data_collator, UnslothVisionDataCollator): |
| if isinstance(data_collator, DataCollatorForSeq2Seq) and 'labels' not in train_dataset.column_names: |
| data_collator = TransformersDataCollatorForLanguageModeling( |
| __tokenizer, |
| mlm = False, |
| mlm_probability = 0.0, |
| pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None), |
| ) |
| elif isinstance(data_collator, TransformersDataCollatorForLanguageModeling) and 'labels' in train_dataset.column_names: |
| data_collator = DataCollatorForSeq2Seq( |
| __tokenizer, |
| pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None), |
| ) |
| else: |
| if hasattr(args, 'remove_unused_columns'): args.remove_unused_columns = False |
| if hasattr(args, 'dataset_text_field'): args.dataset_text_field = '' |
| if hasattr(args, 'dataset_kwargs'): args.dataset_kwargs = {'skip_prepare_dataset': True} |
| if not isinstance(data_collator, UnslothVisionDataCollator): |
| if not hasattr(__tokenizer, 'pad') and hasattr(__tokenizer, 'tokenizer'): |
| if isinstance(data_collator, DataCollatorForSeq2Seq): |
| data_collator = DataCollatorForSeq2Seq( |
| __tokenizer.tokenizer, |
| pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None), |
| ) |
| else: |
| data_collator = TransformersDataCollatorForLanguageModeling( |
| __tokenizer.tokenizer, |
| mlm = False, |
| mlm_probability = 0.0, |
| pad_to_multiple_of = getattr(args, 'pad_to_multiple_of', None), |
| ) |
| other_metrics = [] |
| |
| from unsloth_zoo.logging_utils import PatchRLStatistics |
| PatchRLStatistics('rloo_trainer', other_metrics) |
| |
| |
| |
| if getattr(args, "parallel_mode", None) == ParallelMode.NOT_DISTRIBUTED and args.n_gpu > 1: |
| if getattr(args, "_n_gpu", 1) != 1: |
| args._n_gpu = 1 |
| if "model" in locals() and hasattr(model, "for_training"): |
| model.for_training(use_gradient_checkpointing=getattr(args, 'gradient_checkpointing', True)) |
| super().__init__( |
| model = model, |
| reward_funcs = reward_funcs, |
| args = args, |
| train_dataset = train_dataset, |
| eval_dataset = eval_dataset, |
| processing_class = processing_class, |
| reward_processing_classes = reward_processing_classes, |
| callbacks = callbacks, |
| peft_config = peft_config, |
| config = config, |
| reward_model = reward_model, |
| policy = policy, |
| ref_policy = ref_policy, |
| data_collator = data_collator,**kwargs) |
| if "model" in locals() and hasattr(model, "for_inference"): |
| model.for_inference() |
| if hasattr(self, 'neftune_hook_handle'): |
| self.neftune_hook_handle.remove() |
| if hasattr(self, 'neftune_hook_handle'): del self.neftune_hook_handle |
| if getattr(args, 'neftune_noise_alpha', None) is not None: |
| model.get_input_embeddings().neftune_noise_alpha = self.neftune_noise_alpha |
| pass |
| if hasattr(self, 'accelerator'): |
| scaler = self.accelerator.scaler |
| current_model = model |
| while hasattr(current_model, 'model'): |
| current_model.accelerator_scaler = scaler |
| current_model = current_model.model |
| current_model.accelerator_scaler = scaler |
| pass |
| if hasattr(self, 'train'): |
| self.train = MethodType(prepare_for_training_mode(self.__class__.train), self) |
| pass |
| if hasattr(self, 'llm') and self.llm is not None and hasattr(self.llm, 'get_tokenizer'): |
| _vllm_tok = self.llm.get_tokenizer() |
| _pc = getattr(self, 'processing_class', None) or getattr(self, 'tokenizer', None) |
| if _vllm_tok is not None and _pc is not None and getattr(_pc, 'chat_template', None) is not None and getattr(_vllm_tok, 'chat_template', None) is None: |
| _vllm_tok.chat_template = _pc.chat_template |
| pass |
| |
| pass |
|
|
|
|
| if hasattr(logger, "addFilter"): |
| import logging |
| class HideLoggingMessage(logging.Filter): |
| def __init__(self, text): self.text = text |
| def filter(self, x): return not (self.text in x.getMessage()) |
| pass |
| logger.addFilter(HideLoggingMessage("`use_cache=True`")) |
|
|
|
|