# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import asyncio
import hashlib
import importlib.resources as pkg_resources
import os
import random
import socket
import threading
import types
from collections.abc import Mapping, Sequence, Sized
from contextlib import contextmanager
from importlib.metadata import version
from itertools import accumulate
from typing import TypeVar

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import transformers
from accelerate import PartialState, logging
from huggingface_hub import ModelCard, ModelCardData
from torch.utils.data import Sampler
from transformers import (
    AutoConfig,
    BitsAndBytesConfig,
    PretrainedConfig,
    PreTrainedModel,
    is_comet_available,
    is_trackio_available,
)
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from transformers.utils import (
    is_peft_available,
    is_rich_available,
    is_torch_xpu_available,
)

from ..trainer.model_config import ModelConfig


if is_rich_available():
    from rich.console import Console
    from rich.panel import Panel
    from rich.table import Table
    from rich.text import Text

if is_comet_available():
    import comet_ml

if is_peft_available():
    from peft import LoraConfig, PeftConfig, PeftModel


logger = logging.get_logger(__name__)


def _is_port_free(port: int, host: str = "127.0.0.1") -> bool:
    try:
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
            s.bind((host, port))
            return True
    except OSError:
        return False


def _find_free_port() -> int:
    candidates = (29500, 23456, 12355, 12345)
    for p in candidates:
        if _is_port_free(p):
            return p
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.bind(("", 0))
        return s.getsockname()[1]


def ensure_master_addr_port(addr: str | None = None, port: int | None = None) -> None:
    """
    Ensure `MASTER_ADDR`/`MASTER_PORT` are set safely.

    - Respects existing environment variables.
    - Defaults `MASTER_ADDR` to localhost if unset.
    - Chooses a free TCP port if `MASTER_PORT` is unset to avoid collisions.
    - If `MASTER_PORT` is set to `"0"` or `"auto"`, it is resolved to a free port.
    """
    os.environ["MASTER_ADDR"] = os.environ.get("MASTER_ADDR") or addr or "localhost"

    env_port = os.environ.get("MASTER_PORT", "").strip().lower()
    if port is None and env_port not in {"", "0", "auto"}:
        try:
            port = int(env_port)
        except ValueError:
            pass

    os.environ["MASTER_PORT"] = str(_find_free_port() if port in (None, 0) else port)


def pad(
    tensors: list[torch.Tensor],
    padding_value: int = 0,
    padding_side: str = "right",
    pad_to_multiple_of: int | None = None,
) -> torch.Tensor:
    """
    Pads a list of tensors to the same shape along the first dimension.

    Args:
        tensors (`list[torch.Tensor]`):
            List of input tensors to pad.
        padding_value (`int`):
            Value to use for padding. Default is 0.
        padding_side (`str`):
            Side on which to add padding. Must be 'left' or 'right'. Default is 'right'.
        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value.

    Returns:
        `torch.Tensor`:
            A single tensor containing the padded tensors.

    Examples:
    ```python
    >>> import torch

    >>> pad([torch.tensor([1, 2, 3]), torch.tensor([4, 5])])
    tensor([[1, 2, 3],
            [4, 5, 0]])

    >>> pad([torch.tensor([[1, 2], [3, 4]]), torch.tensor([[5, 6]])])
    tensor([[[1, 2],
            [3, 4]],
            [[5, 6],
            [0, 0]]])
    ```
    """
    # Determine the maximum shape for each dimension
    output_shape = np.max([t.shape for t in tensors], 0).tolist()

    # Apply pad_to_multiple_of to the first (sequence) dimension
    if pad_to_multiple_of is not None:
        remainder = output_shape[0] % pad_to_multiple_of
        if remainder != 0:
            output_shape[0] += pad_to_multiple_of - remainder

    # Create an output tensor filled with the padding value
    output = torch.full((len(tensors), *output_shape), padding_value, dtype=tensors[0].dtype, device=tensors[0].device)

    for i, t in enumerate(tensors):
        if padding_side == "left":
            seq_start = output_shape[0] - t.shape[0]
        elif padding_side == "right":
            seq_start = 0
        else:
            raise ValueError("padding_side must be 'left' or 'right'")

        # Define the slices
        seq_slice = slice(seq_start, seq_start + t.shape[0])
        slices = (seq_slice,) + tuple(slice(0, s) for s in t.shape[1:])
        output[i][slices] = t

    return output


def disable_dropout_in_model(model: torch.nn.Module) -> None:
    for module in model.modules():
        if isinstance(module, torch.nn.Dropout):
            module.p = 0


def get_quantization_config(model_args: ModelConfig) -> BitsAndBytesConfig | None:
    if model_args.load_in_4bit:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=model_args.dtype,  # For consistency with model weights, we use the same value as `dtype`
            bnb_4bit_quant_type=model_args.bnb_4bit_quant_type,
            bnb_4bit_use_double_quant=model_args.use_bnb_nested_quant,
            bnb_4bit_quant_storage=model_args.bnb_4bit_quant_storage,
        )
    elif model_args.load_in_8bit:
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True,
        )
    else:
        quantization_config = None

    return quantization_config


def get_kbit_device_map() -> dict[str, int] | None:
    if torch.cuda.is_available() or is_torch_xpu_available():
        return {"": PartialState().local_process_index}
    else:
        return None


def get_peft_config(model_args: ModelConfig) -> "PeftConfig | None":
    if model_args.use_peft is False:
        return None

    if not is_peft_available():
        raise ValueError(
            "You need to have PEFT library installed in your environment, make sure to install `peft`. "
            "Make sure to run `pip install -U peft`."
        )

    peft_config = LoraConfig(
        task_type=model_args.lora_task_type,
        r=model_args.lora_r,
        target_modules=model_args.lora_target_modules,
        target_parameters=model_args.lora_target_parameters,
        lora_alpha=model_args.lora_alpha,
        lora_dropout=model_args.lora_dropout,
        bias="none",
        use_rslora=model_args.use_rslora,
        use_dora=model_args.use_dora,
        modules_to_save=model_args.lora_modules_to_save,
    )

    return peft_config


def generate_model_card(
    base_model: str | None,
    model_name: str,
    hub_model_id: str,
    dataset_name: str | None,
    tags: list[str],
    wandb_url: str | None,
    trackio_url: str | None,
    trainer_name: str,
    trainer_citation: str | None = None,
    template_file: str | None = None,
    paper_title: str | None = None,
    paper_id: str | None = None,
    comet_url: str | None = None,
) -> ModelCard:
    """
    Generate a [`~huggingface_hub.ModelCard`] from a template.

    Args:
        base_model (`str` or `None`):
            Base model name.
        model_name (`str`):
            Model name.
        hub_model_id (`str`):
            Hub model ID as `username/model_id`.
        dataset_name (`str` or `None`):
            Dataset name.
        tags (`list[str]`):
            Tags.
        wandb_url (`str` or `None`):
            Weights & Biases run URL.
        trackio_url (`str` or `None`):
            Trackio Space URL.
        comet_url (`str` or `None`):
            Comet experiment URL.
        trainer_name (`str`):
            Trainer name.
        trainer_citation (`str` or `None`, defaults to `None`):
            Trainer citation as a BibTeX entry.
        template_file (`str` *optional*):
            Template file name located in the `trl/templates` directory. Defaults to `lm_model_card.md`.
        paper_title (`str` or `None`, defaults to `None`):
            Paper title.
        paper_id (`str` or `None`, defaults to `None`):
            ArXiv paper ID as `YYMM.NNNNN`.

    Returns:
        [`~huggingface_hub.ModelCard`]:
            A ModelCard object.
    """
    card_data = ModelCardData(
        base_model=base_model,
        datasets=dataset_name,
        library_name="transformers",
        licence="license",
        model_name=model_name,
        tags=["generated_from_trainer", *tags],
    )
    template_file = template_file or "lm_model_card.md"
    card = ModelCard.from_template(
        card_data,
        template_path=str(pkg_resources.files("trl").joinpath(f"templates/{template_file}")),
        base_model=base_model,
        model_name=model_name,
        hub_model_id=hub_model_id,
        dataset_name=dataset_name,
        wandb_url=wandb_url,
        trackio_url=trackio_url,
        comet_url=comet_url,
        trainer_name=trainer_name,
        trainer_citation=trainer_citation,
        paper_title=paper_title,
        paper_id=paper_id,
        trl_version=version("trl"),
        transformers_version=version("transformers"),
        pytorch_version=version("torch"),
        datasets_version=version("datasets"),
        tokenizers_version=version("tokenizers"),
    )
    return card


def get_comet_experiment_url() -> str | None:
    """
    If Comet integration is enabled, return the URL of the current Comet experiment; otherwise, return `None`.
    """
    if not is_comet_available():
        return None

    if comet_ml.get_running_experiment() is not None:
        return comet_ml.get_running_experiment().url

    return None


def get_trackio_space_url() -> str | None:
    """
    If Trackio integration is enabled, return the URL of the current Trackio Space; otherwise, return `None`.
    """
    if not is_trackio_available():
        return None

    from trackio import context_vars

    run = context_vars.current_run.get()
    if run is None:
        return None
    space_id = run._space_id
    if space_id is None:
        return None
    space_id = space_id.replace("/", "-")
    project = run.project
    name = run.name
    return f"https://{space_id}.hf.space?project={project}&runs={name}&sidebar=collapsed"


def log_table_to_comet_experiment(name: str, table: pd.DataFrame) -> None:
    """
    If Comet integration is enabled logs a table to the Comet experiment if it is currently running.

    Args:
        name (`str`):
            Table name.
        table (`pandas.DataFrame`):
            The Pandas DataFrame containing the table to log.
    """
    if not is_comet_available():
        raise ModuleNotFoundError("The comet-ml is not installed. Please install it first: pip install comet-ml")

    experiment = comet_ml.get_running_experiment()
    if experiment is not None:
        experiment.log_table(tabular_data=table, filename=name)


def flush_left(mask: torch.Tensor, *tensors: torch.Tensor) -> torch.Tensor | tuple[torch.Tensor, ...]:
    """
    Shift non-zero elements in the mask and corresponding tensors to the left.

    This function operates on a binary mask and any number of additional tensors with the same dimensions as the mask.
    For each row, non-zero values are shifted to the leftmost positions. Then, columns that contain only zeros across
    all rows are truncated from the mask and tensors. Visually, this operation can be represented as follows:

    ```
    [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
     [0, x, x, x, 0, 0]]       [x, x, x, 0]]
    ```

    Args:
        mask (`torch.Tensor`):
            2D tensor (binary mask) with shape `(N, M)`.
        *tensors (`torch.Tensor`):
            One or more 2D tensors with the same shape as `mask`. These tensors will be processed alongside `mask`,
            with non-zero values shifted and excess zero columns truncated in the same manner.

    Returns:
        `torch.Tensor`:
            Updated binary mask with non-zero values flushed to the left and trailing zero columns removed.
        `*torch.Tensor`
            Updated tensors, processed in the same way as the mask.

    Example:
    ```python
    >>> mask = torch.tensor([[0, 0, 1, 1, 1], [0, 1, 1, 0, 0]])
    >>> tensor = torch.tensor([[9, 9, 2, 3, 4], [9, 5, 6, 9, 9]])
    >>> new_mask, new_tensor = flush_left(mask, tensor)
    >>> print(new_mask)
    tensor([[1, 1, 1],
            [1, 1, 0]])

    >>> print(new_tensor)
    tensor([[2, 3, 4],
            [5, 6, 0]])
    ```
    """
    _, M = mask.shape

    # Create copy of mask and tensors
    mask_copy = mask.clone()
    tensors = [t.clone() for t in tensors]

    # Shift non-zero values to the left
    first_non_zero = mask_copy.argmax(dim=1)
    pos = torch.arange(M, device=mask_copy.device).unsqueeze(0)
    idx_roll = (pos + first_non_zero.unsqueeze(1)) % M
    mask_roll = mask_copy.gather(1, idx_roll)
    rolled_tensors = [t.gather(1, idx_roll) for t in tensors]

    # Truncate trailing columns that are all zeros in mask_roll
    col_sums = mask_roll.sum(dim=0)
    empty_cols = col_sums == 0
    first_empty_col = int(empty_cols.to(torch.int8).argmax()) if empty_cols.any() else M
    flushed_mask = mask_roll[:, :first_empty_col]
    flushed_tensors = [t[:, :first_empty_col] for t in rolled_tensors]

    if not flushed_tensors:
        return flushed_mask
    return flushed_mask, *flushed_tensors


def selective_log_softmax(logits, index) -> torch.Tensor:
    """
    A memory-efficient implementation of the common `log_softmax -> gather` operation.

    This function is equivalent to the following naive implementation:
    ```python
    # for index with shape (...):
    logps = torch.gather(logits.log_softmax(-1), dim=-1, index=index.unsqueeze(-1)).squeeze(-1)
    # for index with shape (..., K):
    logps = torch.gather(logits.log_softmax(-1), dim=-1, index=index)
    ```

    Args:
        logits (`torch.Tensor`):
            Logits tensor of shape `(..., num_classes)`.
        index (`torch.Tensor`):
            Index tensor of shape `(..., K)` or `(...)`, specifying the positions to gather from the log-softmax
            output. When the last case is used, `K` log-probabilities are gathered per position (e.g. for top-K)

    Returns:
        `torch.Tensor`:
            Gathered log probabilities with the same shape as `index`.
    """
    squeeze = index.ndim == logits.ndim - 1
    if squeeze:
        index = index.unsqueeze(-1)

    if logits.dtype in [torch.float32, torch.float64]:
        selected_logits = torch.gather(logits, dim=-1, index=index)
        # loop to reduce peak mem consumption
        logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits])
        per_token_logps = selected_logits - logsumexp_values.unsqueeze(-1)  # log_softmax(x_i) = x_i - logsumexp(x)
    else:
        # logsumexp approach is unstable with bfloat16, fall back to slightly less efficient approach
        per_token_logps = []
        for row_logits, row_labels in zip(logits, index, strict=True):  # loop to reduce peak mem consumption
            row_logps = F.log_softmax(row_logits, dim=-1)
            row_per_token_logps = row_logps.gather(dim=-1, index=row_labels)
            per_token_logps.append(row_per_token_logps)
        per_token_logps = torch.stack(per_token_logps)

    if squeeze:
        per_token_logps = per_token_logps.squeeze(-1)

    return per_token_logps


def entropy_from_logits(logits: torch.Tensor, chunk_size: int = 128) -> torch.Tensor:
    """
    Compute the Shannon entropy (in nats) for each row of *logits* in a memory-efficient way.

    Instead of materializing the full softmax for all rows at once, the logits are flattened to shape (N, num_classes),
    where N is the product of all leading dimensions. Computation is then performed in chunks of size `chunk_size`
    along this flattened dimension, reducing peak memory usage. The result is reshaped back to match the input's
    leading dimensions.

    Args:
        logits (`torch.Tensor`):
            Logits tensor of shape `(..., num_classes)`. Entropy is taken along the last axis; all leading dimensions
            are preserved in the output.
        chunk_size (`int`, *optional*, defaults to `128`):
            Number of rows from the flattened logits to process per iteration. Smaller values reduce memory usage at
            the cost of more iterations.

    Returns:
        `torch.Tensor`:
            Entropy values with shape `logits.shape[:-1]`.
    """
    original_shape = logits.shape[:-1]  # all dims except num_classes
    num_classes = logits.shape[-1]

    # Flatten all leading dimensions into one
    flat_logits = logits.reshape(-1, num_classes)

    entropies = []
    for chunk in flat_logits.split(chunk_size, dim=0):
        logps = F.log_softmax(chunk, dim=-1)
        chunk_entropy = -(torch.exp(logps) * logps).sum(-1)
        entropies.append(chunk_entropy)

    entropies = torch.cat(entropies, dim=0)
    return entropies.reshape(original_shape)


def print_prompt_completions_sample(
    prompts: list,
    completions: list,
    rewards: dict[str, list[float]],
    advantages: list[float],
    step: int,
    num_samples: int = None,
    extra: dict[str, list] | None = None,
) -> None:
    """
    Print out a sample of model completions to the console with multiple reward metrics.

    This function creates a nicely formatted table showing prompt-completion pairs, useful for monitoring model outputs
    during training. It requires the `rich` library to be installed.

    Args:
        prompts (`list`):
            List of prompts. Can be either strings or lists of messages.
        completions (`list`):
            List of completions corresponding to the prompts. Can be either strings or lists of messages.
        rewards (`dict[str, list[float]]`):
            Dictionary where keys are reward names and values are lists of rewards.
        advantages (`list[float]`):
            List of advantages corresponding to the prompts and completions.
        step (`int`):
            Current training step number, used in the output title.
        num_samples (`int`, *optional*):
            Number of random samples to display. If `None` (default), all items will be displayed.
        extra (`dict[str, list]`, *optional*):
            Additional columns to display after the advantage column. Keys are column names and values are lists of
            per-completion data (strings or any value convertible to string). Typically populated via `log_extra` in
            reward functions. If `None` (default), no extra columns are shown.

    Example:
    ```python
    >>> from trl.trainer.utils import print_prompt_completions_sample

    >>> prompts = ["The sky is", "The sun is"]
    >>> completions = [" blue.", " in the sky."]
    >>> rewards = {"Correctness": [0.123, 0.456], "Format": [0.789, 0.101]}
    >>> advantages = [0.987, 0.654]
    >>> extra = {"source": ["dataset_A", "dataset_B"]}
    >>> print_prompt_completions_sample(prompts, completions, rewards, advantages, 42, extra=extra)
    ╭────────────────────────────────── Step 42 ───────────────────────────────────╮
    │ ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┓ │
    │ ┃ Prompt     ┃ Completion   ┃ Correctness ┃ Format ┃ Advantage ┃ source    ┃ │
    │ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━┩ │
    │ │ The sky is │  blue.       │        0.12 │   0.79 │      0.99 │ dataset_A │ │
    │ ├────────────┼──────────────┼─────────────┼────────┼───────────┼───────────┤ │
    │ │ The sun is │  in the sky. │        0.46 │   0.10 │      0.65 │ dataset_B │ │
    │ └────────────┴──────────────┴─────────────┴────────┴───────────┴───────────┘ │
    ╰──────────────────────────────────────────────────────────────────────────────╯
    ```
    """
    if not is_rich_available():
        raise ImportError(
            "The function `print_prompt_completions_sample` requires the `rich` library. Please install it with "
            "`pip install rich`."
        )
    console = Console()
    table = Table(show_header=True, header_style="bold white", expand=True)

    extra = extra or {}

    # Add columns
    table.add_column("Prompt", style="bright_yellow")
    table.add_column("Completion", style="bright_green")
    for reward_name in rewards.keys():
        table.add_column(reward_name, style="bold cyan", justify="right")
    table.add_column("Advantage", style="bold magenta", justify="right")
    for extra_name in extra.keys():
        table.add_column(extra_name, style="bright_white")

    def format_entry(entry) -> Text:
        t = Text()
        if isinstance(entry, list) and all(isinstance(m, dict) for m in entry):
            for j, msg in enumerate(entry):
                role = msg.get("role", "")
                if "content" in msg or "reasoning_content" in msg or "thinking" in msg:
                    # Chat message
                    t.append(f"{role.upper()}\n", style="bold red")
                    reasoning = msg.get("reasoning_content") or msg.get("thinking")
                    if reasoning:
                        t.append(reasoning, style="italic dim white")
                        t.append("\n")
                    if "content" in msg:
                        t.append(msg["content"])
                elif "name" in msg and "args" in msg:
                    # Tool call
                    t.append(f"{role.upper()}\n", style="bold red")
                    t.append(f"{msg['name']}({msg['args']})")
                else:
                    # Fallback
                    t.append(str(msg))
                if j < len(entry) - 1:
                    t.append("\n\n")
        else:
            t.append(str(entry))
        return t

    # Some basic input validation
    if num_samples is not None:
        if num_samples >= len(prompts):
            num_samples = None
        elif num_samples <= 0:
            return

    # Subsample data if num_samples is specified
    if num_samples is not None:
        indices = random.sample(range(len(prompts)), num_samples)
        prompts = [prompts[i] for i in indices]
        completions = [completions[i] for i in indices]
        rewards = {key: [val[i] for i in indices] for key, val in rewards.items()}
        advantages = [advantages[i] for i in indices]
        extra = {key: [val[i] for i in indices] for key, val in extra.items()}

    for i in range(len(prompts)):
        reward_values = [f"{rewards[key][i]:.2f}" for key in rewards.keys()]  # 2 decimals
        extra_values = [format_entry(extra[key][i]) for key in extra.keys()]
        table.add_row(
            format_entry(prompts[i]),
            format_entry(completions[i]),
            *reward_values,
            f"{advantages[i]:.2f}",
            *extra_values,
        )
        table.add_section()  # Adds a separator between rows

    panel = Panel(table, expand=False, title=f"Step {step}", border_style="bold white")
    console.print(panel)


class RepeatSampler(Sampler):
    """
    Sampler that repeats the indices of a dataset in a structured manner.

    Args:
        data_source (`Sized`):
            Dataset to sample from.
        mini_repeat_count (`int`):
            Number of times to repeat each index per batch.
        batch_size (`int`, *optional*, defaults to `1`):
            Number of unique indices per batch.
        repeat_count (`int`, *optional*, defaults to `1`):
            Number of times to repeat the full sampling process.
        shuffle (`bool`, *optional*, defaults to `True`):
            Whether to shuffle the dataset.
        seed (`int`, *optional*):
            Random seed for reproducibility (only affects this sampler).

    Example:
    ```python
    >>> sampler = RepeatSampler(["a", "b", "c", "d", "e", "f", "g"], mini_repeat_count=2, batch_size=3, repeat_count=4)
    >>> list(sampler)
    [4, 4, 3, 3, 0, 0,
     4, 4, 3, 3, 0, 0,
     4, 4, 3, 3, 0, 0,
     4, 4, 3, 3, 0, 0,
     1, 1, 2, 2, 6, 6,
     1, 1, 2, 2, 6, 6,
     1, 1, 2, 2, 6, 6,
     1, 1, 2, 2, 6, 6]
    ```

    ```txt
    mini_repeat_count = 3
          -   -   -
         [0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,      |
          4,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,      |
          8,  8,  8,  9,  9,  9, 10, 10, 10, 11, 11, 11,      |
                                                                repeat_count = 2
          0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,      |
          4,  4,  4,  5,  5,  5,  6,  6,  6,  7,  7,  7,      |
          8,  8,  8,  9,  9,  9, 10, 10, 10, 11, 11, 11, ...] |
          ---------   ---------   ---------   ---------
           ---------   ---------   ---------   ---------
            ---------   ---------   ---------   ---------
                         batch_size = 12
    ```
    """

    def __init__(
        self,
        data_source: Sized,
        mini_repeat_count: int,
        batch_size: int = 1,
        repeat_count: int = 1,
        shuffle: bool = True,
        seed: int | None = None,
    ):
        self.data_source = data_source
        self.mini_repeat_count = mini_repeat_count
        self.batch_size = batch_size
        self.repeat_count = repeat_count
        self.num_samples = len(data_source)
        self.shuffle = shuffle
        self.seed = seed

        if shuffle:
            self.generator = torch.Generator()  # Create a local random generator
            if seed is not None:
                self.generator.manual_seed(seed)

    def __iter__(self):
        if self.shuffle:
            # E.g., [2, 4, 3, 1, 0, 6, 5] (num_samples = 7)
            indexes = torch.randperm(self.num_samples, generator=self.generator).tolist()
        else:
            indexes = list(range(self.num_samples))

        #    [2, 4, 3, 1, 0, 6, 5]
        # -> [[2, 4, 3], [1, 0, 6], [5]]  (batch_size = 3)
        indexes = [indexes[i : i + self.batch_size] for i in range(0, len(indexes), self.batch_size)]

        #    [[2, 4, 3], [1, 0, 6], [5]]
        # -> [[2, 4, 3], [1, 0, 6]]
        indexes = [chunk for chunk in indexes if len(chunk) == self.batch_size]

        for chunk in indexes:
            for _ in range(self.repeat_count):
                for index in chunk:
                    for _ in range(self.mini_repeat_count):
                        yield index

    def __len__(self) -> int:
        return (self.num_samples // self.batch_size) * self.batch_size * self.mini_repeat_count * self.repeat_count


# torch.nanstd doesn't exist, so we define it here
def nanstd(tensor: torch.Tensor, dim: int | tuple[int, ...] | None = None, keepdim: bool = False) -> torch.Tensor:
    """
    Compute the standard deviation of a tensor, ignoring NaNs.

    Args:
        tensor (`torch.Tensor`):
            Input tensor.
        dim (`int` or `tuple[int, ...]`, *optional*):
            Dimension(s) to reduce. Defaults to all dimensions.
        keepdim (`bool`, *optional*, defaults to `False`):
            Whether to keep reduced dimensions.

    Returns:
        `torch.Tensor`:
            Standard deviation of the tensor, ignoring NaNs.
    """
    # Compute variance ignoring NaNs
    mean = torch.nanmean(tensor, dim=dim, keepdim=True)
    variance = torch.nanmean((tensor - mean) ** 2, dim=dim, keepdim=True)
    count = torch.sum(~torch.isnan(tensor), dim=dim, keepdim=True)  # count of non-NaN values
    correction = count / (count - 1)
    correction = torch.where(count > 1, correction, torch.full_like(correction, float("nan")))
    variance *= correction  # Bessel's correction
    std = torch.sqrt(variance)
    if keepdim:
        return std
    if dim is None:
        return std.squeeze()
    if isinstance(dim, int):
        return std.squeeze(dim)
    dims = [(d if d >= 0 else d + std.ndim) for d in dim]
    for d in sorted(dims, reverse=True):
        std = std.squeeze(d)
    return std


def split_tensor_dict(
    tensor_dict: dict[str, torch.Tensor | None], num_chunks: int
) -> list[dict[str, torch.Tensor | None]]:
    """
    Splits a dictionary of tensors along the first dimension into `num_chunks` equal parts.

    Example:
    ```python
    >>> x = torch.arange(12).reshape(6, 2)
    >>> y = torch.arange(6).reshape(6, 1)
    >>> tensor_dict = {"x": x, "y": y}
    >>> split_tensor_dict(tensor_dict, 3)
    [
        {"x": tensor([[0, 1], [2, 3]]), "y": tensor([[0], [1]])},
        {"x": tensor([[4, 5], [6, 7]]), "y": tensor([[2], [3]])},
        {"x": tensor([[ 8,  9], [10, 11]]), "y": tensor([[4], [5]])}
    ]
    ```
    """
    first_tensor = next(tensor for tensor in tensor_dict.values() if tensor is not None)
    chunk_size = first_tensor.shape[0] // num_chunks
    chunks = []
    for i in range(num_chunks):
        chunk_dict = {}
        for key, tensor in tensor_dict.items():
            if tensor is not None and (isinstance(tensor, list) or tensor.ndim > 0):
                chunk_dict[key] = tensor[i * chunk_size : (i + 1) * chunk_size]
            elif tensor is not None and tensor.ndim == 0:
                chunk_dict[key] = tensor
            else:
                chunk_dict[key] = None
        chunks.append(chunk_dict)
    return chunks


def shuffle_sequence_dict(seq_dict: dict[str, Sequence | None]) -> dict[str, Sequence | None]:
    """
    Shuffles all sequence-like values in a dictionary along the first dimension in unison.

    Example:
    ```python
    >>> x = torch.arange(6).reshape(3, 2)
    >>> y = ["a", "b", "c"]
    >>> seq_dict = {"x": x, "y": y}
    >>> shuffle_sequence_dict(seq_dict)
    {'x': tensor([[2, 3],
                  [0, 1],
                  [4, 5]]),
     'y': ['b', 'a', 'c']}
    ```
    """
    # Determine batch size from the first non-None sequence
    batch_size = len(next(v for v in seq_dict.values() if v is not None))
    permutation = torch.randperm(batch_size)

    def permute(v: Sequence | None) -> Sequence | None:
        if v is None:
            return None
        if isinstance(v, torch.Tensor) and v.ndim == 0:
            return v
        if isinstance(v, torch.Tensor) and v.ndim >= 1:
            return v[permutation]
        return [v[i] for i in permutation]

    return {key: permute(val) for key, val in seq_dict.items()}


def nanmin(tensor: torch.Tensor) -> torch.Tensor:
    """
    Compute the minimum value of a tensor, ignoring NaNs. This function only supports 1D tensors.

    Args:
        tensor (`torch.Tensor`): Input tensor of shape `(N,)`.

    Returns:
        `torch.Tensor`: Minimum value of the tensor, ignoring NaNs. Returns NaN if all values are NaN.
    """
    if torch.isnan(tensor).all():
        return torch.tensor(float("nan"), dtype=tensor.dtype, device=tensor.device)
    return torch.min(tensor[~torch.isnan(tensor)])


def nanmax(tensor: torch.Tensor) -> torch.Tensor:
    """
    Compute the maximum value of a tensor, ignoring NaNs. This function only supports 1D tensors.

    Args:
        tensor (`torch.Tensor`): Input tensor of shape `(N,)`.

    Returns:
        `torch.Tensor`: Maximum value of the tensor, ignoring NaNs. Returns NaN if all values are NaN.
    """
    if torch.isnan(tensor).all():
        return torch.tensor(float("nan"), dtype=tensor.dtype, device=tensor.device)
    return torch.max(tensor[~torch.isnan(tensor)])


def identity(x):
    """Do we really need docs for this?"""
    return x


def split_pixel_values_by_grid(batch: dict[str, torch.Tensor]) -> dict[str, torch.Tensor | list[torch.Tensor]]:
    """
    Splits `batch["pixel_values"]` into a list of tensors, one per sample, based on `batch["num_images"]`.

    For models with `image_grid_thw` (e.g. Qwen), the grid dimensions determine how many rows of `pixel_values` belong
    to each image. For models with `image_position_ids` instead (e.g. Gemma), `pixel_values` is indexed directly by
    image count.
    """
    if "pixel_values" not in batch or "num_images" not in batch:
        return batch

    num_images = batch["num_images"]
    pixel_values = batch["pixel_values"]  # [total, feature_dim]

    if "image_grid_thw" in batch:
        lengths = batch["image_grid_thw"].prod(-1).tolist()  # [num_images]
        if sum(lengths) != pixel_values.size(0):
            raise ValueError(
                f"Mismatch: sum(lengths) = {sum(lengths)} != pixel_values.size(0) = {pixel_values.size(0)}"
            )

        boundaries = [0, *accumulate(num_images)]
        image_grid_thw = batch["image_grid_thw"]  # [total, 3]
        sections = [sum(lengths[boundaries[i] : boundaries[i + 1]]) for i in range(len(num_images))]
        split_pixel_values = list(torch.split(pixel_values, sections, dim=0))
        split_image_grid_thw = list(torch.split(image_grid_thw, num_images, dim=0))
        return {**batch, "pixel_values": split_pixel_values, "image_grid_thw": split_image_grid_thw}

    if "image_position_ids" in batch:
        image_position_ids = batch["image_position_ids"]  # [total]
        split_pixel_values = list(torch.split(pixel_values, num_images, dim=0))
        split_image_position_ids = list(torch.split(image_position_ids, num_images, dim=0))
        return {**batch, "pixel_values": split_pixel_values, "image_position_ids": split_image_position_ids}

    return batch


def unsplit_pixel_values_by_grid(batch: dict[str, torch.Tensor | list[torch.Tensor]]) -> dict[str, torch.Tensor]:
    """
    Opposite of `split_pixel_values_by_grid`. Merges a list of tensors in `batch["pixel_values"]` back into a single
    tensor along the first dimension.
    """
    pixel_values = batch.get("pixel_values")
    if isinstance(pixel_values, list):
        merged = torch.cat(pixel_values, dim=0)
        batch = {**batch, "pixel_values": merged}

    image_grid_thw = batch.get("image_grid_thw")
    if isinstance(image_grid_thw, list):
        merged = torch.cat(image_grid_thw, dim=0)
        batch = {**batch, "image_grid_thw": merged}

    image_position_ids = batch.get("image_position_ids")
    if isinstance(image_position_ids, list):
        merged = torch.cat(image_position_ids, dim=0)
        batch = {**batch, "image_position_ids": merged}

    return batch


TListOrMapping = TypeVar("TListOrMapping", list, Mapping)


# This function is intentionally not used internally. It is provided as a utility for users whose datasets contain
# `None` values inserted by tabular backends (e.g., Arrow/Parquet) for missing keys in nested structures. This
# situation arises when loading datasets created before `datasets` v4.7.0 (which introduced the Json dtype), or when
# datasets created after that version were saved without using the Json feature. In both cases, users can apply this
# function via `dataset = dataset.with_transform(remove_none_values)` before training to strip the spurious `None`
# values. See the migration guide for more details.
def remove_none_values(example: TListOrMapping) -> TListOrMapping:
    """
    Recursively removes entries with `None` values from a nested structure (list or dictionary).

    Args:
        example (`list` or `Mapping`):
            Input nested structure (list or dictionary) from which to remove `None`.

    Examples:
    ```python
    >>> dataset = dataset.with_transform(remove_none_values)
    ```
    ```python
    >>> [
    ...     {
    ...         "a": {"aa": None, "ab": 1},
    ...         "b": "my_string",
    ...     }
    ... ]
    >>> remove_none_values(example)
    [{'a': {'ab': 1}, 'b': 'my_string'}]
    ```
    """
    if isinstance(example, list):
        return [remove_none_values(value) if isinstance(value, (dict, list)) else value for value in example]
    elif isinstance(example, Mapping):
        return {
            key: remove_none_values(value) if isinstance(value, (dict, list)) else value
            for key, value in example.items()
            if value is not None
        }
    else:
        raise TypeError("Input must be a list or a dictionary.")


def create_model_from_path(
    model_id: str, architecture: _BaseAutoModelClass | None = None, **kwargs
) -> PreTrainedModel:
    """
    Create a model from a given path using the specified initialization arguments.

    Args:
        model_id (`str`):
            Path to the model. Can be either a local directory or a model identifier from the Hugging Face Hub.
        architecture (`_BaseAutoModelClass` or `None`, *optional*):
            Model architecture class to instantiate. The model is initialized using the `from_pretrained` method of
            this class. If `None`, the architecture will be inferred from the model's configuration.
        kwargs (`dict`):
            Initialization keyword arguments to pass to the model's `from_pretrained` method. When `'dtype'` is
            specified, it can be either a `torch.dtype` or one of the strings: `'bfloat16'`, `'float16'`, `'float32'`,
            or `'auto'`. If not explicitly set, `dtype` defaults to `'float32'`.

    Returns:
        [`~transformers.PreTrainedModel`]:
            The instantiated model.
    """
    dtype = kwargs.get("dtype", "float32")
    if isinstance(dtype, torch.dtype) or dtype == "auto" or dtype is None:
        pass  # dtype is already a torch.dtype or "auto" or None
    elif isinstance(dtype, str) and dtype in ["bfloat16", "float16", "float32"]:
        kwargs["dtype"] = getattr(torch, dtype)
    else:
        raise ValueError(
            "Invalid `dtype` passed to the config. Expected either 'auto' or a string representing "
            f"a valid `torch.dtype` (e.g., 'float32'), but got {dtype}."
        )
    kwargs["device_map"] = kwargs.get("device_map", "auto")
    if architecture is None:
        config = AutoConfig.from_pretrained(model_id)
        architecture = getattr(transformers, config.architectures[0])
    model = architecture.from_pretrained(model_id, **kwargs)
    return model


def hash_module(module: torch.nn.Module) -> str:
    h = hashlib.sha256()
    for _, tensor in sorted(module.state_dict().items()):
        tensor = tensor.cpu()
        h.update(str(tensor.dtype).encode())
        if tensor.dtype in [torch.bfloat16, torch.float8_e4m3fn, torch.float8_e5m2]:
            tensor = tensor.to(torch.float32)
        h.update(tensor.numpy().tobytes())
    return h.hexdigest()


def get_config_model_id(config: PretrainedConfig) -> str:
    """
    Retrieve the model identifier from a given model configuration.

    Args:
        config ([`~transformers.PreTrainedConfig`]):
            Configuration from which to extract the model identifier.

    Returns:
        `str`:
            The model identifier associated with the model configuration.
    """
    return getattr(config, "_name_or_path", "")


@contextmanager
def use_adapter(model: "PeftModel", adapter_name: str | None):
    """
    Context manager to temporarily set and reset the active adapter in a PEFT model.

    Args:
        model ([`~peft.PeftModel`]):
            PEFT model to manage.
        adapter_name (`str` or `None`):
            Name of the adapter to set as active. If `None`, the context manager will disable all adapters.

    Example:
    ```python
    >>> from trl.trainer.utils import use_adapter
    >>> from peft import AutoPeftModelForCausalLM
    >>> import torch

    >>> model = AutoPeftModelForCausalLM.from_pretrained("path/to/model")
    >>> input_ids = torch.tensor([[1, 2, 3]])
    >>> with use_adapter(model, "adapter_name"):
    ...     outputs = model(input_ids)
    ```
    """

    if not is_peft_available():
        raise ImportError(
            "You're trying to use a PEFT adapter but PEFT is not installed. Please install it with `pip install peft`."
        )
    if adapter_name is None:
        with model.disable_adapter():
            yield
    else:
        previous_adapter = model.active_adapter
        model.set_adapter(adapter_name)
        try:
            yield
        finally:
            model.set_adapter(previous_adapter)


def start_event_loop_in_daemon(
    name: str | None = None,
) -> tuple[threading.Thread, asyncio.AbstractEventLoop, threading.Event]:
    """
    This function creates a new daemon thread that runs the provided event loop.

    Args:
        name (`str`, *optional*):
            Name of the thread. If `None`, the default thread naming will be used.

    Returns:
        `threading.Thread`:
            The thread running the event loop.
        `asyncio.AbstractEventLoop`:
            The event loop being run in the thread.
        `threading.Event`:
            An event that is set when the loop is ready.
    """
    loop = asyncio.new_event_loop()
    loop_ready_event = threading.Event()

    def run_loop():
        asyncio.set_event_loop(loop)
        loop_ready_event.set()
        loop.run_forever()

    thread = threading.Thread(target=run_loop, name=name, daemon=True)
    thread.start()
    return thread, loop, loop_ready_event


def shutdown_event_loop_in_daemon(
    thread: threading.Thread | None,
    loop: asyncio.AbstractEventLoop | None,
) -> None:
    """
    Shutdown an asyncio event loop running in a separate thread.

    This function stops the event loop and waits for the associated thread to finish execution.

    Args:
        thread (`threading.Thread`):
            The thread running the event loop.
        loop (`asyncio.AbstractEventLoop`):
            The asyncio event loop to shut down.
    """
    if loop is None or thread is None:
        return
    loop.call_soon_threadsafe(loop.stop)
    thread.join(timeout=5)


class _ChunkedLogProbFunction(torch.autograd.Function):
    """Compute per-token log-probs and entropy without materializing [N, V] logits.

    Processes the lm_head in chunks and uses online logsumexp
    """

    @staticmethod
    def forward(
        ctx,
        last_hidden: torch.Tensor,  # [N, H]
        weight: torch.Tensor,  # [V, H]
        targets: torch.Tensor,  # [N]
        temperature: float,
        chunk_size: int,
        logit_scale: float = 1.0,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        device = last_hidden.device
        N, _ = last_hidden.shape
        vocab, _ = weight.shape
        inv_t = logit_scale / temperature

        # NOTE(@aminediro): always acc in fp32 for stability
        max_old = torch.full((N,), float("-inf"), device=device, dtype=torch.float32)
        sum_exp = torch.zeros((N,), device=device, dtype=torch.float32)
        x_sum_exp = torch.zeros((N,), device=device, dtype=torch.float32)
        target_logit = torch.zeros((N,), device=device, dtype=torch.float32)

        # Pre-allocate reusable buffers to avoid per-chunk allocation
        mm_buf = torch.empty((N, chunk_size), device=device, dtype=last_hidden.dtype)
        logits_buf = torch.empty((N, chunk_size), device=device, dtype=torch.float32)

        for start in range(0, vocab, chunk_size):
            end = min(start + chunk_size, vocab)
            C = end - start
            # using fp16=True, the model's hidden states get cast to float16 by autocast, but the mm_buf is allocated
            # with last_hidden.dtype (float16) while w_chunk (the lm_head weights) is not auto casted
            w_chunk = weight[start:end].to(last_hidden.dtype)  # [C, H]
            torch.mm(last_hidden, w_chunk.t(), out=mm_buf[:, :C])
            logits_chunk = logits_buf[:, :C]
            logits_chunk.copy_(mm_buf[:, :C])
            logits_chunk.mul_(inv_t)  # [N, C]

            # Online logsumexp update
            chunk_max = logits_chunk.amax(dim=-1)  # [N]
            max_new = torch.maximum(max_old, chunk_max)
            rescale = torch.exp(max_old - max_new)
            chunk_exp = torch.exp(logits_chunk - max_new.unsqueeze(-1))  # [N, C]

            sum_exp = sum_exp * rescale + chunk_exp.sum(dim=-1)
            x_sum_exp = x_sum_exp * rescale + (chunk_exp * logits_chunk).sum(dim=-1)
            max_old = max_new

            # Gather target logits for labels in this chunk
            in_chunk_cond = (targets >= start) & (targets < end)
            local_idx = torch.clamp(targets - start, 0, end - start - 1)
            # take the new logit if target_idx is in this chunk bounds else 0
            target_logit += logits_chunk[torch.arange(N, device=device), local_idx] * in_chunk_cond

        log_z = max_old + torch.log(sum_exp)
        logprobs = target_logit - log_z
        entropy = log_z - x_sum_exp / sum_exp

        ctx.save_for_backward(last_hidden, weight, targets, log_z)
        ctx.temperature = temperature
        ctx.chunk_size = chunk_size
        ctx.logit_scale = logit_scale

        return logprobs, entropy

    @staticmethod
    def backward(ctx, grad_logprobs: torch.Tensor, grad_entropy: torch.Tensor):  # type: ignore
        hidden, weight, labels, log_z = ctx.saved_tensors
        temperature: float = ctx.temperature
        chunk_size: int = ctx.chunk_size
        logit_scale: float = ctx.logit_scale
        inv_t = logit_scale / temperature

        N, _ = hidden.shape
        vocab = weight.shape[0]

        # NOTE(@aminediro): always acc in fp32 even if input is not
        grad_hidden = torch.zeros(hidden.shape, device=hidden.device, dtype=torch.float32)
        grad_weight = torch.zeros(weight.shape, device=weight.device, dtype=torch.float32)

        # Pre-allocate reusable buffers to avoid per-chunk allocation
        mm_buf = torch.empty((N, chunk_size), device=hidden.device, dtype=hidden.dtype)
        logits_buf = torch.empty((N, chunk_size), device=hidden.device, dtype=torch.float32)

        g = grad_logprobs.to(torch.float32)  # [N]
        row_idx = torch.arange(N, device=hidden.device)

        for start in range(0, vocab, chunk_size):
            end = min(start + chunk_size, vocab)
            C = end - start
            w_chunk = weight[start:end]  # [C, H]

            torch.mm(hidden, w_chunk.t(), out=mm_buf[:, :C])
            logits_chunk = logits_buf[:, :C]
            logits_chunk.copy_(mm_buf[:, :C])
            logits_chunk.mul_(inv_t)  # [N, C]
            probs = torch.exp(logits_chunk - log_z.unsqueeze(-1))  # [N, C]

            # dL/d(logits) = g * (1_[label] - p)
            grad_logits = (-g).unsqueeze(-1) * probs  # [N, C]

            in_chunk_cond = (labels >= start) & (labels < end)
            local_idx = torch.clamp(labels - start, 0, end - start - 1)
            # If label in chunk add g to grad else it stays the same
            grad_logits[row_idx, local_idx] += g * in_chunk_cond
            grad_logits = grad_logits * inv_t

            grad_hidden.add_(grad_logits @ w_chunk.float())
            grad_weight[start:end].add_(grad_logits.t() @ hidden.float())

        return grad_hidden.to(hidden.dtype), grad_weight.to(weight.dtype), None, None, None, None


def patch_chunked_lm_head(model: torch.nn.Module, chunk_size: int, temperature: float) -> None:
    if getattr(model.config, "final_logit_softcapping", None) is not None:
        raise NotImplementedError(
            "The model uses `final_logit_softcapping` which is not yet supported. Please open an issue if you "
            "want your model to be supported."
        )

    def _chunked_forward(
        self: torch.nn.Module,
        input_ids: torch.Tensor | None = None,
        attention_mask: torch.Tensor | None = None,
        labels: torch.Tensor | None = None,
        completion_mask: torch.Tensor | None = None,
        use_cache: bool = False,
        **kwargs,
    ) -> dict[str, torch.Tensor]:
        assert labels is not None, "requires labels to not be None for logprob computation"

        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=use_cache, **kwargs)
        # NOTE(@aminediro): supporting Cohere2 models
        logit_scale = getattr(self.config, "logit_scale", 1.0)
        hidden_states = outputs.last_hidden_state  # [B, S+1, H]

        # Shift: predict next token
        hidden_states = hidden_states[:, :-1, :]  # [B, S-1, H]
        labels = labels[:, 1:]  # [B, S-1]

        b, s, h = hidden_states.shape
        hidden_flat = hidden_states.reshape(b * s, h).contiguous()
        targets_flat = labels.reshape(b * s).contiguous()

        # Filter to completion tokens only to avoid expensive matmuls on prompt tokens and tool results
        valid_mask = None
        if completion_mask is not None:
            completion_mask = completion_mask[:, 1:]  # same shift as labels
            valid_mask = completion_mask.bool().reshape(b * s)
            hidden_flat = hidden_flat[valid_mask]  # [N_valid, H]
            targets_flat = targets_flat[valid_mask]  # [N_valid]

        logprobs_valid, entropy_valid = _ChunkedLogProbFunction.apply(
            hidden_flat, self.lm_head.weight, targets_flat, temperature, chunk_size, logit_scale
        )

        if valid_mask is not None:
            logprobs = torch.zeros(b * s, device=logprobs_valid.device, dtype=logprobs_valid.dtype)
            entropy = torch.zeros(b * s, device=entropy_valid.device, dtype=entropy_valid.dtype)
            logprobs[valid_mask] = logprobs_valid
            entropy[valid_mask] = entropy_valid
        else:
            logprobs = logprobs_valid
            entropy = entropy_valid

        return {
            "log_probs": logprobs.reshape(b, s),
            "entropy": entropy.reshape(b, s),
        }

    model.forward = types.MethodType(_chunked_forward, model)