"""
Utility functions for the training Dream model.

References: https://github.com/zhijie-group/Discrete-Diffusion-Forcing/blob/main/D2F-train/utils/loss.py
"""

import functools
import math

import numpy as np
import torch
import torch.nn.functional as F
from torch import Tensor


@torch.no_grad()
def get_prompt_lengths_from_labels(
    labels: Tensor,
    attention_mask: Tensor | None = None,
    ignore_index: int = -100,
) -> Tensor:
    """
    labels: (B, T) int64, with ignore_index where we ignore loss (prompt/user/system/pad)
    attention_mask: optional (B, T) 1/0; if given, will treat masked-out (0) as non-real tokens

    Returns: (B,) int64 prompt lengths = index of first (labels != ignore_index) per sample.
             If a sample has no supervised tokens, length = number of real tokens.
    """
    B, T = labels.shape
    device = labels.device

    supervised = labels.ne(ignore_index)
    if attention_mask is not None:
        supervised = supervised & attention_mask.bool()

    idx_grid = torch.arange(T, device=device).expand(B, T)
    first_idx = torch.where(supervised, idx_grid, torch.full_like(idx_grid, T)).min(dim=1).values

    if attention_mask is None:
        return first_idx

    real_len = attention_mask.sum(dim=1).to(torch.long)
    return torch.where((labels.ne(ignore_index) & attention_mask.bool()).any(dim=1), first_idx, real_len)


@torch.no_grad()
def simple_uniform_mask(
    input_ids: Tensor,  # (B, L), int64
    prompt_lengths: Tensor,  # (B,), int64 — number of tokens to keep unmasked at the left
    mask_id: int,  # token id to write where masked
    p: float | None = None,  # fixed mask rate; if None, sample per-sample in [p_min, p_max]
    p_min: float = 0.0,
    p_max: float = 1.0,
    protect_eos_id: int | None = None,  # treated as EOS id for the tail rule
    pad_id: int | None = None,
    ensure_at_least_one: bool = True,
    eps: float = 1e-6,  # tiny floor for probabilities
) -> tuple[Tensor, Tensor, Tensor]:
    """
    Returns:
      noisy: (B, L) int64 — input_ids with some tail tokens replaced by mask_id
      masked: (B, L) bool — True where we replaced a token (incurs loss)
      p_samples: (B,) float32 — per-sample mask probabilities used
    """
    B, L = input_ids.shape
    device = input_ids.device

    noisy = input_ids.clone()
    masked = torch.zeros_like(input_ids, dtype=torch.bool)
    p_mask_tensor = torch.zeros((B, L), device=device, dtype=torch.float32)

    # choose per-sample p
    if p is None:
        p_samples = torch.rand(B, device=device) * (p_max - p_min) + p_min
    else:
        p = float(p)
        p_samples = torch.full((B,), p, device=device)

    for i in range(B):
        pl = int(prompt_lengths[i].item())
        if pl >= L:
            continue  # nothing to mask

        # ---- Eligible region: [pl, L). Exclude PAD only here. Do NOT exclude EOS now. ----
        tail_tokens = input_ids[i, pl:L]
        elig = torch.ones_like(tail_tokens, dtype=torch.bool)
        if pad_id is not None:
            elig &= tail_tokens != pad_id
        if not elig.any():
            continue

        # i.i.d. Bernoulli with per-sample prob
        pi = float(torch.clamp(p_samples[i], eps, 1.0 - eps).item())
        randv = torch.rand(elig.shape, device=device)
        tail_mask = (randv < pi) & elig

        # optionally guarantee at least one masked token per sample
        if ensure_at_least_one and not tail_mask.any():
            # pick a random eligible index to force-mask
            idxs = torch.nonzero(elig, as_tuple=False).squeeze(1)
            force_idx = idxs[torch.randint(0, len(idxs), (1,), device=device)]
            tail_mask[force_idx] = True

        # provisional write-back BEFORE EOS rule
        noisy[i, pl:L] = torch.where(
            tail_mask,
            torch.tensor(mask_id, device=device, dtype=noisy.dtype),
            tail_tokens,
        )
        masked[i, pl:L] = tail_mask
        p_mask_tensor[i, pl:L] = torch.where(elig, torch.tensor(pi, device=device), torch.tensor(0.0, device=device))

        # ---- EOS tail rule (apply only if EOS is distinct from PAD) ----
        if protect_eos_id is not None and (pad_id is None or protect_eos_id != pad_id):
            # Find first EOS at/after prompt
            eos_positions = input_ids[i, :] == protect_eos_id
            # First EOS index in the entire sequence
            if eos_positions.any():
                first_eos_idx = int(torch.argmax(eos_positions.to(torch.uint8)).item())
            else:
                first_eos_idx = L  # no EOS

            # Tail exists only if EOS is not the last token
            if first_eos_idx < L - 1:
                # Check whether that first EOS was masked
                was_first_eos_masked = False
                if first_eos_idx >= pl:
                    was_first_eos_masked = bool(masked[i, first_eos_idx].item())
                else:
                    # EOS lies inside the prompt region; it couldn't be masked by the sampling
                    was_first_eos_masked = False

                # Build tail slice [first_eos_idx, L)
                tail_slice = slice(first_eos_idx, L)

                if was_first_eos_masked:
                    # Case A: mask entire EOS tail; loss applies there
                    noisy[i, tail_slice] = torch.tensor(mask_id, device=device, dtype=noisy.dtype)
                    masked[i, tail_slice] = True
                    # For consistency, set per-token prob on the tail to pi where we forced masking
                    p_mask_tensor[i, tail_slice] = pi
                else:
                    # Case B: force EOS on the tail; no loss there
                    noisy[i, tail_slice] = torch.tensor(protect_eos_id, device=device, dtype=noisy.dtype)
                    masked[i, tail_slice] = False
                    p_mask_tensor[i, tail_slice] = 0.0

    return noisy, masked, p_samples


def _shift_logits(logits: Tensor) -> Tensor:
    """
    https://github.com/zhijie-group/Discrete-Diffusion-Forcing/blob/eed9750ab081cdc302daa9d8305478988f3f5a17/D2F-train/utils/util.py#L145C1-L150C26
    """
    shifted_logits = torch.zeros_like(logits)
    shifted_logits[:, 1:, :] = logits[:, :-1, :]
    shifted_logits[:, 0, :] = 1.0

    return shifted_logits


def _context_adaptive_reweight(seq_len: int, distribution: str = "symmetric-geometric", **kwargs) -> Tensor:
    """
    Create context-adaptive reweighting matrix W of shape (seq_len, seq_len)
    https://github.com/DreamLM/Dream/blob/fd91b8f1d47c5cbe4a8a1674fd9b98045e79d9db/src/trainer/fsdp_sft_trainer.py#L93
    """
    position_ids_l = np.arange(seq_len).reshape(-1, 1)
    position_ids_r = np.arange(seq_len).reshape(1, -1)
    distance = position_ids_l - position_ids_r
    distance = torch.from_numpy(distance)

    def geometric_distribution(k, cart_p=0.8, **_):
        if not 0 < cart_p <= 1:
            raise ValueError("p must be between 0 and 1")
        res = (math.log(cart_p) + (k.abs() - 1) * math.log(1 - cart_p)).exp() * 0.5
        res.masked_fill_(k == 0, 0)
        return res

    if distribution == "symmetric-geometric":
        matrix = geometric_distribution(distance, **kwargs)
    else:
        raise ValueError(f"Unknown distribution {distribution}")
    return matrix


@functools.lru_cache(maxsize=64)
def _cached_cart_matrix(seq_len: int, cart_p: float, distribution: str) -> Tensor:
    """
    Get cached context-adaptive reweighting matrix W of shape (seq_len, seq_len)
    """
    W = _context_adaptive_reweight(seq_len, distribution=distribution, cart_p=cart_p)
    return W  # CPU float tensor; we'll .to(device,dtype) at use time


def loss_function(
    logits: Tensor,  # (B, L, V)
    labels: Tensor,  # (B, L)
    masked: Tensor,  # (B, L) bool or float; True/1.0 => include in loss
    vocab_size: int,
    *,
    t: Tensor | None = None,  # (B,) in [0,1], per-sample time
    time_weighting: str = "cart",  # "none" | "original" | "linear" | "cart"
    cart_p: float = 0.5,  # for cart time weighting
    cart_distribution: str = "symmetric-geometric",  # for cart time weighting
    token_reweighting: bool = False,  # optional difficulty weighting
    alpha: float = 1.0,
    gamma: float = 0.0,
    ignore_index: int = -100,
    eps: float = 1e-6,
) -> Tensor:
    """
    Cross-entropy on masked positions with optional time- and token-reweighting.
    time_weighting:
      - "none":     w_t = 1
      - "original": w_t = 1 / t
      - "linear":   w_t = 1 - t
      - "cart":     w_t from context-adaptive reweighting matrix
    We normalize by the sum of (masked * w_t) so the scale stays consistent.
    """
    B, L, _ = logits.shape
    shifted_logits = _shift_logits(logits)  # (B, L, V)

    # per-token CE without reduction
    per_tok = F.cross_entropy(
        shifted_logits.view(-1, vocab_size),
        labels.view(-1),
        ignore_index=ignore_index,
        reduction="none",
    ).view_as(labels)  # (B, L)

    # base mask: include only selected tokens and not ignore_index
    base_mask = masked.to(per_tok.dtype)  # (B, L)
    if ignore_index is not None:
        base_mask = base_mask * (labels.ne(ignore_index)).to(per_tok.dtype)

    # time weights (per-sample -> per-token broadcast)
    if t is None or time_weighting == "none":
        w_t = 1.0
        time_w = torch.ones_like(per_tok)
    else:
        t = t.to(per_tok.device, dtype=per_tok.dtype)
        if time_weighting == "original":
            w_t = 1.0 / t.clamp_min(eps)  # upweight small t (early timesteps)
            time_w = w_t.view(-1, 1).expand_as(per_tok)  # (B, L)
        elif time_weighting == "linear":
            w_t = (1.0 - t).clamp_min(0.0)  # downweight large t
            time_w = w_t.view(-1, 1).expand_as(per_tok)  # (B, L)
        elif time_weighting == "cart":
            W = _cached_cart_matrix(L, float(cart_p), str(cart_distribution)).to(
                per_tok.device, dtype=per_tok.dtype
            )  # (L, L)
            w_pos = base_mask @ W.T  # (B, L) @ (L, L) -> (B, L)
            # normalize so mean weight over included tokens is 1 (stable scale)
            mass = base_mask.sum(dim=1, keepdim=True).clamp_min(1.0)  # (B, 1)
            mean_w = (w_pos * base_mask).sum(dim=1, keepdim=True) / mass  # (B, 1)
            time_w = (w_pos / (mean_w + eps)).where(mass > 0, torch.ones_like(w_pos))  # (B, L)
        else:
            raise ValueError(f"Unknown time_weighting: {time_weighting}")

    weighted = per_tok * base_mask * time_w

    # optional difficulty-based token reweighting (like alpha*(1-exp(-loss))**gamma * loss)
    if token_reweighting and gamma != 0.0:
        weighted = alpha * (1.0 - torch.exp(-weighted)).pow(gamma) * weighted
    elif token_reweighting:
        weighted = alpha * weighted

    # normalize by effective weight mass (masked * time_w), not just masked count
    denom = (base_mask * time_w).sum().clamp_min(1.0)
    loss = weighted.sum() / denom
    return loss