#!/usr/bin/env python3
"""
inference_sad.py – Block-wise hierarchical diffusion sampling from a trained
SADModel.

Generation proceeds block by block left-to-right. Within each block, a small
random subset of non-leaf positions is advanced each round to some strictly
finer level in the hierarchy
    mask (level K+1)  >  ancestors (K, …, 1)  >  leaf (level 0)
A transition may jump any number of levels (e.g. mask → leaf directly, or
ancestor l → ancestor l' with l' < l, or ancestor → leaf) as long as the new
level is strictly finer than the current one — never stay, never revert.
Rounds repeat until every position in the block is leaf; then the next block
begins.

Each denoising round:
  1. One forward pass on the current block (K/V cache holds earlier blocks).
  2. Softmax the leaf logits and project through the fixed LUT
     (`AncestorTable.projection_matrix`) into every strictly-finer ancestor
     level; max over each distribution gives per-level confidence (used to
     rank candidate levels). For ancestor levels the conf is multiplied by
     a per-level scalar λ_l ∈ [0, 1] before the cross-level comparison
     (smaller λ_l biases the schedule away from that ancestor level —
     λ_l = 0 disables it; the default λ = 1 reproduces the original
     behavior). Leaf (l=0) is never scaled. The target id is then produced
     per-level:
       - leaf level (l=0):     argmax over the leaf distribution (deterministic)
       - ancestor level (l≥1): multinomial sampling from the cluster dist. (stochastic)
     Cross-level confidence is always computed from the original (temperature=1)
     softmax so that leaf and ancestor probabilities are comparable.
  3. Randomly pick `positions_per_step` non-leaf positions per sample and
     transition each to its best strictly-finer level.

Finalized blocks' K/V are cached so forwards only recompute the current block.

Usage:
    python scripts/inference_sad.py \\
        --config configs/sad_owt.yaml \\
        --checkpoint outputs/sad/latest.pt \\
        --num_samples 4
"""

from __future__ import annotations

import argparse
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parents[1]  # sad/
from typing import Optional

import torch
import torch.nn.functional as F
import yaml

sys.path.insert(0, str(ROOT))

from src.models.sad_model import SADModel
from src.models.dit_components import apply_rotary_pos_emb, modulate_fused
from src.diffusion.ancestor_table import AncestorTable
from src.data import build_owt_dataloader
from einops import rearrange


# ─────────────────────────────────────────────────────────────────────────────
# Sampler
# ─────────────────────────────────────────────────────────────────────────────

class BlockDiffusionSampler:
    """
    Block-wise hierarchical diffusion sampler for SADModel.

    State per position is (level, value):
      level = 0          → leaf token; value = token id
      level ∈ [1, K]     → ancestor at level l; value = cluster id in K_l
      level = K + 1      → mask

    Per-block denoising loop (random position selection, strict-descent schedule):
      Until every position in the block is leaf:
        1. Forward pass on the current block (cache holds earlier blocks).
        2. Vectorized over all block positions, project the leaf softmax
           through the LUT:
             leaf target     (l=0): prob = softmax(logits)
             ancestor target (l≥1): prob = softmax(logits) @ W_l   [V, K_l]
           Each candidate level contributes (conf, id): conf is the max-prob
           (used only to compare levels). The id is argmax if the level is
           leaf (l=0) and a multinomial draw if the level is ancestor (l≥1)
           — so only the final landing in the leaf layer is deterministic,
           while intermediate ancestor steps are stochastic. Only levels
           strictly finer than the position's current level are eligible —
           so mask → leaf (skipping every ancestor) is a legal transition,
           as is any multi-level jump. The eligible level with the highest
           confidence wins.
        3. Randomly pick `positions_per_step` non-leaf positions per sample
           and apply the selected transition at those positions only.
    """

    def __init__(
        self,
        model: SADModel,
        ancestor_table: AncestorTable,
        tokenizer,
        device: torch.device,
        dtype: torch.dtype = torch.bfloat16,
        level_lambdas: Optional[list] = None,
        leaf_temperature: float = 1.0,
    ):
        """
        level_lambdas: length-K list of floats in [0, 1]. λ_l (for ancestor
            level l = 1..K) multiplies that level's max-prob conf before the
            cross-level argmax that picks the winning target. Leaf (l=0) is
            never scaled. None → all ones (original behavior).
        leaf_temperature: temperature applied to leaf logits before softmax.
            Values < 1.0 sharpen the leaf distribution (higher confidence),
            which is then used for both leaf sampling and ancestor projection.
            Default 1.0 (no temperature scaling).
        """
        self.model = model
        self.ancestor_table = ancestor_table
        self.tokenizer = tokenizer
        self.device = device
        self.dtype = dtype
        self.leaf_temperature = float(leaf_temperature)

        self.block_size: int = model.block_size
        self.max_seq_len: int = model.max_seq_len
        self.vocab_size: int = model.vocab_size
        self.mask_id: int = tokenizer.mask_token_id
        assert self.mask_id is not None, "tokenizer must have mask_token_id"

        self.K: int = ancestor_table.num_levels      # number of ancestor levels
        self.mask_level: int = self.K + 1

        if level_lambdas is None:
            level_lambdas = [1.0] * self.K
        assert len(level_lambdas) == self.K, (
            f"level_lambdas must have length K={self.K}, got {len(level_lambdas)}"
        )
        for x in level_lambdas:
            assert 0.0 <= float(x) <= 1.0, f"each λ must be in [0, 1], got {x}"
        # 1-indexed: self.level_lambdas[l] is λ_l for ancestor level l ∈ [1, K]
        self.level_lambdas = [None] + [float(x) for x in level_lambdas]

        # Leaf embedding table (tied with output head — read-only view).
        self.leaf_emb = model.get_leaf_embeddings().to(device=device, dtype=dtype).detach()
        self.mask_emb = self.leaf_emb[self.mask_id]  # [d]

        # Ancestor embeddings per level: fed into the model, so keep them in
        # self.dtype to match model weights.
        self.anc_embs = [None] + [
            ancestor_table.ancestor_embeddings(l).to(device=device, dtype=dtype).detach()
            for l in range(1, self.K + 1)
        ]
        # LUT projection matrices W_l: used only on the scoring side (fp32).
        # Fixed buffers, no grad, so fp32 storage is cheap.
        self.W = [None] + [
            ancestor_table.projection_matrix(l).to(device=device, dtype=torch.float32).detach()
            for l in range(1, self.K + 1)
        ]

    # ───────────────────────────────────────────────────────────────────────
    def _build_mixed_embeddings(
        self, level_ids: torch.Tensor, value_ids: torch.Tensor,
    ) -> torch.Tensor:
        """
        Build [B, S, d] input embeddings from per-position (level, value).

        Mirrors NoisyStateBuilder.build_noisy_embeddings so inference-time
        inputs match the training distribution.
        """
        B, S = level_ids.shape
        d = self.leaf_emb.shape[-1]
        embs = torch.empty(B, S, d, device=self.device, dtype=self.dtype)

        # leaf (level 0) — leaf_emb[value]
        m0 = (level_ids == 0)
        if m0.any():
            embs[m0] = self.leaf_emb[value_ids[m0]]

        # mask (level K+1) — leaf_emb[mask_id]
        mM = (level_ids == self.mask_level)
        if mM.any():
            embs[mM] = self.mask_emb

        # ancestor levels 1..K — anc_embs[l][value]
        for l in range(1, self.K + 1):
            ml = (level_ids == l)
            if ml.any():
                embs[ml] = self.anc_embs[l][value_ids[ml]]

        return embs

    # ───────────────────────────────────────────────────────────────────────
    # KV-cache–aware forward. The key observation: under the block-causal mask,
    # the K/V produced at positions in finalized (leaf) earlier blocks are
    # deterministic and never change. So we compute them once per block and
    # reuse them across all denoising rounds of the current block.
    #
    # This method inlines DDiTBlockWithMask.forward so we can (a) accept a K/V
    # prefix cache, (b) avoid recomputing Q/K/V for earlier blocks. When
    # k_prefix is None it also serves as an uncached single-block pass (used
    # for prompt blocks and the final K/V capture).
    # ───────────────────────────────────────────────────────────────────────
    def _run_layer_cached(
        self,
        layer_idx: int,
        x: torch.Tensor,
        rotary_cos_sin,
        c: torch.Tensor,
        k_prefix: Optional[torch.Tensor] = None,
        v_prefix: Optional[torch.Tensor] = None,
    ):
        """
        Run one DiT block on `x` (current block positions only) with an
        optional cached K/V prefix.

        Args:
            layer_idx:         index into self.model.blocks
            x:                 [B, bs, d] current block hidden state
            rotary_cos_sin:    rotary cos/sin for positions block_start..block_end-1
            c:                 [B, cond_dim] conditioning
            k_prefix, v_prefix: [B, H, S_prefix, d_head] post-rotary cached K/V
                                (from earlier blocks). None means no prefix.

        Returns:
            x_out:  [B, bs, d]
            k_new:  [B, H, bs, d_head] post-rotary K for current block
            v_new:  [B, H, bs, d_head] post-rotary V for current block
        """
        layer = self.model.blocks[layer_idx]
        B = x.shape[0]
        H = layer.n_heads
        dropout = layer.dropout
        bds_fn = layer._bias_dropout_scale_fn()

        (shift_msa, scale_msa, gate_msa,
         shift_mlp, scale_mlp, gate_mlp) = layer.adaLN_modulation(c)[:, None].chunk(6, dim=2)

        x_skip = x
        x_normed = modulate_fused(layer.norm1(x), shift_msa, scale_msa)
        qkv = layer.attn_qkv(x_normed)
        qkv = rearrange(qkv, "b s (three h d) -> b s three h d", three=3, h=H)
        cos, sin = rotary_cos_sin
        qkv = apply_rotary_pos_emb(qkv, cos.to(qkv.dtype), sin.to(qkv.dtype))

        q = qkv[:, :, 0].transpose(1, 2)      # [B, H, bs, d_h]
        k_new = qkv[:, :, 1].transpose(1, 2)  # [B, H, bs, d_h]
        v_new = qkv[:, :, 2].transpose(1, 2)

        if k_prefix is not None:
            k = torch.cat([k_prefix, k_new], dim=2)
            v = torch.cat([v_prefix, v_new], dim=2)
        else:
            k = k_new
            v = v_new

        # No mask: current block may attend to all prefix (block-causal lookback)
        # and to itself (bidirectional within block).
        attn_out = F.scaled_dot_product_attention(q, k, v)
        attn_out = rearrange(attn_out, "b h s d -> b s (h d)", b=B)

        x = bds_fn(layer.attn_out(attn_out), None, gate_msa, x_skip, dropout)
        x = bds_fn(
            layer.mlp(modulate_fused(layer.norm2(x), shift_mlp, scale_mlp)),
            None, gate_mlp, x, dropout,
        )
        return x, k_new, v_new

    def _forward_block_cached(
        self,
        level_ids_cur: torch.Tensor,
        value_ids_cur: torch.Tensor,
        block_idx: int,
        kv_cache: list,
        is_clean: bool = False,
    ):
        """
        Forward pass over a single block using cached prefix K/V.

        Args:
            level_ids_cur, value_ids_cur: [B, bs] current block state
            block_idx: int, absolute block index (for pos/rotary)
            kv_cache: list[(k_prefix, v_prefix) or (None, None)] per layer
            is_clean: if True, use segment_embed(1) (clean half) to match
                      training's clean context. Used when capturing K/V for
                      finalized blocks and prompt warm-up.

        Returns:
            logits_cur: [B, bs, V] (mask column already set to -inf)
            new_kv:     list[(k_cur, v_cur)] per layer — caller appends to cache
        """
        model = self.model
        B, bs = level_ids_cur.shape
        block_start = block_idx * self.block_size
        block_end = block_start + bs
        device = self.device

        embs = self._build_mixed_embeddings(level_ids_cur, value_ids_cur)  # [B, bs, d]

        # Input projection (weights are self.dtype; embs already self.dtype).
        x = model.input_proj(embs)

        # Position embeddings for this block only.
        block_idx_t = torch.full(
            (bs,), block_idx, dtype=torch.long, device=device,
        )
        intra_pos = torch.arange(self.block_size, device=device)
        # segment=0 for noisy (denoising rounds), segment=1 for clean (cache capture)
        seg_val = 1 if is_clean else 0
        seg_id = torch.full((bs,), seg_val, dtype=torch.long, device=device)
        pos_emb = (
            model.block_idx_embed(block_idx_t)
            + model.intra_pos_embed(intra_pos)
            + model.segment_embed(seg_id)
        ).unsqueeze(0).to(x.dtype)
        x = x + pos_emb

        c = model.cond_bias.unsqueeze(0).expand(B, -1).to(x.dtype)

        # Rotary for absolute positions of this block.
        position_ids = torch.arange(block_start, block_end, device=device)
        rotary_cos_sin = model.rotary_emb(x, position_ids=position_ids)

        new_kv = []
        autocast_device = "cuda" if device.type == "cuda" else "cpu"
        with torch.autocast(device_type=autocast_device, dtype=self.dtype):
            for layer_idx in range(len(model.blocks)):
                k_prefix, v_prefix = kv_cache[layer_idx]
                x, k_cur, v_cur = self._run_layer_cached(
                    layer_idx, x, rotary_cos_sin, c,
                    k_prefix=k_prefix, v_prefix=v_prefix,
                )
                new_kv.append((k_cur, v_cur))
            logits = model.output_layer(x, c)  # [B, bs, rounded_leaf]

        logits = logits[..., :self.vocab_size]
        logits[..., self.mask_id] = float("-inf")
        return logits, new_kv

    @staticmethod
    def _append_kv(kv_cache: list, new_kv: list) -> list:
        """Append per-layer new_kv to kv_cache along the sequence dim."""
        out = []
        for (kp, vp), (kn, vn) in zip(kv_cache, new_kv):
            if kp is None:
                out.append((kn, vn))
            else:
                out.append((torch.cat([kp, kn], dim=2),
                            torch.cat([vp, vn], dim=2)))
        return out

    # ───────────────────────────────────────────────────────────────────────
    @torch.no_grad()
    def generate(
        self,
        batch_size: Optional[int] = None,
        prompt_ids: Optional[torch.Tensor] = None,
        positions_per_step: int = 1,
        return_intermediate: bool = False,
        stop_on_eos: bool = True,
    ) -> dict:
        """
        Block-by-block generation with KV cache and random per-round position
        selection.

        Within each block, rounds repeat until every position is leaf. Each
        round runs one forward, computes the best strictly-finer target
        (level, id) for every non-leaf position, then picks
        `positions_per_step` random non-leaf positions per sample and applies
        their transitions. The strict-descent schedule (pick the finest level
        whose LUT-projected max-prob is highest) is unchanged.

        Unconditional: pass `batch_size` (and leave `prompt_ids=None`); starts
        from an all-mask sequence of length `self.max_seq_len`.

        Conditional: pass `prompt_ids` with shape [B, P] where P is a multiple
        of `block_size`; the first P positions are fixed as leaf tokens, the
        remaining positions are generated block by block.
        """
        block_size = self.block_size
        device = self.device

        total_len = self.max_seq_len
        assert total_len % block_size == 0, (
            f"max_seq_len ({total_len}) must be divisible by block_size "
            f"({block_size})"
        )

        if prompt_ids is not None:
            prompt_ids = prompt_ids.to(device=device, dtype=torch.long)
            B, P = prompt_ids.shape
            assert P % block_size == 0, (
                f"prompt length P={P} must be a multiple of block_size={block_size}"
            )
            assert P < total_len, (
                f"prompt length P={P} must be < total_len={total_len}"
            )
            start_block = P // block_size
        else:
            assert batch_size is not None, (
                "Either batch_size (unconditional) or prompt_ids (conditional) "
                "must be provided."
            )
            B = batch_size
            P = 0
            start_block = 0

        # ── Initialize state: every position is mask; prompt positions set as leaf.
        level_ids = torch.full(
            (B, total_len), self.mask_level, dtype=torch.long, device=device,
        )
        value_ids = torch.zeros((B, total_len), dtype=torch.long, device=device)
        if P > 0:
            level_ids[:, :P] = 0
            value_ids[:, :P] = prompt_ids

        num_blocks = total_len // block_size

        intermediate = [] if return_intermediate else None
        finished = torch.zeros(B, dtype=torch.bool, device=device)
        eos_id = getattr(self.tokenizer, "eos_token_id", None)

        # ── KV cache: per-layer (k_prefix, v_prefix) for finalized blocks.
        # Starts empty; we append block b's K/V after b is fully resolved,
        # so when block b+1 starts the cache covers blocks 0..b.
        num_layers = len(self.model.blocks)
        kv_cache = [(None, None) for _ in range(num_layers)]

        # ── Warm up KV cache over prompt blocks (all leaf, deterministic).
        # Use is_clean=True: prompt blocks act as clean context for later blocks,
        # matching training's clean half (segment=1).
        for b in range(start_block):
            bs0 = b * block_size
            be0 = (b + 1) * block_size
            _, new_kv = self._forward_block_cached(
                level_ids[:, bs0:be0], value_ids[:, bs0:be0], b, kv_cache,
                is_clean=True,
            )
            kv_cache = self._append_kv(kv_cache, new_kv)

        # ── Block loop (skips prompt blocks). ──────────────────────────────
        # Each round advances up to `positions_per_step` non-leaf positions by
        # ≥1 level each (strict descent). Worst case every position needs K+1
        # transitions → cap at block_size * (K+1) rounds, which is slack.
        rounds_cap_per_block = block_size * (self.K + 1)

        total_steps = 0  # total denoising rounds across all generated blocks

        for b in range(start_block, num_blocks):
            block_start = b * block_size
            block_end = (b + 1) * block_size

            for _ in range(rounds_cap_per_block):
                cur_level_block = level_ids[:, block_start:block_end]        # [B, bs]
                non_leaf_block = (cur_level_block > 0)                       # [B, bs]
                if not non_leaf_block.any():
                    break

                # 1) Forward pass on current block (cache holds blocks 0..b-1).
                block_logits, _ = self._forward_block_cached(
                    level_ids[:, block_start:block_end],
                    value_ids[:, block_start:block_end],
                    b, kv_cache,
                )                                                            # [B, bs, V]
                # Compute raw (temperature=1) and temperature-sharpened leaf probs.
                # p_leaf_raw / p_ancestor_raw are used for sampling; conf uses
                # temp for leaf and raw+lambda for ancestor.
                leaf_logits_fp = block_logits.float()
                leaf_prob_raw = F.softmax(leaf_logits_fp, dim=-1)            # [B, bs, V]
                if self.leaf_temperature != 1.0:
                    leaf_prob_temp = F.softmax(
                        leaf_logits_fp / self.leaf_temperature, dim=-1,
                    )                                                        # [B, bs, V]
                else:
                    leaf_prob_temp = leaf_prob_raw

                # 2) Best strictly-finer target for every block position.
                best_conf = torch.full(
                    (B, block_size), float("-inf"),
                    device=device, dtype=torch.float32,
                )
                best_level = torch.full(
                    (B, block_size), -1, device=device, dtype=torch.long,
                )
                best_id = torch.zeros(
                    (B, block_size), device=device, dtype=torch.long,
                )

                # Leaf target (l = 0): conf from temp-sharpened dist, sample
                # from temp-sharpened dist.
                leaf_conf = leaf_prob_temp.max(dim=-1).values                # [B, bs]
                leaf_id = torch.multinomial(
                    leaf_prob_temp.reshape(-1, leaf_prob_temp.shape[-1]),
                    num_samples=1,
                ).squeeze(-1).reshape(B, block_size)                         # [B, bs]
                elig = cur_level_block > 0
                upd = elig & (leaf_conf > best_conf)
                best_conf = torch.where(upd, leaf_conf, best_conf)
                best_level = torch.where(upd, torch.zeros_like(best_level), best_level)
                best_id = torch.where(upd, leaf_id, best_id)

                # Ancestor targets l = 1..K.
                # Conf is max-prob over RAW cluster probs times λ_l.
                # Sample is drawn from RAW cluster probs.
                for l in range(1, self.K + 1):
                    V_anc = self.W[l].shape[0]
                    cluster_prob_raw = leaf_prob_raw[..., :V_anc] @ self.W[l]  # [B, bs, K_l]
                    conf_l = cluster_prob_raw.max(dim=-1).values               # [B, bs]
                    conf_l = conf_l * self.level_lambdas[l]
                    id_l = torch.multinomial(
                        cluster_prob_raw.reshape(-1, cluster_prob_raw.shape[-1]),
                        num_samples=1,
                    ).squeeze(-1).reshape(B, block_size)                       # [B, bs]
                    elig_l = cur_level_block > l
                    upd = elig_l & (conf_l > best_conf)
                    best_conf = torch.where(upd, conf_l, best_conf)
                    best_level = torch.where(
                        upd, torch.full_like(best_level, l), best_level,
                    )
                    best_id = torch.where(upd, id_l, best_id)

                # 3) Randomly pick `positions_per_step` non-leaf positions per
                # sample. Leaf positions get score = -inf so they never win a
                # top-k slot; samples with fewer than k non-leaf positions
                # drop the extra slots via the explicit non_leaf_block mask.
                k = min(positions_per_step, block_size)
                scores = torch.rand(B, block_size, device=device)
                scores = torch.where(
                    non_leaf_block, scores, torch.full_like(scores, -1.0),
                )
                _, topk_idx = scores.topk(k, dim=-1)                         # [B, k]
                selected = torch.zeros_like(non_leaf_block)
                selected.scatter_(1, topk_idx, True)
                apply_mask = selected & non_leaf_block                       # [B, bs]

                level_ids[:, block_start:block_end] = torch.where(
                    apply_mask, best_level, cur_level_block,
                )
                value_ids[:, block_start:block_end] = torch.where(
                    apply_mask, best_id, value_ids[:, block_start:block_end],
                )

                if return_intermediate:
                    intermediate.append(
                        (level_ids.clone().cpu(), value_ids.clone().cpu())
                    )

                total_steps += 1

            # Safety net: force any lingering non-leaf positions to leaf.
            # Use the same temperature-sharpened distribution for consistency.
            block_level = level_ids[:, block_start:block_end]
            non_leaf = (block_level > 0)
            if non_leaf.any():
                block_logits, _ = self._forward_block_cached(
                    level_ids[:, block_start:block_end],
                    value_ids[:, block_start:block_end],
                    b, kv_cache,
                )
                leaf_logits_fp = block_logits.float()
                if self.leaf_temperature != 1.0:
                    leaf_logits_fp = leaf_logits_fp / self.leaf_temperature
                leaf_prob_fallback = F.softmax(leaf_logits_fp, dim=-1)
                leaf_id_fallback = torch.multinomial(
                    leaf_prob_fallback.reshape(-1, leaf_prob_fallback.shape[-1]),
                    num_samples=1,
                ).squeeze(-1).reshape(B, block_size)
                level_ids[:, block_start:block_end] = torch.where(
                    non_leaf, torch.zeros_like(block_level), block_level,
                )
                value_ids[:, block_start:block_end] = torch.where(
                    non_leaf, leaf_id_fallback, value_ids[:, block_start:block_end],
                )

            # ── Finalize block b in the KV cache ───────────────────────────
            # Run one more forward on the block's final (all-leaf) state to
            # grab K/V that are consistent with the resolved tokens, then
            # append to the cache so block b+1 can see block b.
            # Use is_clean=True: finalized blocks serve as clean context for
            # later blocks, matching training's clean half (segment=1).
            _, new_kv = self._forward_block_cached(
                level_ids[:, block_start:block_end],
                value_ids[:, block_start:block_end],
                b, kv_cache,
                is_clean=True,
            )
            kv_cache = self._append_kv(kv_cache, new_kv)

            if stop_on_eos and eos_id is not None:
                block_vals = value_ids[:, block_start:block_end]
                block_lvls = level_ids[:, block_start:block_end]
                has_eos = ((block_lvls == 0) & (block_vals == eos_id)).any(dim=-1)
                finished = finished | has_eos
                if finished.all():
                    break

        # ── Package output ──────────────────────────────────────────────────
        # Every position is now leaf (level 0), so value_ids holds token ids.
        result = {
            "tokens": value_ids.cpu(),
            "prompt_len": P,
            "num_steps": total_steps,
        }
        if return_intermediate:
            result["intermediate"] = intermediate
        return result


# ─────────────────────────────────────────────────────────────────────────────
# Checkpoint / model plumbing
# ─────────────────────────────────────────────────────────────────────────────

def _unwrap(model):
    """Peel DDP (.module) and torch.compile (._orig_mod) wrappers."""
    while True:
        if hasattr(model, "_orig_mod"):
            model = model._orig_mod
        elif hasattr(model, "module"):
            model = model.module
        else:
            return model


def load_config(path: str) -> dict:
    with open(path) as f:
        return yaml.safe_load(f)


def build_tokenizer(config: dict):
    from transformers import AutoTokenizer
    tok = AutoTokenizer.from_pretrained(
        ROOT / "tokenizers" / "gpt2",
        local_files_only=True,
    )
    if tok.eos_token is None:
        tok.add_special_tokens({"eos_token": "<|endoftext|>"})
    if tok.bos_token is None:
        tok.bos_token = tok.eos_token
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    if tok.mask_token_id is None:
        tok.add_special_tokens({"mask_token": "[MASK]"})
    config["model"]["vocab_size"] = len(tok)
    if "level_sizes" in config["model"]:
        config["model"]["level_sizes"][0] = len(tok)
    return tok


def build_ancestor_table(config: dict, device, embed_dim: int) -> AncestorTable:
    """Mirror of train_sad.build_ancestor_table — load fixed LUT (and proto)
    so the returned module has the right shape for ckpt state_dict loading."""
    ancestor_cfg = config.get("ancestor", {})
    script_dir = ROOT
    lut_path = ancestor_cfg.get("lut_path", None)

    if lut_path is None:
        # Debug path: random LUT. Uses the training seed so the random LUT
        # lines up across train/infer — checkpoint's state_dict will overwrite
        # the learnable embeddings anyway.
        vocab_size = config["model"]["vocab_size"]
        K = ancestor_cfg.get("num_clusters", 64)
        top_k = ancestor_cfg.get("top_k", 3)
        seed = config.get("training", {}).get("seed", 42)
        g = torch.Generator().manual_seed(seed)
        indices = torch.randint(0, K, (vocab_size, top_k), generator=g)
        raw_w = torch.rand(vocab_size, top_k, generator=g)
        probs = raw_w / raw_w.sum(dim=-1, keepdim=True)
        init_emb = torch.randn(K, embed_dim, generator=g) * 0.02
        return AncestorTable(
            lut_indices=[indices],
            lut_probs=[probs],
            init_embeddings=[init_emb],
        ).to(device)

    lut_path = Path(lut_path) if Path(lut_path).is_absolute() else script_dir / lut_path
    proto_path = ancestor_cfg.get("proto_path", None)
    if proto_path is not None:
        proto_path = Path(proto_path) if Path(proto_path).is_absolute() else script_dir / proto_path
    table = AncestorTable.from_files(
        lut_path=lut_path, proto_path=proto_path,
        embed_dim=embed_dim, device=device,
    )
    return table.to(device)


def build_model(config: dict, device: torch.device) -> SADModel:
    mc = config["model"]
    model = SADModel(
        vocab_size=mc["vocab_size"],
        hidden_size=mc["hidden_size"],
        n_blocks=mc["n_blocks"],
        n_heads=mc["n_heads"],
        cond_dim=mc["cond_dim"],
        max_seq_len=mc["max_seq_len"],
        block_size=mc.get("block_size", 8),
        dropout=mc.get("dropout", 0.0),
        num_levels=mc.get("num_levels", 2),
        level_sizes=mc.get("level_sizes"),
        tie_weights=mc.get("tie_weights", False),
    ).to(device)
    return model


# ─────────────────────────────────────────────────────────────────────────────
# CLI
# ─────────────────────────────────────────────────────────────────────────────

def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument("--checkpoint", type=str, required=True)
    p.add_argument("--config", type=str, default="configs/sad_owt.yaml")
    p.add_argument("--num_samples", type=int, default=1)
    p.add_argument("--seed", type=int, default=42)
    p.add_argument("--device", type=str,
                   default="cuda" if torch.cuda.is_available() else "cpu")
    p.add_argument("--dtype", type=str, default="bf16", choices=["bf16", "fp16", "fp32"])
    p.add_argument("--stop_on_eos", action="store_true", default=True)
    p.add_argument("--mode", type=str, default="unconditional",
                   choices=["unconditional", "conditional"],
                   help="unconditional: start from all-mask. "
                        "conditional: take a block from the training set as the first block(s).")
    p.add_argument("--prompt_blocks", type=int, default=1,
                   help="(conditional) number of leading blocks taken from the training data.")
    p.add_argument("--data_seed", type=int, default=0,
                   help="(conditional) seed for shuffling the training split when picking a sample.")
    p.add_argument("--positions_per_step", type=int, default=1,
                   help="Number of random non-leaf positions to advance per "
                        "denoising round within a block.")
    p.add_argument("--level_lambdas", type=str, default=None,
                   help="Comma-separated K floats in [0, 1], one per ancestor "
                        "level l = 1..K (e.g. '1.0,0.8,0.5'). Multiplies the "
                        "level's max-prob conf before the cross-level argmax. "
                        "λ_l < 1 biases the schedule away from level l; "
                        "λ_l = 0 disables it. Default: all 1.0 (no change).")
    p.add_argument("--leaf_temperature", type=float, default=1.0,
                   help="Temperature applied to leaf logits before softmax. "
                        "Values < 1.0 sharpen p_leaf, which is then used for "
                        "both leaf multinomial sampling and ancestor projection. "
                        "Default 1.0 (no sharpening).")
    return p.parse_args()


def resolve_dtype(name: str) -> torch.dtype:
    return {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[name]


def main():
    args = parse_args()
    torch.manual_seed(args.seed)

    device = torch.device(args.device)
    dtype = resolve_dtype(args.dtype)

    config = load_config(args.config)
    tokenizer = build_tokenizer(config)

    # ── Build + load model ─────────────────────────────────────────────────
    model = build_model(config, device).to(dtype)
    ckpt = torch.load(args.checkpoint, map_location=device)
    raw_state = ckpt.get("model", ckpt)
    _unwrap(model).load_state_dict(raw_state, strict=False)
    model.eval()
    print(f"Loaded checkpoint: {args.checkpoint}  (step={ckpt.get('step', '?')})")

    # ── Build + load ancestor table ────────────────────────────────────────
    # Fixed LUT comes from config (same file as training); learnable ancestor
    # embeddings come from the checkpoint. load_state_dict overwrites both
    # buffers (LUT, W_l) and parameters (ancestor_embeddings) to match training
    # exactly.
    ancestor_table = build_ancestor_table(
        config, device, embed_dim=config["model"]["hidden_size"],
    )
    assert "ancestor_table" in ckpt, (
        "Checkpoint has no 'ancestor_table' entry — cannot run hierarchical "
        "inference. Re-train with train_sad.py or use an older inference "
        "script that ignores ancestors."
    )
    ancestor_table.load_state_dict(ckpt["ancestor_table"])
    ancestor_table.to(device=device, dtype=dtype).eval()
    print(f"Loaded ancestor table: {ancestor_table.num_levels} ancestor level(s)")

    level_lambdas = None
    if args.level_lambdas:
        level_lambdas = [float(x) for x in args.level_lambdas.split(",")]

    sampler = BlockDiffusionSampler(
        model=_unwrap(model),
        ancestor_table=ancestor_table,
        tokenizer=tokenizer,
        device=device,
        dtype=dtype,
        level_lambdas=level_lambdas,
        leaf_temperature=args.leaf_temperature,
    )
    print(f"level_lambdas (per ancestor level l=1..K) = "
          f"{sampler.level_lambdas[1:]}")
    print(f"leaf_temperature = {sampler.leaf_temperature}")

    # ── Optionally load a prompt from the training data ────────────────────
    prompt_ids = None
    if args.mode == "conditional":
        data_cfg = config.get("data", {})
        seq_len = config["model"]["max_seq_len"]
        block_size = config["model"]["block_size"]
        prompt_len = args.prompt_blocks * block_size
        assert prompt_len < seq_len, (
            f"prompt_blocks * block_size = {prompt_len} must be < max_seq_len = {seq_len}"
        )
        # Resolve relative cache_dir against the sad/ repo root (scripts/..), so
        # the script works regardless of cwd (training ran from sad/).
        cache_dir = data_cfg.get("cache_dir", None)
        if cache_dir is not None and not Path(cache_dir).is_absolute():
            repo_root = ROOT
            candidate = repo_root / cache_dir
            if candidate.exists():
                cache_dir = str(candidate)
        loader = build_owt_dataloader(
            tokenizer,
            split="train[:-100000]",
            seq_len=seq_len,
            batch_size=args.num_samples,
            num_workers=0,
            cache_dir=cache_dir,
            seed=args.data_seed,
            mode=data_cfg.get("mode", "subsample"),
            shard_across_ranks=False,
        )
        batch = next(iter(loader))
        prompt_ids = batch["input_ids"][:args.num_samples, :prompt_len].to(device)
        print(f"Loaded conditional prompt from training data: "
              f"shape={tuple(prompt_ids.shape)} (prompt_blocks={args.prompt_blocks})")

    print(f"Sampling {args.num_samples} sequences ({args.mode}) "
          f"length={config['model']['max_seq_len']}, "
          f"random positions_per_step={args.positions_per_step}")

    out = sampler.generate(
        batch_size=args.num_samples if prompt_ids is None else None,
        prompt_ids=prompt_ids,
        positions_per_step=args.positions_per_step,
        stop_on_eos=args.stop_on_eos,
    )

    # ── Decode & print ─────────────────────────────────────────────────────
    P = out.get("prompt_len", 0)
    print("\n" + "=" * 72)
    for i, ids in enumerate(out["tokens"]):
        ids_list = ids.tolist()
        print(f"[Sample {i + 1}]")
        if P > 0:
            prompt_text = tokenizer.decode(ids_list[:P], skip_special_tokens=True)
            gen_text = tokenizer.decode(ids_list[P:], skip_special_tokens=True)
            print(f"<prompt ({P} tok)> {prompt_text}")
            print(f"<generated> {gen_text}")
        else:
            print(tokenizer.decode(ids_list, skip_special_tokens=True))
        print()


if __name__ == "__main__":
    main()