File size: 34,713 Bytes

9cd89a6

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import time
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
import glob
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer
import torch.utils.checkpoint as cp
import os

# ----------------------------------------------------------------------------
# mamba-ssm dependency
# ----------------------------------------------------------------------------
try:
    from mamba_ssm import Mamba
    from mamba_ssm.utils.generation import InferenceParams
    _HAS_MAMBA = True
except ImportError:
    _HAS_MAMBA = False
    InferenceParams = None
    print("=" * 80)
    print("[WARNING] mamba-ssm not installed. Mamba layers will not function.")
    print("Install with: pip install mamba-ssm")
    print("=" * 80)

    class Mamba(nn.Module):
        def __init__(self, *args, **kwargs):
            super().__init__()
            print("ERROR: Mamba placeholder. mamba-ssm not installed.")
        def forward(self, x, *args, **kwargs):
            print("ERROR: mamba-ssm not installed. Cannot run MambaBlock.")
            return x

# ----------------------------------------------------------------------------
# Model
# ----------------------------------------------------------------------------

@dataclass
class AdaptiveRiverConfig:
    vocab_size: int = 50257
    d_model: int = 1024
    n_layers: int = 24
    d_ff: int = 4096
    dropout: float = 0.0
    rope_theta: float = 10000.0
    rotary_pct: float = 1.0
    layer_norm_eps: float = 1e-5
    rope_scaling_type: str | None = None
    rope_scaling_factor: float = 1.0
    experts_per_layer: int = 4
    top_k_ffn: int = 1
    moe_dropout: float = 0.0
    attn_n_experts: int = 6
    attn_top_k: int = 6
    attn_n_orig_heads: int = 16
    mamba_d_state: int = 16
    mamba_d_conv: int = 4
    mamba_expand: int = 2
    entropy_weight: float = 1e-4
    head_entropy_weight: float = 1e-4
    default_budget_ratio: float = 1.0
    init_std: float = 0.02
    tie_word_embeddings: bool = False  # untied head (matches training)
    load_balance_weight: float = 0.01
    router_z_weight: float = 0.001
    gate_temperature: float = 0.7
    checkpoint_attn_thresh: float = 0.35
    checkpoint_ffn_thresh: float = 0.35
    soak_dtype: str = "fp32"

def _init_weights(module: nn.Module, std: float):
    if isinstance(module, nn.Linear):
        nn.init.normal_(module.weight, mean=0.0, std=std)
        if module.bias is not None:
            nn.init.zeros_(module.bias)

def topk_mask_ste(scores: torch.Tensor, k: int) -> torch.Tensor:
    s = scores.float()
    if k >= s.size(-1):
        return torch.ones_like(s)
    topk = torch.topk(s, k=k, dim=-1).indices
    one_hot = torch.zeros_like(s)
    one_hot.scatter_(dim=-1, index=topk, value=1.0)
    probs = F.softmax(s, dim=-1)
    return one_hot + probs - probs.detach()

class RotaryEmbedding(nn.Module):
    def __init__(self, dim, base=10000.0, scaling_type: str | None = None, scaling_factor: float = 1.0):
        super().__init__()
        self.dim = dim
        self.base = float(base)
        self.scaling_type = scaling_type
        self.scaling_factor = float(scaling_factor)
        base = self._effective_base()
        inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.float32) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self._cos_sin_cache = None
        self._cos_sin_cache_device = None
        self._cos_sin_cache_dtype = None
        self._cos_sin_max_seq_len = -1
    def _effective_base(self) -> float:
        if not self.scaling_type or self.scaling_factor == 1.0:
            return self.base
        if self.scaling_type in ("ntk", "linear", "yarn"):
            return self.base * self.scaling_factor
        return self.base
    def _get_cos_sin_cache(self, seq_len: int, device: torch.device, dtype: torch.dtype):
        if (seq_len > self._cos_sin_max_seq_len or self._cos_sin_cache is None
            or self._cos_sin_cache_device != device or self._cos_sin_cache_dtype != dtype):
            self._cos_sin_max_seq_len = max(seq_len, 2048)
            t = torch.arange(self._cos_sin_max_seq_len, device=device, dtype=self.inv_freq.dtype)
            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos().to(dtype)
            sin = emb.sin().to(dtype)
            self._cos_sin_cache = (cos, sin)
            self._cos_sin_cache_device = device
            self._cos_sin_cache_dtype = dtype
        return self._cos_sin_cache
    def forward(self, x, seq_len: int, offset: int | torch.Tensor = 0):
        device, dtype = x.device, x.dtype
        cos, sin = self._get_cos_sin_cache(seq_len + int(offset), device, dtype)
        if isinstance(offset, torch.Tensor):
            if offset.numel() > 1:
                t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype).float()
                freqs = torch.einsum("i,j->ij", t, self.inv_freq)
                emb = torch.cat((freqs, freqs), dim=-1)
                cos_val = emb.cos()[None, None, :, :].to(dtype)
                sin_val = emb.sin()[None, None, :, :].to(dtype)
                return cos_val, sin_val
            else:
                offset = int(offset.item())
        cos = cos[offset:offset+seq_len].unsqueeze(0).unsqueeze(0)
        sin = sin[offset:offset+seq_len].unsqueeze(0).unsqueeze(0)
        return cos, sin

def apply_rotary(x, cos, sin):
    x1, x2 = x[..., ::2], x[..., 1::2]
    x_rot = torch.stack((-x2, x1), dim=-1).flatten(-2)
    return x * cos + x_rot * sin

class PTLayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-5):
        super().__init__()
        self.ln = nn.LayerNorm(hidden_size, eps=eps)
    def forward(self, x):
        return self.ln(x)

class GlobalSDPAHead(nn.Module):
    def __init__(self, d_model, head_dim, dropout, rope_theta, rotary_pct, cfg):
        super().__init__()
        self.q_proj = nn.Linear(d_model, head_dim, bias=False)
        self.k_proj = nn.Linear(d_model, head_dim, bias=False)
        self.v_proj = nn.Linear(d_model, head_dim, bias=False)
        self.rotary_dim = int(head_dim * rotary_pct)
        self.dropout_p = dropout
        self.rope = None
        if self.rotary_dim > 0:
            self.rope = RotaryEmbedding(
                self.rotary_dim, base=rope_theta,
                scaling_type=cfg.rope_scaling_type,
                scaling_factor=cfg.rope_scaling_factor,
            )
    def forward(self, x, position_offset):
        if isinstance(position_offset, torch.Tensor):
            position_offset = int(position_offset.view(-1)[0].item())
        else:
            position_offset = int(position_offset)
        B, T, C = x.shape
        q, k, v = self.q_proj(x), self.k_proj(x), self.v_proj(x)
        if self.rotary_dim > 0:
            cos, sin = self.rope(q, seq_len=T, offset=position_offset)
            cos = cos.squeeze(1); sin = sin.squeeze(1)
            q_rot = apply_rotary(q[..., :self.rotary_dim], cos, sin)
            k_rot = apply_rotary(k[..., :self.rotary_dim], cos, sin)
            q = torch.cat([q_rot, q[..., self.rotary_dim:]], dim=-1)
            k = torch.cat([k_rot, k[..., self.rotary_dim:]], dim=-1)
        q, k, v = [t.unsqueeze(1) for t in (q, k, v)]
        dropout_p = self.dropout_p if self.training else 0.0
        out = F.scaled_dot_product_attention(q, k, v, is_causal=True, dropout_p=dropout_p)
        return out.squeeze(1)

class AttentionMoERouter(nn.Module):
    def __init__(self, d_model, num_experts, top_k):
        super().__init__()
        self.top_k = top_k
        self.num_experts = num_experts
        self.gate_proj = nn.Linear(d_model, num_experts, bias=False)
        nn.init.normal_(self.gate_proj.weight, mean=0.0, std=0.01)
    def forward(self, x, budget_ratio, temperature):
        seq_embed = x.mean(dim=1)
        logits = self.gate_proj(seq_embed) / max(1e-6, float(temperature))
        logits = logits.clamp(min=-10.0, max=10.0)
        k_target = max(1, int(round(self.top_k * (0.25 + 0.75 * budget_ratio))))
        k_target = min(k_target, logits.size(-1))
        vals, idx = torch.topk(logits, k_target, dim=-1)
        weights = F.softmax(vals.to(torch.float32), dim=-1).to(x.dtype)
        mask = torch.zeros_like(logits, dtype=torch.bool)
        mask.scatter_(1, idx, True)
        with torch.no_grad():
            p = F.softmax(logits, dim=-1)
            entropy = -(p * (p.clamp_min(1e-12)).log()).sum(dim=-1).mean()
        return mask, weights, idx, entropy, logits

class MoEAttention(nn.Module):
    def __init__(self, cfg: AdaptiveRiverConfig):
        super().__init__()
        self.d_model = cfg.d_model
        self.n_experts = cfg.attn_n_experts
        self.cfg = cfg
        self.head_dim = cfg.d_model // cfg.attn_n_orig_heads
        self.rotary_dim = int(self.head_dim * cfg.rotary_pct)
        self.router = AttentionMoERouter(cfg.d_model, cfg.attn_n_experts, cfg.attn_top_k)
        self.q_proj = nn.Linear(cfg.d_model, self.n_experts * self.head_dim, bias=False)
        self.k_proj = nn.Linear(cfg.d_model, self.n_experts * self.head_dim, bias=False)
        self.v_proj = nn.Linear(cfg.d_model, self.n_experts * self.head_dim, bias=False)
        self.rope = None
        if self.rotary_dim > 0:
            self.rope = RotaryEmbedding(
                self.rotary_dim, base=cfg.rope_theta,
                scaling_type=cfg.rope_scaling_type,
                scaling_factor=cfg.rope_scaling_factor,
            )
        self.o_proj = nn.Linear(cfg.attn_n_experts * self.head_dim, cfg.d_model, bias=False)
    def forward(self, x, position_offset, budget_ratio, temperature):
        B, T, C = x.shape
        E, H = self.n_experts, self.head_dim
        sel_mask, gate_w, gate_idx, entropy, gate_logits = self.router(x, budget_ratio, temperature)
        q = self.q_proj(x).view(B, T, E, H).permute(0, 2, 1, 3)
        k = self.k_proj(x).view(B, T, E, H).permute(0, 2, 1, 3)
        v = self.v_proj(x).view(B, T, E, H).permute(0, 2, 1, 3)
        if self.rope:
            if isinstance(position_offset, torch.Tensor):
                position_offset = int(position_offset.view(-1)[0].item())
            else:
                position_offset = int(position_offset)
            cos, sin = self.rope(q, seq_len=T, offset=position_offset)
            cos = cos.squeeze(1); sin = sin.squeeze(1)
            q_rot = apply_rotary(q[..., :self.rotary_dim], cos, sin)
            k_rot = apply_rotary(k[..., :self.rotary_dim], cos, sin)
            q = torch.cat([q_rot, q[..., self.rotary_dim:]], dim=-1)
            k = torch.cat([k_rot, k[..., self.rotary_dim:]], dim=-1)
        q_b = q.reshape(B * E, T, H)
        k_b = k.reshape(B * E, T, H)
        v_b = v.reshape(B * E, T, H)
        dropout_p = self.cfg.dropout if self.training else 0.0
        out_b = F.scaled_dot_product_attention(q_b, k_b, v_b, is_causal=True, dropout_p=dropout_p)
        out = out_b.view(B, E, T, H).permute(0, 2, 1, 3)
        W = torch.zeros(B, E, device=x.device, dtype=out.dtype)
        W.scatter_(1, gate_idx, gate_w.to(out.dtype))
        weighted_out = torch.einsum('b t e h, b e -> b t e h', out, W)
        y = weighted_out.reshape(B, T, E * H).to(self.o_proj.weight.dtype)
        y = self.o_proj(y)
        with torch.no_grad():
            usage = sel_mask.float().mean(dim=0)
            expected = sel_mask.float().sum(dim=-1).mean()
            den = torch.clamp(expected, min=1e-6)
            usage_norm = usage / den
            uniform = 1.0 / self.n_experts
            attn_lb = ((usage_norm - uniform) ** 2).sum() * self.n_experts / self.n_experts
            attn_rz = (gate_logits ** 2).mean()
            head_keep = sel_mask.float().mean()
        return y, {
            "head_entropy": entropy,
            "head_keep_frac": head_keep,
            "attn_load_balance_loss": attn_lb,
            "attn_router_z_loss": attn_rz,
        }

class ExpertFFN(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float):
        super().__init__()
        self.w1 = nn.Linear(d_model, d_ff, bias=False)
        self.w2 = nn.Linear(d_ff, d_model, bias=False)
        self.dropout_p = dropout
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.w1(x)
        x = F.gelu(x, approximate="tanh")
        x = F.dropout(x, p=self.dropout_p, training=self.training)
        x = self.w2(x)
        return x

class MoEFFN(nn.Module):
    def __init__(self, d_model: int, d_ff: int, n_experts: int, top_k: int, dropout: float, cfg: AdaptiveRiverConfig):
        super().__init__()
        self.n_experts = n_experts
        self.base_top_k = top_k
        self.cfg = cfg
        self.router = nn.Linear(d_model, n_experts, bias=False)
        self.w1_stacked = nn.Parameter(torch.empty(n_experts, d_ff, d_model))
        self.w2_stacked = nn.Parameter(torch.empty(n_experts, d_model, d_ff))
        std = cfg.init_std
        nn.init.normal_(self.router.weight, mean=0.0, std=std)
        nn.init.normal_(self.w1_stacked, mean=0.0, std=std)
        nn.init.normal_(self.w2_stacked, mean=0.0, std=std)
    def forward(self, x: torch.Tensor, budget_ratio: float):
        B, T, C = x.shape
        N = B * T
        X = x.reshape(N, C)
        k_target = max(1, int(round(self.base_top_k * (0.5 + budget_ratio / 2.0))))
        k_target = min(k_target, self.n_experts)
        scores = self.router(X).to(torch.float32).clamp(min=-10.0, max=10.0)
        probs = F.softmax(scores, dim=-1).to(X.dtype)
        mask = topk_mask_ste(scores, k=k_target).to(X.dtype)
        gate = (mask * probs)
        gate = gate / gate.sum(dim=-1, keepdim=True).clamp_min(1e-6)
        x_ff = torch.einsum('n c, e d c -> n e d', X, self.w1_stacked)
        x_act = F.gelu(x_ff, approximate="tanh")
        y_experts = torch.einsum('n e d, e c d -> n e c', x_act, self.w2_stacked)
        y = torch.einsum('n e, n e c -> n c', gate, y_experts).view(B, T, C).to(x.dtype)
        with torch.no_grad():
            entropy = (-probs * probs.clamp_min(1e-12).log()).sum(dim=-1).mean()
            router_z = (scores ** 2).mean().clamp(max=10.0)
            frac = mask.mean(dim=0)
            uniform = 1.0 / self.n_experts
            lb = ((frac - uniform) ** 2).sum() * self.n_experts / self.n_experts
        return y, {
            "router_entropy": entropy,
            "ffn_expert_usage": frac.detach(),
            "ffn_load_balance_loss": lb,
            "ffn_router_z_loss": router_z,
        }

class MambaBlock(nn.Module):
    def __init__(self, cfg: AdaptiveRiverConfig, enhanced: bool = False, layer_idx: int | None = None):
        super().__init__()
        if not _HAS_MAMBA:
            print(f"MambaBlock Layer {layer_idx} disabled: mamba-ssm not installed.")
            self.mamba = None
            return
        self.cfg = cfg
        self.ln1 = PTLayerNorm(cfg.d_model, eps=cfg.layer_norm_eps)
        self.mamba = Mamba(
            d_model=cfg.d_model,
            d_state=cfg.mamba_d_state,
            d_conv=cfg.mamba_d_conv,
            expand=cfg.mamba_expand * (2 if enhanced else 1),
            layer_idx=layer_idx,
        )
        self.ln2 = PTLayerNorm(cfg.d_model, eps=cfg.layer_norm_eps)
        self.ffn = nn.Sequential(
            nn.Linear(cfg.d_model, cfg.d_ff * (2 if enhanced else 1), bias=False),
            nn.GELU(approximate="tanh"),
            nn.Linear(cfg.d_ff * (2 if enhanced else 1), cfg.d_model, bias=False),
        )
    def forward(
        self,
        x,
        attn_mask=None,
        position_offset: int | torch.Tensor = 0,
        past_kv=None,
        budget_ratio: float = 1.0,
        use_cache: bool = False,
        mamba_state: Optional[InferenceParams] = None,
    ):
        if not _HAS_MAMBA or self.mamba is None:
            stats = {"head_entropy": torch.tensor(0.0, device=x.device),
                     "head_keep_frac": torch.tensor(1.0, device=x.device),
                     "mamba_out_l2": torch.tensor(0.0, device=x.device)}
            return x, stats, (None, None)
        h = self.ln1(x)
        x_m = self.mamba(h)  # stateless path
        m_out_l2 = x_m.float().pow(2).mean()
        x = x + x_m
        h2 = self.ln2(x)
        x = x + self.ffn(h2)
        stats = {
            "head_entropy": torch.tensor(0.0, device=x.device),
            "head_keep_frac": torch.tensor(1.0, device=x.device),
            "mamba_out_l2": m_out_l2.detach(),
        }
        return x, stats, (None, None)

class RoutedBlock(nn.Module):
    def __init__(self, cfg: AdaptiveRiverConfig):
        super().__init__()
        self.cfg = cfg
        self.ln1 = PTLayerNorm(cfg.d_model, eps=cfg.layer_norm_eps)
        self.ln2 = PTLayerNorm(cfg.d_model, eps=cfg.layer_norm_eps)
        self.attn = MoEAttention(cfg)
        self.ffn = MoEFFN(cfg.d_model, cfg.d_ff, cfg.experts_per_layer, cfg.top_k_ffn, cfg.moe_dropout, cfg)
    def _attn_forward(self, h: torch.Tensor, position_offset: int, budget_ratio: float):
        if isinstance(position_offset, torch.Tensor):
            position_offset = int(position_offset.view(-1)[0].item())
        else:
            position_offset = int(position_offset)
        return self.attn(h, position_offset, budget_ratio, self.cfg.gate_temperature)
    def forward(
        self,
        x,
        attn_mask=None,
        position_offset: int | torch.Tensor = 0,
        past_kv=None,
        budget_ratio: float = 1.0,
        use_cache: bool = False,
        mamba_state: Optional[InferenceParams] = None,
    ):
        h = self.ln1(x)
        attn_out, attn_stats = self._attn_forward(h, position_offset, budget_ratio)
        x = x + attn_out
        h2 = self.ln2(x)
        ffn_out, moe_stats = self.ffn(h2, budget_ratio=budget_ratio)
        x = x + ffn_out
        stats = {**attn_stats, **moe_stats}
        return x, stats, (None, None)

class AdaptiveRiverLM(nn.Module):
    def __init__(self, cfg: AdaptiveRiverConfig):
        super().__init__()
        self.cfg = cfg
        self.embed = nn.Embedding(cfg.vocab_size, cfg.d_model)
        self.blocks = nn.ModuleList()
        mamba_layer_counter = 0
        for i in range(cfg.n_layers):
            if i < 2:
                print(f"[model] Layer {i}: Mamba")
                self.blocks.append(MambaBlock(cfg, enhanced=False, layer_idx=mamba_layer_counter)); mamba_layer_counter += 1
            elif i >= (cfg.n_layers - 2):
                print(f"[model] Layer {i}: Mamba (enhanced)")
                self.blocks.append(MambaBlock(cfg, enhanced=True, layer_idx=mamba_layer_counter)); mamba_layer_counter += 1
            else:
                if i == 2:
                    print(f"[model] Layers {i}-{cfg.n_layers-3}: MoE Attention + MoE FFN")
                self.blocks.append(RoutedBlock(cfg))
        self.ln_f = PTLayerNorm(cfg.d_model, eps=cfg.layer_norm_eps)
        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
        if cfg.tie_word_embeddings:
            self.lm_head.weight = self.embed.weight
        self.apply(lambda m: _init_weights(m, cfg.init_std) if isinstance(m, nn.Linear) else None)
    def forward(
        self,
        input_ids: torch.Tensor,
        budget_ratio: Optional[float] = None,
        mamba_states: Optional[List] = None,
        past_kvs: Optional[List] = None,
        position_offset: int | torch.Tensor = 0,
        return_expert_stats: bool = False,
        use_cache: bool = False,
    ):
        x = self.embed(input_ids)
        b = float(self.cfg.default_budget_ratio if budget_ratio is None else budget_ratio)
        all_stats: Dict[str, List[torch.Tensor]] = {}
        for block in self.blocks:
            x, stats, _ = block(
                x,
                position_offset=position_offset,
                past_kv=None,
                budget_ratio=b,
                use_cache=False,
                mamba_state=None,
            )
            for k, v in stats.items():
                all_stats.setdefault(k, []).append(torch.as_tensor(v.detach() if isinstance(v, torch.Tensor) else v))
        _ = {k: torch.stack(v).mean() for k, v in all_stats.items() if len(v) > 0}
        x = self.ln_f(x)
        logits = self.lm_head(x)
        return logits, _

def estimate_1b_config() -> AdaptiveRiverConfig:
    return AdaptiveRiverConfig(
        vocab_size=50257,
        d_model=1024,
        n_layers=24,
        d_ff=4096,
        experts_per_layer=4,
        top_k_ffn=1,
        default_budget_ratio=1.0,
        attn_n_experts=6,
        attn_top_k=6,
        attn_n_orig_heads=16,
        mamba_d_state=16,
        mamba_d_conv=4,
        mamba_expand=2,
        gate_temperature=0.7,
        head_entropy_weight=1e-4,
        checkpoint_attn_thresh=0.35,
        checkpoint_ffn_thresh=0.35,
        load_balance_weight=0.01,
        router_z_weight=0.001,
        tie_word_embeddings=False,
    )

# ----------------------------------------------------------------------------
# Inference (stateless) with proper end-of-turn handling
# ----------------------------------------------------------------------------

class FastInferenceTester:
    def __init__(self, model, tokenizer, device, im_start_id, im_end_id, eos_id, pad_id):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.im_start_id = im_start_id
        self.im_end_id = im_end_id
        self.eos_id = eos_id
        self.pad_id = pad_id

        self.model.eval()
        torch.set_grad_enabled(False)
        print("Using model's native precision")

        if hasattr(torch, 'compile') and _HAS_MAMBA:
            print("Skipping torch.compile due to mamba-ssm kernels.")
        else:
            try:
                print("Compiling model with torch.compile...")
                self.model = torch.compile(self.model, mode="reduce-overhead")
                print("Model compiled successfully")
            except Exception as e:
                print(f"Could not compile model: {e}")
                print("Running without compilation")

    def _format_to_training_chat(self, prompt: str) -> torch.Tensor:
        messages = [{"role": "user", "content": prompt}]
        formatted = self.tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        input_ids = self.tokenizer.encode(
            formatted, add_special_tokens=False, return_tensors="pt"
        ).to(self.device)
        return input_ids

    def _postprocess_like_training(self, text: str) -> str:
        if "<|im_start|>assistant" in text:
            return text.split("<|im_start|>assistant")[-1].split("<|im_end|>")[0].strip()
        if "assistant\n" in text:
            return text.split("assistant\n")[-1].split("<|im_end|>")[0].strip()
        return text.split("<|im_end|>")[0].strip()

    def _reset_mamba_states(self):
        if not _HAS_MAMBA:
            return
        for block in self.model.blocks:
            if isinstance(block, MambaBlock) and hasattr(block, "mamba"):
                for attr in ("inference_params", "conv_state", "ssm_state"):
                    if hasattr(block.mamba, attr):
                        setattr(block.mamba, attr, None)

    def generate_once(
        self,
        prompt: str,
        max_tokens: int = 2000,
        temperature: float = 0.8,
        top_p: float = 1.0,
        top_k: int = 0,
        budget_ratio: float = 1.0,
        show_tokens: bool = False,
        min_new_tokens: int = 3,
    ) -> Dict:
        self._reset_mamba_states()

        print(f"\n{'='*80}")
        print("FAST GENERATION (no cache)")
        print(f"{'='*80}")
        print(f"Prompt: {prompt}")
        print("─" * 80)

        input_ids = self._format_to_training_chat(prompt)

        generated_tokens: List[int] = []
        token_times: List[float] = []
        stop_ids = set(t for t in [self.im_end_id, self.eos_id] if t is not None)
        ban_initial_ids = set(t for t in [self.im_end_id, self.eos_id, self.im_start_id, self.pad_id] if t is not None)

        start_time = time.time()

        with torch.inference_mode():
            # Prefill over full prompt
            logits, _ = self.model(
                input_ids,
                budget_ratio=budget_ratio,
                position_offset=0,
                use_cache=False
            )
            next_token_logits = logits[:, -1, :]            # [1, vocab]
            vocab_size = next_token_logits.size(-1)

            print("Generating...", end=" ", flush=True)
            is_cuda = torch.cuda.is_available()
            buffer = []  # small output buffer for streaming

            for _ in range(max_tokens):
                if is_cuda:
                    torch.cuda.synchronize()
                t0 = time.time()

                # 1D view for sampling/masking
                logits_for_sampling = next_token_logits.squeeze(0).clone() / max(1e-6, temperature)
                vocab_size = logits_for_sampling.size(0)

                # Ban structural tokens at the very start
                if len(generated_tokens) < min_new_tokens and min_new_tokens > 0:
                    for tid in ban_initial_ids:
                        if tid is not None and 0 <= tid < vocab_size:
                            logits_for_sampling[tid] = float("-inf")

                # Top-k
                if top_k and top_k > 0:
                    kth = torch.topk(logits_for_sampling, top_k)[0][-1]
                    logits_for_sampling[logits_for_sampling < kth] = float("-inf")

                # Top-p
                if top_p < 1.0:
                    sorted_logits, sorted_indices = torch.sort(logits_for_sampling, descending=True)
                    cumulative_probs = torch.softmax(sorted_logits, dim=-1).cumsum(dim=-1)
                    sorted_indices_to_remove = cumulative_probs > top_p
                    sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].clone()
                    sorted_indices_to_remove[0] = False
                    remove_idx = sorted_indices[sorted_indices_to_remove]
                    logits_for_sampling[remove_idx] = float("-inf")

                # Sample
                probs = F.softmax(logits_for_sampling, dim=-1)
                next_token_id = torch.multinomial(probs, num_samples=1).item()

                generated_tokens.append(next_token_id)

                # Decode + buffered print
                if show_tokens:
                    tok_text = self.tokenizer.decode([next_token_id], skip_special_tokens=False)
                    buffer.append(tok_text)
                    if len(buffer) >= 16:
                        print("".join(buffer), end="", flush=True)
                        buffer.clear()

                # Stop on EOT/EOS after min_new_tokens
                if (next_token_id in stop_ids) and (len(generated_tokens) >= max(1, min_new_tokens)):
                    if buffer:
                        print("".join(buffer), end="", flush=True)
                        buffer.clear()
                    if show_tokens:
                        print(" [EOT]", flush=True)
                    break

                # Stateless decode: append token and re-run forward
                input_ids = torch.cat(
                    [input_ids, torch.tensor([[next_token_id]], device=self.device)],
                    dim=1
                )
                logits, _ = self.model(
                    input_ids,
                    budget_ratio=budget_ratio,
                    position_offset=0,
                    use_cache=False
                )
                next_token_logits = logits[:, -1, :]

                if is_cuda:
                    torch.cuda.synchronize()
                token_times.append(time.time() - t0)

            # Flush any remaining buffered tokens
            if buffer:
                print("".join(buffer), end="", flush=True)
                buffer.clear()



        total_time = time.time() - start_time
        text = self.tokenizer.decode(generated_tokens, skip_special_tokens=False)
        text = self._postprocess_like_training(text)

        if show_tokens and (not generated_tokens or (generated_tokens[-1] not in stop_ids)):
            print()

        num_gen = len(generated_tokens)
        if num_gen == 0:
            print("\nNo tokens generated.")
            return {'output': '', 'tokens_per_sec': 0, 'decode_tps': 0, 'total_time': total_time, 'num_tokens': 0}

        decode_time = sum(token_times)
        toks_per_sec = num_gen / total_time if total_time > 0 else 0
        decode_tps = num_gen / decode_time if decode_time > 0 else 0

        print("\n" + "─" * 80)
        print("STATISTICS")
        print("─" * 80)
        print(f"Tokens:        {num_gen}")
        print(f"Total time:    {total_time:.2f}s")
        print(f"Overall speed: {toks_per_sec:.1f} tok/s (includes prompt)")
        print(f"Decode speed:  {decode_tps:.1f} tok/s (generation only)")
        print(f"Time/token:    {(decode_time/num_gen)*1000:.1f}ms")
        print("─" * 80)
        print(f"Output: {text[:100]}{'...' if len(text) > 100 else ''}")
        print("=" * 80 + "\n")

        self._reset_mamba_states()

        return {
            'output': text,
            'tokens_per_sec': toks_per_sec,
            'decode_tps': decode_tps,
            'total_time': total_time,
            'num_tokens': num_gen,
        }

    def interactive_mode(self):
        print("\n" + "=" * 80)
        print("INTERACTIVE MODE (no cache, stateless)")
        print("Type 'quit' or your prompt")
        print("=" * 80 + "\n")
        while True:
            try:
                prompt = input("\nYou: ")
            except (EOFError, KeyboardInterrupt):
                print("\nBye.")
                break
            if prompt.lower() in ["quit", "exit", "q"]:
                break
            if not prompt.strip():
                continue
            print("\nAssistant: ", end="", flush=True)
            self.generate_once(prompt, max_tokens=2000, temperature=0.8, show_tokens=True)

def _cast_layernorm_fp32(module: nn.Module):
    for m in module.modules():
        if isinstance(m, nn.LayerNorm):
            m.float()

def load_model_and_tokenizer(model_dir: str):
    """
    Load AdaptiveRiverLM model and tokenizer from a folder layout like:

        model_dir/
            checkpoint.pt  (or any .pt file)
            tokenizer/
                tokenizer.json
                special_tokens_map.json
                ...

    Automatically finds the .pt file if not explicitly named.
    """
    print(f"Searching for model checkpoint in: {model_dir}")
    ckpts = glob.glob(os.path.join(model_dir, "*.pt"))
    if not ckpts:
        raise FileNotFoundError(f"No .pt checkpoint found in {model_dir}")
    if len(ckpts) > 1:
        print(f"[Warning] Multiple .pt files found, using: {ckpts[0]}")
    checkpoint_path = ckpts[0]

    tokenizer_path = os.path.join(model_dir, "tokenizer")
    if not os.path.isdir(tokenizer_path):
        raise FileNotFoundError(f"Missing tokenizer directory: {tokenizer_path}")

    print(f"Loading tokenizer from: {tokenizer_path}")
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True, trust_remote_code=True)
    if tokenizer.pad_token is None:
        print("Tokenizer missing pad_token. Assigning eos_token as pad_token.")
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id

    print("Building model (AdaptiveRiverLM)...")
    cfg = estimate_1b_config()
    cfg.vocab_size = len(tokenizer)
    cfg.tie_word_embeddings = False 

    model = AdaptiveRiverLM(cfg)

    print(f"Loading checkpoint: {checkpoint_path}")
    state = torch.load(checkpoint_path, map_location="cpu")
    model_state_dict = model.state_dict()
    converted_state = {}

    for k, param in model_state_dict.items():
        if k in state and state[k].shape == param.shape:
            converted_state[k] = state[k]

    print("Loading weights...")
    load_result = model.load_state_dict(converted_state, strict=False)

    if load_result.missing_keys:
        print("\n--- Missing Keys ---")
        for k in load_result.missing_keys:
            print(" ", k)
    if load_result.unexpected_keys:
        print("\n--- Unexpected Keys ---")
        for k in load_result.unexpected_keys:
            print(" ", k)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    if device == "cuda" and torch.cuda.is_bf16_supported():
        _cast_layernorm_fp32(model)
        model = model.to(torch.bfloat16)
    else:
        model = model.to(torch.float32)

    model.eval()
    print(f"Model and tokenizer loaded successfully from {model_dir} on {device}")
    return model, tokenizer, device


def main():
    parser = argparse.ArgumentParser(description="Stateless inference for AdaptiveRiverLM (no KV cache), proper EOT handling")
    parser.add_argument("--model_dir", type=str, required=True, help="Path to model folder (with checkpoint.pt and tokenizer/)")
    parser.add_argument("--prompt", type=str, default="Hello, my name is")
    parser.add_argument("--max_tokens", type=int, default=2000)
    parser.add_argument("--temperature", type=float, default=0.8)
    parser.add_argument("--top_p", type=float, default=1.0)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument("--min_new_tokens", type=int, default=3)
    parser.add_argument("--interactive", action="store_true", help="Interactive mode (stateless)")
    args = parser.parse_args()

    model, tokenizer, device = load_model_and_tokenizer(args.model_dir)

    # Resolve special token IDs for end-of-turn handling
    im_end_id   = tokenizer.convert_tokens_to_ids("<|im_end|>")
    im_start_id = tokenizer.convert_tokens_to_ids("<|im_start|>")
    eos_id      = tokenizer.eos_token_id
    pad_id      = tokenizer.pad_token_id

    stop_ids = set(t for t in [im_end_id, eos_id] if t is not None)
    ban_initial_ids = set(t for t in [im_end_id, eos_id, im_start_id, pad_id] if t is not None)


    tester = FastInferenceTester(model, tokenizer, device, im_start_id, im_end_id, eos_id, pad_id)

    if args.interactive:
        tester.interactive_mode()
    else:
        tester.generate_once(
            args.prompt,
            max_tokens=args.max_tokens,
            temperature=args.temperature,
            top_p=args.top_p,
            top_k=args.top_k,
            show_tokens=True,
            min_new_tokens=args.min_new_tokens,
        )

if __name__ == "__main__":
    main()