#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
LULUV2 optimized local inference engine.

Goals:
- load LULU2/LULUV2 checkpoints through the existing LULUV2 model file
- no AutoModelForCausalLM.from_pretrained and no external model weights
- vectorized prompt prefill into explicit KV caches
- persistent session KV cache across turns when prompt tokens extend prior prompt
- modes: fast(pass1/base), vwm(pass1+pass2), deep(pass1+pass2 long context)
- safe fallback to slow full-prefix forward if cached path fails

This is intentionally Python-first and debuggable.  It is a bridge toward
kernel/CUDA-graph optimization, not the final kernel path.
"""
from __future__ import annotations

import importlib.util
import json
import math
import os
import platform
import time
import traceback
from contextlib import nullcontext
from dataclasses import dataclass, asdict
from pathlib import Path
from types import SimpleNamespace
from typing import Any, Dict, Generator, List, Optional, Tuple

import torch
import torch.nn.functional as F

try:
    import psutil
except Exception:
    psutil = None

try:
    import pynvml
except Exception:
    pynvml = None

STOP_STRINGS = [
    "<|im_start|>", "<|im_end|>", "<|user|>", "<|system|>", "<|assistant|>",
    "User:", "Assistant:", "\nuser:", "\nassistant:",
]


def setup_torch() -> None:
    if torch.cuda.is_available():
        try:
            # Old API still works on current wheels; warnings are harmless.
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cudnn.allow_tf32 = True
        except Exception:
            pass
        try:
            torch.backends.cuda.enable_flash_sdp(True)
            torch.backends.cuda.enable_mem_efficient_sdp(True)
            torch.backends.cuda.enable_math_sdp(False)
        except Exception:
            pass
    if hasattr(torch, "set_float32_matmul_precision"):
        try:
            torch.set_float32_matmul_precision("high")
        except Exception:
            pass


def human_bytes(num: float) -> str:
    num = float(num)
    for unit in ["B", "KB", "MB", "GB", "TB"]:
        if abs(num) < 1024.0:
            return f"{num:.2f} {unit}"
        num /= 1024.0
    return f"{num:.2f} PB"


def value_to_text(value: Any) -> str:
    if value is None:
        return ""
    if isinstance(value, str):
        return value
    if isinstance(value, dict):
        for key in ("text", "content", "value"):
            if key in value:
                return value_to_text(value.get(key))
        return "\n".join(value_to_text(v) for v in value.values() if value_to_text(v))
    if isinstance(value, (list, tuple)):
        return "\n".join(value_to_text(v) for v in value if value_to_text(v))
    return str(value)


def clean_text(text: Any) -> str:
    text = value_to_text(text).replace("\\n", "\n")
    cut_points = [text.find(s) for s in STOP_STRINGS if s in text and text.find(s) > 0]
    if cut_points:
        text = text[: min(cut_points)]
    for s in STOP_STRINGS:
        text = text.replace(s, "")
    text = text.strip()
    for prefix in ("Assistant:", "assistant:", "Lulu:", "lulu:"):
        if text.startswith(prefix):
            text = text[len(prefix):].strip()
    lines = [ln.rstrip() for ln in text.splitlines()]
    # collapse excessive vertical whitespace without destroying code blocks too much
    out: List[str] = []
    blank = 0
    for ln in lines:
        if not ln.strip():
            blank += 1
            if blank <= 2:
                out.append("")
        else:
            blank = 0
            out.append(ln)
    return "\n".join(out).strip()


def normalize_history(history: Any) -> List[Dict[str, str]]:
    out: List[Dict[str, str]] = []
    if not history:
        return out
    for item in history:
        if isinstance(item, dict):
            role = item.get("role", "")
            content = clean_text(item.get("content", ""))
            if role in {"user", "assistant"} and content:
                out.append({"role": role, "content": content})
        elif isinstance(item, (tuple, list)) and len(item) >= 2:
            u = clean_text(item[0])
            a = clean_text(item[1])
            if u:
                out.append({"role": "user", "content": u})
            if a:
                out.append({"role": "assistant", "content": a})
    return out


def resolve_model_py(model_py: Optional[str]) -> str:
    candidates: List[str] = []
    if model_py:
        candidates.append(model_py)
    candidates.extend(["luluv2_inference_runtime.py"])
    for c in candidates:
        p = Path(c)
        if p.exists():
            return str(p.resolve())
    raise FileNotFoundError("Could not find LULUV2 model file. Pass --model-py.")


def import_model_py(model_py: Optional[str]):
    path = resolve_model_py(model_py)
    spec = importlib.util.spec_from_file_location("luluv2_runtime_module", path)
    if spec is None or spec.loader is None:
        raise RuntimeError(f"Could not import model file: {path}")
    mod = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(mod)
    return mod, path


@dataclass
class GenerationConfig:
    max_new_tokens: int = 512
    temperature: float = 0.65
    top_k: int = 40
    top_p: float = 0.90
    min_p: float = 0.03
    repetition_penalty: float = 1.10
    frequency_penalty: float = 0.02
    greedy: bool = False
    no_repeat_ngram: int = 4
    stream_every: int = 1
    max_context_tokens: int = 4096
    mode: str = "vwm"  # fast, vwm, deep, slow
    return_pass_metrics: bool = True
    use_cache: bool = True
    vectorized_prefill: bool = True
    persistent_cache: bool = True
    compile_step: bool = False


@dataclass
class GenerationStats:
    prompt_tokens: int = 0
    prompt_total_tokens: int = 0
    prompt_kept_tokens: int = 0
    prompt_dropped_tokens: int = 0
    generated_tokens: int = 0
    elapsed_sec: float = 0.0
    tokens_per_sec: float = 0.0
    prefill_sec: float = 0.0
    prefill_tps: float = 0.0
    cache_hit: bool = False
    cache_reused_tokens: int = 0
    cache_new_prefill_tokens: int = 0
    mode: str = "vwm"
    backend: str = "none"
    last_token: str = ""
    last_token_id: int = -1
    last_token_prob: float = 0.0
    last_entropy: float = 0.0
    finish_reason: str = "none"
    pass1_pass2_kl: Optional[float] = None
    pass1_pass2_logit_cosine: Optional[float] = None


class KVLayerCache:
    def __init__(self):
        self.k: Optional[torch.Tensor] = None  # [B, H, T, Dh]
        self.v: Optional[torch.Tensor] = None

    @property
    def length(self) -> int:
        if self.k is None:
            return 0
        return int(self.k.shape[2])

    def set(self, k: torch.Tensor, v: torch.Tensor, max_len: int) -> None:
        if k.shape[2] > max_len:
            k = k[:, :, -max_len:, :]
            v = v[:, :, -max_len:, :]
        self.k = k.detach().contiguous()
        self.v = v.detach().contiguous()

    def append(self, k: torch.Tensor, v: torch.Tensor, max_len: int) -> None:
        if self.k is None:
            self.set(k, v, max_len)
            return
        self.k = torch.cat([self.k, k.detach()], dim=2)
        self.v = torch.cat([self.v, v.detach()], dim=2)
        if self.k.shape[2] > max_len:
            self.k = self.k[:, :, -max_len:, :].contiguous()
            self.v = self.v[:, :, -max_len:, :].contiguous()


class DecoderKVCache:
    def __init__(self, n_layers: int):
        self.layers = [KVLayerCache() for _ in range(int(n_layers))]

    def clear(self):
        for layer in self.layers:
            layer.k = None
            layer.v = None

    @property
    def length(self) -> int:
        if not self.layers:
            return 0
        return self.layers[0].length


class LULUV2OptimizedEngine:
    def __init__(
        self,
        ckpt_path: str,
        model_py: Optional[str] = None,
        tokenizer_dir: Optional[str] = None,
        device: Optional[str] = None,
        dtype: str = "bf16",
        local_files_only: bool = True,
        no_config_download: bool = True,
        force_base_only: bool = False,
    ):
        setup_torch()
        self.ckpt_path = str(ckpt_path)
        self.ckpt_dir = Path(self.ckpt_path).resolve().parent
        self.device = self._select_device(device)
        self.dtype = self._dtype_from_name(dtype)
        self.local_files_only = bool(local_files_only)
        self.no_config_download = bool(no_config_download)
        self.force_base_only = bool(force_base_only)
        self.last_stats = GenerationStats()
        self.recent_tokens: List[Dict[str, Any]] = []
        self.last_prompt_total_tokens: int = 0
        self.last_prompt_kept_tokens: int = 0
        self.last_prompt_dropped_tokens: int = 0
        self.cache_ids: Optional[torch.Tensor] = None
        self.cache_mode: str = ""
        self.cache_max_context: int = 0
        self.pass1_cache: Optional[DecoderKVCache] = None
        self.pass2_cache: Optional[DecoderKVCache] = None
        self.cached_logits: Optional[torch.Tensor] = None
        self.cached_pass1_logits: Optional[torch.Tensor] = None
        self.cached_pass2_logits: Optional[torch.Tensor] = None
        self.cache_backend: str = "cold"

        self.goku, self.model_py_path = import_model_py(model_py)
        self.args = SimpleNamespace(
            checkpoint=self.ckpt_path,
            tokenizer=tokenizer_dir or "",
            model_id="",
            no_config_download=self.no_config_download,
            local_files_only=self.local_files_only,
        )

        print("[guard] LULUV2 cockpit: no AutoModelForCausalLM.from_pretrained call and no external model weights loaded.")
        print(f"[load] checkpoint={self.ckpt_path}")
        self.base_ckpt, base = self.goku.load_lulu2_base(self.args, self.device, self.dtype)
        self.tokenizer = self._load_tokenizer(tokenizer_dir)
        self.model, self.has_pass2 = self._maybe_wrap_pass2(base)
        self.base = self.model.base if self.has_pass2 else self.model
        self.n_layers = int(self.base.config.num_hidden_layers)
        self.model.eval()
        self.base.eval()
        self.model_info = self._build_model_info()
        self._compiled = False

    def _select_device(self, device: Optional[str]):
        if device:
            return torch.device(device)
        if torch.cuda.is_available():
            return torch.device("cuda")
        return torch.device("cpu")

    def _dtype_from_name(self, name: str):
        name = (name or "bf16").lower()
        if name in {"bf16", "bfloat16"}:
            return torch.bfloat16
        if name in {"fp16", "float16", "half"}:
            return torch.float16
        return torch.float32

    def _load_tokenizer(self, tokenizer_dir: Optional[str]):
        if tokenizer_dir:
            self.args.tokenizer = tokenizer_dir
        else:
            sibling = self.ckpt_dir / "tokenizer"
            if sibling.is_dir():
                self.args.tokenizer = str(sibling)
        tok = self.goku.load_tokenizer(self.args, self.base_ckpt)
        if getattr(tok, "pad_token_id", None) is None and getattr(tok, "eos_token_id", None) is not None:
            try:
                tok.pad_token = tok.eos_token
            except Exception:
                pass
        # Long-prompt safety: for chat/RAG prompts, the latest user turn and final
        # instruction are normally at the end. Right-side truncation silently drops
        # exactly the part the model must answer, so force left truncation where the
        # tokenizer supports it. encode() below also performs manual left truncation
        # and records how many tokens were dropped.
        try:
            tok.truncation_side = "left"
        except Exception:
            pass
        try:
            tok.model_max_length = 10**9
        except Exception:
            pass
        return tok

    def _maybe_wrap_pass2(self, base):
        ckpt = self.base_ckpt
        if self.force_base_only or "pass2_state" not in ckpt:
            print("[pass2] no pass2_state loaded; running base LULUV2 forward")
            return base.to(self.device).eval(), False
        cfg_dict = dict(ckpt.get("pass2_config") or {})
        Pass2Config = self.goku.Pass2Config
        fields = getattr(Pass2Config, "__dataclass_fields__", {})
        pass2_cfg = Pass2Config(**{k: v for k, v in cfg_dict.items() if k in fields})
        model = self.goku.Lulu2TwoPassForCausalLM(base, pass2_cfg)
        missing, unexpected = model.load_state_dict(ckpt["pass2_state"], strict=False)
        print(f"[pass2] loaded pass2_state missing={len(missing)} unexpected={len(unexpected)}")
        model.to(device=self.device, dtype=self.dtype).eval()
        return model, True

    def _build_model_info(self) -> Dict[str, Any]:
        total_params = sum(p.numel() for p in self.model.parameters())
        c_codes = [(n, p.numel()) for n, p in self.model.named_parameters() if n.endswith(".c")]
        gate_mean = None
        adapter_gate_mean = None
        if self.has_pass2:
            with torch.no_grad():
                gate_mean = float(torch.sigmoid(self.model.layer_gates.float()).mean().item())
                vals = [float(torch.sigmoid(ad.gate.float()).item()) for ad in self.model.adapters]
                adapter_gate_mean = sum(vals) / max(1, len(vals))
        ckpt_size = Path(self.ckpt_path).stat().st_size if Path(self.ckpt_path).exists() else 0
        cfg = getattr(self.base, "config", None)
        return {
            "checkpoint": self.ckpt_path,
            "checkpoint_size": human_bytes(ckpt_size),
            "model_py": self.model_py_path,
            "device": str(self.device),
            "dtype": str(self.dtype).replace("torch.", ""),
            "has_pass2": self.has_pass2,
            "total_params": total_params,
            "vwm_c_modules": len(c_codes),
            "vwm_c_params": sum(n for _, n in c_codes),
            "pass2_layer_gate_mean": gate_mean,
            "pass2_adapter_gate_mean": adapter_gate_mean,
            "hidden_size": getattr(cfg, "hidden_size", None),
            "layers": getattr(cfg, "num_hidden_layers", None),
            "heads": getattr(cfg, "num_attention_heads", None),
            "kv_heads": getattr(cfg, "num_key_value_heads", None),
            "max_position_embeddings": getattr(cfg, "max_position_embeddings", None),
        }

    def amp_context(self):
        if self.device.type == "cuda" and self.dtype in (torch.bfloat16, torch.float16):
            return torch.autocast("cuda", dtype=self.dtype)
        return nullcontext()

    def build_chat_prompt(
        self,
        message: str,
        history: Any,
        system_prompt: str,
        memory_notes: str = "",
        history_turns: int = 4,
        extra_context: str = "",
    ) -> str:
        history = normalize_history(history)
        recent = history[-max(0, int(history_turns)) * 2:] if history_turns else []
        system_chunks: List[str] = []
        if system_prompt.strip():
            system_chunks.append(system_prompt.strip())
        if memory_notes.strip():
            system_chunks.append("Useful memory notes:\n" + memory_notes.strip())
        if extra_context.strip():
            system_chunks.append("Relevant local context:\n" + extra_context.strip())
        system = "\n\n".join(system_chunks)
        messages: List[Dict[str, str]] = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.extend(recent)
        messages.append({"role": "user", "content": clean_text(message)})
        try:
            return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        except Exception:
            parts: List[str] = []
            if system:
                parts.append(f"<|im_start|>system\n{system}<|im_end|>")
            for item in recent:
                parts.append(f"<|im_start|>{item['role']}\n{item['content']}<|im_end|>")
            parts.append(f"<|im_start|>user\n{clean_text(message)}<|im_end|>")
            parts.append("<|im_start|>assistant\n")
            return "\n".join(parts)

    def encode(self, text: str, max_context_tokens: int) -> torch.Tensor:
        """Encode prompt with explicit left-truncation and accounting.

        This avoids a common long-context failure mode: many tokenizers default to
        right-side truncation, which keeps the beginning of a huge prompt and drops
        the final user instruction. For chat, we almost always want the opposite.
        """
        max_context = max(1, int(max_context_tokens))
        try:
            self.tokenizer.truncation_side = "left"
        except Exception:
            pass

        # Tokenize without tokenizer-side truncation so we know exactly whether the
        # prompt was clipped. The prompt already contains chat special tokens.
        try:
            enc = self.tokenizer(
                text,
                return_tensors="pt",
                truncation=False,
                add_special_tokens=False,
            )
        except TypeError:
            enc = self.tokenizer(text, return_tensors="pt", truncation=False)

        ids = enc.input_ids
        total = int(ids.shape[1])
        dropped = max(0, total - max_context)
        if dropped > 0:
            ids = ids[:, -max_context:].contiguous()
            # Do not reuse an older conversation cache after a hard context trim;
            # the logical prefix changed and reuse can make long prompts feel like
            # they are "forgetting" pieces.
            self.pass1_cache = None
            self.pass2_cache = None
            self.cache_ids = None
            self.cached_logits = None
            self.cached_pass1_logits = None
            self.cached_pass2_logits = None
            self.cache_backend = "truncated-rebuild"

        self.last_prompt_total_tokens = total
        self.last_prompt_kept_tokens = int(ids.shape[1])
        self.last_prompt_dropped_tokens = dropped
        return ids.to(self.device)

    def _position_ids(self, T: int, offset: int = 0) -> torch.Tensor:
        return torch.arange(offset, offset + T, device=self.device, dtype=torch.long).unsqueeze(0)

    def _attn_prefill(self, attn, hidden_states: torch.Tensor, position_ids: torch.Tensor, cache: KVLayerCache, max_context: int) -> torch.Tensor:
        bsz, q_len, _ = hidden_states.size()
        query_states = attn.q_proj(hidden_states)
        key_states = attn.k_proj(hidden_states)
        value_states = attn.v_proj(hidden_states)
        query_states = query_states.view(bsz, q_len, attn.num_heads, attn.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, attn.num_key_value_heads, attn.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, attn.num_key_value_heads, attn.head_dim).transpose(1, 2)
        cos, sin = attn.rotary_emb(value_states, position_ids)
        query_states, key_states = self.goku.apply_rotary_pos_emb(query_states, key_states, cos, sin)
        key_states = self.goku.repeat_kv(key_states, attn.num_key_value_groups)
        value_states = self.goku.repeat_kv(value_states, attn.num_key_value_groups)
        cache.set(key_states, value_states, max_context)
        attn_output = F.scaled_dot_product_attention(
            query_states, key_states, value_states, attn_mask=None, dropout_p=0.0, is_causal=True, scale=attn.scaling
        )
        attn_output = attn_output.transpose(1, 2).contiguous().reshape(bsz, q_len, attn.hidden_size)
        return attn.o_proj(attn_output)

    def _attn_step(self, attn, hidden_states: torch.Tensor, pos: int, cache: KVLayerCache, max_context: int) -> torch.Tensor:
        bsz, q_len, _ = hidden_states.size()
        assert q_len == 1
        query_states = attn.q_proj(hidden_states)
        key_states = attn.k_proj(hidden_states)
        value_states = attn.v_proj(hidden_states)
        query_states = query_states.view(bsz, q_len, attn.num_heads, attn.head_dim).transpose(1, 2)
        key_states = key_states.view(bsz, q_len, attn.num_key_value_heads, attn.head_dim).transpose(1, 2)
        value_states = value_states.view(bsz, q_len, attn.num_key_value_heads, attn.head_dim).transpose(1, 2)
        position_ids = self._position_ids(1, pos)
        cos, sin = attn.rotary_emb(value_states, position_ids)
        query_states, key_states = self.goku.apply_rotary_pos_emb(query_states, key_states, cos, sin)
        key_states = self.goku.repeat_kv(key_states, attn.num_key_value_groups)
        value_states = self.goku.repeat_kv(value_states, attn.num_key_value_groups)
        cache.append(key_states, value_states, max_context)
        if cache.k is None or cache.v is None:
            raise RuntimeError("KV cache append failed")
        attn_output = F.scaled_dot_product_attention(
            query_states, cache.k, cache.v, attn_mask=None, dropout_p=0.0, is_causal=False, scale=attn.scaling
        )
        attn_output = attn_output.transpose(1, 2).contiguous().reshape(bsz, q_len, attn.hidden_size)
        return attn.o_proj(attn_output)

    def _layer_prefill(self, layer, hidden_states: torch.Tensor, position_ids: torch.Tensor, cache: KVLayerCache, max_context: int) -> torch.Tensor:
        residual = hidden_states
        x = layer.input_layernorm(hidden_states)
        x = self._attn_prefill(layer.self_attn, x, position_ids, cache, max_context)
        hidden_states = residual + x
        residual = hidden_states
        x = layer.post_attention_layernorm(hidden_states)
        x = layer.mlp(x)
        return residual + x

    def _layer_step(self, layer, hidden_states: torch.Tensor, pos: int, cache: KVLayerCache, max_context: int) -> torch.Tensor:
        residual = hidden_states
        x = layer.input_layernorm(hidden_states)
        x = self._attn_step(layer.self_attn, x, pos, cache, max_context)
        hidden_states = residual + x
        residual = hidden_states
        x = layer.post_attention_layernorm(hidden_states)
        x = layer.mlp(x)
        return residual + x

    @torch.no_grad()
    def _prefill_pass1(self, input_ids: torch.Tensor, max_context: int, use_pass_embed: bool) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor, torch.Tensor]:
        T = int(input_ids.shape[1])
        position_ids = self._position_ids(T, 0)
        cache = DecoderKVCache(self.n_layers)
        h = self.base.model.embed_tokens(input_ids)
        if use_pass_embed and self.has_pass2:
            h = h + self.model.pass_embed[0].to(dtype=h.dtype, device=h.device).view(1, 1, -1)
        layer_states: List[torch.Tensor] = []
        for i, layer in enumerate(self.base.model.layers):
            h = self._layer_prefill(layer, h, position_ids, cache.layers[i], max_context)
            layer_states.append(h)
        normed = self.base.model.norm(h)
        logits = self.base.lm_head(normed)
        self.pass1_cache = cache
        return h, layer_states, position_ids, logits

    @torch.no_grad()
    def _prefill_pass2(self, h1_resid: torch.Tensor, pass1_states: List[torch.Tensor], position_ids: torch.Tensor, max_context: int) -> torch.Tensor:
        if not self.has_pass2:
            raise RuntimeError("pass2 requested but checkpoint has no pass2_state")
        cache = DecoderKVCache(self.n_layers)
        h2 = h1_resid + self.model.pass_embed[1].to(dtype=h1_resid.dtype, device=h1_resid.device).view(1, 1, -1)
        for i, layer in enumerate(self.base.model.layers):
            before = h2
            layer_out = self._layer_prefill(layer, h2, position_ids, cache.layers[i], max_context)
            layer_delta = layer_out - before
            gate = torch.sigmoid(self.model.layer_gates[i]).to(dtype=h2.dtype, device=h2.device)
            adapter_delta = self.model.adapters[i](h2, pass1_states[i])
            h2 = before + gate * layer_delta + adapter_delta
        normed = self.base.model.norm(h2)
        logits = self.base.lm_head(normed)
        self.pass2_cache = cache
        return logits

    @torch.no_grad()
    def _step_pass1(self, token_id: torch.Tensor, pos: int, max_context: int, use_pass_embed: bool) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
        if self.pass1_cache is None:
            self.pass1_cache = DecoderKVCache(self.n_layers)
        h = self.base.model.embed_tokens(token_id)
        if use_pass_embed and self.has_pass2:
            h = h + self.model.pass_embed[0].to(dtype=h.dtype, device=h.device).view(1, 1, -1)
        states: List[torch.Tensor] = []
        for i, layer in enumerate(self.base.model.layers):
            h = self._layer_step(layer, h, pos, self.pass1_cache.layers[i], max_context)
            states.append(h)
        logits = self.base.lm_head(self.base.model.norm(h))
        return h, states, logits

    @torch.no_grad()
    def _step_pass2(self, h1_resid: torch.Tensor, pass1_states: List[torch.Tensor], pos: int, max_context: int) -> torch.Tensor:
        if not self.has_pass2:
            raise RuntimeError("pass2 step requested but unavailable")
        if self.pass2_cache is None:
            self.pass2_cache = DecoderKVCache(self.n_layers)
        h2 = h1_resid + self.model.pass_embed[1].to(dtype=h1_resid.dtype, device=h1_resid.device).view(1, 1, -1)
        for i, layer in enumerate(self.base.model.layers):
            before = h2
            layer_out = self._layer_step(layer, h2, pos, self.pass2_cache.layers[i], max_context)
            layer_delta = layer_out - before
            gate = torch.sigmoid(self.model.layer_gates[i]).to(dtype=h2.dtype, device=h2.device)
            adapter_delta = self.model.adapters[i](h2, pass1_states[i])
            h2 = before + gate * layer_delta + adapter_delta
        return self.base.lm_head(self.base.model.norm(h2))

    def _ids_prefix_len(self, old: torch.Tensor, new: torch.Tensor) -> int:
        if old is None or old.numel() == 0 or new.numel() == 0:
            return 0
        old1 = old[0]
        new1 = new[0]
        max_n = min(int(old1.numel()), int(new1.numel()))
        if max_n == 0:
            return 0
        # Fast path: old is exact prefix of new.
        if int(old1.numel()) <= int(new1.numel()) and torch.equal(old1, new1[: old1.numel()]):
            return int(old1.numel())
        # Conservative fallback, scan from max down; prompts are usually exact-prefix or reset.
        for n in range(max_n, 0, -1):
            if torch.equal(old1[:n], new1[:n]):
                return n
        return 0


    @torch.no_grad()
    def _token_prefill_context(self, input_ids: torch.Tensor, cfg: GenerationConfig, use_pass2: bool, use_pass_embed: bool, max_context: int) -> None:
        """
        Conservative cache builder.

        It fills the same pass1/pass2 KV caches by walking the prompt one token at a time.
        This is slower than vectorized prefill but much safer across checkpoint/runtime variants,
        and it still gives a valid decode cache + persistent cache for the generated tokens.
        """
        self.pass1_cache = DecoderKVCache(self.n_layers)
        self.pass2_cache = DecoderKVCache(self.n_layers) if use_pass2 else None
        self.cached_logits = None
        self.cached_pass1_logits = None
        self.cached_pass2_logits = None

        T = int(input_ids.shape[1])
        for pos in range(T):
            tok = input_ids[:, pos:pos + 1]
            h1, states, logits1 = self._step_pass1(tok, pos, max_context, use_pass_embed=use_pass_embed)
            if use_pass2:
                logits2 = self._step_pass2(h1, states, pos, max_context)
                self.cached_logits = logits2
                self.cached_pass1_logits = logits1
                self.cached_pass2_logits = logits2
            else:
                self.cached_logits = logits1
                self.cached_pass1_logits = logits1
                self.cached_pass2_logits = None

    @torch.no_grad()
    def _prepare_cached_context(self, input_ids: torch.Tensor, cfg: GenerationConfig) -> Tuple[torch.Tensor, bool, int, int, str]:
        mode = self._effective_mode(cfg.mode)
        max_context = int(cfg.max_context_tokens)
        use_pass2 = mode in {"vwm", "deep"} and self.has_pass2
        use_pass_embed = bool(use_pass2)
        T = int(input_ids.shape[1])
        if T > max_context:
            input_ids = input_ids[:, -max_context:]
            T = max_context

        # If mode/context changed, persistent cache is invalid.
        cache_ok = (
            cfg.persistent_cache
            and self.cache_ids is not None
            and self.cache_mode == mode
            and self.cache_max_context == max_context
            and self.pass1_cache is not None
        )
        prefix = self._ids_prefix_len(self.cache_ids, input_ids) if cache_ok else 0
        cache_hit = bool(cache_ok and prefix == int(self.cache_ids.shape[1]) and prefix <= T and prefix > 0)

        t0 = time.time()
        if cache_hit:
            # Process only suffix between prior cached prompt and new prompt.
            suffix = input_ids[:, prefix:]
            for j in range(int(suffix.shape[1])):
                tok = suffix[:, j : j + 1]
                pos = prefix + j
                h1, states, logits1 = self._step_pass1(tok, pos, max_context, use_pass_embed=use_pass_embed)
                if use_pass2:
                    logits2 = self._step_pass2(h1, states, pos, max_context)
                    self.cached_logits = logits2
                    self.cached_pass1_logits = logits1
                    self.cached_pass2_logits = logits2
                else:
                    self.cached_logits = logits1
                    self.cached_pass1_logits = logits1
                    self.cached_pass2_logits = None
            self.cache_ids = input_ids.detach().clone()
            self.cache_backend = "persistent-kv-suffix" if suffix.numel() else "persistent-kv-hit"
            return input_ids, True, prefix, int(suffix.shape[1]), self.cache_backend

        # Reset and prefill. Prefer vectorized prefill, but fall back to conservative
        # token prefill if the runtime variant does not support our vectorized cache path.
        self.pass1_cache = None
        self.pass2_cache = None
        backend = "vectorized-prefill"
        if bool(cfg.vectorized_prefill):
            try:
                h1, states, pos_ids, logits1 = self._prefill_pass1(input_ids, max_context, use_pass_embed=use_pass_embed)
                if use_pass2:
                    logits2 = self._prefill_pass2(h1, states, pos_ids, max_context)
                    self.cached_logits = logits2
                    self.cached_pass1_logits = logits1
                    self.cached_pass2_logits = logits2
                else:
                    self.cached_logits = logits1
                    self.cached_pass1_logits = logits1
                    self.cached_pass2_logits = None
            except Exception as exc:
                if os.getenv("LULUV2_CACHE_DEBUG", "0").strip().lower() in {"1", "true", "yes", "on"}:
                    print("[cache] vectorized prefill failed; using token-prefill cache.")
                    traceback.print_exc()
                self._token_prefill_context(input_ids, cfg, use_pass2=use_pass2, use_pass_embed=use_pass_embed, max_context=max_context)
                backend = "token-prefill-cache"
        else:
            self._token_prefill_context(input_ids, cfg, use_pass2=use_pass2, use_pass_embed=use_pass_embed, max_context=max_context)
            backend = "token-prefill-cache"

        self.cache_ids = input_ids.detach().clone()
        self.cache_mode = mode
        self.cache_max_context = max_context
        self.cache_backend = backend
        return input_ids, False, 0, T, self.cache_backend

    def _effective_mode(self, mode: str) -> str:
        mode = (mode or "vwm").lower()
        if mode in {"fast", "base", "pass1"}:
            return "fast"
        if mode in {"deep", "32k", "long"}:
            return "deep"
        if mode in {"slow", "full"}:
            return "slow"
        return "vwm"

    @torch.no_grad()
    def pass_metrics_from_logits(self, logits1: Optional[torch.Tensor], logits2: Optional[torch.Tensor]) -> Tuple[Optional[float], Optional[float]]:
        if logits1 is None or logits2 is None:
            return None, None
        try:
            l1 = logits1[:, -1, :].float()
            l2 = logits2[:, -1, :].float()
            kl = F.kl_div(F.log_softmax(l2, dim=-1), F.softmax(l1, dim=-1), reduction="batchmean")
            cos = F.cosine_similarity(l1, l2, dim=-1).mean()
            return float(kl.item()), float(cos.item())
        except Exception:
            return None, None

    def _apply_penalties(self, logits: torch.Tensor, generated: torch.Tensor, cfg: GenerationConfig) -> torch.Tensor:
        if generated.numel() == 0:
            return logits
        out = logits.clone()
        uniq, counts = torch.unique(generated.view(-1), return_counts=True)
        if cfg.repetition_penalty != 1.0:
            selected = out[:, uniq]
            selected = torch.where(selected > 0, selected / float(cfg.repetition_penalty), selected * float(cfg.repetition_penalty))
            out[:, uniq] = selected
        if cfg.frequency_penalty:
            out[:, uniq] -= float(cfg.frequency_penalty) * counts.to(out.dtype).unsqueeze(0)
        n = int(cfg.no_repeat_ngram)
        if n > 1 and generated.size(1) >= n - 1:
            seq = generated[0].tolist()
            prefix = tuple(seq[-(n - 1):])
            banned = []
            for i in range(len(seq) - n + 1):
                if tuple(seq[i:i + n - 1]) == prefix:
                    banned.append(seq[i + n - 1])
            if banned:
                out[:, list(set(banned))] = -float("inf")
        return out

    @torch.no_grad()
    def _sample_next(self, logits: torch.Tensor, generated: torch.Tensor, cfg: GenerationConfig) -> Tuple[torch.Tensor, Dict[str, float]]:
        work = self._apply_penalties(logits.float(), generated, cfg)
        if cfg.greedy or cfg.temperature <= 0:
            probs = torch.softmax(work, dim=-1)
            next_id = torch.argmax(work, dim=-1, keepdim=True)
        else:
            work = work / max(float(cfg.temperature), 1e-6)
            if cfg.top_k > 0:
                k = min(int(cfg.top_k), work.size(-1))
                thresh = torch.topk(work, k, dim=-1).values[..., -1, None]
                work = torch.where(work >= thresh, work, torch.full_like(work, -float("inf")))
            if 0.0 < cfg.top_p < 1.0:
                sorted_logits, sorted_idx = torch.sort(work, descending=True, dim=-1)
                sorted_probs = torch.softmax(sorted_logits, dim=-1)
                cumprobs = torch.cumsum(sorted_probs, dim=-1)
                remove = cumprobs > float(cfg.top_p)
                shifted = remove.clone()
                shifted[..., 1:] = remove[..., :-1]
                shifted[..., 0] = False
                sorted_logits = sorted_logits.masked_fill(shifted, -float("inf"))
                work = torch.full_like(work, -float("inf")).scatter(1, sorted_idx, sorted_logits)
            if 0.0 < cfg.min_p < 1.0:
                probs_for_minp = torch.softmax(work, dim=-1)
                max_prob = probs_for_minp.max(dim=-1, keepdim=True).values
                keep = probs_for_minp >= float(cfg.min_p) * max_prob
                work = work.masked_fill(~keep, -float("inf"))
            probs = torch.softmax(work, dim=-1)
            if torch.isnan(probs).any() or not torch.isfinite(probs.sum()) or float(probs.sum()) <= 0:
                next_id = torch.argmax(logits, dim=-1, keepdim=True)
                probs = torch.softmax(logits.float(), dim=-1)
            else:
                next_id = torch.multinomial(probs, 1)
        prob = float(probs.gather(1, next_id).item()) if probs.numel() else 0.0
        entropy = float((-(probs * torch.log(probs.clamp_min(1e-12))).sum(dim=-1)).mean().item()) if probs.numel() else 0.0
        return next_id, {"prob": prob, "entropy": entropy}

    @torch.no_grad()
    def _slow_generate(self, ids: torch.Tensor, prompt_len: int, cfg: GenerationConfig) -> Generator[str, None, None]:
        # Compatibility path: full prefix recompute every token.
        eos_id = getattr(self.tokenizer, "eos_token_id", None)
        last_text = ""
        t0 = time.time()
        for step in range(int(cfg.max_new_tokens)):
            ctx = ids[:, -int(cfg.max_context_tokens):]
            with self.amp_context():
                out = self.model(ctx) if self._effective_mode(cfg.mode) != "fast" else self.base(ctx)
                logits = out.logits[:, -1, :].float()
            generated = ids[:, prompt_len:]
            next_id, tok_stats = self._sample_next(logits, generated, cfg)
            ids = torch.cat([ids, next_id.to(ids.device)], dim=-1)
            token_id = int(next_id.item())
            token_text = self.tokenizer.decode([token_id], skip_special_tokens=False)
            self._record_token(step + 1, token_id, token_text, tok_stats)
            if eos_id is not None and token_id == int(eos_id):
                break
            if (step + 1) % int(cfg.stream_every) == 0 or step == 0:
                raw = self.tokenizer.decode(ids[0, prompt_len:], skip_special_tokens=True)
                if any(s in raw for s in STOP_STRINGS):
                    break
                text = clean_text(raw)
                if text and text != last_text:
                    elapsed = time.time() - t0
                    gen = int(ids.shape[1]) - prompt_len
                    self.last_stats = GenerationStats(prompt_tokens=prompt_len, prompt_total_tokens=self.last_prompt_total_tokens, prompt_kept_tokens=self.last_prompt_kept_tokens, prompt_dropped_tokens=self.last_prompt_dropped_tokens, generated_tokens=gen, elapsed_sec=elapsed, tokens_per_sec=gen / max(elapsed, 1e-9), mode=cfg.mode, backend="slow-full-prefix", last_token=token_text, last_token_id=token_id, last_token_prob=tok_stats["prob"], last_entropy=tok_stats["entropy"], finish_reason="streaming")
                    last_text = text
                    yield text
        final = clean_text(self.tokenizer.decode(ids[0, prompt_len:], skip_special_tokens=True))
        if final:
            yield final

    def _record_token(self, i: int, token_id: int, token_text: str, tok_stats: Dict[str, float]) -> None:
        self.recent_tokens.append({"i": i, "id": token_id, "text": token_text, "prob": tok_stats.get("prob", 0.0), "entropy": tok_stats.get("entropy", 0.0)})
        self.recent_tokens = self.recent_tokens[-64:]

    @torch.no_grad()
    def generate(self, prompt: str, cfg: GenerationConfig) -> Generator[str, None, None]:
        self.model.eval()
        self.base.eval()
        self.recent_tokens = []
        mode = self._effective_mode(cfg.mode)
        if mode == "deep":
            cfg.max_context_tokens = max(int(cfg.max_context_tokens), 16384)
        ids = self.encode(prompt, max_context_tokens=int(cfg.max_context_tokens))
        prompt_len = int(ids.shape[1])
        if self.last_prompt_dropped_tokens > 0:
            print(f"[context] prompt clipped: kept={self.last_prompt_kept_tokens} total={self.last_prompt_total_tokens} dropped={self.last_prompt_dropped_tokens}")
        t_start = time.time()
        prefill_sec = 0.0
        cache_hit = False
        reused = 0
        new_prefill = prompt_len
        backend = ""
        pass_kl = None
        pass_cos = None

        if (not cfg.use_cache) or mode == "slow":
            yield from self._slow_generate(ids, prompt_len, cfg)
            return

        try:
            with self.amp_context():
                t_pref = time.time()
                ids, cache_hit, reused, new_prefill, backend = self._prepare_cached_context(ids, cfg)
                prefill_sec = time.time() - t_pref
                pass_kl, pass_cos = self.pass_metrics_from_logits(self.cached_pass1_logits, self.cached_pass2_logits) if cfg.return_pass_metrics else (None, None)
        except Exception as exc:
            print(f"[cache] cached path failed; falling back to slow full-prefix: {type(exc).__name__}: {exc}")
            if os.getenv("LULUV2_CACHE_DEBUG", "0").strip().lower() in {"1", "true", "yes", "on"}:
                traceback.print_exc()
            self.pass1_cache = None
            self.pass2_cache = None
            self.cache_ids = None
            yield from self._slow_generate(ids, prompt_len, cfg)
            return

        eos_id = getattr(self.tokenizer, "eos_token_id", None)
        last_text = ""
        finish_reason = "length"
        use_pass2 = mode in {"vwm", "deep"} and self.has_pass2
        use_pass_embed = bool(use_pass2)

        for step in range(int(cfg.max_new_tokens)):
            logits = self.cached_logits[:, -1, :].float() if self.cached_logits is not None and self.cached_logits.dim() == 3 else self.cached_logits.float()
            generated = ids[:, prompt_len:]
            next_id, tok_stats = self._sample_next(logits, generated, cfg)
            token_id = int(next_id.item())
            token_text = self.tokenizer.decode([token_id], skip_special_tokens=False)
            self._record_token(step + 1, token_id, token_text, tok_stats)
            ids = torch.cat([ids, next_id.to(ids.device)], dim=-1)

            if eos_id is not None and token_id == int(eos_id):
                finish_reason = "eos"
                break

            pos = int(ids.shape[1]) - 1
            try:
                with self.amp_context():
                    h1, states, logits1 = self._step_pass1(next_id.to(self.device), pos, int(cfg.max_context_tokens), use_pass_embed=use_pass_embed)
                    if use_pass2:
                        logits2 = self._step_pass2(h1, states, pos, int(cfg.max_context_tokens))
                        self.cached_logits = logits2
                        self.cached_pass1_logits = logits1
                        self.cached_pass2_logits = logits2
                    else:
                        self.cached_logits = logits1
                        self.cached_pass1_logits = logits1
                        self.cached_pass2_logits = None
                if self.cache_ids is not None:
                    self.cache_ids = torch.cat([self.cache_ids, next_id.detach().to(self.cache_ids.device)], dim=-1)
                    if self.cache_ids.shape[1] > int(cfg.max_context_tokens):
                        self.cache_ids = self.cache_ids[:, -int(cfg.max_context_tokens):]
            except Exception as exc:
                print(f"[decode-cache] step failed; falling back for this request: {type(exc).__name__}: {exc}")
                # Finish with slow path from current ids; do not pretend cache is valid.
                self.cache_ids = None
                yield from self._slow_generate(ids, prompt_len, cfg)
                return

            if (step + 1) % int(cfg.stream_every) == 0 or step == 0:
                raw = self.tokenizer.decode(ids[0, prompt_len:], skip_special_tokens=True)
                if any(s in raw for s in STOP_STRINGS):
                    finish_reason = "stop_string"
                    break
                text = clean_text(raw)
                if text and text != last_text:
                    elapsed = time.time() - t_start
                    gen = int(ids.shape[1]) - prompt_len
                    self.last_stats = GenerationStats(
                        prompt_tokens=prompt_len,
                        prompt_total_tokens=self.last_prompt_total_tokens,
                        prompt_kept_tokens=self.last_prompt_kept_tokens,
                        prompt_dropped_tokens=self.last_prompt_dropped_tokens,
                        generated_tokens=gen,
                        elapsed_sec=elapsed,
                        tokens_per_sec=gen / max(elapsed - prefill_sec, 1e-9),
                        prefill_sec=prefill_sec,
                        prefill_tps=(new_prefill / max(prefill_sec, 1e-9)),
                        cache_hit=cache_hit,
                        cache_reused_tokens=reused,
                        cache_new_prefill_tokens=new_prefill,
                        mode=mode,
                        backend=backend,
                        last_token=token_text,
                        last_token_id=token_id,
                        last_token_prob=tok_stats["prob"],
                        last_entropy=tok_stats["entropy"],
                        finish_reason="streaming",
                        pass1_pass2_kl=pass_kl,
                        pass1_pass2_logit_cosine=pass_cos,
                    )
                    last_text = text
                    yield text

        raw = self.tokenizer.decode(ids[0, prompt_len:], skip_special_tokens=True)
        final = clean_text(raw)
        elapsed = time.time() - t_start
        gen = int(ids.shape[1]) - prompt_len
        self.last_stats = GenerationStats(
            prompt_tokens=prompt_len,
            prompt_total_tokens=self.last_prompt_total_tokens,
            prompt_kept_tokens=self.last_prompt_kept_tokens,
            prompt_dropped_tokens=self.last_prompt_dropped_tokens,
            generated_tokens=gen,
            elapsed_sec=elapsed,
            tokens_per_sec=gen / max(elapsed - prefill_sec, 1e-9),
            prefill_sec=prefill_sec,
            prefill_tps=(new_prefill / max(prefill_sec, 1e-9)),
            cache_hit=cache_hit,
            cache_reused_tokens=reused,
            cache_new_prefill_tokens=new_prefill,
            mode=mode,
            backend=backend,
            last_token=self.recent_tokens[-1]["text"] if self.recent_tokens else "",
            last_token_id=self.recent_tokens[-1]["id"] if self.recent_tokens else -1,
            last_token_prob=self.recent_tokens[-1]["prob"] if self.recent_tokens else 0.0,
            last_entropy=self.recent_tokens[-1]["entropy"] if self.recent_tokens else 0.0,
            finish_reason=finish_reason,
            pass1_pass2_kl=pass_kl,
            pass1_pass2_logit_cosine=pass_cos,
        )
        if final:
            yield final

    def clear_session_cache(self) -> None:
        self.pass1_cache = None
        self.pass2_cache = None
        self.cache_ids = None
        self.cached_logits = None
        self.cached_pass1_logits = None
        self.cached_pass2_logits = None
        self.cache_backend = "cleared"

    def stats_dict(self) -> Dict[str, Any]:
        return {"generation": asdict(self.last_stats), "model": self.model_info, "system": system_snapshot(self)}

    def stats_text(self) -> str:
        s = self.last_stats
        lines = [
            f"Mode: {s.mode} | backend={s.backend}",
            f"Prompt tokens: {s.prompt_tokens} kept / {getattr(s, 'prompt_total_tokens', s.prompt_tokens)} total / {getattr(s, 'prompt_dropped_tokens', 0)} dropped",
            f"Generated tokens: {s.generated_tokens}",
            f"Elapsed: {s.elapsed_sec:.2f}s | prefill={s.prefill_sec:.2f}s ({s.prefill_tps:.1f} tok/s)",
            f"Decode speed: {s.tokens_per_sec:.2f} tok/s",
            f"Cache: hit={s.cache_hit} reused={s.cache_reused_tokens} new_prefill={s.cache_new_prefill_tokens}",
            f"Finish reason: {s.finish_reason}",
            f"Last token: {s.last_token!r} id={s.last_token_id} p={s.last_token_prob:.4f} H={s.last_entropy:.2f}",
        ]
        if s.pass1_pass2_kl is not None:
            lines.append(f"Pass1→Pass2 KL: {s.pass1_pass2_kl:.6f}")
        if s.pass1_pass2_logit_cosine is not None:
            lines.append(f"Pass1/Pass2 cosine: {s.pass1_pass2_logit_cosine:.6f}")
        lines.extend([
            "",
            f"Checkpoint: {self.model_info['checkpoint']}",
            f"Checkpoint size: {self.model_info['checkpoint_size']}",
            f"Device: {self.model_info['device']} dtype={self.model_info['dtype']}",
            f"Pass2 active: {self.model_info['has_pass2']}",
            f"Params: {self.model_info['total_params']:,}",
            f"VWM c modules: {self.model_info['vwm_c_modules']} ({self.model_info['vwm_c_params']:,} c params)",
        ])
        return "\n".join(lines)

    def token_trace_text(self) -> str:
        if not self.recent_tokens:
            return "No tokens generated yet."
        rows = []
        for t in self.recent_tokens[-48:]:
            safe = repr(t["text"])[1:-1]
            rows.append(f"{t['i']:04d}  id={t['id']:<7}  p={t['prob']:.4f}  H={t['entropy']:.2f}  {safe}")
        return "\n".join(rows)


def system_snapshot(engine: Optional[LULUV2OptimizedEngine] = None) -> Dict[str, Any]:
    snap: Dict[str, Any] = {
        "python_ram": "n/a", "system_ram": "n/a", "system_ram_percent": 0.0,
        "cpu_percent": 0.0, "gpu_name": "CUDA unavailable", "vram_allocated": "n/a",
        "vram_reserved": "n/a", "vram_used": "n/a", "vram_total": "n/a",
        "vram_percent": 0.0, "gpu_util_percent": None, "gpu_temp_c": None,
    }
    if psutil is not None:
        try:
            proc = psutil.Process(os.getpid())
            vm = psutil.virtual_memory()
            snap.update({
                "python_ram": human_bytes(proc.memory_info().rss),
                "system_ram": f"{human_bytes(vm.used)} / {human_bytes(vm.total)}",
                "system_ram_percent": float(vm.percent),
                "cpu_percent": float(psutil.cpu_percent(interval=0.0)),
            })
        except Exception:
            pass
    if torch.cuda.is_available():
        try:
            idx = torch.cuda.current_device()
            props = torch.cuda.get_device_properties(idx)
            allocated = int(torch.cuda.memory_allocated(idx))
            reserved = int(torch.cuda.memory_reserved(idx))
            total = int(props.total_memory)
            snap.update({
                "gpu_name": props.name,
                "vram_allocated": human_bytes(allocated),
                "vram_reserved": human_bytes(reserved),
                "vram_used": human_bytes(allocated),
                "vram_total": human_bytes(total),
                "vram_percent": 100.0 * allocated / max(total, 1),
            })
            if pynvml is not None:
                try:
                    pynvml.nvmlInit()
                    handle = pynvml.nvmlDeviceGetHandleByIndex(idx)
                    util = pynvml.nvmlDeviceGetUtilizationRates(handle)
                    mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                    temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                    snap.update({
                        "gpu_util_percent": int(util.gpu),
                        "vram_used": human_bytes(int(mem.used)),
                        "vram_total": human_bytes(int(mem.total)),
                        "vram_percent": 100.0 * float(mem.used) / max(float(mem.total), 1.0),
                        "gpu_temp_c": int(temp),
                    })
                except Exception:
                    pass
        except Exception:
            pass
    return snap


def system_usage(engine: Optional[LULUV2OptimizedEngine] = None) -> str:
    snap = system_snapshot(engine)
    lines = [
        f"OS: {platform.system()} {platform.release()}",
        f"Python RAM: {snap['python_ram']}",
        f"System RAM: {snap['system_ram']} ({snap['system_ram_percent']:.1f}%)",
        f"CPU: {snap['cpu_percent']:.1f}%",
        "",
        f"GPU: {snap['gpu_name']}",
        f"VRAM used: {snap['vram_used']} / {snap['vram_total']} ({snap['vram_percent']:.1f}%)",
        f"VRAM allocated: {snap['vram_allocated']}",
        f"VRAM reserved: {snap['vram_reserved']}",
    ]
    if snap.get("gpu_util_percent") is not None:
        lines.append(f"GPU util: {snap['gpu_util_percent']}%")
    if snap.get("gpu_temp_c") is not None:
        lines.append(f"GPU temp: {snap['gpu_temp_c']} C")
    if engine is not None:
        lines.extend(["", engine.stats_text()])
    return "\n".join(lines)