Upload 13 files

Browse files

Files changed (13) hide show

.gitattributes +1 -32
.gitignore +11 -0
README.md +64 -1
app.py +523 -0
config.json +10 -0
generation_config.json +9 -0
luluv2_inference_runtime.py +842 -0
luluv2_live_inference.py +698 -0
luluv2_optimized_engine.py +1133 -0
requirements.txt +7 -0
run_chat.ps1 +2 -0
run_chat.sh +3 -0
run_inference.py +46 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,4 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+# Local/private artifacts
+__pycache__/
+*.pyc
+.env
+*.log
+lulu_chats/
+luluv2_chats/
+*lulu_memory*.json
+private_artifacts/
+checkpoints/
+runs/

README.md CHANGED Viewed

@@ -1,3 +1,66 @@
 ---
-license: apache-2.0
 ---

 ---
+language:
+- en
+library_name: pytorch
+pipeline_tag: text-generation
+tags:
+- text-generation
+- bfloat16
+- inference-only
+- local-inference
 ---
+# LULUV2 native-bf16 local inference package
+This repository is prepared as an inference-only package for a native-bf16 LULUV2 checkpoint.
+It is designed so users can run the model directly in native bfloat16 without an extra conversion step.
+## What is included
+- `luluv2_inference_runtime.py` — stripped runtime loader and model architecture needed for inference only.
+- `luluv2_live_inference.py` — streaming inference engine.
+- `luluv2_optimized_engine.py` — optimized local inference engine with cache paths.
+- `app.py` — local Gradio chat UI.
+- `run_inference.py` — minimal command-line runner.
+- `tokenizer/` — local tokenizer files and chat template.
+## What is not included
+Private development tooling, data-preparation scripts, connector code, local chat logs, memory files, workspace artifacts, API keys, and secret tokens are not included.
+## Weights
+Place the native-bf16 checkpoint in the repository root as:
+```text
+LULUV2-bf16.pt
+```
+The uploaded cleanup source did not include weights, so this package does not contain a `.pt` or `.safetensors` model file yet.
+If you publish weights on Hugging Face, keep them in native bfloat16. This package includes `.gitattributes` patterns for large weight files.
+## Install
+```bash
+pip install -r requirements.txt
+```
+## Run the local UI
+```bash
+python app.py --ckpt ./LULUV2-bf16.pt --model-py ./luluv2_inference_runtime.py --tokenizer-dir ./tokenizer --inbrowser
+```
+## Run from CLI
+```bash
+python run_inference.py --ckpt ./LULUV2-bf16.pt --prompt "Write a short introduction to LuluV2."
+```
+## Native bf16 note
+This package is intended for native bfloat16 inference. Users should be able to run the native-bf16 package directly. Hardware without bfloat16 support may require `--dtype fp16` or `--dtype fp32`, depending on their PyTorch/device setup.
+## Safety and disclosure checklist before upload
+Before making the Hugging Face repository public, confirm that your base-model license permits redistribution of the final weights and that any legally required notices are present in the model card or repository files.

app.py ADDED Viewed

	@@ -0,0 +1,523 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+LULUV2 Pro local chat UI.
+A clean ChatGPT-style desktop UI for the fine-tuned LULUV2 checkpoint.
+It keeps the important local features only:
+- chat inference
+- live token streaming
+- new chat / save / load chats
+- persistent memory notes
+- live edge monitor: tok/s, RAM, VRAM, GPU, pass2 metrics
+- 32K context controls and test prompt helper
+Run:
+    python ./app.py --ckpt ./LULUV2-bf16.pt --model-py ./luluv2_inference_runtime.py --inbrowser
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+import gradio as gr
+from luluv2_live_inference import (
+    GenerationConfig,
+    LULUV2LiveEngine,
+    clean_text,
+    normalize_history,
+    system_usage,
+)
+APP_NAME = "LuluV2"
+CHAT_DIR = Path(os.getenv("LULU_CHAT_DIR", "lulu_chats"))
+MEMORY_FILE = Path(os.getenv("LULU_MEMORY_FILE", "lulu_memory.json"))
+DEFAULT_SYSTEM_PROMPT = """Your name is LuluV2.
+You are a local AI assistant made by Open Machine.
+You run offline from the LULUV2 VWM checkpoint.
+Answer directly and naturally.
+Use Markdown for structure.
+When writing code, use fenced code blocks with the correct language tag.
+Do not output role tags, hidden scratchpad text, JSON UI fragments, or {'type':'text'} blocks.
+"""
+PRESETS = {
+    "Balanced": dict(temperature=0.65, top_k=40, top_p=0.90, min_p=0.03, repetition_penalty=1.10, frequency_penalty=0.02, max_new_tokens=768),
+    "Precise": dict(temperature=0.35, top_k=30, top_p=0.84, min_p=0.04, repetition_penalty=1.14, frequency_penalty=0.03, max_new_tokens=512),
+    "Code": dict(temperature=0.42, top_k=40, top_p=0.88, min_p=0.03, repetition_penalty=1.10, frequency_penalty=0.02, max_new_tokens=1200),
+    "Long 32K": dict(temperature=0.55, top_k=50, top_p=0.90, min_p=0.025, repetition_penalty=1.08, frequency_penalty=0.02, max_new_tokens=1200),
+}
+def safe_int(value: Any, default: int, low: int | None = None, high: int | None = None) -> int:
+    try:
+        value = int(value)
+    except Exception:
+        value = default
+    if low is not None:
+        value = max(low, value)
+    if high is not None:
+        value = min(high, value)
+    return value
+def clamp(value: Any, low: float, high: float, default: float) -> float:
+    try:
+        value = float(value)
+    except Exception:
+        return default
+    return max(low, min(high, value))
+def esc(text: Any) -> str:
+    return str(text).replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;").replace('"', "&quot;")
+def status_html(title: str, detail: str = "", tone: str = "neutral") -> str:
+    tone = tone if tone in {"neutral", "good", "warn", "bad", "live"} else "neutral"
+    return f"""
+    <div class="status-pill status-{tone}">
+      <span class="pulse-dot"></span>
+      <div><b>{esc(title)}</b><small>{esc(detail)}</small></div>
+    </div>
+    """
+def read_memory() -> str:
+    if not MEMORY_FILE.exists():
+        return ""
+    try:
+        return str(json.loads(MEMORY_FILE.read_text(encoding="utf-8")).get("memory_notes", ""))
+    except Exception:
+        return ""
+def write_memory(memory_notes: str) -> Tuple[str, str]:
+    MEMORY_FILE.write_text(
+        json.dumps(
+            {"memory_notes": memory_notes or "", "saved_at": datetime.now().isoformat(timespec="seconds"), "app": APP_NAME},
+            indent=2,
+            ensure_ascii=False,
+        ),
+        encoding="utf-8",
+    )
+    return str(MEMORY_FILE), status_html("Memory saved", str(MEMORY_FILE), "good")
+def safe_chat_filename(chat_name: str, suffix: str) -> Path:
+    CHAT_DIR.mkdir(parents=True, exist_ok=True)
+    base = re.sub(r"[^a-zA-Z0-9_-]+", "_", chat_name or "chat").strip("_") or "chat"
+    return CHAT_DIR / f"{base}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.{suffix}"
+def list_saved_chats() -> List[str]:
+    CHAT_DIR.mkdir(parents=True, exist_ok=True)
+    return [str(p) for p in sorted(CHAT_DIR.glob("*.json"), key=lambda x: x.stat().st_mtime, reverse=True)]
+def save_chat(history: Any, chat_name: str, memory_notes: str) -> Tuple[str, str, List[str]]:
+    path = safe_chat_filename(chat_name or "Lulu chat", "json")
+    data = {
+        "chat_name": chat_name or "Lulu chat",
+        "history": normalize_history(history),
+        "memory_notes": memory_notes or "",
+        "saved_at": datetime.now().isoformat(timespec="seconds"),
+        "app": APP_NAME,
+    }
+    path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
+    return str(path), status_html("Chat saved", path.name, "good"), list_saved_chats()
+def load_chat(path: str) -> Tuple[List[Dict[str, str]], str, str, str]:
+    if not path:
+        return [], "New chat", read_memory(), status_html("No saved chat selected", "Pick a JSON file from the sidebar.", "warn")
+    try:
+        data = json.loads(Path(path).read_text(encoding="utf-8"))
+    except Exception as exc:
+        return [], "New chat", read_memory(), status_html("Load failed", f"{type(exc).__name__}: {exc}", "bad")
+    return (
+        normalize_history(data.get("history", [])),
+        str(data.get("chat_name") or Path(path).stem),
+        str(data.get("memory_notes", read_memory())),
+        status_html("Chat loaded", Path(path).name, "good"),
+    )
+def chat_to_markdown(history: Any, chat_name: str) -> str:
+    lines = [f"# {clean_text(chat_name) or 'LuluV2 chat'}", ""]
+    for item in normalize_history(history):
+        lines.append("## You" if item["role"] == "user" else "## LuluV2")
+        lines.append(item["content"])
+        lines.append("")
+    return "\n".join(lines).strip() + "\n"
+def export_markdown(history: Any, chat_name: str) -> Tuple[str, str]:
+    path = safe_chat_filename(chat_name or "Lulu chat", "md")
+    path.write_text(chat_to_markdown(history, chat_name), encoding="utf-8")
+    return str(path), status_html("Markdown exported", path.name, "good")
+def postprocess_answer(text: Any, final: bool = False) -> str:
+    text = clean_text(text)
+    # Remove common generated UI artifacts from older chat data.
+    text = re.sub(r"\n?\s*\[\s*\{\s*['\"]text['\"].*?['\"]type['\"]\s*:\s*['\"]text['\"]\s*\}\s*\]\s*$", "", text, flags=re.S)
+    text = re.sub(r"\n?\s*type\s*:\s*['\"]text['\"]\s*$", "", text, flags=re.I)
+    text = re.sub(r"\n{4,}", "\n\n\n", text)
+    if final and text.count("```") % 2 == 1:
+        text += "\n```"
+    return text.strip()
+def metric_cards(engine: LULUV2LiveEngine, max_context: int) -> str:
+    stats = engine.stats_dict()
+    sys = stats.get("system", {})
+    model = stats.get("model", {})
+    pass_kl = stats.get("pass1_pass2_kl")
+    pass_cos = stats.get("pass1_pass2_logit_cosine")
+    pass_text = "base"
+    if pass_kl is not None and pass_cos is not None:
+        pass_text = f"KL {pass_kl:.3f} / cos {pass_cos:.3f}"
+    gpu_util = sys.get("gpu_util_percent")
+    gpu_temp = sys.get("gpu_temp_c")
+    gpu_text = "n/a" if gpu_util is None else f"{gpu_util}%"
+    temp_text = "n/a" if gpu_temp is None else f"{gpu_temp}°C"
+    return f"""
+    <div class="monitor-bar">
+      <div class="mon-card hot"><b>{float(stats.get('tokens_per_sec', 0.0)):.1f}</b><span>tok/s</span></div>
+      <div class="mon-card"><b>{int(stats.get('generated_tokens', 0))}</b><span>tokens</span></div>
+      <div class="mon-card"><b>{sys.get('python_ram', 'n/a')}</b><span>Python RAM</span></div>
+      <div class="mon-card"><b>{sys.get('vram_used', 'n/a')}</b><span>VRAM / {sys.get('vram_total', 'n/a')}</span></div>
+      <div class="mon-card"><b>{gpu_text}</b><span>GPU · {temp_text}</span></div>
+      <div class="mon-card"><b>{max_context//1024}K</b><span>context</span></div>
+      <div class="mon-card"><b>{model.get('has_pass2')}</b><span>pass2</span></div>
+      <div class="mon-card wide"><b>{pass_text}</b><span>pass1 → pass2</span></div>
+    </div>
+    """
+def make_32k_prompt() -> str:
+    seed = (
+        "We are testing a 32K context window for LuluV2. "
+        "Remember these constraints: answer directly, keep code formatted, and summarize the relevant details. "
+        "The repeated context below is synthetic filler for a long-context stress test.\n\n"
+    )
+    block = (
+        "Section: VWM reconstruction. A model can use A/B atoms and c-code recipes to reconstruct behavior online. "
+        "Pass 1 builds a scaffold, pass 2 refines it, and the UI should keep live tokens/sec, RAM, VRAM, and pass metrics visible. "
+        "When asked at the end, explain the three key ideas and provide a tiny Python example.\n"
+    )
+    # Character length is approximate; token count depends on tokenizer. This usually lands around a long 20K-32K style prompt.
+    return seed + (block * 520) + "\nFinal question: What are the three key ideas above, and can you show a tiny Python class for tracking tokens per second?"
+def create_chatbot():
+    kwargs = dict(
+        value=[],
+        elem_id="chatbot",
+        height=760,
+        show_label=False,
+        avatar_images=(None, None),
+        bubble_full_width=False,
+    )
+    try:
+        return gr.Chatbot(type="messages", render_markdown=True, sanitize_html=True, **kwargs)
+    except TypeError:
+        try:
+            return gr.Chatbot(render_markdown=True, sanitize_html=True, **kwargs)
+        except TypeError:
+            return gr.Chatbot(**kwargs)
+def build_app(engine: LULUV2LiveEngine, default_context: int):
+    def respond(
+        message,
+        history,
+        chat_name,
+        system_prompt,
+        memory_notes,
+        preset,
+        history_turns,
+        max_context_tokens,
+        max_new_tokens,
+        temperature,
+        top_k,
+        top_p,
+        min_p,
+        repetition_penalty,
+        frequency_penalty,
+        greedy,
+        no_repeat_ngram,
+        stream_every,
+        show_pass_metrics,
+    ):
+        hist = normalize_history(history)
+        msg = clean_text(message)
+        max_context_tokens = safe_int(max_context_tokens, default_context, 128, 32768)
+        if not msg:
+            yield "", hist, status_html("Empty message", "Type something first.", "warn"), metric_cards(engine, max_context_tokens), engine.token_trace_text(), engine.stats_dict()
+            return
+        # Preset only affects initial slider defaults; live slider values are honored.
+        prompt = engine.build_chat_prompt(
+            message=msg,
+            history=hist,
+            system_prompt=system_prompt or DEFAULT_SYSTEM_PROMPT,
+            memory_notes=memory_notes or "",
+            history_turns=safe_int(history_turns, 4, 0, 32),
+        )
+        cfg = GenerationConfig(
+            max_new_tokens=safe_int(max_new_tokens, 768, 1, 8192),
+            temperature=clamp(temperature, 0.0, 2.0, 0.65),
+            top_k=safe_int(top_k, 40, 0, 500),
+            top_p=clamp(top_p, 0.01, 1.0, 0.90),
+            min_p=clamp(min_p, 0.0, 0.5, 0.03),
+            repetition_penalty=clamp(repetition_penalty, 1.0, 3.0, 1.10),
+            frequency_penalty=clamp(frequency_penalty, 0.0, 3.0, 0.02),
+            greedy=bool(greedy),
+            no_repeat_ngram=safe_int(no_repeat_ngram, 4, 0, 16),
+            stream_every=safe_int(stream_every, 1, 1, 64),
+            max_context_tokens=max_context_tokens,
+            return_pass_metrics=bool(show_pass_metrics),
+        )
+        hist.append({"role": "user", "content": msg})
+        hist.append({"role": "assistant", "content": "Thinking..."})
+        yield "", hist, status_html("Generating", "LuluV2 is reconstructing tokens live.", "live"), metric_cards(engine, max_context_tokens), engine.token_trace_text(), engine.stats_dict()
+        final = ""
+        try:
+            for partial in engine.generate(prompt, cfg):
+                final = postprocess_answer(partial, final=False)
+                hist[-1] = {"role": "assistant", "content": final or "..."}
+                yield "", hist, status_html("Generating", f"{engine.last_stats.generated_tokens} tokens · {engine.last_stats.tokens_per_sec:.1f} tok/s", "live"), metric_cards(engine, max_context_tokens), engine.token_trace_text(), engine.stats_dict()
+        except Exception as exc:
+            hist[-1] = {"role": "assistant", "content": f"Generation failed:\n\n```text\n{type(exc).__name__}: {exc}\n```"}
+            yield msg, hist, status_html("Generation failed", f"{type(exc).__name__}: {exc}", "bad"), metric_cards(engine, max_context_tokens), engine.token_trace_text(), engine.stats_dict()
+            return
+        final = postprocess_answer(final, final=True) or "I’m not sure how to answer that yet."
+        hist[-1] = {"role": "assistant", "content": final}
+        yield "", hist, status_html("Done", f"{engine.last_stats.generated_tokens} tokens · {engine.last_stats.tokens_per_sec:.1f} tok/s", "good"), metric_cards(engine, max_context_tokens), engine.token_trace_text(), engine.stats_dict()
+    def regenerate(
+        history,
+        chat_name,
+        system_prompt,
+        memory_notes,
+        preset,
+        history_turns,
+        max_context_tokens,
+        max_new_tokens,
+        temperature,
+        top_k,
+        top_p,
+        min_p,
+        repetition_penalty,
+        frequency_penalty,
+        greedy,
+        no_repeat_ngram,
+        stream_every,
+        show_pass_metrics,
+    ):
+        hist = normalize_history(history)
+        if not hist:
+            yield "", hist, status_html("Nothing to regenerate", "Send a message first.", "warn"), metric_cards(engine, safe_int(max_context_tokens, default_context)), engine.token_trace_text(), engine.stats_dict()
+            return
+        work = hist[:]
+        if work and work[-1]["role"] == "assistant":
+            work = work[:-1]
+        if not work or work[-1]["role"] != "user":
+            yield "", hist, status_html("Cannot regenerate", "Last turn is not a user message.", "warn"), metric_cards(engine, safe_int(max_context_tokens, default_context)), engine.token_trace_text(), engine.stats_dict()
+            return
+        last_msg = work[-1]["content"]
+        prev = work[:-1]
+        yield from respond(last_msg, prev, chat_name, system_prompt, memory_notes, preset, history_turns, max_context_tokens, max_new_tokens, temperature, top_k, top_p, min_p, repetition_penalty, frequency_penalty, greedy, no_repeat_ngram, stream_every, show_pass_metrics)
+    def new_chat():
+        return [], "New chat", status_html("New chat", "Fresh conversation. Memory notes are kept.", "good")
+    def forget_last(history):
+        hist = normalize_history(history)
+        if len(hist) >= 2:
+            return hist[:-2], status_html("Forgot last turn", "Removed the latest exchange.", "good")
+        return [], status_html("Nothing to forget", "No full turn to remove.", "warn")
+    def apply_preset(name):
+        p = PRESETS.get(name, PRESETS["Balanced"])
+        context = 32768 if name == "Long 32K" else default_context
+        return p["temperature"], p["top_k"], p["top_p"], p["min_p"], p["repetition_penalty"], p["frequency_penalty"], p["max_new_tokens"], context
+    css = """
+    :root{
+      --bg:#05060d;--panel:#0b1020;--panel2:#101827;--line:rgba(148,163,184,.16);
+      --text:#edf2ff;--muted:#94a3b8;--accent:#8b5cf6;--accent2:#22d3ee;--good:#22c55e;--bad:#ef4444;
+    }
+    html, body, .gradio-container{
+      background: radial-gradient(circle at top left, rgba(139,92,246,.23), transparent 34%),
+                  radial-gradient(circle at top right, rgba(34,211,238,.14), transparent 30%),
+                  linear-gradient(180deg,#05060d,#070a12 62%,#02030a)!important;
+      color:var(--text)!important;
+    }
+    .gradio-container{max-width:1680px!important;margin:auto!important;font-family:Inter,ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif!important;}
+    footer{display:none!important}.main-wrap{gap:18px!important}.sidebar{padding:16px;border:1px solid var(--line);border-radius:28px;background:rgba(9,14,28,.76);box-shadow:0 20px 70px rgba(0,0,0,.32)}
+    .brand{padding:10px 4px 18px}.brand h1{margin:0;font-size:32px;letter-spacing:-.06em;color:#fff}.brand p{margin:5px 0 0;color:var(--muted);font-size:13px}.brand .badge{display:inline-flex;margin-top:12px;padding:7px 10px;border-radius:999px;border:1px solid rgba(34,211,238,.28);background:rgba(8,145,178,.12);color:#cffafe;font-weight:800;font-size:12px}
+    .chat-shell{padding:16px;border:1px solid var(--line);border-radius:32px;background:rgba(5,8,18,.62);box-shadow:0 30px 110px rgba(0,0,0,.38)}
+    #chatbot{height:760px!important;border:0!important;background:transparent!important;overflow:hidden!important}.message{font-size:15.5px!important;line-height:1.62!important}.message-wrap{max-width:900px!important}.bot .message, .assistant .message{background:rgba(15,23,42,.72)!important;border:1px solid rgba(148,163,184,.13)!important;border-radius:22px!important}.user .message{background:linear-gradient(135deg,rgba(124,58,237,.70),rgba(59,130,246,.42))!important;border:1px solid rgba(167,139,250,.35)!important;border-radius:22px!important;color:white!important}
+    #chatbot pre{background:#101827!important;border:1px solid rgba(148,163,184,.22)!important;border-radius:18px!important;padding:16px!important;box-shadow:inset 0 1px 0 rgba(255,255,255,.04)!important}#chatbot code{font-family:'JetBrains Mono','Cascadia Code','SFMono-Regular',Consolas,monospace!important;font-size:14px!important}#chatbot p{margin:0 0 .7em!important}#chatbot ul,#chatbot ol{margin-top:.3em!important}
+    .composer-card{display:flex;gap:12px;align-items:end;padding:10px;border-radius:26px;border:1px solid rgba(139,92,246,.28);background:rgba(2,6,23,.80);box-shadow:0 20px 70px rgba(139,92,246,.12)}#composer textarea{min-height:72px!important;max-height:190px!important;background:transparent!important;border:0!important;color:#fff!important;font-size:16px!important;line-height:1.5!important;box-shadow:none!important}.input-container{border:0!important;background:transparent!important}.form{border:0!important;background:transparent!important}label{color:#cbd5e1!important;font-weight:700!important}
+    button{border-radius:16px!important;font-weight:850!important;border:1px solid rgba(148,163,184,.16)!important;box-shadow:0 10px 28px rgba(0,0,0,.22)!important}.send-btn{min-height:56px!important;background:linear-gradient(135deg,#8b5cf6,#06b6d4)!important;color:white!important}.side-btn button,.side-btn{width:100%!important}
+    .monitor-bar{display:grid;grid-template-columns:repeat(8,minmax(110px,1fr));gap:10px;margin:0 0 12px}.mon-card{padding:12px 13px;border:1px solid var(--line);border-radius:18px;background:rgba(15,23,42,.78);min-height:64px}.mon-card b{display:block;font-size:20px;color:#fff;white-space:nowrap}.mon-card span{display:block;color:var(--muted);font-size:11px;margin-top:3px}.mon-card.hot{background:linear-gradient(135deg,rgba(139,92,246,.30),rgba(34,211,238,.16));border-color:rgba(34,211,238,.30)}.mon-card.wide b{font-size:15px}.status-pill{display:flex;align-items:center;gap:10px;margin:0 0 12px;padding:10px 13px;border-radius:18px;border:1px solid var(--line);background:rgba(2,6,23,.72)}.status-pill b{display:block}.status-pill small{display:block;color:var(--muted);font-size:12px}.pulse-dot{width:10px;height:10px;border-radius:99px;background:var(--accent2);box-shadow:0 0 0 7px rgba(34,211,238,.10),0 0 25px rgba(34,211,238,.55)}.status-good .pulse-dot{background:var(--good);box-shadow:0 0 0 7px rgba(34,197,94,.12),0 0 25px rgba(34,197,94,.5)}.status-bad .pulse-dot{background:var(--bad)}.status-live .pulse-dot{animation:pulse 1.1s infinite}@keyframes pulse{0%{transform:scale(1)}50%{transform:scale(1.45)}100%{transform:scale(1)}}
+    .gr-box,.gr-panel,.block{background:transparent!important;border-color:var(--line)!important}.sidebar textarea,.sidebar input,.sidebar select,.sidebar .wrap{background:rgba(2,6,23,.62)!important;color:#e5e7eb!important;border-color:rgba(148,163,184,.16)!important;border-radius:14px!important}.small-note{color:#94a3b8;font-size:12px}.tokenbox textarea,.jsonbox textarea{font-family:'JetBrains Mono','Cascadia Code',Consolas,monospace!important;font-size:12px!important;background:#060914!important}
+    @media(max-width:1100px){.monitor-bar{grid-template-columns:repeat(2,1fr)}.sidebar{display:none}.chat-shell{padding:8px}}
+    """
+    theme = gr.themes.Base(primary_hue="violet", secondary_hue="cyan", neutral_hue="slate")
+    with gr.Blocks(title=APP_NAME, css=css, theme=theme) as demo:
+        with gr.Row(elem_classes=["main-wrap"]):
+            with gr.Column(scale=1, min_width=270, elem_classes=["sidebar"]):
+                gr.HTML("""
+                <div class="brand">
+                  <h1>LuluV2</h1>
+                  <p>Offline VWM local assistant.</p>
+                  <span class="badge">LOCAL EDGE MODE</span>
+                </div>
+                """)
+                new_btn = gr.Button("+ New chat", variant="primary", elem_classes=["side-btn"])
+                save_btn = gr.Button("Save chat", elem_classes=["side-btn"])
+                saved_path = gr.Textbox(label="Last saved path", interactive=False, visible=False)
+                saved_chats = gr.Dropdown(choices=list_saved_chats(), label="Saved chats", value=None, interactive=True)
+                with gr.Row():
+                    refresh_chats = gr.Button("Refresh")
+                    load_btn = gr.Button("Load")
+                export_btn = gr.Button("Export .md", elem_classes=["side-btn"])
+                export_path = gr.Textbox(label="Export path", interactive=False, visible=False)
+                with gr.Accordion("Memory", open=True):
+                    memory_notes = gr.Textbox(label="Persistent memory notes", value=read_memory(), lines=8, placeholder="Things Lulu should remember locally...")
+                    memory_path = gr.Textbox(label="Memory path", interactive=False, visible=False)
+                    save_mem_btn = gr.Button("Save memory")
+                with gr.Accordion("Live tokens", open=False):
+                    token_trace = gr.Textbox(label="Recent generated tokens", value="No tokens generated yet.", lines=14, elem_classes=["tokenbox"])
+                with gr.Accordion("Advanced", open=False):
+                    chat_name = gr.Textbox(label="Chat name", value="New chat")
+                    preset = gr.Dropdown(label="Preset", choices=list(PRESETS.keys()), value="Balanced")
+                    system_prompt = gr.Textbox(label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=9)
+                    history_turns = gr.Slider(0, 24, value=4, step=1, label="History turns sent")
+                    max_context_tokens = gr.Slider(128, 32768, value=default_context, step=128, label="Max context tokens")
+                    max_new_tokens = gr.Slider(16, 8192, value=768, step=16, label="Max new tokens")
+                    temperature = gr.Slider(0.0, 2.0, value=0.65, step=0.01, label="Temperature")
+                    top_k = gr.Slider(0, 500, value=40, step=1, label="Top-k")
+                    top_p = gr.Slider(0.01, 1.0, value=0.90, step=0.01, label="Top-p")
+                    min_p = gr.Slider(0.0, 0.5, value=0.03, step=0.005, label="Min-p")
+                    repetition_penalty = gr.Slider(1.0, 3.0, value=1.10, step=0.01, label="Repetition penalty")
+                    frequency_penalty = gr.Slider(0.0, 3.0, value=0.02, step=0.01, label="Frequency penalty")
+                    greedy = gr.Checkbox(value=False, label="Greedy")
+                    no_repeat_ngram = gr.Slider(0, 16, value=4, step=1, label="No-repeat ngram")
+                    stream_every = gr.Slider(1, 64, value=1, step=1, label="Stream every N tokens")
+                    show_pass_metrics = gr.Checkbox(value=True, label="Measure pass1/pass2 before generation")
+                    insert_32k = gr.Button("Insert 32K stress prompt")
+            with gr.Column(scale=4, elem_classes=["chat-shell"]):
+                monitor = gr.HTML(metric_cards(engine, default_context))
+                status = gr.HTML(status_html("Ready", f"{engine.model_info.get('checkpoint_size')} checkpoint · {engine.model_info.get('device')}", "good"))
+                chatbot = create_chatbot()
+                with gr.Row(elem_classes=["composer-card"]):
+                    msg = gr.Textbox(show_label=False, placeholder="Message LuluV2...", lines=3, elem_id="composer", scale=12)
+                    send_btn = gr.Button("Send", variant="primary", elem_classes=["send-btn"], scale=2)
+                with gr.Row():
+                    stop_btn = gr.Button("Stop")
+                    regen_btn = gr.Button("Regenerate")
+                    forget_btn = gr.Button("Forget last turn")
+                    prompt_32k_btn = gr.Button("Try 32K prompt")
+                with gr.Accordion("Raw metrics", open=False):
+                    raw_metrics = gr.JSON(label="Raw metrics")
+                    usage_text = gr.Textbox(label="RAM / VRAM / model stats", value=system_usage(engine), lines=18, elem_classes=["jsonbox"])
+        inputs = [
+            msg, chatbot, chat_name, system_prompt, memory_notes, preset,
+            history_turns, max_context_tokens, max_new_tokens, temperature, top_k, top_p,
+            min_p, repetition_penalty, frequency_penalty, greedy, no_repeat_ngram,
+            stream_every, show_pass_metrics,
+        ]
+        outputs = [msg, chatbot, status, monitor, token_trace, raw_metrics]
+        send_event = send_btn.click(respond, inputs=inputs, outputs=outputs)
+        enter_event = msg.submit(respond, inputs=inputs, outputs=outputs)
+        stop_btn.click(fn=None, inputs=None, outputs=None, cancels=[send_event, enter_event])
+        regen_inputs = [
+            chatbot, chat_name, system_prompt, memory_notes, preset,
+            history_turns, max_context_tokens, max_new_tokens, temperature, top_k, top_p,
+            min_p, repetition_penalty, frequency_penalty, greedy, no_repeat_ngram,
+            stream_every, show_pass_metrics,
+        ]
+        regen_event = regen_btn.click(regenerate, inputs=regen_inputs, outputs=outputs)
+        stop_btn.click(fn=None, inputs=None, outputs=None, cancels=[regen_event])
+        new_btn.click(new_chat, outputs=[chatbot, chat_name, status])
+        forget_btn.click(forget_last, inputs=[chatbot], outputs=[chatbot, status])
+        save_btn.click(save_chat, inputs=[chatbot, chat_name, memory_notes], outputs=[saved_path, status, saved_chats])
+        refresh_chats.click(lambda: gr.update(choices=list_saved_chats()), outputs=[saved_chats])
+        load_btn.click(load_chat, inputs=[saved_chats], outputs=[chatbot, chat_name, memory_notes, status])
+        export_btn.click(export_markdown, inputs=[chatbot, chat_name], outputs=[export_path, status])
+        save_mem_btn.click(write_memory, inputs=[memory_notes], outputs=[memory_path, status])
+        preset.change(apply_preset, inputs=[preset], outputs=[temperature, top_k, top_p, min_p, repetition_penalty, frequency_penalty, max_new_tokens, max_context_tokens])
+        insert_32k.click(lambda: make_32k_prompt(), outputs=[msg])
+        prompt_32k_btn.click(lambda: make_32k_prompt(), outputs=[msg])
+    return demo
+def parse_args():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--ckpt", default="LULU2_instruct_ddp.pt")
+    ap.add_argument("--model-py", default="luluv2_inference_runtime.py")
+    ap.add_argument("--tokenizer-dir", default="tokenizer")
+    ap.add_argument("--host", default="127.0.0.1")
+    ap.add_argument("--port", type=int, default=7862)
+    ap.add_argument("--device", default="cuda")
+    ap.add_argument("--dtype", default="bf16")
+    ap.add_argument("--max-context", type=int, default=32768)
+    ap.add_argument("--share", action="store_true")
+    ap.add_argument("--inbrowser", action="store_true")
+    ap.add_argument("--base-only", action="store_true")
+    return ap.parse_args()
+def main():
+    args = parse_args()
+    os.environ.setdefault("HF_HUB_OFFLINE", "1")
+    os.environ.setdefault("TRANSFORMERS_OFFLINE", "1")
+    engine = LULUV2LiveEngine(
+        ckpt_path=args.ckpt,
+        model_py=args.model_py,
+        tokenizer_dir=args.tokenizer_dir,
+        device=args.device,
+        dtype=args.dtype,
+        local_files_only=True,
+        no_config_download=True,
+        force_base_only=bool(args.base_only),
+    )
+    demo = build_app(engine, default_context=safe_int(args.max_context, 32768, 128, 32768))
+    demo.queue(default_concurrency_limit=1).launch(
+        server_name=args.host,
+        server_port=int(args.port),
+        share=bool(args.share),
+        inbrowser=bool(args.inbrowser),
+        show_error=True,
+    )
+if __name__ == "__main__":
+    main()

config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "model_type": "luluv2",
+  "architectures": [
+    "Lulu2ForCausalLM"
+  ],
+  "torch_dtype": "bfloat16",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "auto_map": {},
+  "inference_only_package": true
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "max_new_tokens": 512,
+  "temperature": 0.65,
+  "top_k": 40,
+  "top_p": 0.9,
+  "do_sample": true,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643
+}

luluv2_inference_runtime.py ADDED Viewed

	@@ -0,0 +1,842 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+LULUV2 inference-only runtime.
+This file intentionally contains only the code needed to load and run a
+standalone native-bf16 LULUV2 checkpoint. It contains only the
+runtime loader, tokenizer bridge, decoder modules, and two-pass inference path
+needed for local generation.
+Runtime behavior:
+- loads a local checkpoint supplied by the user/repo;
+- uses local tokenizer files;
+- does not download or load any external model weights;
+- preserves the VWM/two-pass inference path when present in the checkpoint.
+"""
+from __future__ import annotations
+import json
+import math
+import os
+import time
+from dataclasses import dataclass
+from types import SimpleNamespace
+from typing import Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+_TRANSFORMERS_IMPORT_ERROR = None
+try:
+    from transformers import AutoTokenizer as _HFAutoTokenizer
+    try:
+        from transformers import AutoConfig as _HFAutoConfig
+    except Exception:
+        _HFAutoConfig = None
+except Exception as _e:
+    _TRANSFORMERS_IMPORT_ERROR = _e
+    _HFAutoTokenizer = None
+    _HFAutoConfig = None
+class _TokenOutput(dict):
+    def __getattr__(self, name):
+        try:
+            return self[name]
+        except KeyError as exc:
+            raise AttributeError(name) from exc
+    def to(self, device):
+        out = _TokenOutput()
+        for k, v in self.items():
+            out[k] = v.to(device) if torch.is_tensor(v) else v
+        return out
+class _LocalTokenizer:
+    def __init__(self, path: str, tokenizer_file: Optional[str] = None, **kwargs):
+        import json as _json
+        try:
+            from tokenizers import Tokenizer as _TokenizerCore
+        except Exception as exc:
+            raise RuntimeError(
+                "transformers import failed and tokenizers is unavailable. "
+                "Install tokenizers or use a matching torch/transformers pair."
+            ) from exc
+        self.name_or_path = path or tokenizer_file or "<local-tokenizer>"
+        if tokenizer_file:
+            tok_file = tokenizer_file
+            base_dir = os.path.dirname(os.path.abspath(tok_file))
+        else:
+            base_dir = os.path.abspath(path)
+            tok_file = os.path.join(base_dir, "tokenizer.json")
+        if not os.path.exists(tok_file):
+            raise FileNotFoundError(f"Local tokenizer.json not found: {tok_file}")
+        self._tok = _TokenizerCore.from_file(tok_file)
+        self.vocab_size = int(self._tok.get_vocab_size())
+        self.model_max_length = 10**9
+        self.truncation_side = "left"
+        self.chat_template = None
+        self.eos_token = None
+        self.pad_token = None
+        cfg_path = os.path.join(base_dir, "tokenizer_config.json")
+        sp_path = os.path.join(base_dir, "special_tokens_map.json")
+        for p in (cfg_path, sp_path):
+            if os.path.exists(p):
+                try:
+                    data = _json.load(open(p, "r", encoding="utf-8"))
+                except Exception:
+                    data = {}
+                if self.chat_template is None and isinstance(data.get("chat_template"), str):
+                    self.chat_template = data.get("chat_template")
+                for key, attr in (("eos_token", "eos_token"), ("pad_token", "pad_token")):
+                    val = data.get(key)
+                    if isinstance(val, dict):
+                        val = val.get("content")
+                    if isinstance(val, str):
+                        setattr(self, attr, val)
+        if self.eos_token is None:
+            for cand in ("<|im_end|>", "<|endoftext|>", "</s>"):
+                if self._tok.token_to_id(cand) is not None:
+                    self.eos_token = cand
+                    break
+        if self.pad_token is None:
+            self.pad_token = self.eos_token
+        self.eos_token_id = self._tok.token_to_id(self.eos_token) if self.eos_token else None
+        self.pad_token_id = self._tok.token_to_id(self.pad_token) if self.pad_token else self.eos_token_id
+    def __len__(self):
+        return self.vocab_size
+    def __call__(self, text, return_tensors=None, truncation=False, max_length=None, add_special_tokens=True, **kwargs):
+        if isinstance(text, (list, tuple)):
+            encoded = [self._encode_one(t, add_special_tokens, truncation, max_length) for t in text]
+            maxlen = max(len(x) for x in encoded) if encoded else 0
+            pad = self.pad_token_id if self.pad_token_id is not None else 0
+            arr = [x + [pad] * (maxlen - len(x)) for x in encoded]
+            if return_tensors == "pt":
+                return _TokenOutput(input_ids=torch.tensor(arr, dtype=torch.long))
+            return _TokenOutput(input_ids=arr)
+        ids = self._encode_one(str(text), add_special_tokens, truncation, max_length)
+        if return_tensors == "pt":
+            return _TokenOutput(input_ids=torch.tensor([ids], dtype=torch.long))
+        return _TokenOutput(input_ids=ids)
+    def _encode_one(self, text, add_special_tokens=True, truncation=False, max_length=None):
+        enc = self._tok.encode(text, add_special_tokens=bool(add_special_tokens))
+        ids = list(enc.ids)
+        if truncation and max_length is not None and len(ids) > int(max_length):
+            if self.truncation_side == "left":
+                ids = ids[-int(max_length):]
+            else:
+                ids = ids[:int(max_length)]
+        return ids
+    def decode(self, ids, skip_special_tokens=True, **kwargs):
+        if torch.is_tensor(ids):
+            ids = ids.detach().cpu().tolist()
+        if ids and isinstance(ids[0], list):
+            ids = ids[0]
+        return self._tok.decode([int(x) for x in ids], skip_special_tokens=bool(skip_special_tokens))
+    def apply_chat_template(self, messages, tokenize=False, add_generation_prompt=False, **kwargs):
+        chunks = []
+        for m in messages:
+            role = str(m.get("role", "user"))
+            content = str(m.get("content", ""))
+            chunks.append(f"<|im_start|>{role}\n{content}<|im_end|>")
+        if add_generation_prompt:
+            chunks.append("<|im_start|>assistant\n")
+        text = "\n".join(chunks)
+        if tokenize:
+            return self(text, add_special_tokens=False).input_ids
+        return text
+class _AutoTokenizerShim:
+    @staticmethod
+    def from_pretrained(path, *args, **kwargs):
+        if _HFAutoTokenizer is not None:
+            return _HFAutoTokenizer.from_pretrained(path, *args, **kwargs)
+        return _LocalTokenizer(path)
+class _AutoConfigShim:
+    @staticmethod
+    def from_pretrained(path, *args, **kwargs):
+        if _HFAutoConfig is not None:
+            return _HFAutoConfig.from_pretrained(path, *args, **kwargs)
+        raise RuntimeError(
+            "AutoConfig requested, but transformers failed to import. "
+            "Use --no-config-download / embedded model_config for LULUV2."
+        )
+AutoTokenizer = _AutoTokenizerShim
+AutoConfig = _AutoConfigShim
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+if hasattr(torch, "set_float32_matmul_precision"):
+    torch.set_float32_matmul_precision("high")
+try:
+    if torch.cuda.is_available():
+        torch.backends.cuda.enable_flash_sdp(True)
+        torch.backends.cuda.enable_mem_efficient_sdp(True)
+        torch.backends.cuda.enable_math_sdp(False)
+except Exception:
+    pass
+def parse_dtype(name: str):
+    name = str(name).strip().lower()
+    if name in {"bf16", "bfloat16"}:
+        return torch.bfloat16
+    if name in {"fp16", "float16", "half"}:
+        return torch.float16
+    if name in {"fp32", "float32"}:
+        return torch.float32
+    raise ValueError(f"Unknown dtype: {name}")
+def human_bytes(n: float) -> str:
+    units = ["B", "KB", "MB", "GB", "TB"]
+    x = float(n)
+    i = 0
+    while x >= 1024.0 and i < len(units) - 1:
+        x /= 1024.0
+        i += 1
+    return f"{x:.2f} {units[i]}"
+def safe_torch_load(path: str, map_location="cpu"):
+    # PyTorch 2.6+ defaults may warn around weights_only. This checkpoint stores
+    # Python metadata plus tensors, so weights_only=False is intentional.
+    try:
+        return torch.load(path, map_location=map_location, weights_only=False)
+    except TypeError:
+        return torch.load(path, map_location=map_location)
+def module_has_vwm(sd: Dict[str, torch.Tensor], prefix: str) -> bool:
+    return f"{prefix}.A" in sd and f"{prefix}.B" in sd and f"{prefix}.c" in sd
+def linear_shape_from_state(sd: Dict[str, torch.Tensor], prefix: str) -> Tuple[int, int, bool]:
+    if module_has_vwm(sd, prefix):
+        out_features = int(sd[f"{prefix}.A"].shape[0])
+        in_features = int(sd[f"{prefix}.B"].shape[0])
+        has_bias = f"{prefix}.bias" in sd
+        return in_features, out_features, has_bias
+    wkey = f"{prefix}.weight"
+    if wkey not in sd:
+        raise KeyError(f"Cannot infer Linear shape for {prefix}; missing {wkey} and VWM A/B/c")
+    out_features, in_features = sd[wkey].shape
+    has_bias = f"{prefix}.bias" in sd
+    return int(in_features), int(out_features), has_bias
+def make_linear_from_state(sd: Dict[str, torch.Tensor], prefix: str) -> nn.Module:
+    in_features, out_features, has_bias = linear_shape_from_state(sd, prefix)
+    if module_has_vwm(sd, prefix):
+        rank = int(sd[f"{prefix}.c"].shape[0])
+        return VWMFactorizedLinear(in_features, out_features, rank, bias=has_bias, name=prefix)
+    return nn.Linear(in_features, out_features, bias=has_bias)
+def module_has_vwm_embedding(sd: Dict[str, torch.Tensor], prefix: str) -> bool:
+    return f"{prefix}.A" in sd and f"{prefix}.B" in sd and f"{prefix}.c" in sd
+def embedding_shape_from_state(sd: Dict[str, torch.Tensor], prefix: str) -> Tuple[int, int]:
+    if module_has_vwm_embedding(sd, prefix):
+        return int(sd[f"{prefix}.A"].shape[0]), int(sd[f"{prefix}.B"].shape[0])
+    wkey = f"{prefix}.weight"
+    if wkey not in sd:
+        raise KeyError(f"Cannot infer embedding shape for {prefix}; missing dense or VWM embedding tensors")
+    return int(sd[wkey].shape[0]), int(sd[wkey].shape[1])
+def make_embedding_from_state(sd: Dict[str, torch.Tensor], prefix: str) -> nn.Module:
+    vocab_size, hidden_size = embedding_shape_from_state(sd, prefix)
+    if module_has_vwm_embedding(sd, prefix):
+        rank = int(sd[f"{prefix}.c"].shape[0])
+        return VWMFactorizedEmbedding(vocab_size, hidden_size, rank, name=prefix)
+    return nn.Embedding(vocab_size, hidden_size)
+def expand_shared_banks_into_state(ckpt: Dict, sd: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    """Expand experimental shared-bank storage into normal per-module A/B/c tensors."""
+    banks = ckpt.get("shared_banks")
+    if not banks:
+        return sd
+    out = dict(sd)
+    n = 0
+    for bank_id, bank in banks.items():
+        A = bank["A"]
+        B = bank["B"]
+        modules = bank.get("modules", {})
+        for prefix, m in modules.items():
+            out[f"{prefix}.A"] = A
+            out[f"{prefix}.B"] = B
+            out[f"{prefix}.c"] = m["c"]
+            if "bias" in m and m["bias"] is not None:
+                out[f"{prefix}.bias"] = m["bias"]
+            n += 1
+    print(f"[shared-bank] expanded {len(banks)} banks into {n} VWM modules")
+    return out
+# -----------------------------
+# VWM linear used by the exported checkpoint
+# -----------------------------
+class VWMFactorizedLinear(nn.Module):
+    """
+    W ~= A diag(c) B^T
+    y = ((x @ B) * c) @ A^T + bias
+    This matches LULU2 exporter's exported VWMFactorizedLinear
+    state names: A, B, c, optional bias.
+    """
+    def __init__(self, in_features: int, out_features: int, rank: int, bias: bool = True, name: str = ""):
+        super().__init__()
+        self.in_features = int(in_features)
+        self.out_features = int(out_features)
+        self.rank = int(rank)
+        self.name = name
+        self.A = nn.Parameter(torch.empty(out_features, rank), requires_grad=False)
+        self.B = nn.Parameter(torch.empty(in_features, rank), requires_grad=False)
+        self.c = nn.Parameter(torch.empty(rank), requires_grad=False)
+        self.bias = nn.Parameter(torch.zeros(out_features), requires_grad=False) if bias else None
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Compute in the activation dtype/device. Parameters are already moved by model.to(...).
+        t = torch.matmul(x, self.B.to(dtype=x.dtype))
+        t = t * self.c.to(dtype=x.dtype)
+        y = torch.matmul(t, self.A.to(dtype=x.dtype).transpose(0, 1))
+        if self.bias is not None:
+            y = y + self.bias.to(dtype=x.dtype)
+        return y
+class VWMFactorizedEmbedding(nn.Module):
+    """Runtime for exported VWM embedding: E ~= A diag(c) B^T."""
+    def __init__(self, num_embeddings: int, embedding_dim: int, rank: int, name: str = "model.embed_tokens"):
+        super().__init__()
+        self.num_embeddings = int(num_embeddings)
+        self.embedding_dim = int(embedding_dim)
+        self.rank = int(rank)
+        self.name = name
+        self.A = nn.Parameter(torch.empty(num_embeddings, rank), requires_grad=False)
+        self.B = nn.Parameter(torch.empty(embedding_dim, rank), requires_grad=False)
+        self.c = nn.Parameter(torch.empty(rank), requires_grad=False)
+    @property
+    def weight(self):
+        # Dense materialization only for compatibility/debug. Normal forward avoids this.
+        return (self.A * self.c.view(1, -1)) @ self.B.T
+    def forward(self, input_ids: torch.LongTensor) -> torch.Tensor:
+        a = F.embedding(input_ids, self.A)
+        t = a * self.c.to(dtype=a.dtype)
+        return torch.matmul(t, self.B.to(dtype=a.dtype).transpose(0, 1))
+class TiedEmbeddingLMHead(nn.Module):
+    """LM head tied to the model embedding matrix, dense or VWM."""
+    def __init__(self, embedding_module: nn.Module):
+        super().__init__()
+        self.embedding_module = embedding_module
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        emb = self.embedding_module
+        if isinstance(emb, VWMFactorizedEmbedding):
+            # logits = h @ E.T = (h @ B) * c @ A.T
+            t = torch.matmul(hidden_states, emb.B.to(dtype=hidden_states.dtype))
+            t = t * emb.c.to(dtype=hidden_states.dtype)
+            return torch.matmul(t, emb.A.to(dtype=hidden_states.dtype).transpose(0, 1))
+        return F.linear(hidden_states, emb.weight.to(dtype=hidden_states.dtype))
+# -----------------------------
+# LULU2 decoder architecture
+# -----------------------------
+class LuluRMSNorm(nn.Module):
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size), requires_grad=False)
+        self.variance_epsilon = float(eps)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.float()
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight.to(dtype=input_dtype) * hidden_states.to(input_dtype)
+class LuluRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, max_position_embeddings: int = 32768, base: float = 1000000.0):
+        super().__init__()
+        self.dim = int(dim)
+        self.max_position_embeddings = int(max_position_embeddings)
+        self.base = float(base)
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    @torch.no_grad()
+    def forward(self, x: torch.Tensor, position_ids: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # position_ids: [B, T]
+        inv_freq = self.inv_freq.to(device=x.device)
+        freqs = torch.einsum("bt,d->btd", position_ids.float(), inv_freq.float())
+        emb = torch.cat((freqs, freqs), dim=-1)
+        return emb.cos().to(dtype=x.dtype), emb.sin().to(dtype=x.dtype)
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    # q/k: [B, H, T, D], cos/sin: [B, T, D]
+    cos = cos.unsqueeze(1)
+    sin = sin.unsqueeze(1)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class LuluVWMMLP(nn.Module):
+    def __init__(self, cfg, sd: Dict[str, torch.Tensor], layer_idx: int):
+        super().__init__()
+        p = f"model.layers.{layer_idx}.mlp"
+        self.gate_proj = make_linear_from_state(sd, f"{p}.gate_proj")
+        self.up_proj = make_linear_from_state(sd, f"{p}.up_proj")
+        self.down_proj = make_linear_from_state(sd, f"{p}.down_proj")
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
+class LuluVWMAttention(nn.Module):
+    def __init__(self, cfg, sd: Dict[str, torch.Tensor], layer_idx: int):
+        super().__init__()
+        self.layer_idx = int(layer_idx)
+        self.hidden_size = int(cfg.hidden_size)
+        self.num_heads = int(cfg.num_attention_heads)
+        self.num_key_value_heads = int(getattr(cfg, "num_key_value_heads", self.num_heads))
+        self.head_dim = int(getattr(cfg, "head_dim", self.hidden_size // self.num_heads))
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.scaling = self.head_dim ** -0.5
+        self.attention_dropout = float(getattr(cfg, "attention_dropout", 0.0))
+        p = f"model.layers.{layer_idx}.self_attn"
+        self.q_proj = make_linear_from_state(sd, f"{p}.q_proj")
+        self.k_proj = make_linear_from_state(sd, f"{p}.k_proj")
+        self.v_proj = make_linear_from_state(sd, f"{p}.v_proj")
+        self.o_proj = make_linear_from_state(sd, f"{p}.o_proj")
+        rope_theta = float(getattr(cfg, "rope_theta", 1000000.0))
+        max_pos = int(getattr(cfg, "max_position_embeddings", 32768))
+        self.rotary_emb = LuluRotaryEmbedding(self.head_dim, max_position_embeddings=max_pos, base=rope_theta)
+    def forward(self, hidden_states: torch.Tensor, position_ids: torch.Tensor) -> torch.Tensor:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        # Full forward is causal. This generation script recomputes the full prefix each token.
+        attn_output = F.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=None,
+            dropout_p=0.0,
+            is_causal=True,
+            scale=self.scaling,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous().reshape(bsz, q_len, self.hidden_size)
+        return self.o_proj(attn_output)
+class LuluVWMDecoderLayer(nn.Module):
+    def __init__(self, cfg, sd: Dict[str, torch.Tensor], layer_idx: int):
+        super().__init__()
+        self.self_attn = LuluVWMAttention(cfg, sd, layer_idx)
+        self.mlp = LuluVWMMLP(cfg, sd, layer_idx)
+        self.input_layernorm = LuluRMSNorm(cfg.hidden_size, eps=getattr(cfg, "rms_norm_eps", 1e-6))
+        self.post_attention_layernorm = LuluRMSNorm(cfg.hidden_size, eps=getattr(cfg, "rms_norm_eps", 1e-6))
+    def forward(self, hidden_states: torch.Tensor, position_ids: torch.Tensor) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(hidden_states, position_ids=position_ids)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class LuluVWMModel(nn.Module):
+    def __init__(self, cfg, sd: Dict[str, torch.Tensor]):
+        super().__init__()
+        self.config = cfg
+        vocab_size, hidden_size = embedding_shape_from_state(sd, "model.embed_tokens")
+        self.embed_tokens = make_embedding_from_state(sd, "model.embed_tokens")
+        self.layers = nn.ModuleList([LuluVWMDecoderLayer(cfg, sd, i) for i in range(int(cfg.num_hidden_layers))])
+        self.norm = LuluRMSNorm(hidden_size, eps=getattr(cfg, "rms_norm_eps", 1e-6))
+    def forward(self, input_ids: torch.LongTensor, position_ids: Optional[torch.LongTensor] = None) -> torch.Tensor:
+        bsz, seq_len = input_ids.shape
+        if position_ids is None:
+            position_ids = torch.arange(seq_len, device=input_ids.device, dtype=torch.long).unsqueeze(0).expand(bsz, -1)
+        hidden_states = self.embed_tokens(input_ids)
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, position_ids=position_ids)
+        return self.norm(hidden_states)
+class LuluVWMForCausalLM(nn.Module):
+    def __init__(self, cfg, sd: Dict[str, torch.Tensor]):
+        super().__init__()
+        self.config = cfg
+        self.model = LuluVWMModel(cfg, sd)
+        _, hidden_size = embedding_shape_from_state(sd, "model.embed_tokens")
+        self.tie_word_embeddings = bool(getattr(cfg, "tie_word_embeddings", False))
+        if module_has_vwm(sd, "lm_head") or "lm_head.weight" in sd:
+            self.lm_head = make_linear_from_state(sd, "lm_head")
+        else:
+            self.tie_word_embeddings = True
+            self.lm_head = TiedEmbeddingLMHead(self.model.embed_tokens)
+    def forward(self, input_ids: torch.LongTensor, position_ids: Optional[torch.LongTensor] = None):
+        hidden_states = self.model(input_ids=input_ids, position_ids=position_ids)
+        logits = self.lm_head(hidden_states)
+        return SimpleNamespace(logits=logits)
+# -----------------------------
+# config loading / inference
+# -----------------------------
+def infer_minimal_config_from_state(sd: Dict[str, torch.Tensor], model_id: str = "") -> SimpleNamespace:
+    if "model.embed_tokens.weight" in sd:
+        hidden_size = int(sd["model.embed_tokens.weight"].shape[1])
+        vocab_size = int(sd["model.embed_tokens.weight"].shape[0])
+    elif module_has_vwm_embedding(sd, "model.embed_tokens"):
+        vocab_size = int(sd["model.embed_tokens.A"].shape[0])
+        hidden_size = int(sd["model.embed_tokens.B"].shape[0])
+    else:
+        raise ValueError("Checkpoint is missing model.embed_tokens dense or VWM tensors. Use a full standalone checkpoint, not a delta checkpoint.")
+    layer_ids = []
+    for k in sd.keys():
+        if k.startswith("model.layers."):
+            try:
+                layer_ids.append(int(k.split(".")[2]))
+            except Exception:
+                pass
+    num_hidden_layers = max(layer_ids) + 1 if layer_ids else 0
+    inter_key = "model.layers.0.mlp.gate_proj.weight"
+    if inter_key in sd:
+        intermediate_size = int(sd[inter_key].shape[0])
+    else:
+        intermediate_size = 4864
+    # Best known defaults for LULU2. If you export
+    # model_config into the checkpoint, these assumptions are not used.
+    num_attention_heads = 14
+    num_key_value_heads = 2
+    head_dim = hidden_size // num_attention_heads
+    if head_dim * num_attention_heads != hidden_size:
+        # Fallback if a different decoder variant is used and no config is present.
+        # This requires explicit command-line override in practice.
+        num_attention_heads = 1
+        num_key_value_heads = 1
+        head_dim = hidden_size
+    return SimpleNamespace(
+        model_type="luluv2",
+        model_id=model_id,
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        intermediate_size=intermediate_size,
+        num_hidden_layers=num_hidden_layers,
+        num_attention_heads=num_attention_heads,
+        num_key_value_heads=num_key_value_heads,
+        head_dim=head_dim,
+        rms_norm_eps=1e-6,
+        rope_theta=1000000.0,
+        max_position_embeddings=32768,
+        attention_dropout=0.0,
+        tie_word_embeddings=False,
+    )
+def namespace_from_dict(d: Dict) -> SimpleNamespace:
+    return SimpleNamespace(**d)
+def load_runtime_config(ckpt: Dict, sd: Dict[str, torch.Tensor], args) -> SimpleNamespace:
+    if "model_config" in ckpt and isinstance(ckpt["model_config"], dict):
+        print("[config] using model_config embedded in checkpoint")
+        d = dict(ckpt["model_config"])
+        if ckpt.get("tie_word_embeddings") is True:
+            d["tie_word_embeddings"] = True
+        return namespace_from_dict(d)
+    model_id = args.model_id or ckpt.get("model_id") or ckpt.get("args", {}).get("model_id") or "LULU2"
+    if args.no_config_download:
+        print("[config] no embedded config and --no-config-download set; using LULU2 defaults")
+        cfg = infer_minimal_config_from_state(sd, model_id=model_id)
+        if ckpt.get("tie_word_embeddings") is True:
+            cfg.tie_word_embeddings = True
+        return cfg
+    print(f"[config] loading config metadata only from {model_id}; no model weights are loaded")
+    cfg = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+    return cfg
+# -----------------------------
+# generation
+# -----------------------------
+def build_chat_prompt(tokenizer, user_prompt: str, system_prompt: str = "You are a helpful assistant. Answer directly and naturally.") -> str:
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    messages.append({"role": "user", "content": user_prompt})
+    try:
+        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    except Exception:
+        return f"system\n{system_prompt}\nuser\n{user_prompt}\nassistant\n"
+@torch.no_grad()
+def sample_next(logits: torch.Tensor, temperature: float = 0.0, top_k: int = 0, top_p: float = 1.0) -> torch.Tensor:
+    if temperature <= 0.0:
+        return torch.argmax(logits, dim=-1, keepdim=True)
+    logits = logits / max(temperature, 1e-6)
+    if top_k and top_k > 0:
+        k = min(int(top_k), logits.size(-1))
+        thresh = torch.topk(logits, k, dim=-1).values[..., -1, None]
+        logits = torch.where(logits >= thresh, logits, torch.full_like(logits, -float("inf")))
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+        probs = torch.softmax(sorted_logits, dim=-1)
+        cumulative_probs = torch.cumsum(probs, dim=-1)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = False
+        sorted_logits = sorted_logits.masked_fill(sorted_indices_to_remove, -float("inf"))
+        logits = torch.full_like(logits, -float("inf")).scatter(1, sorted_indices, sorted_logits)
+    probs = torch.softmax(logits, dim=-1)
+    return torch.multinomial(probs, num_samples=1)
+@torch.no_grad()
+def generate_text(model, tokenizer, prompt: str, device, max_new_tokens: int = 120, temperature: float = 0.0, top_k: int = 0, top_p: float = 1.0, max_context: int = 2048) -> Tuple[str, float]:
+    model.eval()
+    enc = tokenizer(prompt, return_tensors="pt")
+    input_ids = enc.input_ids.to(device)
+    eos_id = tokenizer.eos_token_id
+    t0 = time.time()
+    start_len = int(input_ids.shape[1])
+    for _ in range(max_new_tokens):
+        ctx = input_ids[:, -max_context:]
+        out = model(ctx)
+        next_logits = out.logits[:, -1, :].float()
+        next_id = sample_next(next_logits, temperature=temperature, top_k=top_k, top_p=top_p)
+        input_ids = torch.cat([input_ids, next_id.to(input_ids.device)], dim=-1)
+        if eos_id is not None and int(next_id.item()) == int(eos_id):
+            break
+    dt = time.time() - t0
+    new_tokens = max(1, int(input_ids.shape[1]) - start_len)
+    return tokenizer.decode(input_ids[0], skip_special_tokens=True), new_tokens / max(dt, 1e-9)
+def load_tokenizer(args, ckpt):
+    tok_path = args.tokenizer or ckpt.get("tokenizer_dir") or ckpt.get("model_id") or ckpt.get("args", {}).get("model_id") or args.model_id
+    if not tok_path:
+        raise ValueError("Tokenizer path/name is required. Pass --tokenizer <local-dir-or-model-id>.")
+    # If checkpoint stores a relative tokenizer_dir like "tokenizer", resolve it
+    # relative to the checkpoint location so no HF lookup is needed.
+    ckpt_dir = os.path.dirname(os.path.abspath(args.checkpoint))
+    if tok_path and not os.path.isabs(tok_path):
+        maybe_local = os.path.join(ckpt_dir, tok_path)
+        if os.path.isdir(maybe_local):
+            tok_path = maybe_local
+    print(f"[tokenizer] {tok_path}")
+    tok = AutoTokenizer.from_pretrained(tok_path, trust_remote_code=True, local_files_only=bool(args.local_files_only))
+    if tok.pad_token_id is None and tok.eos_token_id is not None:
+        tok.pad_token = tok.eos_token
+    return tok
+# -----------------------------
+# main
+# Public model aliases used by the UI/runtime.
+Lulu2RMSNorm = LuluRMSNorm
+Lulu2RotaryEmbedding = LuluRotaryEmbedding
+Lulu2VWMMLP = LuluVWMMLP
+Lulu2VWMAttention = LuluVWMAttention
+Lulu2VWMDecoderLayer = LuluVWMDecoderLayer
+Lulu2VWMModel = LuluVWMModel
+Lulu2ForCausalLM = LuluVWMForCausalLM
+class Pass2RefinementAdapter(nn.Module):
+    """Small gated residual adapter conditioned on pass-1 layer state."""
+    def __init__(self, hidden_size: int, rank: int, gate_init: float = -5.0):
+        super().__init__()
+        self.hidden_size = int(hidden_size)
+        self.rank = int(rank)
+        self.x_norm = LuluRMSNorm(hidden_size)
+        self.cond_norm = LuluRMSNorm(hidden_size)
+        self.down = nn.Linear(2 * hidden_size, rank, bias=False)
+        self.up = nn.Linear(rank, hidden_size, bias=False)
+        self.gate = nn.Parameter(torch.tensor(float(gate_init)))
+        nn.init.normal_(self.down.weight, mean=0.0, std=0.02 / math.sqrt(max(1, hidden_size)))
+        # Zero init means the two-pass model starts exactly as pass 1.
+        nn.init.zeros_(self.up.weight)
+    def forward(self, x: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
+        z = torch.cat([self.x_norm(x), self.cond_norm(cond)], dim=-1)
+        delta = self.up(F.silu(self.down(z)))
+        return torch.sigmoid(self.gate).to(dtype=x.dtype) * delta
+@dataclass
+class Pass2Config:
+    adapter_rank: int = 64
+    adapter_gate_init: float = -5.0
+    layer_gate_init: float = -5.0
+    pass_embed_scale: float = 0.0
+    mode: str = "refine_pass1_residual"
+class Lulu2TwoPassForCausalLM(nn.Module):
+    """
+    Wraps a loaded LULU2 base model.
+    Pass 1: normal LULU2 decoder forward, producing the pass-1 residual stream.
+    Pass 2: starts from pass-1 residual stream and adds small gated refinements.
+    h2_i = h2_i + sigmoid(layer_gate_i) * (BaseLayer_i(h2_i) - h2_i)
+                 + Adapter_i(h2_i, pass1_state_i)
+    With zero-initialized adapter up-projections and negative gates, the model
+    starts extremely close to the loaded LULU2 checkpoint and learns refinements.
+    """
+    def __init__(self, base: Lulu2ForCausalLM, cfg: Pass2Config):
+        super().__init__()
+        self.base = base
+        self.pass2_config = cfg
+        hidden = int(base.config.hidden_size)
+        n_layers = int(base.config.num_hidden_layers)
+        self.pass_embed = nn.Parameter(torch.randn(2, hidden) * float(cfg.pass_embed_scale))
+        self.layer_gates = nn.Parameter(torch.full((n_layers,), float(cfg.layer_gate_init)))
+        self.adapters = nn.ModuleList([
+            Pass2RefinementAdapter(hidden, int(cfg.adapter_rank), gate_init=float(cfg.adapter_gate_init))
+            for _ in range(n_layers)
+        ])
+    def _position_ids(self, input_ids: torch.LongTensor, position_ids: Optional[torch.LongTensor] = None):
+        if position_ids is not None:
+            return position_ids
+        bsz, seq_len = input_ids.shape
+        return torch.arange(seq_len, device=input_ids.device, dtype=torch.long).unsqueeze(0).expand(bsz, -1)
+    def forward_pass1_features(self, input_ids: torch.LongTensor, position_ids: Optional[torch.LongTensor] = None):
+        position_ids = self._position_ids(input_ids, position_ids)
+        h = self.base.model.embed_tokens(input_ids)
+        h = h + self.pass_embed[0].to(dtype=h.dtype).view(1, 1, -1)
+        layer_states = []
+        for layer in self.base.model.layers:
+            h = layer(h, position_ids=position_ids)
+            layer_states.append(h)
+        return h, layer_states, position_ids
+    def forward(self, input_ids: torch.LongTensor, position_ids: Optional[torch.LongTensor] = None, return_pass1_logits: bool = False):
+        h1_resid, pass1_states, position_ids = self.forward_pass1_features(input_ids, position_ids=position_ids)
+        h1 = self.base.model.norm(h1_resid)
+        # Pass 2 refines pass 1; it does not discard pass 1.
+        h2 = h1_resid + self.pass_embed[1].to(dtype=h1_resid.dtype).view(1, 1, -1)
+        for i, layer in enumerate(self.base.model.layers):
+            before = h2
+            layer_out = layer(h2, position_ids=position_ids)
+            layer_delta = layer_out - before
+            layer_gate = torch.sigmoid(self.layer_gates[i]).to(dtype=h2.dtype)
+            adapter_delta = self.adapters[i](h2, pass1_states[i])
+            h2 = before + layer_gate * layer_delta + adapter_delta
+        h2 = self.base.model.norm(h2)
+        logits2 = self.base.lm_head(h2)
+        if return_pass1_logits:
+            with torch.no_grad():
+                logits1 = self.base.lm_head(h1)
+        else:
+            logits1 = None
+        return SimpleNamespace(logits=logits2, pass1_logits=logits1)
+@torch.no_grad()
+def load_lulu2_base(args, device, dtype):
+    print("[guard] LULUV2 VWM runtime: no AutoModelForCausalLM.from_pretrained call and no external-model weights loaded.")
+    print(f"[load] {args.checkpoint} ({human_bytes(os.path.getsize(args.checkpoint))})")
+    ckpt = safe_torch_load(args.checkpoint, map_location="cpu")
+    if "model" not in ckpt:
+        raise ValueError("Checkpoint missing model state dict")
+    sd = expand_shared_banks_into_state(ckpt, ckpt["model"])
+    cfg = load_runtime_config(ckpt, sd, args)
+    print(f"[config] hidden={cfg.hidden_size} layers={cfg.num_hidden_layers}")
+    base = Lulu2ForCausalLM(cfg, sd)
+    missing, unexpected = base.load_state_dict(sd, strict=False)
+    print(f"[state:base] missing={len(missing)} unexpected={len(unexpected)}")
+    if missing:
+        print("[state:base] first missing:", missing[:10])
+    if unexpected:
+        print("[state:base] first unexpected:", unexpected[:10])
+    base.to(device=device, dtype=dtype)
+    return ckpt, base

luluv2_live_inference.py ADDED Viewed

	@@ -0,0 +1,698 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+LULUV2 live local inference engine.
+This is the runtime bridge for the LULUV2 fine-tuned checkpoint:
+    LULU2_instruct_ddp.pt / LULU2_base_ddp.pt / LULU2.pt
+It imports the actual LULUV2 architecture file, loads the checkpoint,
+restores pass2_state when present, uses the local tokenizer folder, and streams
+tokens with live metrics.
+No AutoModelForCausalLM.from_pretrained call is used here.
+No external model weights are loaded.
+"""
+from __future__ import annotations
+import importlib.util
+import json
+import math
+import os
+import platform
+import time
+from contextlib import nullcontext
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any, Dict, Generator, List, Optional, Tuple
+import torch
+import torch.nn.functional as F
+try:
+    import psutil
+except Exception:
+    psutil = None
+try:
+    import pynvml
+except Exception:
+    pynvml = None
+STOP_STRINGS = [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|user|>",
+    "<|system|>",
+    "<|assistant|>",
+    "User:",
+    "Assistant:",
+    "\nuser:",
+    "\nassistant:",
+]
+@dataclass
+class GenerationConfig:
+    max_new_tokens: int = 512
+    temperature: float = 0.65
+    top_k: int = 40
+    top_p: float = 0.90
+    min_p: float = 0.03
+    repetition_penalty: float = 1.10
+    frequency_penalty: float = 0.02
+    greedy: bool = False
+    no_repeat_ngram: int = 4
+    stream_every: int = 1
+    max_context_tokens: int = 4096
+    return_pass_metrics: bool = True
+@dataclass
+class GenerationStats:
+    prompt_tokens: int = 0
+    generated_tokens: int = 0
+    elapsed_sec: float = 0.0
+    tokens_per_sec: float = 0.0
+    last_token: str = ""
+    last_token_id: int = -1
+    last_token_prob: float = 0.0
+    last_entropy: float = 0.0
+    finish_reason: str = "none"
+    pass1_pass2_kl: Optional[float] = None
+    pass1_pass2_logit_cosine: Optional[float] = None
+def setup_torch():
+    if torch.cuda.is_available():
+        try:
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+        except Exception:
+            pass
+        try:
+            torch.backends.cuda.enable_flash_sdp(True)
+            torch.backends.cuda.enable_mem_efficient_sdp(True)
+            torch.backends.cuda.enable_math_sdp(False)
+        except Exception:
+            pass
+    if hasattr(torch, "set_float32_matmul_precision"):
+        try:
+            torch.set_float32_matmul_precision("high")
+        except Exception:
+            pass
+def human_bytes(num: float) -> str:
+    for unit in ["B", "KB", "MB", "GB", "TB"]:
+        if abs(num) < 1024.0:
+            return f"{num:.2f} {unit}"
+        num /= 1024.0
+    return f"{num:.2f} PB"
+def _value_to_text(value: Any) -> str:
+    """Coerce Gradio/Textbox/Multimodal values into plain text.
+    Some Gradio versions send messages as {"text": ..., "files": ...} or
+    content blocks like [{"type": "text", "text": ...}].  The local UI is
+    text-only, so we aggressively unwrap these before tokenization.
+    """
+    if value is None:
+        return ""
+    if isinstance(value, str):
+        return value
+    if isinstance(value, dict):
+        if "text" in value:
+            return _value_to_text(value.get("text"))
+        if "content" in value:
+            return _value_to_text(value.get("content"))
+        if "value" in value:
+            return _value_to_text(value.get("value"))
+        return "\n".join(_value_to_text(v) for v in value.values() if _value_to_text(v))
+    if isinstance(value, (list, tuple)):
+        return "\n".join(_value_to_text(v) for v in value if _value_to_text(v))
+    return str(value)
+def clean_text(text: Any) -> str:
+    text = _value_to_text(text).replace("\\n", "\n")
+    # Cut only after obvious turn-control strings that appear in generated text.
+    cut_points = [text.find(s) for s in STOP_STRINGS if s in text and text.find(s) > 0]
+    if cut_points:
+        text = text[: min(cut_points)]
+    for s in STOP_STRINGS:
+        text = text.replace(s, "")
+    # Remove common role remnants and JSON-ish UI artifacts.
+    for prefix in ("assistant\n", "Assistant:", "Lulu:", "assistant:"):
+        if text.lstrip().startswith(prefix):
+            text = text.lstrip()[len(prefix):]
+    text = text.replace("{'type': 'text'}", "").replace('{"type": "text"}', "")
+    text = "\n".join(line.rstrip() for line in text.strip().splitlines())
+    text = "\n".join(line for line in text.splitlines() if not line.strip().startswith("type: 'text'"))
+    return text.strip()
+def normalize_history(history: Any) -> List[Dict[str, str]]:
+    out: List[Dict[str, str]] = []
+    if not history:
+        return out
+    for item in history:
+        if isinstance(item, dict):
+            role = item.get("role")
+            content = clean_text(item.get("content", ""))
+            if role in {"user", "assistant"} and content:
+                out.append({"role": role, "content": content})
+        elif isinstance(item, (tuple, list)) and len(item) >= 2:
+            u = clean_text(item[0])
+            a = clean_text(item[1])
+            if u:
+                out.append({"role": "user", "content": u})
+            if a:
+                out.append({"role": "assistant", "content": a})
+    return out
+def resolve_model_py(model_py: Optional[str] = None) -> str:
+    candidates = []
+    if model_py:
+        candidates.append(model_py)
+    candidates.extend(["luluv2_inference_runtime.py"])
+    for c in candidates:
+        p = Path(c)
+        if p.exists():
+            return str(p.resolve())
+    raise FileNotFoundError(
+        "Could not find the LULUV2 model file. Pass --model-py or put "
+        "luluv2_inference_runtime.py next to this UI."
+    )
+def import_model_py(model_py: Optional[str] = None):
+    path = resolve_model_py(model_py)
+    spec = importlib.util.spec_from_file_location("luluv2_runtime_module", path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Could not import model file: {path}")
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod, path
+class LULUV2LiveEngine:
+    def __init__(
+        self,
+        ckpt_path: str,
+        model_py: Optional[str] = None,
+        tokenizer_dir: Optional[str] = None,
+        device: Optional[str] = None,
+        dtype: str = "bf16",
+        local_files_only: bool = True,
+        no_config_download: bool = True,
+        force_base_only: bool = False,
+    ):
+        setup_torch()
+        self.ckpt_path = str(ckpt_path)
+        self.ckpt_dir = Path(self.ckpt_path).resolve().parent
+        self.device = self._select_device(device)
+        self.dtype = self._dtype_from_name(dtype)
+        self.local_files_only = bool(local_files_only)
+        self.no_config_download = bool(no_config_download)
+        self.force_base_only = bool(force_base_only)
+        self.last_stats = GenerationStats()
+        self.recent_tokens: List[Dict[str, Any]] = []
+        self.goku, self.model_py_path = import_model_py(model_py)
+        # args object expected by the embedded LULUV2 runtime helpers.
+        self.args = SimpleNamespace(
+            checkpoint=self.ckpt_path,
+            tokenizer=tokenizer_dir or "",
+            model_id="",
+            no_config_download=self.no_config_download,
+            local_files_only=self.local_files_only,
+        )
+        print("[guard] LULUV2 local UI: no AutoModelForCausalLM.from_pretrained call and no external model weights loaded.")
+        print(f"[load] checkpoint={self.ckpt_path}")
+        self.base_ckpt, base = self.goku.load_lulu2_base(self.args, self.device, self.dtype)
+        self.tokenizer = self._load_tokenizer(tokenizer_dir)
+        self.model, self.has_pass2 = self._maybe_wrap_pass2(base)
+        self.model.eval()
+        self.model_info = self._build_model_info()
+    def _select_device(self, device: Optional[str]):
+        if device:
+            return torch.device(device)
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        return torch.device("cpu")
+    def _dtype_from_name(self, name: str):
+        name = (name or "bf16").lower()
+        if name in {"bf16", "bfloat16"}:
+            return torch.bfloat16
+        if name in {"fp16", "float16", "half"}:
+            return torch.float16
+        return torch.float32
+    def _load_tokenizer(self, tokenizer_dir: Optional[str]):
+        # Prefer explicit path, then sibling tokenizer folder, then checkpoint metadata.
+        if tokenizer_dir:
+            self.args.tokenizer = tokenizer_dir
+        else:
+            sibling = self.ckpt_dir / "tokenizer"
+            if sibling.is_dir():
+                self.args.tokenizer = str(sibling)
+        tok = self.goku.load_tokenizer(self.args, self.base_ckpt)
+        if getattr(tok, "pad_token_id", None) is None and getattr(tok, "eos_token_id", None) is not None:
+            try:
+                tok.pad_token = tok.eos_token
+            except Exception:
+                pass
+        return tok
+    def _maybe_wrap_pass2(self, base):
+        ckpt = self.base_ckpt
+        if self.force_base_only or "pass2_state" not in ckpt:
+            print("[pass2] no pass2_state loaded; running base LULUV2 forward")
+            return base.to(self.device).eval(), False
+        cfg_dict = dict(ckpt.get("pass2_config") or {})
+        Pass2Config = self.goku.Pass2Config
+        pass2_cfg = Pass2Config(**{k: v for k, v in cfg_dict.items() if k in Pass2Config.__dataclass_fields__})
+        model = self.goku.Lulu2TwoPassForCausalLM(base, pass2_cfg)
+        missing, unexpected = model.load_state_dict(ckpt["pass2_state"], strict=False)
+        print(f"[pass2] loaded pass2_state missing={len(missing)} unexpected={len(unexpected)}")
+        model.to(device=self.device, dtype=self.dtype)
+        model.eval()
+        return model, True
+    def _build_model_info(self) -> Dict[str, Any]:
+        total_params = sum(p.numel() for p in self.model.parameters())
+        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
+        c_codes = [(n, p.numel()) for n, p in self.model.named_parameters() if n.endswith(".c")]
+        gate_mean = None
+        adapter_gate_mean = None
+        if self.has_pass2:
+            with torch.no_grad():
+                gate_mean = float(torch.sigmoid(self.model.layer_gates.float()).mean().item())
+                vals = []
+                for ad in self.model.adapters:
+                    vals.append(float(torch.sigmoid(ad.gate.float()).item()))
+                adapter_gate_mean = float(sum(vals) / max(1, len(vals)))
+        ckpt_size = Path(self.ckpt_path).stat().st_size if Path(self.ckpt_path).exists() else 0
+        cfg = getattr(self.model.base if self.has_pass2 else self.model, "config", None)
+        return {
+            "checkpoint": self.ckpt_path,
+            "checkpoint_size": human_bytes(ckpt_size),
+            "model_py": self.model_py_path,
+            "device": str(self.device),
+            "dtype": str(self.dtype).replace("torch.", ""),
+            "has_pass2": self.has_pass2,
+            "total_params": total_params,
+            "trainable_params": trainable_params,
+            "vwm_c_modules": len(c_codes),
+            "vwm_c_params": sum(n for _, n in c_codes),
+            "pass2_layer_gate_mean": gate_mean,
+            "pass2_adapter_gate_mean": adapter_gate_mean,
+            "hidden_size": getattr(cfg, "hidden_size", None),
+            "layers": getattr(cfg, "num_hidden_layers", None),
+            "heads": getattr(cfg, "num_attention_heads", None),
+            "kv_heads": getattr(cfg, "num_key_value_heads", None),
+            "max_position_embeddings": getattr(cfg, "max_position_embeddings", None),
+        }
+    def amp_context(self):
+        if self.device.type == "cuda" and self.dtype in (torch.bfloat16, torch.float16):
+            return torch.autocast("cuda", dtype=self.dtype)
+        return nullcontext()
+    def build_chat_prompt(
+        self,
+        message: str,
+        history: Any,
+        system_prompt: str,
+        memory_notes: str = "",
+        history_turns: int = 4,
+        extra_context: str = "",
+    ) -> str:
+        history = normalize_history(history)
+        recent = history[-max(0, int(history_turns)) * 2:] if history_turns else []
+        system_chunks = []
+        if system_prompt.strip():
+            system_chunks.append(system_prompt.strip())
+        if memory_notes.strip():
+            system_chunks.append("Useful memory notes:\n" + memory_notes.strip())
+        if extra_context.strip():
+            system_chunks.append("Relevant local context:\n" + extra_context.strip())
+        system = "\n\n".join(system_chunks)
+        messages = []
+        if system:
+            messages.append({"role": "system", "content": system})
+        messages.extend(recent)
+        messages.append({"role": "user", "content": clean_text(message)})
+        try:
+            return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        except Exception:
+            parts = []
+            if system:
+                parts.append(f"<|im_start|>system\n{system}<|im_end|>")
+            for item in recent:
+                parts.append(f"<|im_start|>{item['role']}\n{item['content']}<|im_end|>")
+            parts.append(f"<|im_start|>user\n{clean_text(message)}<|im_end|>")
+            parts.append("<|im_start|>assistant\n")
+            return "\n".join(parts)
+    def encode(self, text: str, max_context_tokens: int = 4096) -> torch.Tensor:
+        enc = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=int(max_context_tokens))
+        ids = enc.input_ids.to(self.device)
+        return ids
+    @torch.no_grad()
+    def pass_metrics_for_ids(self, ids: torch.Tensor) -> Tuple[Optional[float], Optional[float]]:
+        if not self.has_pass2:
+            return None, None
+        try:
+            with self.amp_context():
+                out = self.model(ids, return_pass1_logits=True)
+                if out.pass1_logits is None:
+                    return None, None
+                l1 = out.pass1_logits[:, -1, :].float()
+                l2 = out.logits[:, -1, :].float()
+                kl = F.kl_div(F.log_softmax(l2, dim=-1), F.softmax(l1, dim=-1), reduction="batchmean")
+                cos = F.cosine_similarity(l1, l2, dim=-1).mean()
+                return float(kl.item()), float(cos.item())
+        except Exception as exc:
+            print(f"[metrics] pass metrics failed: {type(exc).__name__}: {exc}")
+            return None, None
+    def _apply_penalties(self, logits: torch.Tensor, generated: torch.Tensor, cfg: GenerationConfig) -> torch.Tensor:
+        if generated.numel() == 0:
+            return logits
+        out = logits.clone()
+        uniq, counts = torch.unique(generated.view(-1), return_counts=True)
+        if cfg.repetition_penalty != 1.0:
+            selected = out[:, uniq]
+            selected = torch.where(selected > 0, selected / float(cfg.repetition_penalty), selected * float(cfg.repetition_penalty))
+            out[:, uniq] = selected
+        if cfg.frequency_penalty:
+            out[:, uniq] -= float(cfg.frequency_penalty) * counts.to(out.dtype).unsqueeze(0)
+        n = int(cfg.no_repeat_ngram)
+        if n > 1 and generated.size(1) >= n - 1:
+            seq = generated[0].tolist()
+            prefix = tuple(seq[-(n - 1):])
+            banned = []
+            for i in range(len(seq) - n + 1):
+                if tuple(seq[i:i + n - 1]) == prefix:
+                    banned.append(seq[i + n - 1])
+            if banned:
+                out[:, list(set(banned))] = -float("inf")
+        return out
+    @torch.no_grad()
+    def _sample_next(self, logits: torch.Tensor, generated: torch.Tensor, cfg: GenerationConfig) -> Tuple[torch.Tensor, Dict[str, Any]]:
+        work = self._apply_penalties(logits.float(), generated, cfg)
+        if cfg.greedy or cfg.temperature <= 0:
+            probs = torch.softmax(work, dim=-1)
+            next_id = torch.argmax(work, dim=-1, keepdim=True)
+        else:
+            work = work / max(float(cfg.temperature), 1e-6)
+            if cfg.top_k > 0:
+                k = min(int(cfg.top_k), work.size(-1))
+                thresh = torch.topk(work, k, dim=-1).values[..., -1, None]
+                work = torch.where(work >= thresh, work, torch.full_like(work, -float("inf")))
+            if 0.0 < cfg.top_p < 1.0:
+                sorted_logits, sorted_idx = torch.sort(work, descending=True, dim=-1)
+                sorted_probs = torch.softmax(sorted_logits, dim=-1)
+                cumprobs = torch.cumsum(sorted_probs, dim=-1)
+                remove = cumprobs > float(cfg.top_p)
+                shifted = remove.clone()
+                shifted[..., 1:] = remove[..., :-1]
+                shifted[..., 0] = False
+                sorted_logits = sorted_logits.masked_fill(shifted, -float("inf"))
+                work = torch.full_like(work, -float("inf")).scatter(1, sorted_idx, sorted_logits)
+            if 0.0 < cfg.min_p < 1.0:
+                probs_for_minp = torch.softmax(work, dim=-1)
+                max_prob = probs_for_minp.max(dim=-1, keepdim=True).values
+                keep = probs_for_minp >= float(cfg.min_p) * max_prob
+                work = work.masked_fill(~keep, -float("inf"))
+            probs = torch.softmax(work, dim=-1)
+            if torch.isnan(probs).any() or not torch.isfinite(probs.sum()) or float(probs.sum()) <= 0:
+                next_id = torch.argmax(logits, dim=-1, keepdim=True)
+                probs = torch.softmax(logits.float(), dim=-1)
+            else:
+                next_id = torch.multinomial(probs, 1)
+        prob = float(probs.gather(1, next_id).item()) if probs.numel() else 0.0
+        entropy = float((-(probs * torch.log(probs.clamp_min(1e-12))).sum(dim=-1)).mean().item()) if probs.numel() else 0.0
+        return next_id, {"prob": prob, "entropy": entropy}
+    @torch.no_grad()
+    def generate(self, prompt: str, cfg: GenerationConfig) -> Generator[str, None, None]:
+        self.model.eval()
+        self.recent_tokens = []
+        ids = self.encode(prompt, max_context_tokens=cfg.max_context_tokens)
+        prompt_len = int(ids.shape[1])
+        t0 = time.time()
+        pass_kl, pass_cos = (None, None)
+        if cfg.return_pass_metrics:
+            pass_kl, pass_cos = self.pass_metrics_for_ids(ids)
+        eos_id = getattr(self.tokenizer, "eos_token_id", None)
+        last_text = ""
+        finish_reason = "length"
+        for step in range(int(cfg.max_new_tokens)):
+            ctx = ids[:, -int(cfg.max_context_tokens):]
+            with self.amp_context():
+                out = self.model(ctx)
+                logits = out.logits[:, -1, :].float()
+            generated = ids[:, prompt_len:]
+            next_id, tok_stats = self._sample_next(logits, generated, cfg)
+            ids = torch.cat([ids, next_id.to(ids.device)], dim=-1)
+            token_id = int(next_id.item())
+            token_text = self.tokenizer.decode([token_id], skip_special_tokens=False)
+            self.recent_tokens.append({
+                "i": step + 1,
+                "id": token_id,
+                "text": token_text,
+                "prob": tok_stats["prob"],
+                "entropy": tok_stats["entropy"],
+            })
+            self.recent_tokens = self.recent_tokens[-32:]
+            if eos_id is not None and token_id == int(eos_id):
+                finish_reason = "eos"
+                break
+            if (step + 1) % int(cfg.stream_every) == 0 or step == 0:
+                raw = self.tokenizer.decode(ids[0, prompt_len:], skip_special_tokens=True)
+                if any(s in raw for s in STOP_STRINGS):
+                    finish_reason = "stop_string"
+                    break
+                text = clean_text(raw)
+                if text and text != last_text:
+                    elapsed = time.time() - t0
+                    gen = int(ids.shape[1]) - prompt_len
+                    self.last_stats = GenerationStats(
+                        prompt_tokens=prompt_len,
+                        generated_tokens=gen,
+                        elapsed_sec=elapsed,
+                        tokens_per_sec=gen / max(elapsed, 1e-9),
+                        last_token=token_text,
+                        last_token_id=token_id,
+                        last_token_prob=tok_stats["prob"],
+                        last_entropy=tok_stats["entropy"],
+                        finish_reason="streaming",
+                        pass1_pass2_kl=pass_kl,
+                        pass1_pass2_logit_cosine=pass_cos,
+                    )
+                    last_text = text
+                    yield text
+        raw = self.tokenizer.decode(ids[0, prompt_len:], skip_special_tokens=True)
+        final = clean_text(raw)
+        elapsed = time.time() - t0
+        gen = int(ids.shape[1]) - prompt_len
+        self.last_stats = GenerationStats(
+            prompt_tokens=prompt_len,
+            generated_tokens=gen,
+            elapsed_sec=elapsed,
+            tokens_per_sec=gen / max(elapsed, 1e-9),
+            last_token=self.recent_tokens[-1]["text"] if self.recent_tokens else "",
+            last_token_id=self.recent_tokens[-1]["id"] if self.recent_tokens else -1,
+            last_token_prob=self.recent_tokens[-1]["prob"] if self.recent_tokens else 0.0,
+            last_entropy=self.recent_tokens[-1]["entropy"] if self.recent_tokens else 0.0,
+            finish_reason=finish_reason,
+            pass1_pass2_kl=pass_kl,
+            pass1_pass2_logit_cosine=pass_cos,
+        )
+        if final:
+            yield final
+    def stats_dict(self) -> Dict[str, Any]:
+        d = asdict(self.last_stats)
+        d["model"] = self.model_info
+        d["system"] = system_snapshot(self)
+        return d
+    def stats_text(self) -> str:
+        s = self.last_stats
+        lines = [
+            f"Prompt tokens: {s.prompt_tokens}",
+            f"Generated tokens: {s.generated_tokens}",
+            f"Elapsed: {s.elapsed_sec:.2f}s",
+            f"Decode speed: {s.tokens_per_sec:.2f} tok/s",
+            f"Finish reason: {s.finish_reason}",
+            f"Last token: {s.last_token!r} id={s.last_token_id} p={s.last_token_prob:.4f}",
+            f"Last entropy: {s.last_entropy:.3f}",
+        ]
+        if s.pass1_pass2_kl is not None:
+            lines.append(f"Pass1→Pass2 KL: {s.pass1_pass2_kl:.6f}")
+        if s.pass1_pass2_logit_cosine is not None:
+            lines.append(f"Pass1/Pass2 logit cosine: {s.pass1_pass2_logit_cosine:.6f}")
+        lines.extend([
+            "",
+            f"Checkpoint: {self.model_info['checkpoint']}",
+            f"Checkpoint size: {self.model_info['checkpoint_size']}",
+            f"Device: {self.model_info['device']} dtype={self.model_info['dtype']}",
+            f"Pass2 active: {self.model_info['has_pass2']}",
+            f"Params: {self.model_info['total_params']:,}",
+            f"VWM c modules: {self.model_info['vwm_c_modules']} ({self.model_info['vwm_c_params']:,} c params)",
+            f"Layer gate mean: {self.model_info['pass2_layer_gate_mean']}",
+            f"Adapter gate mean: {self.model_info['pass2_adapter_gate_mean']}",
+        ])
+        return "\n".join(lines)
+    def token_trace_text(self) -> str:
+        if not self.recent_tokens:
+            return "No tokens generated yet."
+        rows = []
+        for t in self.recent_tokens[-24:]:
+            safe = repr(t["text"])[1:-1]
+            rows.append(f"{t['i']:04d}  id={t['id']:<7}  p={t['prob']:.4f}  H={t['entropy']:.2f}  {safe}")
+        return "\n".join(rows)
+def system_snapshot(engine: Optional[LULUV2LiveEngine] = None) -> Dict[str, Any]:
+    """Return compact live edge-device metrics for the UI cards.
+    Values are safe for JSON/HTML display. NVML is used when available for
+    whole-device VRAM/utilization; PyTorch counters are always included.
+    """
+    snap: Dict[str, Any] = {
+        "python_ram": "n/a",
+        "system_ram": "n/a",
+        "system_ram_percent": 0.0,
+        "cpu_percent": 0.0,
+        "gpu_name": "CUDA unavailable",
+        "vram_allocated": "n/a",
+        "vram_reserved": "n/a",
+        "vram_used": "n/a",
+        "vram_total": "n/a",
+        "vram_percent": 0.0,
+        "gpu_util_percent": None,
+        "gpu_temp_c": None,
+    }
+    if psutil is not None:
+        try:
+            proc = psutil.Process(os.getpid())
+            vm = psutil.virtual_memory()
+            snap.update({
+                "python_ram": human_bytes(proc.memory_info().rss),
+                "system_ram": f"{human_bytes(vm.used)} / {human_bytes(vm.total)}",
+                "system_ram_percent": float(vm.percent),
+                "cpu_percent": float(psutil.cpu_percent(interval=0.0)),
+            })
+        except Exception:
+            pass
+    if torch.cuda.is_available():
+        try:
+            idx = torch.cuda.current_device()
+            props = torch.cuda.get_device_properties(idx)
+            allocated = int(torch.cuda.memory_allocated(idx))
+            reserved = int(torch.cuda.memory_reserved(idx))
+            total = int(props.total_memory)
+            snap.update({
+                "gpu_name": props.name,
+                "vram_allocated": human_bytes(allocated),
+                "vram_reserved": human_bytes(reserved),
+                "vram_used": human_bytes(allocated),
+                "vram_total": human_bytes(total),
+                "vram_percent": (100.0 * allocated / max(total, 1)),
+            })
+            if pynvml is not None:
+                try:
+                    pynvml.nvmlInit()
+                    handle = pynvml.nvmlDeviceGetHandleByIndex(idx)
+                    util = pynvml.nvmlDeviceGetUtilizationRates(handle)
+                    mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
+                    temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
+                    snap.update({
+                        "gpu_util_percent": int(util.gpu),
+                        "vram_used": human_bytes(int(mem.used)),
+                        "vram_total": human_bytes(int(mem.total)),
+                        "vram_percent": (100.0 * float(mem.used) / max(float(mem.total), 1.0)),
+                        "gpu_temp_c": int(temp),
+                    })
+                except Exception:
+                    pass
+        except Exception:
+            pass
+    return snap
+def system_usage(engine: Optional[LULUV2LiveEngine] = None) -> str:
+    lines = [f"OS: {platform.system()} {platform.release()}"]
+    if psutil is not None:
+        proc = psutil.Process(os.getpid())
+        vm = psutil.virtual_memory()
+        lines += [
+            f"Python RAM: {human_bytes(proc.memory_info().rss)}",
+            f"System RAM: {human_bytes(vm.used)} / {human_bytes(vm.total)} ({vm.percent:.1f}%)",
+            f"CPU: {psutil.cpu_percent(interval=0.0):.1f}%",
+        ]
+    else:
+        lines.append("psutil unavailable")
+    if torch.cuda.is_available():
+        idx = torch.cuda.current_device()
+        props = torch.cuda.get_device_properties(idx)
+        lines += [
+            "",
+            f"GPU: {props.name}",
+            f"VRAM allocated: {human_bytes(torch.cuda.memory_allocated(idx))}",
+            f"VRAM reserved: {human_bytes(torch.cuda.memory_reserved(idx))}",
+            f"VRAM total: {human_bytes(props.total_memory)}",
+        ]
+        if pynvml is not None:
+            try:
+                pynvml.nvmlInit()
+                handle = pynvml.nvmlDeviceGetHandleByIndex(idx)
+                util = pynvml.nvmlDeviceGetUtilizationRates(handle)
+                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
+                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
+                lines += [
+                    f"GPU util: {util.gpu}%",
+                    f"GPU memory: {human_bytes(mem.used)} / {human_bytes(mem.total)}",
+                    f"GPU temperature: {temp} C",
+                ]
+            except Exception as exc:
+                lines.append(f"NVML unavailable: {type(exc).__name__}: {exc}")
+    else:
+        lines += ["", "GPU: CUDA unavailable"]
+    if engine is not None:
+        lines += ["", engine.stats_text()]
+    return "\n".join(lines)

luluv2_optimized_engine.py ADDED Viewed

	@@ -0,0 +1,1133 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+LULUV2 optimized local inference engine.
+Goals:
+- load LULU2/LULUV2 checkpoints through the existing LULUV2 model file
+- no AutoModelForCausalLM.from_pretrained and no external model weights
+- vectorized prompt prefill into explicit KV caches
+- persistent session KV cache across turns when prompt tokens extend prior prompt
+- modes: fast(pass1/base), vwm(pass1+pass2), deep(pass1+pass2 long context)
+- safe fallback to slow full-prefix forward if cached path fails
+This is intentionally Python-first and debuggable.  It is a bridge toward
+kernel/CUDA-graph optimization, not the final kernel path.
+"""
+from __future__ import annotations
+import importlib.util
+import json
+import math
+import os
+import platform
+import time
+import traceback
+from contextlib import nullcontext
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any, Dict, Generator, List, Optional, Tuple
+import torch
+import torch.nn.functional as F
+try:
+    import psutil
+except Exception:
+    psutil = None
+try:
+    import pynvml
+except Exception:
+    pynvml = None
+STOP_STRINGS = [
+    "<|im_start|>", "<|im_end|>", "<|user|>", "<|system|>", "<|assistant|>",
+    "User:", "Assistant:", "\nuser:", "\nassistant:",
+]
+def setup_torch() -> None:
+    if torch.cuda.is_available():
+        try:
+            # Old API still works on current wheels; warnings are harmless.
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+        except Exception:
+            pass
+        try:
+            torch.backends.cuda.enable_flash_sdp(True)
+            torch.backends.cuda.enable_mem_efficient_sdp(True)
+            torch.backends.cuda.enable_math_sdp(False)
+        except Exception:
+            pass
+    if hasattr(torch, "set_float32_matmul_precision"):
+        try:
+            torch.set_float32_matmul_precision("high")
+        except Exception:
+            pass
+def human_bytes(num: float) -> str:
+    num = float(num)
+    for unit in ["B", "KB", "MB", "GB", "TB"]:
+        if abs(num) < 1024.0:
+            return f"{num:.2f} {unit}"
+        num /= 1024.0
+    return f"{num:.2f} PB"
+def value_to_text(value: Any) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, str):
+        return value
+    if isinstance(value, dict):
+        for key in ("text", "content", "value"):
+            if key in value:
+                return value_to_text(value.get(key))
+        return "\n".join(value_to_text(v) for v in value.values() if value_to_text(v))
+    if isinstance(value, (list, tuple)):
+        return "\n".join(value_to_text(v) for v in value if value_to_text(v))
+    return str(value)
+def clean_text(text: Any) -> str:
+    text = value_to_text(text).replace("\\n", "\n")
+    cut_points = [text.find(s) for s in STOP_STRINGS if s in text and text.find(s) > 0]
+    if cut_points:
+        text = text[: min(cut_points)]
+    for s in STOP_STRINGS:
+        text = text.replace(s, "")
+    text = text.strip()
+    for prefix in ("Assistant:", "assistant:", "Lulu:", "lulu:"):
+        if text.startswith(prefix):
+            text = text[len(prefix):].strip()
+    lines = [ln.rstrip() for ln in text.splitlines()]
+    # collapse excessive vertical whitespace without destroying code blocks too much
+    out: List[str] = []
+    blank = 0
+    for ln in lines:
+        if not ln.strip():
+            blank += 1
+            if blank <= 2:
+                out.append("")
+        else:
+            blank = 0
+            out.append(ln)
+    return "\n".join(out).strip()
+def normalize_history(history: Any) -> List[Dict[str, str]]:
+    out: List[Dict[str, str]] = []
+    if not history:
+        return out
+    for item in history:
+        if isinstance(item, dict):
+            role = item.get("role", "")
+            content = clean_text(item.get("content", ""))
+            if role in {"user", "assistant"} and content:
+                out.append({"role": role, "content": content})
+        elif isinstance(item, (tuple, list)) and len(item) >= 2:
+            u = clean_text(item[0])
+            a = clean_text(item[1])
+            if u:
+                out.append({"role": "user", "content": u})
+            if a:
+                out.append({"role": "assistant", "content": a})
+    return out
+def resolve_model_py(model_py: Optional[str]) -> str:
+    candidates: List[str] = []
+    if model_py:
+        candidates.append(model_py)
+    candidates.extend(["luluv2_inference_runtime.py"])
+    for c in candidates:
+        p = Path(c)
+        if p.exists():
+            return str(p.resolve())
+    raise FileNotFoundError("Could not find LULUV2 model file. Pass --model-py.")
+def import_model_py(model_py: Optional[str]):
+    path = resolve_model_py(model_py)
+    spec = importlib.util.spec_from_file_location("luluv2_runtime_module", path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Could not import model file: {path}")
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod, path
+@dataclass
+class GenerationConfig:
+    max_new_tokens: int = 512
+    temperature: float = 0.65
+    top_k: int = 40
+    top_p: float = 0.90
+    min_p: float = 0.03
+    repetition_penalty: float = 1.10
+    frequency_penalty: float = 0.02
+    greedy: bool = False
+    no_repeat_ngram: int = 4
+    stream_every: int = 1
+    max_context_tokens: int = 4096
+    mode: str = "vwm"  # fast, vwm, deep, slow
+    return_pass_metrics: bool = True
+    use_cache: bool = True
+    vectorized_prefill: bool = True
+    persistent_cache: bool = True
+    compile_step: bool = False
+@dataclass
+class GenerationStats:
+    prompt_tokens: int = 0
+    prompt_total_tokens: int = 0
+    prompt_kept_tokens: int = 0
+    prompt_dropped_tokens: int = 0
+    generated_tokens: int = 0
+    elapsed_sec: float = 0.0
+    tokens_per_sec: float = 0.0
+    prefill_sec: float = 0.0
+    prefill_tps: float = 0.0
+    cache_hit: bool = False
+    cache_reused_tokens: int = 0
+    cache_new_prefill_tokens: int = 0
+    mode: str = "vwm"
+    backend: str = "none"
+    last_token: str = ""
+    last_token_id: int = -1
+    last_token_prob: float = 0.0
+    last_entropy: float = 0.0
+    finish_reason: str = "none"
+    pass1_pass2_kl: Optional[float] = None
+    pass1_pass2_logit_cosine: Optional[float] = None
+class KVLayerCache:
+    def __init__(self):
+        self.k: Optional[torch.Tensor] = None  # [B, H, T, Dh]
+        self.v: Optional[torch.Tensor] = None
+    @property
+    def length(self) -> int:
+        if self.k is None:
+            return 0
+        return int(self.k.shape[2])
+    def set(self, k: torch.Tensor, v: torch.Tensor, max_len: int) -> None:
+        if k.shape[2] > max_len:
+            k = k[:, :, -max_len:, :]
+            v = v[:, :, -max_len:, :]
+        self.k = k.detach().contiguous()
+        self.v = v.detach().contiguous()
+    def append(self, k: torch.Tensor, v: torch.Tensor, max_len: int) -> None:
+        if self.k is None:
+            self.set(k, v, max_len)
+            return
+        self.k = torch.cat([self.k, k.detach()], dim=2)
+        self.v = torch.cat([self.v, v.detach()], dim=2)
+        if self.k.shape[2] > max_len:
+            self.k = self.k[:, :, -max_len:, :].contiguous()
+            self.v = self.v[:, :, -max_len:, :].contiguous()
+class DecoderKVCache:
+    def __init__(self, n_layers: int):
+        self.layers = [KVLayerCache() for _ in range(int(n_layers))]
+    def clear(self):
+        for layer in self.layers:
+            layer.k = None
+            layer.v = None
+    @property
+    def length(self) -> int:
+        if not self.layers:
+            return 0
+        return self.layers[0].length
+class LULUV2OptimizedEngine:
+    def __init__(
+        self,
+        ckpt_path: str,
+        model_py: Optional[str] = None,
+        tokenizer_dir: Optional[str] = None,
+        device: Optional[str] = None,
+        dtype: str = "bf16",
+        local_files_only: bool = True,
+        no_config_download: bool = True,
+        force_base_only: bool = False,
+    ):
+        setup_torch()
+        self.ckpt_path = str(ckpt_path)
+        self.ckpt_dir = Path(self.ckpt_path).resolve().parent
+        self.device = self._select_device(device)
+        self.dtype = self._dtype_from_name(dtype)
+        self.local_files_only = bool(local_files_only)
+        self.no_config_download = bool(no_config_download)
+        self.force_base_only = bool(force_base_only)
+        self.last_stats = GenerationStats()
+        self.recent_tokens: List[Dict[str, Any]] = []
+        self.last_prompt_total_tokens: int = 0
+        self.last_prompt_kept_tokens: int = 0
+        self.last_prompt_dropped_tokens: int = 0
+        self.cache_ids: Optional[torch.Tensor] = None
+        self.cache_mode: str = ""
+        self.cache_max_context: int = 0
+        self.pass1_cache: Optional[DecoderKVCache] = None
+        self.pass2_cache: Optional[DecoderKVCache] = None
+        self.cached_logits: Optional[torch.Tensor] = None
+        self.cached_pass1_logits: Optional[torch.Tensor] = None
+        self.cached_pass2_logits: Optional[torch.Tensor] = None
+        self.cache_backend: str = "cold"
+        self.goku, self.model_py_path = import_model_py(model_py)
+        self.args = SimpleNamespace(
+            checkpoint=self.ckpt_path,
+            tokenizer=tokenizer_dir or "",
+            model_id="",
+            no_config_download=self.no_config_download,
+            local_files_only=self.local_files_only,
+        )
+        print("[guard] LULUV2 cockpit: no AutoModelForCausalLM.from_pretrained call and no external model weights loaded.")
+        print(f"[load] checkpoint={self.ckpt_path}")
+        self.base_ckpt, base = self.goku.load_lulu2_base(self.args, self.device, self.dtype)
+        self.tokenizer = self._load_tokenizer(tokenizer_dir)
+        self.model, self.has_pass2 = self._maybe_wrap_pass2(base)
+        self.base = self.model.base if self.has_pass2 else self.model
+        self.n_layers = int(self.base.config.num_hidden_layers)
+        self.model.eval()
+        self.base.eval()
+        self.model_info = self._build_model_info()
+        self._compiled = False
+    def _select_device(self, device: Optional[str]):
+        if device:
+            return torch.device(device)
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        return torch.device("cpu")
+    def _dtype_from_name(self, name: str):
+        name = (name or "bf16").lower()
+        if name in {"bf16", "bfloat16"}:
+            return torch.bfloat16
+        if name in {"fp16", "float16", "half"}:
+            return torch.float16
+        return torch.float32
+    def _load_tokenizer(self, tokenizer_dir: Optional[str]):
+        if tokenizer_dir:
+            self.args.tokenizer = tokenizer_dir
+        else:
+            sibling = self.ckpt_dir / "tokenizer"
+            if sibling.is_dir():
+                self.args.tokenizer = str(sibling)
+        tok = self.goku.load_tokenizer(self.args, self.base_ckpt)
+        if getattr(tok, "pad_token_id", None) is None and getattr(tok, "eos_token_id", None) is not None:
+            try:
+                tok.pad_token = tok.eos_token
+            except Exception:
+                pass
+        # Long-prompt safety: for chat/RAG prompts, the latest user turn and final
+        # instruction are normally at the end. Right-side truncation silently drops
+        # exactly the part the model must answer, so force left truncation where the
+        # tokenizer supports it. encode() below also performs manual left truncation
+        # and records how many tokens were dropped.
+        try:
+            tok.truncation_side = "left"
+        except Exception:
+            pass
+        try:
+            tok.model_max_length = 10**9
+        except Exception:
+            pass
+        return tok
+    def _maybe_wrap_pass2(self, base):
+        ckpt = self.base_ckpt
+        if self.force_base_only or "pass2_state" not in ckpt:
+            print("[pass2] no pass2_state loaded; running base LULUV2 forward")
+            return base.to(self.device).eval(), False
+        cfg_dict = dict(ckpt.get("pass2_config") or {})
+        Pass2Config = self.goku.Pass2Config
+        fields = getattr(Pass2Config, "__dataclass_fields__", {})
+        pass2_cfg = Pass2Config(**{k: v for k, v in cfg_dict.items() if k in fields})
+        model = self.goku.Lulu2TwoPassForCausalLM(base, pass2_cfg)
+        missing, unexpected = model.load_state_dict(ckpt["pass2_state"], strict=False)
+        print(f"[pass2] loaded pass2_state missing={len(missing)} unexpected={len(unexpected)}")
+        model.to(device=self.device, dtype=self.dtype).eval()
+        return model, True
+    def _build_model_info(self) -> Dict[str, Any]:
+        total_params = sum(p.numel() for p in self.model.parameters())
+        c_codes = [(n, p.numel()) for n, p in self.model.named_parameters() if n.endswith(".c")]
+        gate_mean = None
+        adapter_gate_mean = None
+        if self.has_pass2:
+            with torch.no_grad():
+                gate_mean = float(torch.sigmoid(self.model.layer_gates.float()).mean().item())
+                vals = [float(torch.sigmoid(ad.gate.float()).item()) for ad in self.model.adapters]
+                adapter_gate_mean = sum(vals) / max(1, len(vals))
+        ckpt_size = Path(self.ckpt_path).stat().st_size if Path(self.ckpt_path).exists() else 0
+        cfg = getattr(self.base, "config", None)
+        return {
+            "checkpoint": self.ckpt_path,
+            "checkpoint_size": human_bytes(ckpt_size),
+            "model_py": self.model_py_path,
+            "device": str(self.device),
+            "dtype": str(self.dtype).replace("torch.", ""),
+            "has_pass2": self.has_pass2,
+            "total_params": total_params,
+            "vwm_c_modules": len(c_codes),
+            "vwm_c_params": sum(n for _, n in c_codes),
+            "pass2_layer_gate_mean": gate_mean,
+            "pass2_adapter_gate_mean": adapter_gate_mean,
+            "hidden_size": getattr(cfg, "hidden_size", None),
+            "layers": getattr(cfg, "num_hidden_layers", None),
+            "heads": getattr(cfg, "num_attention_heads", None),
+            "kv_heads": getattr(cfg, "num_key_value_heads", None),
+            "max_position_embeddings": getattr(cfg, "max_position_embeddings", None),
+        }
+    def amp_context(self):
+        if self.device.type == "cuda" and self.dtype in (torch.bfloat16, torch.float16):
+            return torch.autocast("cuda", dtype=self.dtype)
+        return nullcontext()
+    def build_chat_prompt(
+        self,
+        message: str,
+        history: Any,
+        system_prompt: str,
+        memory_notes: str = "",
+        history_turns: int = 4,
+        extra_context: str = "",
+    ) -> str:
+        history = normalize_history(history)
+        recent = history[-max(0, int(history_turns)) * 2:] if history_turns else []
+        system_chunks: List[str] = []
+        if system_prompt.strip():
+            system_chunks.append(system_prompt.strip())
+        if memory_notes.strip():
+            system_chunks.append("Useful memory notes:\n" + memory_notes.strip())
+        if extra_context.strip():
+            system_chunks.append("Relevant local context:\n" + extra_context.strip())
+        system = "\n\n".join(system_chunks)
+        messages: List[Dict[str, str]] = []
+        if system:
+            messages.append({"role": "system", "content": system})
+        messages.extend(recent)
+        messages.append({"role": "user", "content": clean_text(message)})
+        try:
+            return self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        except Exception:
+            parts: List[str] = []
+            if system:
+                parts.append(f"<|im_start|>system\n{system}<|im_end|>")
+            for item in recent:
+                parts.append(f"<|im_start|>{item['role']}\n{item['content']}<|im_end|>")
+            parts.append(f"<|im_start|>user\n{clean_text(message)}<|im_end|>")
+            parts.append("<|im_start|>assistant\n")
+            return "\n".join(parts)
+    def encode(self, text: str, max_context_tokens: int) -> torch.Tensor:
+        """Encode prompt with explicit left-truncation and accounting.
+        This avoids a common long-context failure mode: many tokenizers default to
+        right-side truncation, which keeps the beginning of a huge prompt and drops
+        the final user instruction. For chat, we almost always want the opposite.
+        """
+        max_context = max(1, int(max_context_tokens))
+        try:
+            self.tokenizer.truncation_side = "left"
+        except Exception:
+            pass
+        # Tokenize without tokenizer-side truncation so we know exactly whether the
+        # prompt was clipped. The prompt already contains chat special tokens.
+        try:
+            enc = self.tokenizer(
+                text,
+                return_tensors="pt",
+                truncation=False,
+                add_special_tokens=False,
+            )
+        except TypeError:
+            enc = self.tokenizer(text, return_tensors="pt", truncation=False)
+        ids = enc.input_ids
+        total = int(ids.shape[1])
+        dropped = max(0, total - max_context)
+        if dropped > 0:
+            ids = ids[:, -max_context:].contiguous()
+            # Do not reuse an older conversation cache after a hard context trim;
+            # the logical prefix changed and reuse can make long prompts feel like
+            # they are "forgetting" pieces.
+            self.pass1_cache = None
+            self.pass2_cache = None
+            self.cache_ids = None
+            self.cached_logits = None
+            self.cached_pass1_logits = None
+            self.cached_pass2_logits = None
+            self.cache_backend = "truncated-rebuild"
+        self.last_prompt_total_tokens = total
+        self.last_prompt_kept_tokens = int(ids.shape[1])
+        self.last_prompt_dropped_tokens = dropped
+        return ids.to(self.device)
+    def _position_ids(self, T: int, offset: int = 0) -> torch.Tensor:
+        return torch.arange(offset, offset + T, device=self.device, dtype=torch.long).unsqueeze(0)
+    def _attn_prefill(self, attn, hidden_states: torch.Tensor, position_ids: torch.Tensor, cache: KVLayerCache, max_context: int) -> torch.Tensor:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = attn.q_proj(hidden_states)
+        key_states = attn.k_proj(hidden_states)
+        value_states = attn.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, attn.num_heads, attn.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, attn.num_key_value_heads, attn.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, attn.num_key_value_heads, attn.head_dim).transpose(1, 2)
+        cos, sin = attn.rotary_emb(value_states, position_ids)
+        query_states, key_states = self.goku.apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        key_states = self.goku.repeat_kv(key_states, attn.num_key_value_groups)
+        value_states = self.goku.repeat_kv(value_states, attn.num_key_value_groups)
+        cache.set(key_states, value_states, max_context)
+        attn_output = F.scaled_dot_product_attention(
+            query_states, key_states, value_states, attn_mask=None, dropout_p=0.0, is_causal=True, scale=attn.scaling
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous().reshape(bsz, q_len, attn.hidden_size)
+        return attn.o_proj(attn_output)
+    def _attn_step(self, attn, hidden_states: torch.Tensor, pos: int, cache: KVLayerCache, max_context: int) -> torch.Tensor:
+        bsz, q_len, _ = hidden_states.size()
+        assert q_len == 1
+        query_states = attn.q_proj(hidden_states)
+        key_states = attn.k_proj(hidden_states)
+        value_states = attn.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, attn.num_heads, attn.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, attn.num_key_value_heads, attn.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, attn.num_key_value_heads, attn.head_dim).transpose(1, 2)
+        position_ids = self._position_ids(1, pos)
+        cos, sin = attn.rotary_emb(value_states, position_ids)
+        query_states, key_states = self.goku.apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        key_states = self.goku.repeat_kv(key_states, attn.num_key_value_groups)
+        value_states = self.goku.repeat_kv(value_states, attn.num_key_value_groups)
+        cache.append(key_states, value_states, max_context)
+        if cache.k is None or cache.v is None:
+            raise RuntimeError("KV cache append failed")
+        attn_output = F.scaled_dot_product_attention(
+            query_states, cache.k, cache.v, attn_mask=None, dropout_p=0.0, is_causal=False, scale=attn.scaling
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous().reshape(bsz, q_len, attn.hidden_size)
+        return attn.o_proj(attn_output)
+    def _layer_prefill(self, layer, hidden_states: torch.Tensor, position_ids: torch.Tensor, cache: KVLayerCache, max_context: int) -> torch.Tensor:
+        residual = hidden_states
+        x = layer.input_layernorm(hidden_states)
+        x = self._attn_prefill(layer.self_attn, x, position_ids, cache, max_context)
+        hidden_states = residual + x
+        residual = hidden_states
+        x = layer.post_attention_layernorm(hidden_states)
+        x = layer.mlp(x)
+        return residual + x
+    def _layer_step(self, layer, hidden_states: torch.Tensor, pos: int, cache: KVLayerCache, max_context: int) -> torch.Tensor:
+        residual = hidden_states
+        x = layer.input_layernorm(hidden_states)
+        x = self._attn_step(layer.self_attn, x, pos, cache, max_context)
+        hidden_states = residual + x
+        residual = hidden_states
+        x = layer.post_attention_layernorm(hidden_states)
+        x = layer.mlp(x)
+        return residual + x
+    @torch.no_grad()
+    def _prefill_pass1(self, input_ids: torch.Tensor, max_context: int, use_pass_embed: bool) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor, torch.Tensor]:
+        T = int(input_ids.shape[1])
+        position_ids = self._position_ids(T, 0)
+        cache = DecoderKVCache(self.n_layers)
+        h = self.base.model.embed_tokens(input_ids)
+        if use_pass_embed and self.has_pass2:
+            h = h + self.model.pass_embed[0].to(dtype=h.dtype, device=h.device).view(1, 1, -1)
+        layer_states: List[torch.Tensor] = []
+        for i, layer in enumerate(self.base.model.layers):
+            h = self._layer_prefill(layer, h, position_ids, cache.layers[i], max_context)
+            layer_states.append(h)
+        normed = self.base.model.norm(h)
+        logits = self.base.lm_head(normed)
+        self.pass1_cache = cache
+        return h, layer_states, position_ids, logits
+    @torch.no_grad()
+    def _prefill_pass2(self, h1_resid: torch.Tensor, pass1_states: List[torch.Tensor], position_ids: torch.Tensor, max_context: int) -> torch.Tensor:
+        if not self.has_pass2:
+            raise RuntimeError("pass2 requested but checkpoint has no pass2_state")
+        cache = DecoderKVCache(self.n_layers)
+        h2 = h1_resid + self.model.pass_embed[1].to(dtype=h1_resid.dtype, device=h1_resid.device).view(1, 1, -1)
+        for i, layer in enumerate(self.base.model.layers):
+            before = h2
+            layer_out = self._layer_prefill(layer, h2, position_ids, cache.layers[i], max_context)
+            layer_delta = layer_out - before
+            gate = torch.sigmoid(self.model.layer_gates[i]).to(dtype=h2.dtype, device=h2.device)
+            adapter_delta = self.model.adapters[i](h2, pass1_states[i])
+            h2 = before + gate * layer_delta + adapter_delta
+        normed = self.base.model.norm(h2)
+        logits = self.base.lm_head(normed)
+        self.pass2_cache = cache
+        return logits
+    @torch.no_grad()
+    def _step_pass1(self, token_id: torch.Tensor, pos: int, max_context: int, use_pass_embed: bool) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
+        if self.pass1_cache is None:
+            self.pass1_cache = DecoderKVCache(self.n_layers)
+        h = self.base.model.embed_tokens(token_id)
+        if use_pass_embed and self.has_pass2:
+            h = h + self.model.pass_embed[0].to(dtype=h.dtype, device=h.device).view(1, 1, -1)
+        states: List[torch.Tensor] = []
+        for i, layer in enumerate(self.base.model.layers):
+            h = self._layer_step(layer, h, pos, self.pass1_cache.layers[i], max_context)
+            states.append(h)
+        logits = self.base.lm_head(self.base.model.norm(h))
+        return h, states, logits
+    @torch.no_grad()
+    def _step_pass2(self, h1_resid: torch.Tensor, pass1_states: List[torch.Tensor], pos: int, max_context: int) -> torch.Tensor:
+        if not self.has_pass2:
+            raise RuntimeError("pass2 step requested but unavailable")
+        if self.pass2_cache is None:
+            self.pass2_cache = DecoderKVCache(self.n_layers)
+        h2 = h1_resid + self.model.pass_embed[1].to(dtype=h1_resid.dtype, device=h1_resid.device).view(1, 1, -1)
+        for i, layer in enumerate(self.base.model.layers):
+            before = h2
+            layer_out = self._layer_step(layer, h2, pos, self.pass2_cache.layers[i], max_context)
+            layer_delta = layer_out - before
+            gate = torch.sigmoid(self.model.layer_gates[i]).to(dtype=h2.dtype, device=h2.device)
+            adapter_delta = self.model.adapters[i](h2, pass1_states[i])
+            h2 = before + gate * layer_delta + adapter_delta
+        return self.base.lm_head(self.base.model.norm(h2))
+    def _ids_prefix_len(self, old: torch.Tensor, new: torch.Tensor) -> int:
+        if old is None or old.numel() == 0 or new.numel() == 0:
+            return 0
+        old1 = old[0]
+        new1 = new[0]
+        max_n = min(int(old1.numel()), int(new1.numel()))
+        if max_n == 0:
+            return 0
+        # Fast path: old is exact prefix of new.
+        if int(old1.numel()) <= int(new1.numel()) and torch.equal(old1, new1[: old1.numel()]):
+            return int(old1.numel())
+        # Conservative fallback, scan from max down; prompts are usually exact-prefix or reset.
+        for n in range(max_n, 0, -1):
+            if torch.equal(old1[:n], new1[:n]):
+                return n
+        return 0
+    @torch.no_grad()
+    def _token_prefill_context(self, input_ids: torch.Tensor, cfg: GenerationConfig, use_pass2: bool, use_pass_embed: bool, max_context: int) -> None:
+        """
+        Conservative cache builder.
+        It fills the same pass1/pass2 KV caches by walking the prompt one token at a time.
+        This is slower than vectorized prefill but much safer across checkpoint/runtime variants,
+        and it still gives a valid decode cache + persistent cache for the generated tokens.
+        """
+        self.pass1_cache = DecoderKVCache(self.n_layers)
+        self.pass2_cache = DecoderKVCache(self.n_layers) if use_pass2 else None
+        self.cached_logits = None
+        self.cached_pass1_logits = None
+        self.cached_pass2_logits = None
+        T = int(input_ids.shape[1])
+        for pos in range(T):
+            tok = input_ids[:, pos:pos + 1]
+            h1, states, logits1 = self._step_pass1(tok, pos, max_context, use_pass_embed=use_pass_embed)
+            if use_pass2:
+                logits2 = self._step_pass2(h1, states, pos, max_context)
+                self.cached_logits = logits2
+                self.cached_pass1_logits = logits1
+                self.cached_pass2_logits = logits2
+            else:
+                self.cached_logits = logits1
+                self.cached_pass1_logits = logits1
+                self.cached_pass2_logits = None
+    @torch.no_grad()
+    def _prepare_cached_context(self, input_ids: torch.Tensor, cfg: GenerationConfig) -> Tuple[torch.Tensor, bool, int, int, str]:
+        mode = self._effective_mode(cfg.mode)
+        max_context = int(cfg.max_context_tokens)
+        use_pass2 = mode in {"vwm", "deep"} and self.has_pass2
+        use_pass_embed = bool(use_pass2)
+        T = int(input_ids.shape[1])
+        if T > max_context:
+            input_ids = input_ids[:, -max_context:]
+            T = max_context
+        # If mode/context changed, persistent cache is invalid.
+        cache_ok = (
+            cfg.persistent_cache
+            and self.cache_ids is not None
+            and self.cache_mode == mode
+            and self.cache_max_context == max_context
+            and self.pass1_cache is not None
+        )
+        prefix = self._ids_prefix_len(self.cache_ids, input_ids) if cache_ok else 0
+        cache_hit = bool(cache_ok and prefix == int(self.cache_ids.shape[1]) and prefix <= T and prefix > 0)
+        t0 = time.time()
+        if cache_hit:
+            # Process only suffix between prior cached prompt and new prompt.
+            suffix = input_ids[:, prefix:]
+            for j in range(int(suffix.shape[1])):
+                tok = suffix[:, j : j + 1]
+                pos = prefix + j
+                h1, states, logits1 = self._step_pass1(tok, pos, max_context, use_pass_embed=use_pass_embed)
+                if use_pass2:
+                    logits2 = self._step_pass2(h1, states, pos, max_context)
+                    self.cached_logits = logits2
+                    self.cached_pass1_logits = logits1
+                    self.cached_pass2_logits = logits2
+                else:
+                    self.cached_logits = logits1
+                    self.cached_pass1_logits = logits1
+                    self.cached_pass2_logits = None
+            self.cache_ids = input_ids.detach().clone()
+            self.cache_backend = "persistent-kv-suffix" if suffix.numel() else "persistent-kv-hit"
+            return input_ids, True, prefix, int(suffix.shape[1]), self.cache_backend
+        # Reset and prefill. Prefer vectorized prefill, but fall back to conservative
+        # token prefill if the runtime variant does not support our vectorized cache path.
+        self.pass1_cache = None
+        self.pass2_cache = None
+        backend = "vectorized-prefill"
+        if bool(cfg.vectorized_prefill):
+            try:
+                h1, states, pos_ids, logits1 = self._prefill_pass1(input_ids, max_context, use_pass_embed=use_pass_embed)
+                if use_pass2:
+                    logits2 = self._prefill_pass2(h1, states, pos_ids, max_context)
+                    self.cached_logits = logits2
+                    self.cached_pass1_logits = logits1
+                    self.cached_pass2_logits = logits2
+                else:
+                    self.cached_logits = logits1
+                    self.cached_pass1_logits = logits1
+                    self.cached_pass2_logits = None
+            except Exception as exc:
+                if os.getenv("LULUV2_CACHE_DEBUG", "0").strip().lower() in {"1", "true", "yes", "on"}:
+                    print("[cache] vectorized prefill failed; using token-prefill cache.")
+                    traceback.print_exc()
+                self._token_prefill_context(input_ids, cfg, use_pass2=use_pass2, use_pass_embed=use_pass_embed, max_context=max_context)
+                backend = "token-prefill-cache"
+        else:
+            self._token_prefill_context(input_ids, cfg, use_pass2=use_pass2, use_pass_embed=use_pass_embed, max_context=max_context)
+            backend = "token-prefill-cache"
+        self.cache_ids = input_ids.detach().clone()
+        self.cache_mode = mode
+        self.cache_max_context = max_context
+        self.cache_backend = backend
+        return input_ids, False, 0, T, self.cache_backend
+    def _effective_mode(self, mode: str) -> str:
+        mode = (mode or "vwm").lower()
+        if mode in {"fast", "base", "pass1"}:
+            return "fast"
+        if mode in {"deep", "32k", "long"}:
+            return "deep"
+        if mode in {"slow", "full"}:
+            return "slow"
+        return "vwm"
+    @torch.no_grad()
+    def pass_metrics_from_logits(self, logits1: Optional[torch.Tensor], logits2: Optional[torch.Tensor]) -> Tuple[Optional[float], Optional[float]]:
+        if logits1 is None or logits2 is None:
+            return None, None
+        try:
+            l1 = logits1[:, -1, :].float()
+            l2 = logits2[:, -1, :].float()
+            kl = F.kl_div(F.log_softmax(l2, dim=-1), F.softmax(l1, dim=-1), reduction="batchmean")
+            cos = F.cosine_similarity(l1, l2, dim=-1).mean()
+            return float(kl.item()), float(cos.item())
+        except Exception:
+            return None, None
+    def _apply_penalties(self, logits: torch.Tensor, generated: torch.Tensor, cfg: GenerationConfig) -> torch.Tensor:
+        if generated.numel() == 0:
+            return logits
+        out = logits.clone()
+        uniq, counts = torch.unique(generated.view(-1), return_counts=True)
+        if cfg.repetition_penalty != 1.0:
+            selected = out[:, uniq]
+            selected = torch.where(selected > 0, selected / float(cfg.repetition_penalty), selected * float(cfg.repetition_penalty))
+            out[:, uniq] = selected
+        if cfg.frequency_penalty:
+            out[:, uniq] -= float(cfg.frequency_penalty) * counts.to(out.dtype).unsqueeze(0)
+        n = int(cfg.no_repeat_ngram)
+        if n > 1 and generated.size(1) >= n - 1:
+            seq = generated[0].tolist()
+            prefix = tuple(seq[-(n - 1):])
+            banned = []
+            for i in range(len(seq) - n + 1):
+                if tuple(seq[i:i + n - 1]) == prefix:
+                    banned.append(seq[i + n - 1])
+            if banned:
+                out[:, list(set(banned))] = -float("inf")
+        return out
+    @torch.no_grad()
+    def _sample_next(self, logits: torch.Tensor, generated: torch.Tensor, cfg: GenerationConfig) -> Tuple[torch.Tensor, Dict[str, float]]:
+        work = self._apply_penalties(logits.float(), generated, cfg)
+        if cfg.greedy or cfg.temperature <= 0:
+            probs = torch.softmax(work, dim=-1)
+            next_id = torch.argmax(work, dim=-1, keepdim=True)
+        else:
+            work = work / max(float(cfg.temperature), 1e-6)
+            if cfg.top_k > 0:
+                k = min(int(cfg.top_k), work.size(-1))
+                thresh = torch.topk(work, k, dim=-1).values[..., -1, None]
+                work = torch.where(work >= thresh, work, torch.full_like(work, -float("inf")))
+            if 0.0 < cfg.top_p < 1.0:
+                sorted_logits, sorted_idx = torch.sort(work, descending=True, dim=-1)
+                sorted_probs = torch.softmax(sorted_logits, dim=-1)
+                cumprobs = torch.cumsum(sorted_probs, dim=-1)
+                remove = cumprobs > float(cfg.top_p)
+                shifted = remove.clone()
+                shifted[..., 1:] = remove[..., :-1]
+                shifted[..., 0] = False
+                sorted_logits = sorted_logits.masked_fill(shifted, -float("inf"))
+                work = torch.full_like(work, -float("inf")).scatter(1, sorted_idx, sorted_logits)
+            if 0.0 < cfg.min_p < 1.0:
+                probs_for_minp = torch.softmax(work, dim=-1)
+                max_prob = probs_for_minp.max(dim=-1, keepdim=True).values
+                keep = probs_for_minp >= float(cfg.min_p) * max_prob
+                work = work.masked_fill(~keep, -float("inf"))
+            probs = torch.softmax(work, dim=-1)
+            if torch.isnan(probs).any() or not torch.isfinite(probs.sum()) or float(probs.sum()) <= 0:
+                next_id = torch.argmax(logits, dim=-1, keepdim=True)
+                probs = torch.softmax(logits.float(), dim=-1)
+            else:
+                next_id = torch.multinomial(probs, 1)
+        prob = float(probs.gather(1, next_id).item()) if probs.numel() else 0.0
+        entropy = float((-(probs * torch.log(probs.clamp_min(1e-12))).sum(dim=-1)).mean().item()) if probs.numel() else 0.0
+        return next_id, {"prob": prob, "entropy": entropy}
+    @torch.no_grad()
+    def _slow_generate(self, ids: torch.Tensor, prompt_len: int, cfg: GenerationConfig) -> Generator[str, None, None]:
+        # Compatibility path: full prefix recompute every token.
+        eos_id = getattr(self.tokenizer, "eos_token_id", None)
+        last_text = ""
+        t0 = time.time()
+        for step in range(int(cfg.max_new_tokens)):
+            ctx = ids[:, -int(cfg.max_context_tokens):]
+            with self.amp_context():
+                out = self.model(ctx) if self._effective_mode(cfg.mode) != "fast" else self.base(ctx)
+                logits = out.logits[:, -1, :].float()
+            generated = ids[:, prompt_len:]
+            next_id, tok_stats = self._sample_next(logits, generated, cfg)
+            ids = torch.cat([ids, next_id.to(ids.device)], dim=-1)
+            token_id = int(next_id.item())
+            token_text = self.tokenizer.decode([token_id], skip_special_tokens=False)
+            self._record_token(step + 1, token_id, token_text, tok_stats)
+            if eos_id is not None and token_id == int(eos_id):
+                break
+            if (step + 1) % int(cfg.stream_every) == 0 or step == 0:
+                raw = self.tokenizer.decode(ids[0, prompt_len:], skip_special_tokens=True)
+                if any(s in raw for s in STOP_STRINGS):
+                    break
+                text = clean_text(raw)
+                if text and text != last_text:
+                    elapsed = time.time() - t0
+                    gen = int(ids.shape[1]) - prompt_len
+                    self.last_stats = GenerationStats(prompt_tokens=prompt_len, prompt_total_tokens=self.last_prompt_total_tokens, prompt_kept_tokens=self.last_prompt_kept_tokens, prompt_dropped_tokens=self.last_prompt_dropped_tokens, generated_tokens=gen, elapsed_sec=elapsed, tokens_per_sec=gen / max(elapsed, 1e-9), mode=cfg.mode, backend="slow-full-prefix", last_token=token_text, last_token_id=token_id, last_token_prob=tok_stats["prob"], last_entropy=tok_stats["entropy"], finish_reason="streaming")
+                    last_text = text
+                    yield text
+        final = clean_text(self.tokenizer.decode(ids[0, prompt_len:], skip_special_tokens=True))
+        if final:
+            yield final
+    def _record_token(self, i: int, token_id: int, token_text: str, tok_stats: Dict[str, float]) -> None:
+        self.recent_tokens.append({"i": i, "id": token_id, "text": token_text, "prob": tok_stats.get("prob", 0.0), "entropy": tok_stats.get("entropy", 0.0)})
+        self.recent_tokens = self.recent_tokens[-64:]
+    @torch.no_grad()
+    def generate(self, prompt: str, cfg: GenerationConfig) -> Generator[str, None, None]:
+        self.model.eval()
+        self.base.eval()
+        self.recent_tokens = []
+        mode = self._effective_mode(cfg.mode)
+        if mode == "deep":
+            cfg.max_context_tokens = max(int(cfg.max_context_tokens), 16384)
+        ids = self.encode(prompt, max_context_tokens=int(cfg.max_context_tokens))
+        prompt_len = int(ids.shape[1])
+        if self.last_prompt_dropped_tokens > 0:
+            print(f"[context] prompt clipped: kept={self.last_prompt_kept_tokens} total={self.last_prompt_total_tokens} dropped={self.last_prompt_dropped_tokens}")
+        t_start = time.time()
+        prefill_sec = 0.0
+        cache_hit = False
+        reused = 0
+        new_prefill = prompt_len
+        backend = ""
+        pass_kl = None
+        pass_cos = None
+        if (not cfg.use_cache) or mode == "slow":
+            yield from self._slow_generate(ids, prompt_len, cfg)
+            return
+        try:
+            with self.amp_context():
+                t_pref = time.time()
+                ids, cache_hit, reused, new_prefill, backend = self._prepare_cached_context(ids, cfg)
+                prefill_sec = time.time() - t_pref
+                pass_kl, pass_cos = self.pass_metrics_from_logits(self.cached_pass1_logits, self.cached_pass2_logits) if cfg.return_pass_metrics else (None, None)
+        except Exception as exc:
+            print(f"[cache] cached path failed; falling back to slow full-prefix: {type(exc).__name__}: {exc}")
+            if os.getenv("LULUV2_CACHE_DEBUG", "0").strip().lower() in {"1", "true", "yes", "on"}:
+                traceback.print_exc()
+            self.pass1_cache = None
+            self.pass2_cache = None
+            self.cache_ids = None
+            yield from self._slow_generate(ids, prompt_len, cfg)
+            return
+        eos_id = getattr(self.tokenizer, "eos_token_id", None)
+        last_text = ""
+        finish_reason = "length"
+        use_pass2 = mode in {"vwm", "deep"} and self.has_pass2
+        use_pass_embed = bool(use_pass2)
+        for step in range(int(cfg.max_new_tokens)):
+            logits = self.cached_logits[:, -1, :].float() if self.cached_logits is not None and self.cached_logits.dim() == 3 else self.cached_logits.float()
+            generated = ids[:, prompt_len:]
+            next_id, tok_stats = self._sample_next(logits, generated, cfg)
+            token_id = int(next_id.item())
+            token_text = self.tokenizer.decode([token_id], skip_special_tokens=False)
+            self._record_token(step + 1, token_id, token_text, tok_stats)
+            ids = torch.cat([ids, next_id.to(ids.device)], dim=-1)
+            if eos_id is not None and token_id == int(eos_id):
+                finish_reason = "eos"
+                break
+            pos = int(ids.shape[1]) - 1
+            try:
+                with self.amp_context():
+                    h1, states, logits1 = self._step_pass1(next_id.to(self.device), pos, int(cfg.max_context_tokens), use_pass_embed=use_pass_embed)
+                    if use_pass2:
+                        logits2 = self._step_pass2(h1, states, pos, int(cfg.max_context_tokens))
+                        self.cached_logits = logits2
+                        self.cached_pass1_logits = logits1
+                        self.cached_pass2_logits = logits2
+                    else:
+                        self.cached_logits = logits1
+                        self.cached_pass1_logits = logits1
+                        self.cached_pass2_logits = None
+                if self.cache_ids is not None:
+                    self.cache_ids = torch.cat([self.cache_ids, next_id.detach().to(self.cache_ids.device)], dim=-1)
+                    if self.cache_ids.shape[1] > int(cfg.max_context_tokens):
+                        self.cache_ids = self.cache_ids[:, -int(cfg.max_context_tokens):]
+            except Exception as exc:
+                print(f"[decode-cache] step failed; falling back for this request: {type(exc).__name__}: {exc}")
+                # Finish with slow path from current ids; do not pretend cache is valid.
+                self.cache_ids = None
+                yield from self._slow_generate(ids, prompt_len, cfg)
+                return
+            if (step + 1) % int(cfg.stream_every) == 0 or step == 0:
+                raw = self.tokenizer.decode(ids[0, prompt_len:], skip_special_tokens=True)
+                if any(s in raw for s in STOP_STRINGS):
+                    finish_reason = "stop_string"
+                    break
+                text = clean_text(raw)
+                if text and text != last_text:
+                    elapsed = time.time() - t_start
+                    gen = int(ids.shape[1]) - prompt_len
+                    self.last_stats = GenerationStats(
+                        prompt_tokens=prompt_len,
+                        prompt_total_tokens=self.last_prompt_total_tokens,
+                        prompt_kept_tokens=self.last_prompt_kept_tokens,
+                        prompt_dropped_tokens=self.last_prompt_dropped_tokens,
+                        generated_tokens=gen,
+                        elapsed_sec=elapsed,
+                        tokens_per_sec=gen / max(elapsed - prefill_sec, 1e-9),
+                        prefill_sec=prefill_sec,
+                        prefill_tps=(new_prefill / max(prefill_sec, 1e-9)),
+                        cache_hit=cache_hit,
+                        cache_reused_tokens=reused,
+                        cache_new_prefill_tokens=new_prefill,
+                        mode=mode,
+                        backend=backend,
+                        last_token=token_text,
+                        last_token_id=token_id,
+                        last_token_prob=tok_stats["prob"],
+                        last_entropy=tok_stats["entropy"],
+                        finish_reason="streaming",
+                        pass1_pass2_kl=pass_kl,
+                        pass1_pass2_logit_cosine=pass_cos,
+                    )
+                    last_text = text
+                    yield text
+        raw = self.tokenizer.decode(ids[0, prompt_len:], skip_special_tokens=True)
+        final = clean_text(raw)
+        elapsed = time.time() - t_start
+        gen = int(ids.shape[1]) - prompt_len
+        self.last_stats = GenerationStats(
+            prompt_tokens=prompt_len,
+            prompt_total_tokens=self.last_prompt_total_tokens,
+            prompt_kept_tokens=self.last_prompt_kept_tokens,
+            prompt_dropped_tokens=self.last_prompt_dropped_tokens,
+            generated_tokens=gen,
+            elapsed_sec=elapsed,
+            tokens_per_sec=gen / max(elapsed - prefill_sec, 1e-9),
+            prefill_sec=prefill_sec,
+            prefill_tps=(new_prefill / max(prefill_sec, 1e-9)),
+            cache_hit=cache_hit,
+            cache_reused_tokens=reused,
+            cache_new_prefill_tokens=new_prefill,
+            mode=mode,
+            backend=backend,
+            last_token=self.recent_tokens[-1]["text"] if self.recent_tokens else "",
+            last_token_id=self.recent_tokens[-1]["id"] if self.recent_tokens else -1,
+            last_token_prob=self.recent_tokens[-1]["prob"] if self.recent_tokens else 0.0,
+            last_entropy=self.recent_tokens[-1]["entropy"] if self.recent_tokens else 0.0,
+            finish_reason=finish_reason,
+            pass1_pass2_kl=pass_kl,
+            pass1_pass2_logit_cosine=pass_cos,
+        )
+        if final:
+            yield final
+    def clear_session_cache(self) -> None:
+        self.pass1_cache = None
+        self.pass2_cache = None
+        self.cache_ids = None
+        self.cached_logits = None
+        self.cached_pass1_logits = None
+        self.cached_pass2_logits = None
+        self.cache_backend = "cleared"
+    def stats_dict(self) -> Dict[str, Any]:
+        return {"generation": asdict(self.last_stats), "model": self.model_info, "system": system_snapshot(self)}
+    def stats_text(self) -> str:
+        s = self.last_stats
+        lines = [
+            f"Mode: {s.mode} | backend={s.backend}",
+            f"Prompt tokens: {s.prompt_tokens} kept / {getattr(s, 'prompt_total_tokens', s.prompt_tokens)} total / {getattr(s, 'prompt_dropped_tokens', 0)} dropped",
+            f"Generated tokens: {s.generated_tokens}",
+            f"Elapsed: {s.elapsed_sec:.2f}s | prefill={s.prefill_sec:.2f}s ({s.prefill_tps:.1f} tok/s)",
+            f"Decode speed: {s.tokens_per_sec:.2f} tok/s",
+            f"Cache: hit={s.cache_hit} reused={s.cache_reused_tokens} new_prefill={s.cache_new_prefill_tokens}",
+            f"Finish reason: {s.finish_reason}",
+            f"Last token: {s.last_token!r} id={s.last_token_id} p={s.last_token_prob:.4f} H={s.last_entropy:.2f}",
+        ]
+        if s.pass1_pass2_kl is not None:
+            lines.append(f"Pass1→Pass2 KL: {s.pass1_pass2_kl:.6f}")
+        if s.pass1_pass2_logit_cosine is not None:
+            lines.append(f"Pass1/Pass2 cosine: {s.pass1_pass2_logit_cosine:.6f}")
+        lines.extend([
+            "",
+            f"Checkpoint: {self.model_info['checkpoint']}",
+            f"Checkpoint size: {self.model_info['checkpoint_size']}",
+            f"Device: {self.model_info['device']} dtype={self.model_info['dtype']}",
+            f"Pass2 active: {self.model_info['has_pass2']}",
+            f"Params: {self.model_info['total_params']:,}",
+            f"VWM c modules: {self.model_info['vwm_c_modules']} ({self.model_info['vwm_c_params']:,} c params)",
+        ])
+        return "\n".join(lines)
+    def token_trace_text(self) -> str:
+        if not self.recent_tokens:
+            return "No tokens generated yet."
+        rows = []
+        for t in self.recent_tokens[-48:]:
+            safe = repr(t["text"])[1:-1]
+            rows.append(f"{t['i']:04d}  id={t['id']:<7}  p={t['prob']:.4f}  H={t['entropy']:.2f}  {safe}")
+        return "\n".join(rows)
+def system_snapshot(engine: Optional[LULUV2OptimizedEngine] = None) -> Dict[str, Any]:
+    snap: Dict[str, Any] = {
+        "python_ram": "n/a", "system_ram": "n/a", "system_ram_percent": 0.0,
+        "cpu_percent": 0.0, "gpu_name": "CUDA unavailable", "vram_allocated": "n/a",
+        "vram_reserved": "n/a", "vram_used": "n/a", "vram_total": "n/a",
+        "vram_percent": 0.0, "gpu_util_percent": None, "gpu_temp_c": None,
+    }
+    if psutil is not None:
+        try:
+            proc = psutil.Process(os.getpid())
+            vm = psutil.virtual_memory()
+            snap.update({
+                "python_ram": human_bytes(proc.memory_info().rss),
+                "system_ram": f"{human_bytes(vm.used)} / {human_bytes(vm.total)}",
+                "system_ram_percent": float(vm.percent),
+                "cpu_percent": float(psutil.cpu_percent(interval=0.0)),
+            })
+        except Exception:
+            pass
+    if torch.cuda.is_available():
+        try:
+            idx = torch.cuda.current_device()
+            props = torch.cuda.get_device_properties(idx)
+            allocated = int(torch.cuda.memory_allocated(idx))
+            reserved = int(torch.cuda.memory_reserved(idx))
+            total = int(props.total_memory)
+            snap.update({
+                "gpu_name": props.name,
+                "vram_allocated": human_bytes(allocated),
+                "vram_reserved": human_bytes(reserved),
+                "vram_used": human_bytes(allocated),
+                "vram_total": human_bytes(total),
+                "vram_percent": 100.0 * allocated / max(total, 1),
+            })
+            if pynvml is not None:
+                try:
+                    pynvml.nvmlInit()
+                    handle = pynvml.nvmlDeviceGetHandleByIndex(idx)
+                    util = pynvml.nvmlDeviceGetUtilizationRates(handle)
+                    mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
+                    temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
+                    snap.update({
+                        "gpu_util_percent": int(util.gpu),
+                        "vram_used": human_bytes(int(mem.used)),
+                        "vram_total": human_bytes(int(mem.total)),
+                        "vram_percent": 100.0 * float(mem.used) / max(float(mem.total), 1.0),
+                        "gpu_temp_c": int(temp),
+                    })
+                except Exception:
+                    pass
+        except Exception:
+            pass
+    return snap
+def system_usage(engine: Optional[LULUV2OptimizedEngine] = None) -> str:
+    snap = system_snapshot(engine)
+    lines = [
+        f"OS: {platform.system()} {platform.release()}",
+        f"Python RAM: {snap['python_ram']}",
+        f"System RAM: {snap['system_ram']} ({snap['system_ram_percent']:.1f}%)",
+        f"CPU: {snap['cpu_percent']:.1f}%",
+        "",
+        f"GPU: {snap['gpu_name']}",
+        f"VRAM used: {snap['vram_used']} / {snap['vram_total']} ({snap['vram_percent']:.1f}%)",
+        f"VRAM allocated: {snap['vram_allocated']}",
+        f"VRAM reserved: {snap['vram_reserved']}",
+    ]
+    if snap.get("gpu_util_percent") is not None:
+        lines.append(f"GPU util: {snap['gpu_util_percent']}%")
+    if snap.get("gpu_temp_c") is not None:
+        lines.append(f"GPU temp: {snap['gpu_temp_c']} C")
+    if engine is not None:
+        lines.extend(["", engine.stats_text()])
+    return "\n".join(lines)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch>=2.1
+tokenizers>=0.15
+transformers>=4.40
+gradio>=4.0
+psutil>=5.9
+nvidia-ml-py>=12.0; platform_system != "Darwin"

run_chat.ps1 ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ $ErrorActionPreference = "Stop"
2	+ python .\app.py --ckpt .\LULUV2-bf16.pt --model-py .\luluv2_inference_runtime.py --tokenizer-dir .\tokenizer --inbrowser

run_chat.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+#!/usr/bin/env bash
+set -euo pipefail
+python ./app.py --ckpt ./LULUV2-bf16.pt --model-py ./luluv2_inference_runtime.py --tokenizer-dir ./tokenizer --inbrowser

run_inference.py ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/usr/bin/env python3
+"""Small CLI launcher for LULUV2 native-bf16 local inference."""
+from __future__ import annotations
+import argparse
+import torch
+from luluv2_live_inference import LULUV2LiveEngine, GenerationConfig
+def main():
+    p = argparse.ArgumentParser("LULUV2 local inference")
+    p.add_argument("--ckpt", default="LULUV2-bf16.pt", help="Path to the native-bf16 checkpoint file")
+    p.add_argument("--tokenizer-dir", default="tokenizer", help="Local tokenizer directory")
+    p.add_argument("--prompt", required=True, help="User prompt")
+    p.add_argument("--system", default="You are LuluV2, a helpful local AI assistant.")
+    p.add_argument("--max-new-tokens", type=int, default=512)
+    p.add_argument("--temperature", type=float, default=0.65)
+    p.add_argument("--top-p", type=float, default=0.90)
+    p.add_argument("--top-k", type=int, default=40)
+    p.add_argument("--device", default=None)
+    p.add_argument("--dtype", default="bf16", choices=["bf16", "fp16", "fp32"])
+    args = p.parse_args()
+    engine = LULUV2LiveEngine(
+        ckpt_path=args.ckpt,
+        model_py="luluv2_inference_runtime.py",
+        tokenizer_dir=args.tokenizer_dir,
+        device=args.device,
+        dtype=args.dtype,
+        local_files_only=True,
+        no_config_download=True,
+    )
+    cfg = GenerationConfig(
+        max_new_tokens=args.max_new_tokens,
+        temperature=args.temperature,
+        top_p=args.top_p,
+        top_k=args.top_k,
+    )
+    history = []
+    for text in engine.generate_stream(args.prompt, history, args.system, cfg):
+        print(text, end="", flush=True)
+    print()
+if __name__ == "__main__":
+    torch.set_grad_enabled(False)
+    main()