"""OUROBOROS Kernel Mint: a GPU-kernel demo (Gradio + custom JS).

Compose a fused operation, and a 1B model writes a real Triton kernel for it while an
immutable referee checks correctness against PyTorch and times it against torch.compile. The
whole interactive surface is a custom JS widget in one gr.HTML; the mint runs through Python
(no CORS) and hits the 1B model (default) or the 27B (Pro mode).
"""
from __future__ import annotations

import base64
import json
import os
import re
import sys
import time

import gradio as gr
import requests

# `spaces.GPU` only matters on ZeroGPU, where the GPU is granted lazily per call. On a dedicated
# GPU Space (e.g. L4) or locally the GPU is always attached, so the decorator must be a plain
# pass-through — otherwise it can fight the always-present CUDA context. Gate on the env var HF sets
# only on ZeroGPU hardware, so the same app.py runs unchanged on ZeroGPU, a paid GPU, and a laptop.
try:
    if not os.environ.get("SPACES_ZERO_GPU"):
        raise ImportError("not on ZeroGPU: use the no-op GPU shim")
    import spaces
except Exception:
    class _SpacesShim:
        @staticmethod
        def GPU(*_args, **_kwargs):
            def _decorator(fn):
                return fn
            return _decorator

    spaces = _SpacesShim()

ONE_B = os.environ.get("BACKEND_URL", "https://ymrohit--ouroboros-kernel-mint-mint-mint.modal.run")
PRO = os.environ.get("BACKEND_PRO_URL", "https://ymrohit--ouroboros-kernel-mint-pro-mint-mint.modal.run")
ROOT = os.path.dirname(__file__)
REFEREE = os.path.join(ROOT, "referee")
SEED_DIR = os.path.join(ROOT, "seed_kernels")
LOCAL_FINE_TUNED_REPO = os.environ.get(
    "LOCAL_GGUF_REPO", "YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF")
LOCAL_BASE_REPO = os.environ.get("LOCAL_GGUF_FALLBACK_REPO", "openbmb/MiniCPM5-1B-GGUF")
LOCAL_QUANT_PREFS = tuple(q.strip() for q in os.environ.get(
    "LOCAL_GGUF_QUANTS", "Q5_K_M,Q6_K,Q4_K_M,Q8_0,F16,BF16").split(",") if q.strip())
LOCAL_MAX_ATTEMPTS = int(os.environ.get("LOCAL_MAX_ATTEMPTS", "2"))
_LOCAL_LLM = None
_LOCAL_LLM_PATH = None


def _ensure_referee_path():
    if REFEREE not in sys.path:
        sys.path.insert(0, REFEREE)


LOCAL_SYS = ("You are an expert GPU kernel engineer. Write a single correct, fast Triton "
             "kernel. Output ONLY one fenced python code block defining `run(*inputs)` and "
             "its @triton.jit kernel. Accumulate reductions in float32. No prose.")

# The model was trained (rl_kernelsmith.py Proposer.prompt) with a real per-op SEED kernel as the
# style guide, and for almost every op that exemplar is rmsnorm's ROW-WISE reduction. We must hand
# the model that SAME exemplar at inference, or it goes off-distribution and writes flat elementwise
# code for reduction ops (wrong results). Seeds are bundled in ./seed_kernels; this inline copy is
# only a fallback if a file is missing.
_FALLBACK_EXEMPLAR = """# GOLD seed kernel: fused RMSNorm, one row per program, fp32 accumulation.
@triton.jit
def _rmsnorm_kernel(x_ptr, w_ptr, y_ptr, stride, N, eps, BLOCK: tl.constexpr):
    row = tl.program_id(0)
    x_ptr += row * stride
    y_ptr += row * stride
    acc = tl.zeros([BLOCK], dtype=tl.float32)
    for off in range(0, N, BLOCK):
        cols = off + tl.arange(0, BLOCK)
        x = tl.load(x_ptr + cols, mask=cols < N, other=0.0).to(tl.float32)
        acc += x * x
    rms = tl.rsqrt(tl.sum(acc) / N + eps)
    for off in range(0, N, BLOCK):
        cols = off + tl.arange(0, BLOCK)
        mask = cols < N
        x = tl.load(x_ptr + cols, mask=mask, other=0.0).to(tl.float32)
        w = tl.load(w_ptr + cols, mask=mask, other=0.0).to(tl.float32)
        tl.store(y_ptr + cols, (x * rms * w), mask=mask)


def run(x, w):
    M, N = x.shape
    y = torch.empty_like(x)
    _rmsnorm_kernel[(M,)](x, w, y, x.stride(0), N, 1e-6, BLOCK=1024)
    return y
"""


def _load_seed_kernel(op: str) -> str:
    try:
        with open(os.path.join(SEED_DIR, f"{op}.py")) as f:
            return f.read()
    except OSError:
        return _FALLBACK_EXEMPLAR


def extract_kernel(text: str) -> str:
    """Pull a fenced Python kernel out of a llama.cpp completion."""
    m = re.search(r"```(?:python)?\s*(.*?)```", text or "", re.S)
    body = m.group(1) if m else (text or "")
    starts = [body.find(k) for k in ("@triton", "import ", "def run", "def _") if body.find(k) >= 0]
    i = min(starts or [0])
    return body[i:].strip()


def _local_op_from_recipe(r: dict) -> tuple[str, str]:
    _ensure_referee_path()
    from specs import SPECS

    if r.get("mode") == "classic":
        op = (r.get("classic") or "softmax").strip()
        if op not in SPECS:
            raise ValueError(f"unknown op '{op}'")
        return op, op

    normfull = "rmsnorm" if str(r.get("norm", "rmsnorm")).startswith("rms") else "layernorm"
    norm = "rms" if normfull == "rmsnorm" else "layer"
    residual = bool(r.get("residual"))
    acts = [a for a in (r.get("acts") or []) if a][:3] or ["gelu"]
    label = ("residual + " if residual else "") + \
            ("RMSNorm" if norm == "rms" else "LayerNorm") + " -> " + " -> ".join(acts)
    named = ("add_" if residual else "") + normfull + "_" + acts[0]
    if len(acts) == 1 and named in SPECS:
        return named, label
    return f"chain|{norm}|{'1' if residual else '0'}|{','.join(acts)}", label


def _local_prompt_messages(op: str, spark: str = "") -> list[dict[str, str]]:
    _ensure_referee_path()
    from specs import get_spec, SPECS

    spec = get_spec(op)
    # Same exemplar rule as training: the real seed kernel for the next op in SPECS that isn't this
    # one (almost always rmsnorm's row-wise reduction). This is the single thing that makes the 1B
    # write correct kernels here instead of elementwise guesses.
    exemplar_op = next((o for o in SPECS if o != op), op)
    exemplar = _load_seed_kernel(exemplar_op)
    user = (f"Op `{op}`: {spec.notes}\nSignature:\n{spec.signature_hint}\n\n"
            f"Here is a valid Triton kernel for a DIFFERENT op (`{exemplar_op}`) as a style guide:\n"
            f"```python\n{exemplar}\n```\n")
    if spark:
        user += (f"\nThe person building this kernel added their own idea: \"{spark}\". "
                 "Honour their idea where you can, but correctness is mandatory.")
    return [{"role": "system", "content": LOCAL_SYS}, {"role": "user", "content": user}]


def _render_prompt(messages: list[dict[str, str]]) -> str:
    # MiniCPM5 uses ChatML (<|im_start|>/<|im_end|>). It is a reasoning model, so we append the
    # empty <think></think> block (the enable_thinking=False convention) to make it answer
    # directly with the kernel instead of emitting a reasoning trace. Verified against the
    # tokenizer's apply_chat_template(enable_thinking=False) output.
    out = []
    for m in messages:
        out.append(f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>\n")
    out.append("<|im_start|>assistant\n<think>\n\n</think>\n\n")
    return "".join(out)


def _common_gguf_names() -> list[str]:
    names = []
    stems = ("MiniCPM5-1B", "minicpm5-1b", "model", "ggml-model")
    for q in LOCAL_QUANT_PREFS:
        for stem in stems:
            names.append(f"{stem}-{q}.gguf")
            names.append(f"{stem}.{q}.gguf")
    return names


def _download_gguf_from_repo(repo_id: str) -> str:
    from huggingface_hub import HfApi, hf_hub_download

    explicit = os.environ.get("LOCAL_GGUF_FILE")
    cache_dir = os.environ.get("LOCAL_GGUF_CACHE")
    if explicit:
        return hf_hub_download(repo_id=repo_id, filename=explicit, cache_dir=cache_dir)

    files = []
    try:
        files = HfApi().list_repo_files(repo_id)
    except Exception:
        files = []
    ggufs = [f for f in files if f.lower().endswith(".gguf")]
    for quant in LOCAL_QUANT_PREFS:
        hit = next((f for f in ggufs if quant.lower() in f.lower()), None)
        if hit:
            return hf_hub_download(repo_id=repo_id, filename=hit, cache_dir=cache_dir)
    if ggufs:
        return hf_hub_download(repo_id=repo_id, filename=ggufs[0], cache_dir=cache_dir)

    last = None
    for filename in _common_gguf_names():
        try:
            return hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir)
        except Exception as e:
            last = e
    raise RuntimeError(f"no GGUF found in {repo_id}: {last}")


def _resolve_local_gguf() -> str:
    errors = []
    for repo_id in (LOCAL_FINE_TUNED_REPO, LOCAL_BASE_REPO):
        try:
            return _download_gguf_from_repo(repo_id)
        except Exception as e:
            errors.append(f"{repo_id}: {type(e).__name__}: {str(e)[:180]}")
    raise RuntimeError("could not load a local GGUF. " + " | ".join(errors))


_CUDA_PRELOADED = False


def _preload_cuda_libs():
    # The CUDA llama.cpp wheel's libllama.so links libcudart.so.12 / libcublas*.so.12, which on HF
    # Spaces live inside torch's bundled nvidia-* packages and are NOT on the default loader path,
    # so the import dies with "libcudart.so.12: cannot open shared object file". Preload them with
    # RTLD_GLOBAL (in dependency order) so libllama.so resolves their symbols. No-op if absent.
    global _CUDA_PRELOADED
    if _CUDA_PRELOADED:
        return
    _CUDA_PRELOADED = True
    import ctypes
    import glob
    roots = []
    try:
        import torch
        roots.append(os.path.join(os.path.dirname(torch.__file__), "lib"))
    except Exception:
        pass
    try:
        import site
        roots += list(site.getsitepackages() if hasattr(site, "getsitepackages") else [])
    except Exception:
        pass
    roots += ["/usr/local/lib/python3.10/site-packages", "/usr/local/cuda/lib64",
              "/usr/lib/x86_64-linux-gnu", "/usr/local/lib"]
    # Match versioned sonames too (e.g. libcudart.so.12.4.127 with no libcudart.so.12 symlink);
    # RTLD_GLOBAL makes the symbols visible to libllama.so regardless of the exact soname.
    for pat in ("libcudart.so*", "libcublas.so*", "libcublasLt.so*"):
        loaded = False
        for root in roots:
            if loaded:
                break
            try:
                hits = sorted(glob.glob(os.path.join(root, "**", pat), recursive=True))
            except Exception:
                hits = []
            for hit in hits:
                try:
                    ctypes.CDLL(hit, mode=ctypes.RTLD_GLOBAL)
                    loaded = True
                    break
                except Exception:
                    continue


# Confirm the CUDA llama.cpp lib loads at startup (loading the .so needs no GPU device, so it spends
# no ZeroGPU quota). It also warms the import so the in-@spaces.GPU call is instant.
if os.environ.get("LOCAL_LLAMA_GPU_LAYERS", "-1") != "0":
    try:
        _preload_cuda_libs()
        import llama_cpp as _lc_probe
        print(f"[startup] llama_cpp {_lc_probe.__version__} loaded OK", flush=True)
    except Exception as _e:
        print(f"[startup] llama_cpp load FAILED: {type(_e).__name__}: {str(_e)[:200]}", flush=True)


def _get_local_llm():
    global _LOCAL_LLM, _LOCAL_LLM_PATH
    on_zero = bool(os.environ.get("SPACES_ZERO_GPU"))
    gpu_layers = int(os.environ.get("LOCAL_LLAMA_GPU_LAYERS", "-1"))
    # Default to full GPU offload (-1). On ZeroGPU the H200 is ~30x faster than its throttled
    # shared CPU for this 1B, and the GPU is granted only inside @spaces.GPU and detached between
    # calls, so a GPU-resident model can't be reused across mints: rebuild it each call there.
    # Off ZeroGPU (dedicated GPU or CPU) the model is cached once. Set LOCAL_LLAMA_GPU_LAYERS=0
    # to force CPU (e.g. when only a CPU-only llama.cpp wheel is installed).
    if _LOCAL_LLM is not None and not (on_zero and gpu_layers != 0):
        return _LOCAL_LLM
    if gpu_layers != 0:
        _preload_cuda_libs()
    from llama_cpp import Llama

    _LOCAL_LLM_PATH = _resolve_local_gguf()
    threads = int(os.environ.get("LOCAL_LLAMA_THREADS", str(max(1, (os.cpu_count() or 4) - 1))))
    llm = Llama(
        model_path=_LOCAL_LLM_PATH,
        n_ctx=int(os.environ.get("LOCAL_LLAMA_CTX", "4096")),
        n_threads=threads,
        n_gpu_layers=gpu_layers,
        verbose=bool(int(os.environ.get("LOCAL_LLAMA_VERBOSE", "0"))),
    )
    if not (on_zero and gpu_layers != 0):
        _LOCAL_LLM = llm
    return llm


def _llama_complete(messages: list[dict[str, str]], temp: float, seed: int) -> str:
    # Use raw completion with our think-suppressed ChatML render (not create_chat_completion, which
    # leaves the reasoning trace on and makes a 1B ramble instead of writing the kernel).
    llm = _get_local_llm()
    max_tokens = int(os.environ.get("LOCAL_MAX_TOKENS", "768"))
    out = llm.create_completion(
        _render_prompt(messages),
        max_tokens=max_tokens,
        temperature=temp,
        top_p=0.97,
        seed=seed,
        stop=["<|im_end|>", "<|im_start|>"],
    )
    return out["choices"][0]["text"]


def _local_gpu_duration() -> int:
    # This is a ZeroGPU reservation window, not a benchmark setting. Keep the default within the
    # usual free-tier budget, but make the cap explicit so slower/cold hardware can raise it without
    # code changes. The current Space had LOCAL_GPU_DURATION=135, which over-reserved quota; the
    # default cap trims that to 120 while still leaving headroom for cold local mints.
    try:
        requested = int(os.environ.get("LOCAL_GPU_DURATION", "120"))
    except Exception:
        requested = 120
    try:
        cap = int(os.environ.get("LOCAL_GPU_DURATION_CAP", "120"))
    except Exception:
        cap = 120
    return max(45, min(max(45, cap), requested))


LOCAL_GPU_DURATION = _local_gpu_duration()


@spaces.GPU(duration=LOCAL_GPU_DURATION)
def _local_gpu_mint(op: str, k: int, temp: float, spark: str) -> dict:
    # Generation AND verification happen inside this one GPU window. That is required on ZeroGPU,
    # where the GPU exists only inside @spaces.GPU: llama.cpp offloads generation to the H200
    # (LOCAL_LLAMA_GPU_LAYERS=-1, seconds instead of minutes on the throttled CPU) and the referee
    # compiles + runs Triton, all here. Off ZeroGPU the decorator is a no-op and the GPU is always
    # present, so the same code path works on a dedicated GPU and locally too.
    _ensure_referee_path()
    # Time every verified kernel against eager AND torch.compile (default + max-autotune), so local
    # mode shows the honest baselines, not just the inflated vs-eager fusion win. The max-autotune
    # compile is a few seconds for these ops (inductor-cached across the k attempts), so it fits.
    from harness import evaluate_inprocess_full

    messages = _local_prompt_messages(op, spark)
    srcs = []
    base_seed = int(time.time() * 1000) & 0x7FFFFFFF
    for i in range(k):
        text = _llama_complete(messages, temp=temp, seed=base_seed + i)
        src = extract_kernel(text)
        if src:
            srcs.append(src)

    statuses = []
    best = None
    n_shapes = int(os.environ.get("LOCAL_REFEREE_SHAPES", "2"))
    n_iters = int(os.environ.get("LOCAL_REFEREE_ITERS", "30"))
    for i, src in enumerate(srcs):
        try:
            res = evaluate_inprocess_full(src, op, n_shapes=n_shapes, n_iters=n_iters, seed=i)
        except Exception:
            statuses.append("runtime_fail")
            continue
        statuses.append(res.status)
        # Keep the FASTEST verified kernel (lowest latency = best vs every baseline at once).
        if res.status == "ok" and (best is None or res.latency_ms < best["result"]["latency_ms"]):
            best = {"source": src, "result": res.to_dict()}
    return {"n_srcs": len(srcs), "statuses": statuses, "best": best}


def _local_explain(label: str, res) -> str:
    if res is None or res.get("status") != "ok":
        status = "no verified attempt" if res is None else res.get("status", "failed")
        return (f"The local referee rejected these kernels ({status}). It still compiled and "
                "checked them inside this Space; try minting again or simplify the operation.")
    se = float(res.get("speedup_eager", 0) or 0)
    honest = res.get("speedup_maxauto") or res.get("speedup_compile")
    bl = "torch.compile max-autotune" if res.get("speedup_maxauto") else "torch.compile"
    if honest:
        return (f"Verified correct. {float(honest):.2f}x faster than {bl} (the honest baseline that "
                f"also fuses and autotunes), and {se:.0f}x vs unfused PyTorch eager. The whole loop "
                "ran in this Space: the 1B wrote it via llama.cpp, the referee compiled, checked it "
                "against PyTorch, and timed it.")
    return (f"Verified correct, {se:.0f}x faster than unfused PyTorch eager. The 1B wrote it via "
            "llama.cpp and the referee checked and timed it, all inside this Space.")


def local_mint(recipe: dict) -> dict:
    # The 1B reliably writes the named ops and single-activation machines (norm + optional residual
    # + one activation), which is what it was trained on. Arbitrary multi-activation chains are
    # off-distribution and it can't write them correctly, so local mode declines them with a clear
    # message instead of looping through rejected attempts. Pro mode (the 27B) is the place for those.
    op, label = _local_op_from_recipe(recipe)
    if op.startswith("chain|"):
        return {"op": label, "verified": False, "local": True, "baseline": "eager",
                "statuses": [], "gen_seconds": 0.0, "verify_seconds": 0.0,
                "speedup_compile": None, "unsupported": True, "k": 0,
                "explanation": ("This machine stacks two activations, which is outside what the "
                                "kernelsmith models were trained on (the grammar is a norm plus one "
                                "activation), so neither the 1B nor the 27B writes it reliably. Drop "
                                "to a single activation and it mints with a verified kernel.")}
    k = max(1, min(LOCAL_MAX_ATTEMPTS, int(recipe.get("k", 1))))
    temp = max(0.2, min(1.3, float(recipe.get("temp", 0.7))))
    spark = (recipe.get("spark") or "").strip()[:200]
    # Pre-fetch the GGUF to disk OUTSIDE the GPU window, so the ZeroGPU 120s budget is spent on
    # generate + verify rather than a 1.15GB download.
    _resolve_local_gguf()
    t0 = time.time()
    work = _local_gpu_mint(op, k, temp, spark)
    total_s = round(time.time() - t0, 1)
    statuses = work.get("statuses") or []
    best = work.get("best")
    if not work.get("n_srcs"):
        return {"op": label, "verified": False, "local": True, "baseline": "eager",
                "statuses": statuses, "gen_seconds": total_s, "verify_seconds": 0.0,
                "speedup_compile": None, "k": k,
                "explanation": "llama.cpp returned no parseable kernel this time. Try minting again."}
    if not best:
        return {"op": label, "verified": False, "local": True, "baseline": "eager",
                "statuses": statuses, "gen_seconds": total_s, "verify_seconds": 0.0,
                "speedup_compile": None, "explanation": _local_explain(label, None), "k": k}
    res = best["result"]

    def _su(key):
        v = res.get(key)
        return round(float(v), 2) if v else None
    return {"op": label, "verified": True, "local": True, "baseline": "compile",
            "speedup_eager": round(float(res.get("speedup_eager", 0)), 2),
            "speedup_compile": _su("speedup_compile"), "speedup_maxauto": _su("speedup_maxauto"),
            "latency_ms": res.get("latency_ms"), "eager_ms": res.get("eager_ms"),
            "compile_ms": res.get("compile_ms"), "maxauto_ms": res.get("maxauto_ms"),
            "source": best["source"], "statuses": statuses, "gen_seconds": total_s,
            "verify_seconds": 0.0, "explanation": _local_explain(label, res),
            "n_verified": sum(1 for s in statuses if s == "ok"), "k": k}


def lb_url(u):
    return u.replace("-mint-mint.modal.run", "-leaderboard.modal.run")


def _blob(d):
    return f"<span data-blob='{base64.b64encode(json.dumps(d).encode()).decode()}'></span>"


def _with_nonce(d: dict, r: dict) -> dict:
    out = dict(d or {})
    if r.get("_n") is not None:
        out["_n"] = r.get("_n")
    return out


def do_mint(recipe_json: str) -> str:
    try:
        r = json.loads(recipe_json or "{}")
    except Exception:
        return _blob({"error": "bad recipe"})
    if r.get("local"):
        try:
            return _blob(_with_nonce(local_mint(r), r))
        except Exception as e:
            return _blob(_with_nonce(
                {"error": f"Local offline mint failed: {type(e).__name__}: {str(e)[:240]}",
                 "local": True, "baseline": "eager"}, r))
    base = PRO if r.get("pro") else ONE_B
    if r.get("mode") == "classic":
        payload = {"op": r.get("classic", "softmax")}
    else:
        payload = {"blocks": {"norm": r.get("norm", "rmsnorm"), "residual": bool(r.get("residual")),
                              "acts": r.get("acts") or ["gelu"]}}
    payload.update({"name": (r.get("name") or "anonymous")[:40], "k": int(r.get("k", 4)),
                    "temp": float(r.get("temp", 0.7)), "spark": (r.get("spark") or "")[:200]})
    # During a cold start Modal can return an empty body before the model finishes loading,
    # which makes .json() blow up. Retry a few times, then explain it plainly.
    last = ""
    for attempt in range(4):
        try:
            resp = requests.post(base, json=payload, timeout=600)
            if resp.status_code == 200 and resp.text.strip():
                try:
                    return _blob(_with_nonce(resp.json(), r))
                except ValueError:
                    last = "the model returned an unexpected response"
            else:
                last = f"the model is still waking up (HTTP {resp.status_code})"
        except Exception as e:
            last = str(e)
        time.sleep(4)
    pro = " The 27B in Pro mode takes longer to wake. Try again, or turn Pro off." if r.get("pro") else ""
    return _blob(_with_nonce(
        {"error": f"The model is still waking up. Give it about 30 seconds and press Mint again.{pro}"},
        r))


def fetch_lb(_=None):
    rows = []
    seen = set()
    sources = []
    errors = []
    for u in (ONE_B, PRO):
        try:
            url = lb_url(u)
            got = requests.get(url, timeout=20).json().get("rows", [])
            if got:
                sources.append(url)
            for row in got:
                key = (row.get("op"), row.get("name"), row.get("ts"),
                       row.get("speedup_compile"), row.get("speedup_eager"))
                if key in seen:
                    continue
                seen.add(key)
                rows.append(row)
        except Exception as e:
            errors.append(f"{lb_url(u)}: {type(e).__name__}")
            continue
    # Always include fetched_at so the hidden Gradio HTML changes even when the rows are identical.
    # That makes the visible Refresh button observably work.
    return _blob({"rows": rows, "fetched_at": time.time(), "sources": sources, "errors": errors[:2]})


CSS = open(os.path.join(os.path.dirname(__file__), "style.css")).read() if os.path.exists(
    os.path.join(os.path.dirname(__file__), "style.css")) else ""

REPLAYS_PATH = os.path.join(os.path.dirname(__file__), "replays", "index.json")
try:
    with open(REPLAYS_PATH, encoding="utf-8") as f:
        REPLAYS_JSON = json.dumps(json.load(f), separators=(",", ":")).replace("</", "<\\/")
except Exception:
    REPLAYS_JSON = "{}"

APP_HTML = r"""
<div id="km" data-build="2026-06-15-leaderboard-refresh-fallback"
     data-lb-one="__KM_LB_ONE__"
     data-lb-pro="__KM_LB_PRO__">
  <div id="km-top">
    <div id="km-brand">
      <span class="mark"><span class="dot"></span>Kernel Mint</span>
      <span class="sub">an OUROBOROS demo</span>
    </div>
    <div id="km-tabs">
      <span id="km-tab-pill"></span>
      <button class="km-tab on" data-tab="build">Build</button>
      <button class="km-tab" data-tab="expert">Expert</button>
      <button class="km-tab" data-tab="lb">Leaderboard</button>
      <button class="km-tab" data-tab="docs">Run &amp; verify</button>
    </div>
  </div>
  <script id="km-replays-data" type="application/json">__KM_REPLAYS__</script>

  <!-- ============ BUILD ============ -->
  <div class="km-page" data-page="build">
    <header id="km-hero">
      <h1>A 1B model writes a real GPU kernel. <span class="g">A referee that can't be fooled checks it.</span></h1>
      <p>Compose an operation, and a 1-billion-parameter model writes a fused Triton kernel for it. The kernel is compiled, checked against PyTorch on adversarial inputs, and timed against PyTorch's own compiler before anything counts as a win.</p>
    </header>
    <div id="km-grid">
      <section class="km-card">
        <div class="km-seg" role="tablist">
          <button class="km-segb on" data-mode="build">Compose</button>
          <button class="km-segb" data-mode="classic">Classic ops</button>
        </div>

        <div id="km-compose">
          <div class="km-section-label">Pipeline</div>
          <div id="km-pipe">
            <div class="km-stage fixed"><span class="km-stage-k">input</span><span class="km-dim">a row of activations</span></div>
            <div class="km-flow">&darr;</div>
            <div class="km-stage"><span class="km-stage-k">residual</span><span class="km-pick" data-slot="mem"></span></div>
            <div class="km-flow">&darr;</div>
            <div class="km-stage"><span class="km-stage-k">normalize</span><span class="km-pick" data-slot="norm"></span></div>
            <div class="km-flow">&darr;</div>
            <div class="km-stage"><span class="km-stage-k">activate</span><span class="km-pick" data-slot="act"></span></div>
          </div>
        </div>

        <div id="km-classic" hidden>
          <div class="km-section-label">A named operation from a real transformer</div>
          <div class="km-pick" id="km-classics"></div>
        </div>

        <div id="km-info"></div>

        <div class="km-params">
          <div class="km-field">
            <label>Attempts the model drafts <b id="km-tries-v">4</b></label>
            <div class="km-stepper" id="km-tries"></div>
          </div>
          <div class="km-field">
            <label>Sampling temperature <b id="km-temp-v">0.70</b></label>
            <input type="range" class="km-range" id="km-temp" min="20" max="130" value="70">
          </div>
          <div class="km-field">
            <label>Name for the leaderboard</label>
            <input class="km-input" id="km-name" placeholder="optional" autocomplete="off">
          </div>
          <label class="km-toggle"><input type="checkbox" id="km-pro"><span class="km-switch"></span>Use the 27B model (slower, knows more)</label>
          <label class="km-toggle km-local-toggle"><input type="checkbox" id="km-local"><span class="km-switch"></span>Local (offline)</label>
          <p class="km-mode-help">Runs entirely in this Space: a 1B GGUF model (llama.cpp) writes the kernel on this Space's GPU, then the in-process referee compiles it, checks correctness against PyTorch, and times it against PyTorch eager, torch.compile, and max-autotune. No Modal or cloud model calls. Pro uses the 27B.</p>
        </div>

        <div class="km-actions">
          <button class="km-btn primary" id="km-build">Mint kernel</button>
          <button class="km-btn ghost" id="km-replay" data-replay-key="rmsnorm_gelu">Watch a recorded mint</button>
        </div>
        <div id="km-replay-picks" class="km-replay-picks" aria-label="recorded mints"></div>
      </section>

      <section class="km-card">
        <div class="km-section-label">The operation, stage by stage</div>
        <div id="km-viz"><svg id="km-svg" viewBox="0 0 320 150" preserveAspectRatio="none"></svg>
          <div id="km-stage-label">input</div></div>
        <div id="km-verdict"><div class="km-card"><div class="km-result-top"><div class="km-verdict-icon">&rarr;</div>
          <div><span class="km-run-label">ready</span><b>Press Mint to begin</b></div></div>
          <p class="km-dim" style="margin-top:10px">Compose a pipeline on the left and mint it, or watch a recorded verified mint to see the whole referee loop instantly.</p></div></div>
        <details id="km-source" class="km-source" hidden>
          <summary>Triton kernel source</summary>
          <pre id="km-code"></pre>
        </details>
        <div class="km-section-label">This session</div>
        <div id="km-hist" class="km-dim">No mints yet.</div>
      </section>
    </div>
  </div>

  <!-- ============ EXPERT ============ -->
  <div class="km-page" data-page="expert" hidden>
    <header id="km-hero">
      <h1>Free-test the verifier.</h1>
      <p>Describe the fused operation you want in plain language. The model writes a Triton kernel and the referee certifies it against PyTorch, torch.compile, and max-autotune. The referee can only certify operations it has a reference for, so your request is resolved to the nearest one it can check.</p>
    </header>
    <div class="km-expert-grid">
      <section class="km-card">
        <div class="km-section-label">Your brief</div>
        <textarea class="km-textarea" id="km-xbrief" placeholder="e.g. fused RMSNorm with a residual add and a SiLU activation, contiguous rows"></textarea>
        <div id="km-xresolved" class="km-resolved"><span class="k">resolves to</span><div class="v">rmsnorm with gelu</div></div>
        <div class="km-params">
          <div class="km-field">
            <label>Attempts the model drafts <b id="km-xtries-v">5</b></label>
            <div class="km-stepper" id="km-xtries"></div>
          </div>
          <div class="km-field">
            <label>Sampling temperature <b id="km-xtemp-v">0.70</b></label>
            <input type="range" class="km-range" id="km-xtemp" min="20" max="130" value="70">
          </div>
          <label class="km-toggle"><input type="checkbox" id="km-xpro"><span class="km-switch"></span>Use the 27B model</label>
          <label class="km-toggle km-local-toggle"><input type="checkbox" id="km-xlocal"><span class="km-switch"></span>Local (offline)</label>
          <p class="km-mode-help">Uses llama.cpp inside the Space and verifies on this Space's GPU, timed against torch.compile and max-autotune (the honest baselines), not just eager. Pro mode keeps the 27B on Modal.</p>
        </div>
        <div class="km-actions">
          <button class="km-btn primary" id="km-xbuild">Mint from brief</button>
        </div>
      </section>
      <section class="km-card">
        <div class="km-section-label">What the referee can verify</div>
        <p class="km-limits">It certifies fusions of a <b>normalization</b> (RMSNorm or LayerNorm), an optional <b>residual add</b>, and one <b>activation</b>, plus a set of named operators from real transformers. Click one to load it into your brief.</p>
        <div class="km-oplist" id="km-oplist"></div>
        <div id="km-xverdict" style="margin-top:16px"></div>
        <details id="km-xsource" class="km-source" hidden>
          <summary>Triton kernel source</summary>
          <pre class="km-code"></pre>
        </details>
      </section>
    </div>
  </div>

  <!-- ============ LEADERBOARD ============ -->
  <div class="km-page" data-page="lb" hidden>
    <section class="km-card">
      <div class="km-h">Leaderboard</div>
      <p class="km-lede">The best verified kernel per operation. Crowned rows were minted by the 27B model. Every speedup here was measured by the referee, not reported by the model.</p>
      <div id="km-lb-view" class="km-dim" style="margin-top:16px">Loading.</div>
      <div class="km-lb-tools">
        <button id="km-lb-refresh" class="km-mini">Refresh</button>
        <span id="km-lb-status" class="km-lb-status">Auto-refreshes while this tab is open.</span>
      </div>
    </section>
  </div>

  <!-- ============ RUN + VERIFY / DOCS ============ -->
  <div class="km-page" data-page="docs" hidden>
    <section class="km-docs">
      <div class="km-doc-hero">
        <span class="km-doc-kicker">reproduce the claim</span>
        <h2>Run the smith, then make the referee decide.</h2>
        <p>The Space has two execution paths. The <b>Local (offline)</b> switch runs the 1B GGUF model with llama.cpp inside this Space, then verifies the candidate kernel in-process on the same GPU. <b>Pro</b> uses the 27B adapter through the Modal backend. Both paths compare against PyTorch eager, torch.compile, and torch.compile max-autotune before a result counts.</p>
      </div>

      <div class="km-doc-metrics" aria-label="execution modes">
        <div><b>1B local</b><span>MiniCPM5-1B GGUF, llama.cpp, in-Space referee</span></div>
        <div><b>27B Pro</b><span>Qwen3.6-27B adapter, Modal H200 backend</span></div>
        <div><b>referee</b><span>compile, allclose, eager, compile, max-autotune</span></div>
      </div>

      <div class="km-doc-grid">
        <article class="km-doc-panel wide">
          <div class="km-doc-label">1. Use this Space properly</div>
          <h3>The easiest local run is already on this page.</h3>
          <ol class="km-doc-steps">
            <li>Open <b>Build</b> or <b>Expert</b>.</li>
            <li>Turn on <b>Local (offline)</b>. It disables Pro because the two paths are intentionally separate.</li>
            <li>Pick a named op such as <code>rmsnorm_gelu</code>, <code>softmax</code>, or <code>swiglu</code>, or compose one norm plus one activation.</li>
            <li>Press <b>Mint kernel</b>. The Space downloads or reuses the GGUF, drafts up to two kernels, compiles Triton, checks PyTorch correctness, and times eager, torch.compile, and max-autotune.</li>
          </ol>
          <div class="km-doc-callout">A certified Local result should show compiler baselines. If a cold ZeroGPU session times out during max-autotune, mint again. A result that only beats eager is not treated as the headline claim.</div>
        </article>

        <article class="km-doc-panel">
          <div class="km-doc-label">2. Run the Space clone</div>
          <h3>Same UI, your GPU.</h3>
          <p>Use this when you want the full app locally. With no `SPACES_ZERO_GPU` env var, the GPU decorator becomes a no-op and the local verifier runs directly on your attached CUDA device.</p>
          <div class="km-codeblock"><button class="km-copy">Copy</button>git clone https://huggingface.co/spaces/build-small-hackathon/ouroboros-kernel-mint
cd ouroboros-kernel-mint
python -m venv .venv
. .venv/bin/activate
python -m pip install -r requirements.txt
python app.py</div>
          <p>Useful knobs:</p>
          <div class="km-codeblock"><button class="km-copy">Copy</button>LOCAL_GGUF_REPO=YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF
LOCAL_GGUF_QUANTS=Q5_K_M,Q4_K_M
LOCAL_LLAMA_GPU_LAYERS=-1
LOCAL_GPU_DURATION=120
python app.py</div>
        </article>

        <article class="km-doc-panel">
          <div class="km-doc-label">3. Run the 1B smith directly</div>
          <h3>GGUF path, no Modal.</h3>
          <p>This mirrors the Space-local path. It uses the fine-tuned GGUF repo first and falls back to OpenBMB's base GGUF only if needed.</p>
          <div class="km-codeblock"><button class="km-copy">Copy</button>python -m pip install torch triton huggingface_hub \
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu130 \
  llama-cpp-python==0.3.28</div>
          <div class="km-codeblock"><button class="km-copy">Copy</button>from huggingface_hub import HfApi, hf_hub_download
from llama_cpp import Llama

repo = "YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF"
files = [f for f in HfApi().list_repo_files(repo) if f.lower().endswith(".gguf")]
filename = next((f for f in files if "Q5_K_M" in f), files[0])
gguf = hf_hub_download(repo, filename=filename)

llm = Llama(model_path=gguf, n_ctx=4096, n_gpu_layers=-1)

system = "You are an expert GPU kernel engineer. Output only one fenced python code block."
user = "Write a fused Triton kernel for row-wise softmax. Use stable max-subtraction. Return run(x)."
prompt = (
    f"&lt;|im_start|&gt;system\n{system}&lt;|im_end|&gt;\n"
    f"&lt;|im_start|&gt;user\n{user}&lt;|im_end|&gt;\n"
    "&lt;|im_start|&gt;assistant\n```python\n"
)
out = llm.create_completion(prompt, max_tokens=768, temperature=0.7, top_p=0.97)
print(out["choices"][0]["text"])</div>
        </article>

        <article class="km-doc-panel">
          <div class="km-doc-label">4. Run the 1B adapter directly</div>
          <h3>Transformers plus PEFT.</h3>
          <p>Use this when you want the published LoRA adapter instead of GGUF. It is slower to start than llama.cpp, but useful for inspection or further fine-tuning.</p>
          <div class="km-codeblock"><button class="km-copy">Copy</button>python -m pip install torch transformers peft accelerate triton</div>
          <div class="km-codeblock"><button class="km-copy">Copy</button>from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

tok = AutoTokenizer.from_pretrained("openbmb/MiniCPM5-1B", trust_remote_code=True)
base = AutoModelForCausalLM.from_pretrained(
    "openbmb/MiniCPM5-1B",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto",
)
model = PeftModel.from_pretrained(base, "YMRohit/ouroboros-kernelsmith-minicpm5-1b")
model.eval()</div>
        </article>

        <article class="km-doc-panel">
          <div class="km-doc-label">5. Run the 27B smith locally</div>
          <h3>Same prompt contract, much bigger hardware.</h3>
          <p>The 27B artifact is the stronger Qwen3.6-27B smith used for the 76 verified compiler-beating kernels. Run it locally if you have a large GPU or multi-GPU box. The training run used Modal H200s and peaked around 110 GB VRAM; inference is still a heavyweight path compared with the 1B GGUF.</p>
          <div class="km-codeblock"><button class="km-copy">Copy</button>python -m pip install torch transformers peft accelerate triton</div>
          <div class="km-codeblock"><button class="km-copy">Copy</button>from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

tok = AutoTokenizer.from_pretrained("Qwen/Qwen3.6-27B", trust_remote_code=True)
base = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3.6-27B",
    trust_remote_code=True,
    torch_dtype="auto",
    device_map="auto",
)
model = PeftModel.from_pretrained(base, "YMRohit/ouroboros-kernelsmith-qwen3.6-27b")
model.eval()</div>
          <p>If that does not fit, use <b>Pro</b> in this Space. It calls the hosted 27B backend and still sends the output through the same three-baseline referee before returning a result.</p>
        </article>

        <article class="km-doc-panel wide">
          <div class="km-doc-label">6. Prompt contract</div>
          <h3>Ask for one operation, one code block, one `run(...)` entry point.</h3>
          <p>The model is not a general Python assistant here. It was trained to emit a single Triton kernel for a verifier-known operation. Keep the prompt narrow and include the exact operation, input tensors, output contract, and baseline target.</p>
          <div class="km-prompt-grid">
            <div>
              <b>System</b>
              <div class="km-codeblock"><button class="km-copy">Copy</button>You are an expert GPU kernel engineer. Write a single correct, fast Triton kernel. Output ONLY one fenced python code block defining run(*inputs) and its @triton.jit kernel. Accumulate reductions in float32. No prose.</div>
            </div>
            <div>
              <b>User template</b>
              <div class="km-codeblock"><button class="km-copy">Copy</button>Operation: add_rmsnorm_gelu
Inputs: x, residual, weight. Each row is one transformer hidden state.
Reference: y = gelu(rmsnorm(x + residual, weight)).
Return: one fenced python block with imports, one @triton.jit kernel, and run(x, residual, weight).
Target: correct vs PyTorch first, then faster than torch.compile max-autotune.</div>
            </div>
          </div>
          <div class="km-doc-chiprow">
            <span class="km-doc-chip">name the op</span>
            <span class="km-doc-chip">state tensor order</span>
            <span class="km-doc-chip">require fp32 reductions</span>
            <span class="km-doc-chip">forbid prose</span>
            <span class="km-doc-chip">verify before trusting</span>
          </div>
        </article>

        <article class="km-doc-panel">
          <div class="km-doc-label">Prompt example A</div>
          <h3>Residual RMSNorm plus GELU.</h3>
          <div class="km-codeblock"><button class="km-copy">Copy</button>Write a fused Triton kernel for add_rmsnorm_gelu.
Inputs are x, residual, and weight, all CUDA tensors.
Compute RMSNorm over each row after x + residual, multiply by weight, then apply GELU.
Use fp32 accumulation for the row reduction.
Return exactly one fenced python code block with run(x, residual, weight).</div>
        </article>

        <article class="km-doc-panel">
          <div class="km-doc-label">Prompt example B</div>
          <h3>Stable softmax.</h3>
          <div class="km-codeblock"><button class="km-copy">Copy</button>Write a fused Triton kernel for row-wise softmax.
Input x is a CUDA tensor shaped [M, N].
Use the stable max-subtraction form.
Return exactly one fenced python code block with run(x).
Do not include explanation text outside the code block.</div>
        </article>

        <article class="km-doc-panel">
          <div class="km-doc-label">Prompt example C</div>
          <h3>SwiGLU gate.</h3>
          <div class="km-codeblock"><button class="km-copy">Copy</button>Write a fused Triton kernel for swiglu.
Inputs are gate and up tensors with the same shape.
Compute silu(gate) * up elementwise.
Return exactly one fenced python code block with run(gate, up).
Keep the launch grid simple and contiguous-row friendly.</div>
        </article>

        <article class="km-doc-panel wide">
          <div class="km-doc-label">7. Verify a generated kernel</div>
          <h3>Never score the model output by eye.</h3>
          <p>Save the candidate as `candidate.py`, then send it to the referee. The result object reports correctness, latency, and speedups vs eager, torch.compile, and max-autotune.</p>
          <div class="km-codeblock"><button class="km-copy">Copy</button>git clone https://github.com/ymrohit/ouroboros-kernelsmith.git
cd ouroboros-kernelsmith
python -m pip install torch triton numpy</div>
          <div class="km-codeblock"><button class="km-copy">Copy</button>import pathlib
import sys

sys.path.insert(0, "referee")
from harness import evaluate_inprocess_full

kernel_src = pathlib.Path("candidate.py").read_text()
result = evaluate_inprocess_full(kernel_src, "add_rmsnorm_gelu", n_shapes=2, n_iters=30)
print(result.to_dict())</div>
          <div class="km-doc-callout">A kernel is submission-grade only if `status` is `ok`, correctness is true, and the compiler baselines are present. The strongest number is `speedup_maxauto` because that compares against torch.compile max-autotune.</div>
        </article>

        <article class="km-doc-panel wide">
          <div class="km-doc-label">8. What the referee can certify</div>
          <h3>Known operations only, by design.</h3>
          <p>The referee needs a PyTorch reference. It certifies norm and activation fusions plus named transformer operations. Unknown math can still be interesting, but this app will not label it a verified win until there is a reference spec.</p>
          <div id="km-docs-ops" class="km-oplist"></div>
        </article>

        <article class="km-doc-panel wide">
          <div class="km-doc-label">9. Glossary</div>
          <div id="km-learn-blocks"></div>
        </article>

        <article class="km-doc-panel wide">
          <div class="km-doc-label">Source and artifacts</div>
          <p>MIT licensed code: <a href="https://github.com/ymrohit/ouroboros-kernelsmith">ymrohit/ouroboros-kernelsmith</a>. Models: <a href="https://huggingface.co/YMRohit/ouroboros-kernelsmith-minicpm5-1b">MiniCPM5-1B smith</a>, <a href="https://huggingface.co/YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF">MiniCPM5-1B GGUF</a>, and <a href="https://huggingface.co/YMRohit/ouroboros-kernelsmith-qwen3.6-27b">Qwen3.6-27B smith</a>. Evidence: <a href="https://huggingface.co/datasets/YMRohit/ouroboros-kernel-corpus">verified kernel corpus</a>.</p>
        </article>
      </div>
    </section>
  </div>
</div>
"""

APP_HTML = (APP_HTML
            .replace("__KM_REPLAYS__", REPLAYS_JSON)
            .replace("__KM_LB_ONE__", lb_url(ONE_B))
            .replace("__KM_LB_PRO__", lb_url(PRO)))

APP_JS = r"""
() => {
  const ACTS = {
    gelu:{lbl:'GELU',verb:'smoothly bends',info:'GELU is the gentle bend used inside many transformers. It lets a little of the negative side through.',f:x=>0.5*x*(1+Math.tanh(0.7978845608*(x+0.044715*x*x*x)))},
    silu:{lbl:'SiLU',verb:'swishes',info:'SiLU, also called Swish, is x times sigmoid(x). It is common in LLaMA style feed-forward blocks.',f:x=>x/(1+Math.exp(-x))},
    relu:{lbl:'ReLU',verb:'clips negatives',info:'ReLU keeps positive values and clips negative values to zero. It is fast, classic, and direct.',f:x=>Math.max(x,0)},
    tanh:{lbl:'Tanh',verb:'squashes to -1..1',info:'Tanh squashes each value into the range from -1 to 1. Large values flatten near the top and bottom.',f:x=>Math.tanh(x)},
    sigmoid:{lbl:'Sigmoid',verb:'squashes to 0..1',info:'Sigmoid squashes each value into the range from 0 to 1. It is often used for gates and probabilities.',f:x=>1/(1+Math.exp(-x))},
    relu2:{lbl:'ReLU2',verb:'clips then squares',info:'Squared ReLU clips negative values to zero, then squares the remaining positive values.',f:x=>{const r=Math.max(x,0);return r*r;}},
    gelu_erf:{lbl:'GELU exact',verb:'bends exactly',info:'Exact GELU uses the erf form of GELU. It behaves almost like the tanh approximation, but follows the exact formula.',f:x=>0.5*x*(1+Math.tanh(0.7978845608*(x+0.044715*x*x*x)))},
    leaky_relu:{lbl:'Leaky ReLU',verb:'leaks negatives',info:'Leaky ReLU is like ReLU, but negative values leak through at a small scale instead of becoming zero.',f:x=>x>0?x:0.01*x},
    elu:{lbl:'ELU',verb:'curves negatives',info:'ELU passes positive values through and curves negative values smoothly down toward -1.',f:x=>x>0?x:Math.exp(x)-1},
    mish:{lbl:'Mish',verb:'self gates',info:'Mish is a smooth self-gating activation, x times tanh of softplus(x).',f:x=>x*Math.tanh(Math.log(1+Math.exp(x)))},
  };
  const NORMS = {
    rmsnorm:{lbl:'RMSNorm',info:'RMSNorm makes a row of values a consistent size by dividing by root mean square, then scaling.'},
    layernorm:{lbl:'LayerNorm',info:'LayerNorm centers a row of values by subtracting the average, then rescales by the spread.'},
  };
  const MEM = {info:'Residual add means adding the input back before the next operation. This is the skip path used in deep models.'};
  const CLASSICS = {
    softmax:{lbl:'Softmax',info:'Softmax turns a row of scores into probabilities that add to 1.'},
    softmax_scale:{lbl:'Scaled softmax',info:'Scaled softmax applies a scale before softmax. It is the attention score step used before attention weights.'},
    swiglu:{lbl:'SwiGLU',info:'SwiGLU is a gated feed-forward operation: SiLU on the gate multiplied by the up projection.'},
    geglu:{lbl:'GeGLU',info:'GeGLU is a gated feed-forward operation like SwiGLU, but with a GELU gate.'},
    rmsnorm:{lbl:'RMSNorm',info:'RMSNorm on its own, without an activation after it.'},
    layernorm:{lbl:'LayerNorm',info:'LayerNorm on its own, without an activation after it.'},
  };
  const INPUT = [0.6,-0.9,1.4,-0.3,0.2,-1.6,0.8,1.1,-0.5,0.0,1.7,-1.1,0.4,-0.2];

  const root=document.getElementById('km');
  if(!root||root.dataset.init)return;
  root.dataset.init='1';
  const $=s=>root.querySelector(s), $$=s=>Array.from(root.querySelectorAll(s));
  const LB_URLS=[root.dataset.lbOne,root.dataset.lbPro].filter(Boolean);
  const cleanText=s=>String(s??'').replace(/\u2013|\u2014/g,' - ').replace(/\u2192/g,'->').replace(/\u00d7/g,'x').replace(/[\u2600-\u27BF]/g,'').replace(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g,'').replace(/\s+/g,' ').trim();
  const esc=s=>cleanText(s).replace(/[&<>"']/g,c=>({'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#39;'}[c]));
  let REPLAYS={};
  try{REPLAYS=JSON.parse(($('#km-replays-data')||{}).textContent||'{}');}catch(e){REPLAYS={};}
  const REPLAY_META={
    rmsnorm_gelu:{label:'RMSNorm to GELU',short:'RMSNorm GELU'},
    softmax:{label:'Softmax',short:'Softmax'},
    add_layernorm_silu:{label:'Residual plus LayerNorm to SiLU',short:'Residual LayerNorm SiLU'}
  };
  const REPLAY_ORDER=['rmsnorm_gelu','softmax','add_layernorm_silu'].filter(k=>REPLAYS[k]);

  const state={tab:'build',mode:'build',slots:{mem:false,norm:'rmsnorm',act:'gelu',act2:null},classic:'softmax'};
  let buildTries=4, expertTries=5, resolvedExpert=null, lastMintedMode='build';
  let activeMintNonce=null, awaitingMint=false, pendingLBRefresh=false, lbRefreshTimer=null, lbPollTimer=null, lbUiTimer=null;
  const HIST=[];

  function animate(el,frames,opts){ if(el&&el.animate)el.animate(frames,opts); }
  function labelRecipe(r){
    if(!r)return 'unknown';
    if(r.mode==='classic')return CLASSICS[r.classic]?.lbl||r.classic||'classic op';
    const p=[];
    if(r.residual)p.push('residual add');
    p.push(NORMS[r.norm]?.lbl||r.norm);
    (r.acts||[]).forEach(a=>p.push(ACTS[a]?.lbl||a));
    return p.join(' + ');
  }
  function machineInfo(){
    if(state.mode==='classic')return CLASSICS[state.classic].info;
    const p=[];
    p.push(state.slots.mem?'add input':'no residual add');
    p.push(NORMS[state.slots.norm].lbl);
    p.push(ACTS[state.slots.act].lbl);
    if(state.slots.act2)p.push(ACTS[state.slots.act2].lbl);
    return 'Current operation: '+p.join(' + ')+'. The model must fuse it into one verified kernel.';
  }
  function setInfo(t){
    const info=$('#km-info');
    if(!info)return;
    info.textContent=cleanText(t||machineInfo());
    animate(info,[{opacity:.45},{opacity:1}],{duration:180,easing:'cubic-bezier(0.16,1,0.3,1)'});
  }

  function positionTabPill(){
    const tabs=$('#km-tabs'), pill=$('#km-tab-pill'), btn=$('.km-tab.on');
    if(!tabs||!pill||!btn)return;
    const tr=tabs.getBoundingClientRect(), br=btn.getBoundingClientRect();
    pill.style.width=br.width+'px';
    pill.style.transform='translateX('+(br.left-tr.left-4)+'px)';
  }
  function showTab(tab){
    state.tab=tab;
    $$('.km-tab').forEach(b=>b.classList.toggle('on',b.dataset.tab===tab));
    $$('.km-page').forEach(p=>{p.hidden=p.dataset.page!==tab;});
    requestAnimationFrame(positionTabPill);
    if(tab==='expert')resolveExpert();
    if(tab==='lb'){refreshLB(true); setLBPolling(true);}
    else setLBPolling(false);
  }
  $$('.km-tab').forEach(b=>b.addEventListener('click',()=>showTab(b.dataset.tab)));
  window.addEventListener('resize',positionTabPill);

  function optionInfo(slot,key){
    if(slot==='mem')return key==='add'?MEM.info:'No residual add. The kernel starts from the input row and does not add a skip value.';
    if(slot==='norm')return NORMS[key].info;
    if(slot==='act'||slot==='act2')return key==='none'?'No second activation. The pipeline stops after the first activation.':ACTS[key].info;
    return '';
  }
  function optionLabel(slot,key){
    if(slot==='mem')return key==='add'?'add input':'none';
    if(slot==='norm')return NORMS[key].lbl;
    if(slot==='act'||slot==='act2')return key==='none'?'none':ACTS[key].lbl;
    return key;
  }
  function selectedKey(slot){
    if(slot==='mem')return state.slots.mem?'add':'none';
    if(slot==='act2')return state.slots.act2||'none';
    return state.slots[slot];
  }
  function setSlot(slot,key,quiet){
    if(slot==='mem')state.slots.mem=key==='add';
    else if(slot==='act2')state.slots.act2=key==='none'?null:key;
    else state.slots[slot]=key;
    const wrap=root.querySelector('.km-pick[data-slot="'+slot+'"]');
    if(wrap)wrap.querySelectorAll('.km-opt').forEach(b=>b.classList.toggle('on',b.dataset.key===selectedKey(slot)));
    if(!quiet)setInfo(optionInfo(slot,key));
    render();
  }
  function renderPickers(){
    $$('.km-pick[data-slot]').forEach(wrap=>{
      const slot=wrap.dataset.slot;
      let keys=[];
      if(slot==='mem')keys=['none','add'];
      if(slot==='norm')keys=Object.keys(NORMS);
      if(slot==='act')keys=Object.keys(ACTS);
      if(slot==='act2')keys=['none'].concat(Object.keys(ACTS));
      wrap.innerHTML='';
      keys.forEach(key=>{
        const b=document.createElement('button');
        b.type='button';
        b.className='km-opt';
        b.dataset.key=key;
        b.textContent=optionLabel(slot,key);
        b.title=optionInfo(slot,key);
        b.classList.toggle('on',key===selectedKey(slot));
        b.addEventListener('mouseenter',()=>setInfo(optionInfo(slot,key)));
        b.addEventListener('focus',()=>setInfo(optionInfo(slot,key)));
        b.addEventListener('click',()=>setSlot(slot,key));
        wrap.appendChild(b);
      });
    });
  }

  function renderClassics(){
    const cwrap=$('#km-classics');
    if(!cwrap)return;
    cwrap.innerHTML='';
    Object.entries(CLASSICS).forEach(([k,v])=>{
      const b=document.createElement('button');
      b.type='button';
      b.className='km-opt';
      b.dataset.key=k;
      b.textContent=v.lbl;
      b.title=v.info;
      b.classList.toggle('on',state.classic===k);
      b.addEventListener('mouseenter',()=>setInfo(v.info));
      b.addEventListener('focus',()=>setInfo(v.info));
      b.addEventListener('click',()=>{
        state.classic=k;
        cwrap.querySelectorAll('.km-opt').forEach(x=>x.classList.toggle('on',x===b));
        setInfo(v.info);
        render();
      });
      cwrap.appendChild(b);
    });
  }
  function setComposerMode(mode){
    state.mode=mode;
    $$('.km-segb').forEach(b=>b.classList.toggle('on',b.dataset.mode===mode));
    const compose=$('#km-compose'), classic=$('#km-classic');
    if(compose)compose.hidden=mode!=='build';
    if(classic)classic.hidden=mode!=='classic';
    setInfo(machineInfo());
    render();
  }
  $$('.km-segb').forEach(b=>b.addEventListener('click',()=>setComposerMode(b.dataset.mode)));

  function setupStepper(id,outId,initial,onChange){
    const wrap=$(id), out=$(outId);
    if(!wrap)return;
    wrap.innerHTML='';
    const set=v=>{
      onChange(v);
      if(out)out.textContent=String(v);
      wrap.querySelectorAll('button').forEach(b=>b.classList.toggle('on',+b.dataset.v===v));
    };
    for(let i=1;i<=8;i++){
      const b=document.createElement('button');
      b.type='button';
      b.dataset.v=String(i);
      b.textContent=String(i);
      b.addEventListener('click',()=>set(i));
      wrap.appendChild(b);
    }
    set(initial);
  }
  function setupTemp(id,outId){
    const el=$(id), out=$(outId);
    if(!el)return;
    const sync=()=>{if(out)out.textContent=(+el.value/100).toFixed(2);};
    el.addEventListener('input',sync);
    sync();
  }
  function syncModePair(localSel,proSel){
    const local=$(localSel), pro=$(proSel);
    if(!local||!pro)return;
    const sync=()=>{
      if(local.checked){pro.checked=false;pro.disabled=true;}
      else pro.disabled=false;
    };
    local.addEventListener('change',sync);
    pro.addEventListener('change',()=>{if(pro.checked)local.checked=false;sync();});
    sync();
  }

  const svg=$('#km-svg'), W=320, H=150, PAD=12, n=INPUT.length;
  const xs=i=>PAD+i*(W-2*PAD)/(n-1);
  const ys=v=>{const t=Math.max(-2.2,Math.min(2.2,v));return H/2-t*(H/2-PAD)/2.2;};
  let dots=[], line=null, cur=INPUT.slice(), anim=null;
  function buildSvg(){
    if(!svg)return;
    svg.innerHTML='';
    const mid=document.createElementNS('http://www.w3.org/2000/svg','line');
    mid.setAttribute('x1',0); mid.setAttribute('x2',W); mid.setAttribute('y1',H/2); mid.setAttribute('y2',H/2);
    mid.setAttribute('stroke','#1f2d26'); mid.setAttribute('stroke-width','1'); svg.appendChild(mid);
    line=document.createElementNS('http://www.w3.org/2000/svg','polyline');
    line.setAttribute('fill','none'); line.setAttribute('stroke','#34e0a1'); line.setAttribute('stroke-width','2'); line.setAttribute('stroke-linejoin','round');
    svg.appendChild(line);
    dots=cur.map((v,i)=>{const c=document.createElementNS('http://www.w3.org/2000/svg','circle');c.setAttribute('r','3.4');c.setAttribute('cx',xs(i));svg.appendChild(c);return c;});
    paint(cur);
  }
  function paint(v){
    if(!line)return;
    line.setAttribute('points',v.map((x,i)=>xs(i)+','+ys(x)).join(' '));
    v.forEach((x,i)=>{if(dots[i]){dots[i].setAttribute('cy',ys(x));dots[i].setAttribute('fill',x>=0?'#34e0a1':'#ff7a7a');}});
  }
  const ease=t=>1-Math.pow(1-t,3);
  function morph(to,ms=480){
    if(!line)return;
    const from=cur.slice(), t0=performance.now();
    if(anim)cancelAnimationFrame(anim);
    const step=t=>{
      const k=Math.min(1,(t-t0)/ms), e=ease(k);
      cur=from.map((f,i)=>f+(to[i]-f)*e);
      paint(cur);
      if(k<1)anim=requestAnimationFrame(step); else cur=to.slice();
    };
    anim=requestAnimationFrame(step);
  }
  function normalize(a,kind){
    const m=a.reduce((s,x)=>s+x,0)/a.length;
    if(kind==='layernorm'){
      const v=a.reduce((s,x)=>s+(x-m)*(x-m),0)/a.length;
      return a.map(x=>(x-m)/Math.sqrt(v+1e-5));
    }
    const r=Math.sqrt(a.reduce((s,x)=>s+x*x,0)/a.length+1e-6);
    return a.map(x=>x/r);
  }
  function classicViz(key){
    let v=INPUT.slice();
    if(key.startsWith('softmax')){
      const scale=key==='softmax_scale'?0.7:1;
      const z=v.map(x=>x*scale), mx=Math.max(...z), e=z.map(x=>Math.exp(x-mx)), s=e.reduce((a,b)=>a+b,0);
      v=e.map(x=>x/s*4-0.5);
    }else if(key==='rmsnorm'||key==='layernorm'){
      v=normalize(v,key);
    }else if(key==='swiglu'){
      v=v.map(x=>(x/(1+Math.exp(-x)))*(0.7+Math.abs(x)*0.3));
    }else if(key==='geglu'){
      v=v.map(x=>ACTS.gelu.f(x)*(0.7+Math.abs(x)*0.3));
    }
    return v;
  }
  function pipeline(){
    if(state.mode==='classic')return [{l:'input',v:INPUT.slice()},{l:CLASSICS[state.classic].lbl,v:classicViz(state.classic)}];
    const st=[{l:'input',v:INPUT.slice()}];
    let v=INPUT.slice();
    if(state.slots.mem){v=v.map((x,i)=>x+INPUT[(i+3)%n]*0.5);st.push({l:'residual add',v:v.slice()});}
    v=normalize(v,state.slots.norm);st.push({l:NORMS[state.slots.norm].lbl,v:v.slice()});
    v=v.map(ACTS[state.slots.act].f);st.push({l:ACTS[state.slots.act].lbl+' '+ACTS[state.slots.act].verb,v:v.slice()});
    if(state.slots.act2){v=v.map(ACTS[state.slots.act2].f);st.push({l:ACTS[state.slots.act2].lbl+' second pass',v:v.slice()});}
    return st;
  }
  let playing=false;
  function play(){
    if(playing)return;
    playing=true;
    const st=pipeline();
    let i=0;
    const stage=$('#km-stage-label');
    const next=()=>{
      if(i>=st.length){playing=false;return;}
      if(stage){stage.textContent=cleanText(st[i].l);animate(stage,[{opacity:0,transform:'translateY(4px)'},{opacity:1,transform:'none'}],{duration:200,easing:'cubic-bezier(0.16,1,0.3,1)'});}
      morph(st[i].v);
      i++;
      setTimeout(next,680);
    };
    next();
  }
  function render(){
    const st=pipeline(), stage=$('#km-stage-label');
    if(stage)stage.textContent=cleanText(st[st.length-1].l);
    morph(st[st.length-1].v,360);
    updateReplayOffer();
  }

  function buildRecipe(){
    const tempEl=$('#km-temp'), proEl=$('#km-pro'), localEl=$('#km-local'), nameEl=$('#km-name');
    const local=!!(localEl&&localEl.checked);
    return {
      mode:state.mode,
      norm:state.slots.norm,
      residual:!!state.slots.mem,
      acts:[state.slots.act].concat(state.slots.act2?[state.slots.act2]:[]),
      classic:state.classic,
      spark:'',
      k:buildTries,
      temp:(+(tempEl?tempEl.value:70))/100,
      pro:!local&&!!(proEl&&proEl.checked),
      local,
      name:nameEl?nameEl.value:''
    };
  }
  function expertRecipe(){
    if(!resolvedExpert||resolvedExpert.unknown)return null;
    const tempEl=$('#km-xtemp'), proEl=$('#km-xpro'), localEl=$('#km-xlocal'), briefEl=$('#km-xbrief');
    const local=!!(localEl&&localEl.checked);
    const r=Object.assign({},resolvedExpert.recipe);
    r.k=expertTries;
    r.temp=(+(tempEl?tempEl.value:70))/100;
    r.pro=!local&&!!(proEl&&proEl.checked);
    r.local=local;
    r.name='expert';
    r.spark=briefEl?briefEl.value:'';
    return r;
  }
  function currentRecipe(){
    return state.tab==='expert'?(expertRecipe()||buildRecipe()):buildRecipe();
  }
  function bridge(recipe){
    const inp=document.querySelector('#km_recipe textarea');
    if(!inp)return;
    // _n is a nonce: it makes the recipe JSON differ on every mint so Gradio always registers a
    // value change and re-runs do_mint. Without it, minting the same recipe twice in a row left
    // the textarea value unchanged and the second submit was dropped (the "needs a double-click"
    // bug). The backend echoes it back so stale blobs from the previous run cannot repaint the old
    // result over the new loading card while Gradio is preparing the next response.
    const nonce=Date.now().toString(36)+'-'+Math.random().toString(36).slice(2);
    const payload=Object.assign({},recipe,{_n:nonce});
    activeMintNonce=nonce;
    awaitingMint=true;
    const set=Object.getOwnPropertyDescriptor(window.HTMLTextAreaElement.prototype,'value').set;
    set.call(inp,JSON.stringify(payload));
    inp.dispatchEvent(new Event('input',{bubbles:true}));
    // NOTE: we intentionally do NOT dispatch 'change' here. The leaderboard fetch is triggered
    // separately via refreshLB() so it never competes with do_mint in Gradio's queue.
    setTimeout(()=>{
      const g=document.querySelector('#km_go button')||document.querySelector('#km_go');
      if(g)g.click();
    },80);
  }
  function setLBStatus(text,kind){
    const s=$('#km-lb-status');
    if(!s)return;
    s.textContent=text;
    s.classList.toggle('bad',kind==='bad');
    s.classList.toggle('warn',kind==='warn');
  }
  function setLBButton(on){
    const b=$('#km-lb-refresh');
    if(!b)return;
    b.disabled=on;
    b.textContent=on?'Refreshing...':'Refresh';
  }
  async function refreshLBDirect(){
    const rows=[], seen=new Set(), sources=[];
    for(const url of LB_URLS){
      try{
        const res=await fetch(url,{cache:'no-store'});
        if(!res.ok)continue;
        const data=await res.json();
        const got=Array.isArray(data.rows)?data.rows:[];
        if(got.length)sources.push(url);
        got.forEach(row=>{
          const key=[row.op,row.name,row.ts,row.speedup_compile,row.speedup_eager].join('|');
          if(seen.has(key))return;
          seen.add(key);
          rows.push(row);
        });
      }catch(e){}
    }
    if(!sources.length)throw new Error('direct leaderboard fetch failed');
    renderLB(rows,{fetched_at:Date.now()/1000,sources});
  }
  function refreshLB(manual=false){
    if(awaitingMint){
      pendingLBRefresh=true;
      if(manual)setLBStatus('Queued until the current mint finishes.','warn');
      return;
    }
    if(manual||state.tab==='lb'){
      setLBButton(true);
      setLBStatus('Checking leaderboard...','');
      if(lbUiTimer)clearTimeout(lbUiTimer);
      lbUiTimer=setTimeout(()=>{
        refreshLBDirect().catch(()=>{
          setLBButton(false);
          setLBStatus('Still waiting for the Gradio refresh bridge. Direct browser fetch was blocked. Try once more or reopen the tab.','warn');
        });
      },8000);
    }
    const btn=document.querySelector('#km_lb_btn button')||document.querySelector('#km_lb_btn');
    if(btn)btn.click();
    else{
      setLBButton(false);
      setLBStatus('Refresh bridge is not mounted yet. Reopen the Leaderboard tab.','bad');
    }
  }
  function scheduleLBRefresh(delay=700){
    pendingLBRefresh=false;
    if(lbRefreshTimer)clearTimeout(lbRefreshTimer);
    lbRefreshTimer=setTimeout(()=>refreshLB(false),delay);
  }
  function setLBPolling(on){
    if(lbPollTimer){clearInterval(lbPollTimer);lbPollTimer=null;}
    if(on)lbPollTimer=setInterval(()=>{if(state.tab==='lb')refreshLB(false);},15000);
  }
  let mintSafetyTimer=null;
  function setMinting(on){
    ['#km-build','#km-xbuild'].forEach(id=>{const b=$(id); if(b){b.disabled=on; b.classList.toggle('minting',on);}});
    if(mintSafetyTimer){clearTimeout(mintSafetyTimer); mintSafetyTimer=null;}
    // Bulletproof re-enable: showResult() clears this, but if a result never arrives (hung backend)
    // the button must not stay dead. 4 min is longer than any real mint.
    if(on) mintSafetyTimer=setTimeout(()=>setMinting(false), 240000);
  }

  function replayName(key){return (REPLAY_META[key]&&REPLAY_META[key].label)||(REPLAYS[key]&&cleanText(REPLAYS[key].op))||key;}
  function matchingReplayKey(){
    if(state.mode==='classic'&&state.classic==='softmax'&&REPLAYS.softmax)return 'softmax';
    if(state.mode==='build'&&!state.slots.mem&&state.slots.norm==='rmsnorm'&&state.slots.act==='gelu'&&!state.slots.act2&&REPLAYS.rmsnorm_gelu)return 'rmsnorm_gelu';
    if(state.mode==='build'&&state.slots.mem&&state.slots.norm==='layernorm'&&state.slots.act==='silu'&&!state.slots.act2&&REPLAYS.add_layernorm_silu)return 'add_layernorm_silu';
    return null;
  }
  function updateReplayOffer(){
    const btn=$('#km-replay');
    if(!btn||!REPLAY_ORDER.length)return;
    const key=matchingReplayKey()||REPLAY_ORDER[0];
    btn.dataset.replayKey=key;
    btn.textContent=matchingReplayKey()?'Watch this recorded mint':'Watch a recorded mint';
    $$('.km-rpick').forEach(b=>b.classList.toggle('on',b.dataset.replayKey===key));
  }
  function renderReplayChoices(){
    const wrap=$('#km-replay-picks');
    if(!wrap)return;
    if(!REPLAY_ORDER.length){wrap.hidden=true;return;}
    wrap.hidden=false;
    wrap.innerHTML=REPLAY_ORDER.map(k=>"<button class='km-rpick' data-replay-key='"+esc(k)+"'>"+esc((REPLAY_META[k]&&REPLAY_META[k].short)||replayName(k))+" <span>"+sp3(REPLAYS[k].speedup_compile)+"x</span></button>").join('');
    updateReplayOffer();
  }

  const sp=x=>Number.isFinite(+x)?(+x).toFixed(1).replace(/\.0$/,''):'?';
  const sp2=x=>Number.isFinite(+x)?(+x).toFixed(2):'?';
  const sp3=x=>Number.isFinite(+x)?(+x).toFixed(3).replace(/0+$/,'').replace(/\.$/,''):'?';
  const isOk=s=>String(s||'').toLowerCase()==='ok'||String(s||'').toLowerCase()==='pass';
  function attemptSummary(d){
    const statuses=Array.isArray(d.statuses)?d.statuses:[];
    const passed=statuses.length?statuses.filter(isOk).length:(Number.isFinite(+d.n_verified)?+d.n_verified:0);
    const total=statuses.length||(Number.isFinite(+d.k)?+d.k:passed||0);
    return {statuses,passed,total};
  }
  function attemptChips(d,limit){
    const a=attemptSummary(d), statuses=a.statuses.slice(0,limit??a.statuses.length);
    if(!statuses.length)return '';
    return "<div class='km-attempts'>"+statuses.map((s,i)=>{
      const ok=isOk(s);
      return "<span class='km-attempt "+(ok?'ok':'bad')+"'><b>"+(ok?'OK':'FAIL')+"</b> attempt "+(i+1)+", "+esc(s)+"</span>";
    }).join('')+"</div>";
  }
  function target(mode){
    if(mode==='expert'){
      return {verdict:$('#km-xverdict'),source:$('#km-xsource'),code:$('#km-xsource .km-code')};
    }
    return {verdict:$('#km-verdict'),source:$('#km-source'),code:$('#km-code')};
  }
  function clearSource(mode){
    const t=target(mode);
    if(t.source)t.source.hidden=true;
    if(t.code)t.code.textContent='';
  }
  function setSource(d,kind,mode){
    const t=target(mode);
    if(!t.source||!t.code)return;
    if(!d.source){clearSource(mode);return;}
    t.source.hidden=false;
    t.source.open=false;
    const sum=t.source.querySelector('summary');
    if(sum)sum.textContent=kind==='recorded'?'Recorded Triton kernel source':'Live Triton kernel source';
    t.code.textContent=d.source;
  }
  function wakingError(msg){return /waking|wake|timeout|timed out|still|unexpected|http|modal|loading/i.test(String(msg||''));}
  function showResult(d,opts={}){
    if(!opts.recorded&&awaitingMint&&String(d&&d._n||'')!==String(activeMintNonce||'')){
      return;
    }
    if(!opts.recorded&&awaitingMint){
      awaitingMint=false;
      activeMintNonce=null;
    }
    const mode=opts.mode||lastMintedMode||'build';
    stopBuild();
    stopReplay();
    if(!opts.recorded) setMinting(false);  // result arrived -> re-enable Mint (replays don't disable it)
    const t=target(mode), v=t.verdict;
    if(!v)return;
    clearSource(mode);
    if(d.error){
      const local=!!d.local;
      const wake=!local&&wakingError(d.error);
      const replayKey=matchingReplayKey()||REPLAY_ORDER[0]||'';
      const replayBtn=replayKey?"<button class='km-inline-replay' data-replay-key='"+esc(replayKey)+"'>Watch a recorded mint while it wakes</button>":"";
      v.innerHTML="<div class='km-card err'>"+
        "<div class='km-result-top'><div class='km-verdict-icon bad'>!</div><div><span class='km-run-label'>"+(local?'local offline mint':'live mint')+"</span><b>"+(wake?'The live model is still waking':(local?'The local mint did not finish':'The live mint did not finish'))+"</b></div></div>"+
        "<p class='km-dim'>"+(wake?'The backend scales to zero, so the first live mint can take about 90 seconds. The replay below is a real earlier mint, not a mockup.':(local?esc(d.error):'Backend said: '+esc(d.error)))+"</p>"+
        (wake?replayBtn+"<p class='km-rawerr'>Backend said: "+esc(d.error)+"</p>":replayBtn)+"</div>";
      if(pendingLBRefresh)scheduleLBRefresh();
      return;
    }
    const a=attemptSummary(d);
    if(!d.verified){
      v.innerHTML="<div class='km-card miss'>"+
        "<div class='km-result-top'><div class='km-verdict-icon bad'>NO</div><div><span class='km-run-label'>referee verdict</span><b>Not verified yet</b></div></div>"+
        "<div class='km-ref-summary'><b>"+a.passed+" of "+(a.total||'?')+"</b> attempts passed the referee</div>"+
        attemptChips(d)+"<p class='km-dim'>"+esc(d.explanation||'Press Mint again, raise attempts, or change the operation.')+"</p></div>";
      if(pendingLBRefresh)scheduleLBRefresh();
      return;
    }
    const recorded=!!opts.recorded, local=!!d.local;
    // Headline the HONEST baseline (the compiler), not the inflated vs-eager fusion number.
    // Strongest available wins: max-autotune > torch.compile default > eager.
    const se=+d.speedup_eager, sc=+d.speedup_compile, sm=+d.speedup_maxauto;
    const hasC=Number.isFinite(sc)&&sc>0, hasM=Number.isFinite(sm)&&sm>0;
    const heroSpeed=hasM?sm:(hasC?sc:se);
    const heroLabel=hasM?'vs torch.compile max-autotune':(hasC?'vs torch.compile':'vs PyTorch eager');
    const missingLocalCompiler=local&&!recorded&&!hasC&&!hasM;
    const compilerGuard=missingLocalCompiler
      ? "<div class='km-baseline-guard'>Compiler baselines did not return for this Local run. Re-mint; a certified Local result should include torch.compile and max-autotune next to eager.</div>"
      : "";
    const runLabel=local?'local offline mint':(recorded?'recorded mint':'live mint');
    const noteText=recorded?'recorded backend run':(local?'local offline run':'live backend run');
    const note="<span class='km-pill-note "+(recorded?'recorded':(local?'local':'hit'))+"'>"+noteText+"</span>";
    const board=d.beat_champion?"<span class='km-crown'>27B champion beaten</span>":d.on_leaderboard?"<span class='km-board'>leaderboard result</span>":"";
    v.innerHTML="<div class='km-card ok km-result-card'>"+note+board+
      "<div class='km-result-top'><div class='km-verdict-icon'>OK</div><div><span class='km-run-label'>"+runLabel+"</span><b>Verified by the referee</b><span>"+esc(d.op||'kernel')+"</span></div></div>"+
      "<div class='km-ref-summary'><b>"+a.passed+" of "+(a.total||'?')+"</b> attempts passed the referee</div>"+
      attemptChips(d)+
      compilerGuard+
      // Every comparison as its own big number, strongest (most honest) baseline first, eager last
      // and labelled 'unfused' so the side-by-side is self-explanatory: the model's real edge is
      // the ~1.1x over the compiler; the big eager number is mostly the fusion win.
      "<div class='km-stats km-cmp'>"+
      (hasM?"<div class='km-cmp-key'><big>"+sp3(sm)+"x</big><span>vs torch.compile max-autotune</span></div>":"")+
      (hasC?"<div"+(hasM?"":" class='km-cmp-key'")+"><big>"+sp3(sc)+"x</big><span>vs torch.compile</span></div>":"")+
      "<div><big>"+sp(se)+"x</big><span>vs PyTorch eager (unfused)</span></div>"+
      "<div><big>"+(Number.isFinite(+d.gen_seconds)?sp(d.gen_seconds)+'s':'?')+"</big><span>generation</span></div></div>"+
      "<p class='km-dim'>"+esc(d.explanation||'')+"</p></div>";
    animate(v.firstElementChild,[{transform:'scale(0.96)',opacity:0},{transform:'scale(1)',opacity:1}],{duration:300,easing:'cubic-bezier(0.16,1,0.3,1)'});
    setSource(d,recorded?'recorded':'live',mode);
    if(!recorded&&mode==='build'){
      HIST.unshift("<code>"+esc(d.op||'kernel')+"</code><span>"+sp(d.speedup_maxauto||d.speedup_compile||d.speedup_eager)+"x vs compiler</span>");
      const hist=$('#km-hist');
      if(hist)hist.innerHTML=HIST.slice(0,10).map(h=>"<div class='km-hrow'>"+h+"</div>").join('');
    }
    // The leaderboard refresh is deliberately decoupled from the mint call. Once the result has
    // arrived, the queue is free again, so refresh the board without hiding the just-rendered result.
    if(!recorded)scheduleLBRefresh(900);
  }

  let replayTimer=null;
  function stopReplay(){if(replayTimer){clearTimeout(replayTimer);replayTimer=null;}}
  function startReplay(key){
    const d=REPLAYS[key];
    if(!d)return;
    lastMintedMode=state.tab==='expert'?'expert':'build';
    stopBuild();
    stopReplay();
    play();
    const t=target(lastMintedMode), v=t.verdict;
    if(!v)return;
    clearSource(lastMintedMode);
    const statuses=Array.isArray(d.statuses)?d.statuses:[];
    const total=statuses.length||(Number.isFinite(+d.k)?+d.k:4);
    const passed=statuses.length?statuses.filter(isOk).length:(Number.isFinite(+d.n_verified)?+d.n_verified:0);
    v.innerHTML="<div class='km-card building replaying'>"+
      "<div class='km-result-top'><div class='km-verdict-icon'>RUN</div><div><span class='km-run-label recorded'>recorded mint</span><b>"+esc(replayName(key))+"</b><span>This is a genuine earlier backend run. Mint still starts a live run.</span></div></div>"+
      "<div class='km-replay-stage' id='km-replay-stage'>loading recorded attempts...</div>"+
      "<div class='km-attempts' id='km-replay-attempts'></div>"+
      "<div class='km-bar'><i id='km-replay-fill'></i></div>"+
      "<p class='km-dim'>The model took "+(Number.isFinite(+d.gen_seconds)?sp(d.gen_seconds)+'s':'real time')+" on this recorded run. The replay shows the referee loop without waiting for cold start.</p></div>";
    const att=$('#km-replay-attempts'), stage=$('#km-replay-stage'), fill=$('#km-replay-fill');
    let i=0;
    const tick=()=>{
      if(i<total){
        const st=statuses[i]||(i<passed?'ok':'not verified'), ok=isOk(st);
        if(stage)stage.textContent='referee verdict for attempt '+(i+1)+' of '+total;
        if(att)att.insertAdjacentHTML('beforeend',"<span class='km-attempt "+(ok?'ok':'bad')+"'><b>"+(ok?'OK':'FAIL')+"</b> attempt "+(i+1)+", "+esc(st)+"</span>");
        if(fill)fill.style.width=Math.round(((i+1)/(total+1))*100)+'%';
        i++;
        replayTimer=setTimeout(tick,360);
        return;
      }
      if(stage)stage.textContent='best verified kernel selected';
      if(fill)fill.style.width='100%';
      replayTimer=setTimeout(()=>showResult(d,{recorded:true,mode:lastMintedMode,replayKey:key}),420);
    };
    replayTimer=setTimeout(tick,180);
  }

  let buildTimer=null;
  function startBuild(mode,recipe){
    stopReplay();
    const t=target(mode), v=t.verdict;
    if(!v)return;
    clearSource(mode);
    const k=recipe.k||4, started=Date.now(), replayKey=mode==='build'?(matchingReplayKey()||REPLAY_ORDER[0]||''):(REPLAY_ORDER[0]||'');
    const local=!!recipe.local;
    const stages=local?['loading local GGUF...','llama.cpp drafting '+Math.min(k,2)+' kernels...','compiling Triton on this Space GPU...','checking correctness vs PyTorch...','timing vs torch.compile max-autotune...']:
      ['model waking...','drafting '+k+' kernels...','compiling Triton...','checking correctness vs PyTorch...','timing vs torch.compile max-autotune...'];
    const stepHtml=stages.map((s,i)=>"<span class='"+(i===0?'on':'')+"'>"+esc(s)+"</span>").join('');
    v.innerHTML="<div class='km-card building'><b class='km-bstage' id='km-bstage'>"+esc(stages[0])+"</b>"+
      "<div class='km-build-steps' id='km-build-steps'>"+stepHtml+"</div>"+
      "<div class='km-bar'><i id='km-barfill'></i></div>"+
      "<span class='km-dim'>"+(local?"Local mode drafts up to two kernels with llama.cpp on this Space's GPU, then verifies them with the in-process referee and times them against PyTorch eager, torch.compile, and max-autotune. First run may also download/cache the GGUF.":"The model drafts up to "+k+" kernels. The referee compiles, correctness-checks, and times every candidate. First build of a session can take about 90 seconds while the model wakes.")+"</span>"+
      (replayKey?"<button class='km-inline-replay' data-replay-key='"+esc(replayKey)+"'>Watch a recorded mint while this runs</button>":"")+"</div>";
    // Honest progress: do_mint is one blocking call with no progress signal, so we cannot show a
    // live readout. Instead we walk the referee's real phases ONCE on a rough estimate, monotonically
    // (the index only ever increases), and park on the final phase until the result arrives. It never
    // loops back or resets the way the old modulo version did. A gentle pulse keeps the viz alive.
    const stageStart=[0,12,18,23,28];  // seconds at which each phase is assumed to begin
    let baseVals=null; try{const st=pipeline();baseVals=st[st.length-1].v.slice();}catch(e){}
    buildTimer=setInterval(()=>{
      const elapsed=(Date.now()-started)/1000;
      let ix=0; for(let j=0;j<stageStart.length;j++){if(elapsed>=stageStart[j])ix=j;}
      ix=Math.min(ix,stages.length-1);
      const label=(ix===stages.length-1&&elapsed>45)?'still timing vs torch.compile max-autotune...':stages[ix];
      const el=$('#km-bstage');
      if(el&&el.textContent!==label){el.textContent=label;animate(el,[{opacity:.35,transform:'translateY(3px)'},{opacity:1,transform:'none'}],{duration:220,easing:'cubic-bezier(0.16,1,0.3,1)'});}
      $$('#km-build-steps span').forEach((s,j)=>{s.classList.toggle('on',j===ix);s.classList.toggle('done',j<ix);});
      const f=$('#km-barfill');
      if(f)f.style.width=Math.min(94,(Date.now()-started)/900)+'%';
      if(baseVals){const p=0.96+0.04*Math.sin(elapsed*1.5);morph(baseVals.map(x=>x*p),650);}
    },650);
  }
  function stopBuild(){
    if(buildTimer){clearInterval(buildTimer);buildTimer=null;}
    const f=$('#km-barfill');
    if(f)f.style.width='100%';
    render();
  }

  function renderLB(rows,meta={}){
    const el=$('#km-lb-view');
    if(!el)return;
    if(lbUiTimer){clearTimeout(lbUiTimer);lbUiTimer=null;}
    setLBButton(false);
    const fetched=Number(meta.fetched_at||0);
    const stamp=fetched?new Date(fetched*1000).toLocaleTimeString([], {hour:'2-digit', minute:'2-digit', second:'2-digit'}):'just now';
    const sourceCount=Array.isArray(meta.sources)?meta.sources.length:0;
    setLBStatus('Last checked '+stamp+(sourceCount?' from '+sourceCount+' endpoint'+(sourceCount>1?'s':'')+'.':'.'),'');
    if(!rows.length){el.innerHTML='<i>No kernels yet. Be the first.</i>';return;}
    el.innerHTML="<table class='km-lbt'><thead><tr><th>#</th><th>machine</th><th>builder</th><th>vs compiler</th><th>vs PyTorch</th></tr></thead><tbody>"+
      rows.slice(0,25).map((r,i)=>"<tr class='"+(r.champion?'champ':'')+"'><td class='km-rank'>"+(i+1)+"</td><td><code>"+esc(r.op||'kernel')+"</code></td><td>"+esc(r.name||'anonymous')+"</td><td class='hot'>"+sp2(r.speedup_compile)+"x</td><td>"+sp(r.speedup_eager)+"x</td></tr>").join('')+"</tbody></table>";
  }
  function watch(id,cb){
    const el=document.querySelector(id);
    if(!el)return;
    const read=()=>{
      const s=el.querySelector('[data-blob]');
      if(s){try{cb(JSON.parse(atob(s.dataset.blob)));}catch(e){}}
    };
    new MutationObserver(read).observe(el,{childList:true,subtree:true});
    read();
  }
  setTimeout(()=>{watch('#km_result',showResult);watch('#km_lb',d=>renderLB(d.rows||[],d));},250);
  const rf=$('#km-lb-refresh');
  if(rf)rf.addEventListener('click',()=>refreshLB(true));

  const CLASSIC_TERMS=[
    {key:'softmax_scale',terms:['scaled softmax','scale softmax','attention scale']},
    {key:'swiglu',terms:['swiglu','swi glu']},
    {key:'geglu',terms:['geglu','ge glu']},
    {key:'softmax',terms:['softmax']}
  ];
  const ACT_TERMS=[
    {key:'gelu_erf',terms:['gelu_erf','exact gelu','erf gelu']},
    {key:'relu2',terms:['relu2','relu squared','squared relu','square relu']},
    {key:'leaky_relu',terms:['leaky_relu','leaky relu','leaky']},
    {key:'silu',terms:['silu','swish','swishes']},
    {key:'sigmoid',terms:['sigmoid']},
    {key:'gelu',terms:['gelu']},
    {key:'relu',terms:['relu']},
    {key:'tanh',terms:['tanh']},
    {key:'elu',terms:['elu']},
    {key:'mish',terms:['mish']}
  ];
  function termMatches(text,items){
    // word-boundary match so 'elu' does not fire inside 'gelu', 'relu' not inside 'leaky_relu', etc.
    // (underscores and digits are word chars, so \b correctly keeps relu2 and gelu_erf distinct.)
    const found=[];
    items.forEach(item=>{
      let best=-1, bestLen=0;
      item.terms.forEach(term=>{
        const re=new RegExp('\\b'+term.replace(/[.*+?^${}()|[\]\\]/g,'\\$&')+'\\b');
        const m=re.exec(text);
        if(m&&(best<0||m.index<best||(m.index===best&&term.length>bestLen))){best=m.index;bestLen=term.length;}
      });
      if(best>=0)found.push({key:item.key,ix:best,len:bestLen});
    });
    found.sort((a,b)=>a.ix-b.ix||b.len-a.len);
    // drop a shorter match that overlaps a longer one ('relu' inside 'leaky relu', 'softmax'
    // inside 'scaled softmax'), keeping the first/longest at each span.
    const kept=[];
    found.forEach(m=>{ if(!kept.some(k=>m.ix<k.ix+k.len&&k.ix<m.ix+m.len))kept.push(m); });
    return kept;
  }
  function resolveBrief(text){
    const raw=text||'', t=raw.toLowerCase();
    const classics=termMatches(t,CLASSIC_TERMS);
    if(classics.length){
      const key=classics[0].key;
      return {unknown:false,label:CLASSICS[key].lbl,recipe:{mode:'classic',norm:'rmsnorm',residual:false,acts:[],classic:key,spark:raw}};
    }
    const normHit=t.includes('layernorm')||t.includes('layer norm')?'layernorm':(t.includes('rmsnorm')||t.includes('rms norm')||/\brms\b/.test(t)?'rmsnorm':null);
    const residual=/residual|resid|\bskip\b|add input|\+ input/.test(t);
    const acts=[];
    termMatches(t,ACT_TERMS).forEach(m=>{if(!acts.includes(m.key)&&acts.length<2)acts.push(m.key);});
    if(!normHit&&!acts.length){
      return {unknown:true,message:'Could not map this brief. Use RMSNorm, LayerNorm, residual, and a single supported activation, or a named op such as softmax, scaled softmax, SwiGLU, or GeGLU.'};
    }
    if(!acts.length){
      if(normHit&&!residual){
        return {unknown:false,label:CLASSICS[normHit].lbl,recipe:{mode:'classic',norm:normHit,residual:false,acts:[],classic:normHit,spark:raw}};
      }
      return {unknown:true,message:'This brief needs a supported activation after the residual or normalization so the referee can verify it.'};
    }
    const recipe={mode:'build',norm:normHit||'rmsnorm',residual,acts,classic:'softmax',spark:raw};
    return {unknown:false,label:labelRecipe(recipe),recipe};
  }
  function resolveExpert(){
    const box=$('#km-xbrief'), res=$('#km-xresolved'), out=res?res.querySelector('.v'):null, btn=$('#km-xbuild');
    if(!box||!res||!out)return;
    resolvedExpert=resolveBrief(box.value);
    res.classList.toggle('unknown',!!resolvedExpert.unknown);
    out.textContent=resolvedExpert.unknown?resolvedExpert.message:resolvedExpert.label;
    if(btn)btn.disabled=!!resolvedExpert.unknown;
  }
  const OP_TAGS=[
    ['softmax','softmax over each row'],
    ['scaled_softmax','scaled softmax over each row'],
    ['swiglu','SwiGLU gated feed forward'],
    ['geglu','GeGLU gated feed forward'],
    ['rmsnorm','RMSNorm only'],
    ['layernorm','LayerNorm only'],
    ['add_rmsnorm_gelu','fused RMSNorm with residual and GELU'],
    ['layernorm_silu','fused LayerNorm with SiLU'],
    ['rmsnorm_mish','fused RMSNorm with Mish'],
    ['add_layernorm_silu','fused LayerNorm with residual and SiLU'],
    ['rmsnorm_gelu_relu2','fused RMSNorm with GELU then squared ReLU']
  ].concat(Object.keys(ACTS).map(k=>[k,ACTS[k].lbl+' activation']));
  function renderOpTags(sel,interactive){
    const wrap=$(sel);
    if(!wrap)return;
    wrap.innerHTML='';
    OP_TAGS.forEach(([label,brief])=>{
      const b=document.createElement('button');
      b.type='button';
      b.className='km-optag';
      b.textContent=label;
      b.dataset.brief=brief;
      if(interactive)b.addEventListener('click',()=>{
        const box=$('#km-xbrief');
        if(box){box.value=brief;box.dispatchEvent(new Event('input',{bubbles:true}));box.focus();}
      });
      wrap.appendChild(b);
    });
  }

  function renderGlossary(){
    const lb=$('#km-learn-blocks');
    if(!lb)return;
    const rows=[['Residual',MEM.info],...Object.values(NORMS).map(v=>[v.lbl,v.info]),...Object.values(ACTS).map(v=>[v.lbl,v.info]),...Object.values(CLASSICS).map(v=>[v.lbl,v.info])];
    lb.innerHTML=rows.map(([n,d])=>"<div class='km-gloss'><b>"+esc(n)+"</b><span>"+esc(d)+"</span></div>").join('');
  }
  function setupCopyButtons(){
    $$('.km-copy').forEach(btn=>btn.addEventListener('click',()=>{
      const block=btn.closest('.km-codeblock');
      if(!block)return;
      const clone=block.cloneNode(true), copy=clone.querySelector('.km-copy');
      if(copy)copy.remove();
      const text=clone.textContent.trim();
      const done=()=>{const old=btn.textContent;btn.textContent='Copied';setTimeout(()=>{btn.textContent=old;},900);};
      if(navigator.clipboard&&navigator.clipboard.writeText)navigator.clipboard.writeText(text).then(done).catch(done);
      else done();
    }));
  }

  root.addEventListener('click',e=>{
    const b=e.target.closest('[data-replay-key]');
    if(!b||!root.contains(b))return;
    const key=b.dataset.replayKey;
    if(!REPLAYS[key])return;
    e.preventDefault();
    startReplay(key);
  });
  const xb=$('#km-xbrief');
  if(xb)xb.addEventListener('input',resolveExpert);
  // (second-activation builder removed: the kernelsmith grammar is norm + one activation)
  const buildBtn=$('#km-build');
  if(buildBtn)buildBtn.addEventListener('click',()=>{
    lastMintedMode='build';
    const r=buildRecipe();
    animate(buildBtn,[{transform:'scale(0.97)'},{transform:'scale(1)'}],{duration:140,easing:'cubic-bezier(0.16,1,0.3,1)'});
    play();
    setMinting(true);
    startBuild('build',r);
    bridge(r);
  });
  const xBuild=$('#km-xbuild');
  if(xBuild)xBuild.addEventListener('click',()=>{
    resolveExpert();
    const r=expertRecipe();
    if(!r)return;
    lastMintedMode='expert';
    animate(xBuild,[{transform:'scale(0.97)'},{transform:'scale(1)'}],{duration:140,easing:'cubic-bezier(0.16,1,0.3,1)'});
    setMinting(true);
    startBuild('expert',r);
    bridge(r);
  });

  setupStepper('#km-tries','#km-tries-v',4,v=>{buildTries=v;});
  setupStepper('#km-xtries','#km-xtries-v',5,v=>{expertTries=v;});
  setupTemp('#km-temp','#km-temp-v');
  setupTemp('#km-xtemp','#km-xtemp-v');
  syncModePair('#km-local','#km-pro');
  syncModePair('#km-xlocal','#km-xpro');
  // The build and expert pages each have their own "Local (offline)" toggle. Local mode is the ONLY
  // path that computes the max-autotune number, so a page in Modal mode shows just 2 comparisons.
  // Keep both toggles in lockstep so build and expert always run the same way and show the same
  // set of comparison numbers (the guard stops the change events from ping-ponging).
  (function(){
    const a=$('#km-local'), b=$('#km-xlocal');
    if(!a||!b)return;
    a.addEventListener('change',()=>{ if(b.checked!==a.checked){ b.checked=a.checked; b.dispatchEvent(new Event('change')); } });
    b.addEventListener('change',()=>{ if(a.checked!==b.checked){ a.checked=b.checked; a.dispatchEvent(new Event('change')); } });
  })();
  renderPickers();
  renderClassics();
  renderReplayChoices();
  renderOpTags('#km-oplist',true);
  renderOpTags('#km-docs-ops',false);
  renderGlossary();
  setupCopyButtons();
  buildSvg();
  setInfo(machineInfo());
  resolveExpert();
  showTab('build');
  render();
}
"""

with gr.Blocks(title="Kernel Mint", head=f"<style>{CSS}</style>") as demo:
    gr.HTML(APP_HTML)
    # bridge components: kept in the DOM (NOT visible=False, which removes them entirely so the
    # custom JS can't reach the inner textarea/button) and hidden with CSS (#km_recipe etc.).
    recipe_in = gr.Textbox(elem_id="km_recipe")
    result_out = gr.HTML(elem_id="km_result")
    lb_out = gr.HTML(elem_id="km_lb")
    go = gr.Button("go", elem_id="km_go")
    lb_btn = gr.Button("lb", elem_id="km_lb_btn")
    # trigger_mode="multiple": the default ("once") silently DROPS a click that lands while a
    # previous mint is still running, so minting one kernel after another needed two clicks and the
    # old result lingered. The frontend also disables the Mint button during a mint (one at a time,
    # no wasted ZeroGPU calls); this is the belt-and-suspenders so no click is ever swallowed.
    go.click(do_mint, recipe_in, result_out, trigger_mode="multiple")
    # Leaderboard refresh is its OWN trigger, decoupled from minting. Previously fetch_lb was wired
    # to recipe_in.change, so every mint (which sets recipe_in) also fired a leaderboard fetch; on a
    # slow backend that fetch sat ahead of do_mint in Gradio's queue and the mint appeared to do
    # nothing until a second click. Now the JS clicks #km_lb_btn only when it actually wants the board.
    lb_btn.click(fetch_lb, None, lb_out)
    demo.load(fetch_lb, None, lb_out)
    demo.load(None, None, None, js=APP_JS)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))