YMRohit's picture
Make leaderboard refresh visible
281cd85
Raw
History Blame Contribute Delete
90.7 kB
"""OUROBOROS Kernel Mint: a GPU-kernel demo (Gradio + custom JS).
Compose a fused operation, and a 1B model writes a real Triton kernel for it while an
immutable referee checks correctness against PyTorch and times it against torch.compile. The
whole interactive surface is a custom JS widget in one gr.HTML; the mint runs through Python
(no CORS) and hits the 1B model (default) or the 27B (Pro mode).
"""
from __future__ import annotations
import base64
import json
import os
import re
import sys
import time
import gradio as gr
import requests
# `spaces.GPU` only matters on ZeroGPU, where the GPU is granted lazily per call. On a dedicated
# GPU Space (e.g. L4) or locally the GPU is always attached, so the decorator must be a plain
# pass-through — otherwise it can fight the always-present CUDA context. Gate on the env var HF sets
# only on ZeroGPU hardware, so the same app.py runs unchanged on ZeroGPU, a paid GPU, and a laptop.
try:
if not os.environ.get("SPACES_ZERO_GPU"):
raise ImportError("not on ZeroGPU: use the no-op GPU shim")
import spaces
except Exception:
class _SpacesShim:
@staticmethod
def GPU(*_args, **_kwargs):
def _decorator(fn):
return fn
return _decorator
spaces = _SpacesShim()
ONE_B = os.environ.get("BACKEND_URL", "https://ymrohit--ouroboros-kernel-mint-mint-mint.modal.run")
PRO = os.environ.get("BACKEND_PRO_URL", "https://ymrohit--ouroboros-kernel-mint-pro-mint-mint.modal.run")
ROOT = os.path.dirname(__file__)
REFEREE = os.path.join(ROOT, "referee")
SEED_DIR = os.path.join(ROOT, "seed_kernels")
LOCAL_FINE_TUNED_REPO = os.environ.get(
"LOCAL_GGUF_REPO", "YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF")
LOCAL_BASE_REPO = os.environ.get("LOCAL_GGUF_FALLBACK_REPO", "openbmb/MiniCPM5-1B-GGUF")
LOCAL_QUANT_PREFS = tuple(q.strip() for q in os.environ.get(
"LOCAL_GGUF_QUANTS", "Q5_K_M,Q6_K,Q4_K_M,Q8_0,F16,BF16").split(",") if q.strip())
LOCAL_MAX_ATTEMPTS = int(os.environ.get("LOCAL_MAX_ATTEMPTS", "2"))
_LOCAL_LLM = None
_LOCAL_LLM_PATH = None
def _ensure_referee_path():
if REFEREE not in sys.path:
sys.path.insert(0, REFEREE)
LOCAL_SYS = ("You are an expert GPU kernel engineer. Write a single correct, fast Triton "
"kernel. Output ONLY one fenced python code block defining `run(*inputs)` and "
"its @triton.jit kernel. Accumulate reductions in float32. No prose.")
# The model was trained (rl_kernelsmith.py Proposer.prompt) with a real per-op SEED kernel as the
# style guide, and for almost every op that exemplar is rmsnorm's ROW-WISE reduction. We must hand
# the model that SAME exemplar at inference, or it goes off-distribution and writes flat elementwise
# code for reduction ops (wrong results). Seeds are bundled in ./seed_kernels; this inline copy is
# only a fallback if a file is missing.
_FALLBACK_EXEMPLAR = """# GOLD seed kernel: fused RMSNorm, one row per program, fp32 accumulation.
@triton.jit
def _rmsnorm_kernel(x_ptr, w_ptr, y_ptr, stride, N, eps, BLOCK: tl.constexpr):
row = tl.program_id(0)
x_ptr += row * stride
y_ptr += row * stride
acc = tl.zeros([BLOCK], dtype=tl.float32)
for off in range(0, N, BLOCK):
cols = off + tl.arange(0, BLOCK)
x = tl.load(x_ptr + cols, mask=cols < N, other=0.0).to(tl.float32)
acc += x * x
rms = tl.rsqrt(tl.sum(acc) / N + eps)
for off in range(0, N, BLOCK):
cols = off + tl.arange(0, BLOCK)
mask = cols < N
x = tl.load(x_ptr + cols, mask=mask, other=0.0).to(tl.float32)
w = tl.load(w_ptr + cols, mask=mask, other=0.0).to(tl.float32)
tl.store(y_ptr + cols, (x * rms * w), mask=mask)
def run(x, w):
M, N = x.shape
y = torch.empty_like(x)
_rmsnorm_kernel[(M,)](x, w, y, x.stride(0), N, 1e-6, BLOCK=1024)
return y
"""
def _load_seed_kernel(op: str) -> str:
try:
with open(os.path.join(SEED_DIR, f"{op}.py")) as f:
return f.read()
except OSError:
return _FALLBACK_EXEMPLAR
def extract_kernel(text: str) -> str:
"""Pull a fenced Python kernel out of a llama.cpp completion."""
m = re.search(r"```(?:python)?\s*(.*?)```", text or "", re.S)
body = m.group(1) if m else (text or "")
starts = [body.find(k) for k in ("@triton", "import ", "def run", "def _") if body.find(k) >= 0]
i = min(starts or [0])
return body[i:].strip()
def _local_op_from_recipe(r: dict) -> tuple[str, str]:
_ensure_referee_path()
from specs import SPECS
if r.get("mode") == "classic":
op = (r.get("classic") or "softmax").strip()
if op not in SPECS:
raise ValueError(f"unknown op '{op}'")
return op, op
normfull = "rmsnorm" if str(r.get("norm", "rmsnorm")).startswith("rms") else "layernorm"
norm = "rms" if normfull == "rmsnorm" else "layer"
residual = bool(r.get("residual"))
acts = [a for a in (r.get("acts") or []) if a][:3] or ["gelu"]
label = ("residual + " if residual else "") + \
("RMSNorm" if norm == "rms" else "LayerNorm") + " -> " + " -> ".join(acts)
named = ("add_" if residual else "") + normfull + "_" + acts[0]
if len(acts) == 1 and named in SPECS:
return named, label
return f"chain|{norm}|{'1' if residual else '0'}|{','.join(acts)}", label
def _local_prompt_messages(op: str, spark: str = "") -> list[dict[str, str]]:
_ensure_referee_path()
from specs import get_spec, SPECS
spec = get_spec(op)
# Same exemplar rule as training: the real seed kernel for the next op in SPECS that isn't this
# one (almost always rmsnorm's row-wise reduction). This is the single thing that makes the 1B
# write correct kernels here instead of elementwise guesses.
exemplar_op = next((o for o in SPECS if o != op), op)
exemplar = _load_seed_kernel(exemplar_op)
user = (f"Op `{op}`: {spec.notes}\nSignature:\n{spec.signature_hint}\n\n"
f"Here is a valid Triton kernel for a DIFFERENT op (`{exemplar_op}`) as a style guide:\n"
f"```python\n{exemplar}\n```\n")
if spark:
user += (f"\nThe person building this kernel added their own idea: \"{spark}\". "
"Honour their idea where you can, but correctness is mandatory.")
return [{"role": "system", "content": LOCAL_SYS}, {"role": "user", "content": user}]
def _render_prompt(messages: list[dict[str, str]]) -> str:
# MiniCPM5 uses ChatML (<|im_start|>/<|im_end|>). It is a reasoning model, so we append the
# empty <think></think> block (the enable_thinking=False convention) to make it answer
# directly with the kernel instead of emitting a reasoning trace. Verified against the
# tokenizer's apply_chat_template(enable_thinking=False) output.
out = []
for m in messages:
out.append(f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>\n")
out.append("<|im_start|>assistant\n<think>\n\n</think>\n\n")
return "".join(out)
def _common_gguf_names() -> list[str]:
names = []
stems = ("MiniCPM5-1B", "minicpm5-1b", "model", "ggml-model")
for q in LOCAL_QUANT_PREFS:
for stem in stems:
names.append(f"{stem}-{q}.gguf")
names.append(f"{stem}.{q}.gguf")
return names
def _download_gguf_from_repo(repo_id: str) -> str:
from huggingface_hub import HfApi, hf_hub_download
explicit = os.environ.get("LOCAL_GGUF_FILE")
cache_dir = os.environ.get("LOCAL_GGUF_CACHE")
if explicit:
return hf_hub_download(repo_id=repo_id, filename=explicit, cache_dir=cache_dir)
files = []
try:
files = HfApi().list_repo_files(repo_id)
except Exception:
files = []
ggufs = [f for f in files if f.lower().endswith(".gguf")]
for quant in LOCAL_QUANT_PREFS:
hit = next((f for f in ggufs if quant.lower() in f.lower()), None)
if hit:
return hf_hub_download(repo_id=repo_id, filename=hit, cache_dir=cache_dir)
if ggufs:
return hf_hub_download(repo_id=repo_id, filename=ggufs[0], cache_dir=cache_dir)
last = None
for filename in _common_gguf_names():
try:
return hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir)
except Exception as e:
last = e
raise RuntimeError(f"no GGUF found in {repo_id}: {last}")
def _resolve_local_gguf() -> str:
errors = []
for repo_id in (LOCAL_FINE_TUNED_REPO, LOCAL_BASE_REPO):
try:
return _download_gguf_from_repo(repo_id)
except Exception as e:
errors.append(f"{repo_id}: {type(e).__name__}: {str(e)[:180]}")
raise RuntimeError("could not load a local GGUF. " + " | ".join(errors))
_CUDA_PRELOADED = False
def _preload_cuda_libs():
# The CUDA llama.cpp wheel's libllama.so links libcudart.so.12 / libcublas*.so.12, which on HF
# Spaces live inside torch's bundled nvidia-* packages and are NOT on the default loader path,
# so the import dies with "libcudart.so.12: cannot open shared object file". Preload them with
# RTLD_GLOBAL (in dependency order) so libllama.so resolves their symbols. No-op if absent.
global _CUDA_PRELOADED
if _CUDA_PRELOADED:
return
_CUDA_PRELOADED = True
import ctypes
import glob
roots = []
try:
import torch
roots.append(os.path.join(os.path.dirname(torch.__file__), "lib"))
except Exception:
pass
try:
import site
roots += list(site.getsitepackages() if hasattr(site, "getsitepackages") else [])
except Exception:
pass
roots += ["/usr/local/lib/python3.10/site-packages", "/usr/local/cuda/lib64",
"/usr/lib/x86_64-linux-gnu", "/usr/local/lib"]
# Match versioned sonames too (e.g. libcudart.so.12.4.127 with no libcudart.so.12 symlink);
# RTLD_GLOBAL makes the symbols visible to libllama.so regardless of the exact soname.
for pat in ("libcudart.so*", "libcublas.so*", "libcublasLt.so*"):
loaded = False
for root in roots:
if loaded:
break
try:
hits = sorted(glob.glob(os.path.join(root, "**", pat), recursive=True))
except Exception:
hits = []
for hit in hits:
try:
ctypes.CDLL(hit, mode=ctypes.RTLD_GLOBAL)
loaded = True
break
except Exception:
continue
# Confirm the CUDA llama.cpp lib loads at startup (loading the .so needs no GPU device, so it spends
# no ZeroGPU quota). It also warms the import so the in-@spaces.GPU call is instant.
if os.environ.get("LOCAL_LLAMA_GPU_LAYERS", "-1") != "0":
try:
_preload_cuda_libs()
import llama_cpp as _lc_probe
print(f"[startup] llama_cpp {_lc_probe.__version__} loaded OK", flush=True)
except Exception as _e:
print(f"[startup] llama_cpp load FAILED: {type(_e).__name__}: {str(_e)[:200]}", flush=True)
def _get_local_llm():
global _LOCAL_LLM, _LOCAL_LLM_PATH
on_zero = bool(os.environ.get("SPACES_ZERO_GPU"))
gpu_layers = int(os.environ.get("LOCAL_LLAMA_GPU_LAYERS", "-1"))
# Default to full GPU offload (-1). On ZeroGPU the H200 is ~30x faster than its throttled
# shared CPU for this 1B, and the GPU is granted only inside @spaces.GPU and detached between
# calls, so a GPU-resident model can't be reused across mints: rebuild it each call there.
# Off ZeroGPU (dedicated GPU or CPU) the model is cached once. Set LOCAL_LLAMA_GPU_LAYERS=0
# to force CPU (e.g. when only a CPU-only llama.cpp wheel is installed).
if _LOCAL_LLM is not None and not (on_zero and gpu_layers != 0):
return _LOCAL_LLM
if gpu_layers != 0:
_preload_cuda_libs()
from llama_cpp import Llama
_LOCAL_LLM_PATH = _resolve_local_gguf()
threads = int(os.environ.get("LOCAL_LLAMA_THREADS", str(max(1, (os.cpu_count() or 4) - 1))))
llm = Llama(
model_path=_LOCAL_LLM_PATH,
n_ctx=int(os.environ.get("LOCAL_LLAMA_CTX", "4096")),
n_threads=threads,
n_gpu_layers=gpu_layers,
verbose=bool(int(os.environ.get("LOCAL_LLAMA_VERBOSE", "0"))),
)
if not (on_zero and gpu_layers != 0):
_LOCAL_LLM = llm
return llm
def _llama_complete(messages: list[dict[str, str]], temp: float, seed: int) -> str:
# Use raw completion with our think-suppressed ChatML render (not create_chat_completion, which
# leaves the reasoning trace on and makes a 1B ramble instead of writing the kernel).
llm = _get_local_llm()
max_tokens = int(os.environ.get("LOCAL_MAX_TOKENS", "768"))
out = llm.create_completion(
_render_prompt(messages),
max_tokens=max_tokens,
temperature=temp,
top_p=0.97,
seed=seed,
stop=["<|im_end|>", "<|im_start|>"],
)
return out["choices"][0]["text"]
def _local_gpu_duration() -> int:
# This is a ZeroGPU reservation window, not a benchmark setting. Keep the default within the
# usual free-tier budget, but make the cap explicit so slower/cold hardware can raise it without
# code changes. The current Space had LOCAL_GPU_DURATION=135, which over-reserved quota; the
# default cap trims that to 120 while still leaving headroom for cold local mints.
try:
requested = int(os.environ.get("LOCAL_GPU_DURATION", "120"))
except Exception:
requested = 120
try:
cap = int(os.environ.get("LOCAL_GPU_DURATION_CAP", "120"))
except Exception:
cap = 120
return max(45, min(max(45, cap), requested))
LOCAL_GPU_DURATION = _local_gpu_duration()
@spaces.GPU(duration=LOCAL_GPU_DURATION)
def _local_gpu_mint(op: str, k: int, temp: float, spark: str) -> dict:
# Generation AND verification happen inside this one GPU window. That is required on ZeroGPU,
# where the GPU exists only inside @spaces.GPU: llama.cpp offloads generation to the H200
# (LOCAL_LLAMA_GPU_LAYERS=-1, seconds instead of minutes on the throttled CPU) and the referee
# compiles + runs Triton, all here. Off ZeroGPU the decorator is a no-op and the GPU is always
# present, so the same code path works on a dedicated GPU and locally too.
_ensure_referee_path()
# Time every verified kernel against eager AND torch.compile (default + max-autotune), so local
# mode shows the honest baselines, not just the inflated vs-eager fusion win. The max-autotune
# compile is a few seconds for these ops (inductor-cached across the k attempts), so it fits.
from harness import evaluate_inprocess_full
messages = _local_prompt_messages(op, spark)
srcs = []
base_seed = int(time.time() * 1000) & 0x7FFFFFFF
for i in range(k):
text = _llama_complete(messages, temp=temp, seed=base_seed + i)
src = extract_kernel(text)
if src:
srcs.append(src)
statuses = []
best = None
n_shapes = int(os.environ.get("LOCAL_REFEREE_SHAPES", "2"))
n_iters = int(os.environ.get("LOCAL_REFEREE_ITERS", "30"))
for i, src in enumerate(srcs):
try:
res = evaluate_inprocess_full(src, op, n_shapes=n_shapes, n_iters=n_iters, seed=i)
except Exception:
statuses.append("runtime_fail")
continue
statuses.append(res.status)
# Keep the FASTEST verified kernel (lowest latency = best vs every baseline at once).
if res.status == "ok" and (best is None or res.latency_ms < best["result"]["latency_ms"]):
best = {"source": src, "result": res.to_dict()}
return {"n_srcs": len(srcs), "statuses": statuses, "best": best}
def _local_explain(label: str, res) -> str:
if res is None or res.get("status") != "ok":
status = "no verified attempt" if res is None else res.get("status", "failed")
return (f"The local referee rejected these kernels ({status}). It still compiled and "
"checked them inside this Space; try minting again or simplify the operation.")
se = float(res.get("speedup_eager", 0) or 0)
honest = res.get("speedup_maxauto") or res.get("speedup_compile")
bl = "torch.compile max-autotune" if res.get("speedup_maxauto") else "torch.compile"
if honest:
return (f"Verified correct. {float(honest):.2f}x faster than {bl} (the honest baseline that "
f"also fuses and autotunes), and {se:.0f}x vs unfused PyTorch eager. The whole loop "
"ran in this Space: the 1B wrote it via llama.cpp, the referee compiled, checked it "
"against PyTorch, and timed it.")
return (f"Verified correct, {se:.0f}x faster than unfused PyTorch eager. The 1B wrote it via "
"llama.cpp and the referee checked and timed it, all inside this Space.")
def local_mint(recipe: dict) -> dict:
# The 1B reliably writes the named ops and single-activation machines (norm + optional residual
# + one activation), which is what it was trained on. Arbitrary multi-activation chains are
# off-distribution and it can't write them correctly, so local mode declines them with a clear
# message instead of looping through rejected attempts. Pro mode (the 27B) is the place for those.
op, label = _local_op_from_recipe(recipe)
if op.startswith("chain|"):
return {"op": label, "verified": False, "local": True, "baseline": "eager",
"statuses": [], "gen_seconds": 0.0, "verify_seconds": 0.0,
"speedup_compile": None, "unsupported": True, "k": 0,
"explanation": ("This machine stacks two activations, which is outside what the "
"kernelsmith models were trained on (the grammar is a norm plus one "
"activation), so neither the 1B nor the 27B writes it reliably. Drop "
"to a single activation and it mints with a verified kernel.")}
k = max(1, min(LOCAL_MAX_ATTEMPTS, int(recipe.get("k", 1))))
temp = max(0.2, min(1.3, float(recipe.get("temp", 0.7))))
spark = (recipe.get("spark") or "").strip()[:200]
# Pre-fetch the GGUF to disk OUTSIDE the GPU window, so the ZeroGPU 120s budget is spent on
# generate + verify rather than a 1.15GB download.
_resolve_local_gguf()
t0 = time.time()
work = _local_gpu_mint(op, k, temp, spark)
total_s = round(time.time() - t0, 1)
statuses = work.get("statuses") or []
best = work.get("best")
if not work.get("n_srcs"):
return {"op": label, "verified": False, "local": True, "baseline": "eager",
"statuses": statuses, "gen_seconds": total_s, "verify_seconds": 0.0,
"speedup_compile": None, "k": k,
"explanation": "llama.cpp returned no parseable kernel this time. Try minting again."}
if not best:
return {"op": label, "verified": False, "local": True, "baseline": "eager",
"statuses": statuses, "gen_seconds": total_s, "verify_seconds": 0.0,
"speedup_compile": None, "explanation": _local_explain(label, None), "k": k}
res = best["result"]
def _su(key):
v = res.get(key)
return round(float(v), 2) if v else None
return {"op": label, "verified": True, "local": True, "baseline": "compile",
"speedup_eager": round(float(res.get("speedup_eager", 0)), 2),
"speedup_compile": _su("speedup_compile"), "speedup_maxauto": _su("speedup_maxauto"),
"latency_ms": res.get("latency_ms"), "eager_ms": res.get("eager_ms"),
"compile_ms": res.get("compile_ms"), "maxauto_ms": res.get("maxauto_ms"),
"source": best["source"], "statuses": statuses, "gen_seconds": total_s,
"verify_seconds": 0.0, "explanation": _local_explain(label, res),
"n_verified": sum(1 for s in statuses if s == "ok"), "k": k}
def lb_url(u):
return u.replace("-mint-mint.modal.run", "-leaderboard.modal.run")
def _blob(d):
return f"<span data-blob='{base64.b64encode(json.dumps(d).encode()).decode()}'></span>"
def _with_nonce(d: dict, r: dict) -> dict:
out = dict(d or {})
if r.get("_n") is not None:
out["_n"] = r.get("_n")
return out
def do_mint(recipe_json: str) -> str:
try:
r = json.loads(recipe_json or "{}")
except Exception:
return _blob({"error": "bad recipe"})
if r.get("local"):
try:
return _blob(_with_nonce(local_mint(r), r))
except Exception as e:
return _blob(_with_nonce(
{"error": f"Local offline mint failed: {type(e).__name__}: {str(e)[:240]}",
"local": True, "baseline": "eager"}, r))
base = PRO if r.get("pro") else ONE_B
if r.get("mode") == "classic":
payload = {"op": r.get("classic", "softmax")}
else:
payload = {"blocks": {"norm": r.get("norm", "rmsnorm"), "residual": bool(r.get("residual")),
"acts": r.get("acts") or ["gelu"]}}
payload.update({"name": (r.get("name") or "anonymous")[:40], "k": int(r.get("k", 4)),
"temp": float(r.get("temp", 0.7)), "spark": (r.get("spark") or "")[:200]})
# During a cold start Modal can return an empty body before the model finishes loading,
# which makes .json() blow up. Retry a few times, then explain it plainly.
last = ""
for attempt in range(4):
try:
resp = requests.post(base, json=payload, timeout=600)
if resp.status_code == 200 and resp.text.strip():
try:
return _blob(_with_nonce(resp.json(), r))
except ValueError:
last = "the model returned an unexpected response"
else:
last = f"the model is still waking up (HTTP {resp.status_code})"
except Exception as e:
last = str(e)
time.sleep(4)
pro = " The 27B in Pro mode takes longer to wake. Try again, or turn Pro off." if r.get("pro") else ""
return _blob(_with_nonce(
{"error": f"The model is still waking up. Give it about 30 seconds and press Mint again.{pro}"},
r))
def fetch_lb(_=None):
rows = []
seen = set()
sources = []
errors = []
for u in (ONE_B, PRO):
try:
url = lb_url(u)
got = requests.get(url, timeout=20).json().get("rows", [])
if got:
sources.append(url)
for row in got:
key = (row.get("op"), row.get("name"), row.get("ts"),
row.get("speedup_compile"), row.get("speedup_eager"))
if key in seen:
continue
seen.add(key)
rows.append(row)
except Exception as e:
errors.append(f"{lb_url(u)}: {type(e).__name__}")
continue
# Always include fetched_at so the hidden Gradio HTML changes even when the rows are identical.
# That makes the visible Refresh button observably work.
return _blob({"rows": rows, "fetched_at": time.time(), "sources": sources, "errors": errors[:2]})
CSS = open(os.path.join(os.path.dirname(__file__), "style.css")).read() if os.path.exists(
os.path.join(os.path.dirname(__file__), "style.css")) else ""
REPLAYS_PATH = os.path.join(os.path.dirname(__file__), "replays", "index.json")
try:
with open(REPLAYS_PATH, encoding="utf-8") as f:
REPLAYS_JSON = json.dumps(json.load(f), separators=(",", ":")).replace("</", "<\\/")
except Exception:
REPLAYS_JSON = "{}"
APP_HTML = r"""
<div id="km" data-build="2026-06-15-leaderboard-refresh-fallback"
data-lb-one="__KM_LB_ONE__"
data-lb-pro="__KM_LB_PRO__">
<div id="km-top">
<div id="km-brand">
<span class="mark"><span class="dot"></span>Kernel Mint</span>
<span class="sub">an OUROBOROS demo</span>
</div>
<div id="km-tabs">
<span id="km-tab-pill"></span>
<button class="km-tab on" data-tab="build">Build</button>
<button class="km-tab" data-tab="expert">Expert</button>
<button class="km-tab" data-tab="lb">Leaderboard</button>
<button class="km-tab" data-tab="docs">Run &amp; verify</button>
</div>
</div>
<script id="km-replays-data" type="application/json">__KM_REPLAYS__</script>
<!-- ============ BUILD ============ -->
<div class="km-page" data-page="build">
<header id="km-hero">
<h1>A 1B model writes a real GPU kernel. <span class="g">A referee that can't be fooled checks it.</span></h1>
<p>Compose an operation, and a 1-billion-parameter model writes a fused Triton kernel for it. The kernel is compiled, checked against PyTorch on adversarial inputs, and timed against PyTorch's own compiler before anything counts as a win.</p>
</header>
<div id="km-grid">
<section class="km-card">
<div class="km-seg" role="tablist">
<button class="km-segb on" data-mode="build">Compose</button>
<button class="km-segb" data-mode="classic">Classic ops</button>
</div>
<div id="km-compose">
<div class="km-section-label">Pipeline</div>
<div id="km-pipe">
<div class="km-stage fixed"><span class="km-stage-k">input</span><span class="km-dim">a row of activations</span></div>
<div class="km-flow">&darr;</div>
<div class="km-stage"><span class="km-stage-k">residual</span><span class="km-pick" data-slot="mem"></span></div>
<div class="km-flow">&darr;</div>
<div class="km-stage"><span class="km-stage-k">normalize</span><span class="km-pick" data-slot="norm"></span></div>
<div class="km-flow">&darr;</div>
<div class="km-stage"><span class="km-stage-k">activate</span><span class="km-pick" data-slot="act"></span></div>
</div>
</div>
<div id="km-classic" hidden>
<div class="km-section-label">A named operation from a real transformer</div>
<div class="km-pick" id="km-classics"></div>
</div>
<div id="km-info"></div>
<div class="km-params">
<div class="km-field">
<label>Attempts the model drafts <b id="km-tries-v">4</b></label>
<div class="km-stepper" id="km-tries"></div>
</div>
<div class="km-field">
<label>Sampling temperature <b id="km-temp-v">0.70</b></label>
<input type="range" class="km-range" id="km-temp" min="20" max="130" value="70">
</div>
<div class="km-field">
<label>Name for the leaderboard</label>
<input class="km-input" id="km-name" placeholder="optional" autocomplete="off">
</div>
<label class="km-toggle"><input type="checkbox" id="km-pro"><span class="km-switch"></span>Use the 27B model (slower, knows more)</label>
<label class="km-toggle km-local-toggle"><input type="checkbox" id="km-local"><span class="km-switch"></span>Local (offline)</label>
<p class="km-mode-help">Runs entirely in this Space: a 1B GGUF model (llama.cpp) writes the kernel on this Space's GPU, then the in-process referee compiles it, checks correctness against PyTorch, and times it against PyTorch eager, torch.compile, and max-autotune. No Modal or cloud model calls. Pro uses the 27B.</p>
</div>
<div class="km-actions">
<button class="km-btn primary" id="km-build">Mint kernel</button>
<button class="km-btn ghost" id="km-replay" data-replay-key="rmsnorm_gelu">Watch a recorded mint</button>
</div>
<div id="km-replay-picks" class="km-replay-picks" aria-label="recorded mints"></div>
</section>
<section class="km-card">
<div class="km-section-label">The operation, stage by stage</div>
<div id="km-viz"><svg id="km-svg" viewBox="0 0 320 150" preserveAspectRatio="none"></svg>
<div id="km-stage-label">input</div></div>
<div id="km-verdict"><div class="km-card"><div class="km-result-top"><div class="km-verdict-icon">&rarr;</div>
<div><span class="km-run-label">ready</span><b>Press Mint to begin</b></div></div>
<p class="km-dim" style="margin-top:10px">Compose a pipeline on the left and mint it, or watch a recorded verified mint to see the whole referee loop instantly.</p></div></div>
<details id="km-source" class="km-source" hidden>
<summary>Triton kernel source</summary>
<pre id="km-code"></pre>
</details>
<div class="km-section-label">This session</div>
<div id="km-hist" class="km-dim">No mints yet.</div>
</section>
</div>
</div>
<!-- ============ EXPERT ============ -->
<div class="km-page" data-page="expert" hidden>
<header id="km-hero">
<h1>Free-test the verifier.</h1>
<p>Describe the fused operation you want in plain language. The model writes a Triton kernel and the referee certifies it against PyTorch, torch.compile, and max-autotune. The referee can only certify operations it has a reference for, so your request is resolved to the nearest one it can check.</p>
</header>
<div class="km-expert-grid">
<section class="km-card">
<div class="km-section-label">Your brief</div>
<textarea class="km-textarea" id="km-xbrief" placeholder="e.g. fused RMSNorm with a residual add and a SiLU activation, contiguous rows"></textarea>
<div id="km-xresolved" class="km-resolved"><span class="k">resolves to</span><div class="v">rmsnorm with gelu</div></div>
<div class="km-params">
<div class="km-field">
<label>Attempts the model drafts <b id="km-xtries-v">5</b></label>
<div class="km-stepper" id="km-xtries"></div>
</div>
<div class="km-field">
<label>Sampling temperature <b id="km-xtemp-v">0.70</b></label>
<input type="range" class="km-range" id="km-xtemp" min="20" max="130" value="70">
</div>
<label class="km-toggle"><input type="checkbox" id="km-xpro"><span class="km-switch"></span>Use the 27B model</label>
<label class="km-toggle km-local-toggle"><input type="checkbox" id="km-xlocal"><span class="km-switch"></span>Local (offline)</label>
<p class="km-mode-help">Uses llama.cpp inside the Space and verifies on this Space's GPU, timed against torch.compile and max-autotune (the honest baselines), not just eager. Pro mode keeps the 27B on Modal.</p>
</div>
<div class="km-actions">
<button class="km-btn primary" id="km-xbuild">Mint from brief</button>
</div>
</section>
<section class="km-card">
<div class="km-section-label">What the referee can verify</div>
<p class="km-limits">It certifies fusions of a <b>normalization</b> (RMSNorm or LayerNorm), an optional <b>residual add</b>, and one <b>activation</b>, plus a set of named operators from real transformers. Click one to load it into your brief.</p>
<div class="km-oplist" id="km-oplist"></div>
<div id="km-xverdict" style="margin-top:16px"></div>
<details id="km-xsource" class="km-source" hidden>
<summary>Triton kernel source</summary>
<pre class="km-code"></pre>
</details>
</section>
</div>
</div>
<!-- ============ LEADERBOARD ============ -->
<div class="km-page" data-page="lb" hidden>
<section class="km-card">
<div class="km-h">Leaderboard</div>
<p class="km-lede">The best verified kernel per operation. Crowned rows were minted by the 27B model. Every speedup here was measured by the referee, not reported by the model.</p>
<div id="km-lb-view" class="km-dim" style="margin-top:16px">Loading.</div>
<div class="km-lb-tools">
<button id="km-lb-refresh" class="km-mini">Refresh</button>
<span id="km-lb-status" class="km-lb-status">Auto-refreshes while this tab is open.</span>
</div>
</section>
</div>
<!-- ============ RUN + VERIFY / DOCS ============ -->
<div class="km-page" data-page="docs" hidden>
<section class="km-docs">
<div class="km-doc-hero">
<span class="km-doc-kicker">reproduce the claim</span>
<h2>Run the smith, then make the referee decide.</h2>
<p>The Space has two execution paths. The <b>Local (offline)</b> switch runs the 1B GGUF model with llama.cpp inside this Space, then verifies the candidate kernel in-process on the same GPU. <b>Pro</b> uses the 27B adapter through the Modal backend. Both paths compare against PyTorch eager, torch.compile, and torch.compile max-autotune before a result counts.</p>
</div>
<div class="km-doc-metrics" aria-label="execution modes">
<div><b>1B local</b><span>MiniCPM5-1B GGUF, llama.cpp, in-Space referee</span></div>
<div><b>27B Pro</b><span>Qwen3.6-27B adapter, Modal H200 backend</span></div>
<div><b>referee</b><span>compile, allclose, eager, compile, max-autotune</span></div>
</div>
<div class="km-doc-grid">
<article class="km-doc-panel wide">
<div class="km-doc-label">1. Use this Space properly</div>
<h3>The easiest local run is already on this page.</h3>
<ol class="km-doc-steps">
<li>Open <b>Build</b> or <b>Expert</b>.</li>
<li>Turn on <b>Local (offline)</b>. It disables Pro because the two paths are intentionally separate.</li>
<li>Pick a named op such as <code>rmsnorm_gelu</code>, <code>softmax</code>, or <code>swiglu</code>, or compose one norm plus one activation.</li>
<li>Press <b>Mint kernel</b>. The Space downloads or reuses the GGUF, drafts up to two kernels, compiles Triton, checks PyTorch correctness, and times eager, torch.compile, and max-autotune.</li>
</ol>
<div class="km-doc-callout">A certified Local result should show compiler baselines. If a cold ZeroGPU session times out during max-autotune, mint again. A result that only beats eager is not treated as the headline claim.</div>
</article>
<article class="km-doc-panel">
<div class="km-doc-label">2. Run the Space clone</div>
<h3>Same UI, your GPU.</h3>
<p>Use this when you want the full app locally. With no `SPACES_ZERO_GPU` env var, the GPU decorator becomes a no-op and the local verifier runs directly on your attached CUDA device.</p>
<div class="km-codeblock"><button class="km-copy">Copy</button>git clone https://huggingface.co/spaces/build-small-hackathon/ouroboros-kernel-mint
cd ouroboros-kernel-mint
python -m venv .venv
. .venv/bin/activate
python -m pip install -r requirements.txt
python app.py</div>
<p>Useful knobs:</p>
<div class="km-codeblock"><button class="km-copy">Copy</button>LOCAL_GGUF_REPO=YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF
LOCAL_GGUF_QUANTS=Q5_K_M,Q4_K_M
LOCAL_LLAMA_GPU_LAYERS=-1
LOCAL_GPU_DURATION=120
python app.py</div>
</article>
<article class="km-doc-panel">
<div class="km-doc-label">3. Run the 1B smith directly</div>
<h3>GGUF path, no Modal.</h3>
<p>This mirrors the Space-local path. It uses the fine-tuned GGUF repo first and falls back to OpenBMB's base GGUF only if needed.</p>
<div class="km-codeblock"><button class="km-copy">Copy</button>python -m pip install torch triton huggingface_hub \
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu130 \
llama-cpp-python==0.3.28</div>
<div class="km-codeblock"><button class="km-copy">Copy</button>from huggingface_hub import HfApi, hf_hub_download
from llama_cpp import Llama
repo = "YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF"
files = [f for f in HfApi().list_repo_files(repo) if f.lower().endswith(".gguf")]
filename = next((f for f in files if "Q5_K_M" in f), files[0])
gguf = hf_hub_download(repo, filename=filename)
llm = Llama(model_path=gguf, n_ctx=4096, n_gpu_layers=-1)
system = "You are an expert GPU kernel engineer. Output only one fenced python code block."
user = "Write a fused Triton kernel for row-wise softmax. Use stable max-subtraction. Return run(x)."
prompt = (
f"&lt;|im_start|&gt;system\n{system}&lt;|im_end|&gt;\n"
f"&lt;|im_start|&gt;user\n{user}&lt;|im_end|&gt;\n"
"&lt;|im_start|&gt;assistant\n```python\n"
)
out = llm.create_completion(prompt, max_tokens=768, temperature=0.7, top_p=0.97)
print(out["choices"][0]["text"])</div>
</article>
<article class="km-doc-panel">
<div class="km-doc-label">4. Run the 1B adapter directly</div>
<h3>Transformers plus PEFT.</h3>
<p>Use this when you want the published LoRA adapter instead of GGUF. It is slower to start than llama.cpp, but useful for inspection or further fine-tuning.</p>
<div class="km-codeblock"><button class="km-copy">Copy</button>python -m pip install torch transformers peft accelerate triton</div>
<div class="km-codeblock"><button class="km-copy">Copy</button>from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
tok = AutoTokenizer.from_pretrained("openbmb/MiniCPM5-1B", trust_remote_code=True)
base = AutoModelForCausalLM.from_pretrained(
"openbmb/MiniCPM5-1B",
trust_remote_code=True,
torch_dtype="auto",
device_map="auto",
)
model = PeftModel.from_pretrained(base, "YMRohit/ouroboros-kernelsmith-minicpm5-1b")
model.eval()</div>
</article>
<article class="km-doc-panel">
<div class="km-doc-label">5. Run the 27B smith locally</div>
<h3>Same prompt contract, much bigger hardware.</h3>
<p>The 27B artifact is the stronger Qwen3.6-27B smith used for the 76 verified compiler-beating kernels. Run it locally if you have a large GPU or multi-GPU box. The training run used Modal H200s and peaked around 110 GB VRAM; inference is still a heavyweight path compared with the 1B GGUF.</p>
<div class="km-codeblock"><button class="km-copy">Copy</button>python -m pip install torch transformers peft accelerate triton</div>
<div class="km-codeblock"><button class="km-copy">Copy</button>from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
tok = AutoTokenizer.from_pretrained("Qwen/Qwen3.6-27B", trust_remote_code=True)
base = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen3.6-27B",
trust_remote_code=True,
torch_dtype="auto",
device_map="auto",
)
model = PeftModel.from_pretrained(base, "YMRohit/ouroboros-kernelsmith-qwen3.6-27b")
model.eval()</div>
<p>If that does not fit, use <b>Pro</b> in this Space. It calls the hosted 27B backend and still sends the output through the same three-baseline referee before returning a result.</p>
</article>
<article class="km-doc-panel wide">
<div class="km-doc-label">6. Prompt contract</div>
<h3>Ask for one operation, one code block, one `run(...)` entry point.</h3>
<p>The model is not a general Python assistant here. It was trained to emit a single Triton kernel for a verifier-known operation. Keep the prompt narrow and include the exact operation, input tensors, output contract, and baseline target.</p>
<div class="km-prompt-grid">
<div>
<b>System</b>
<div class="km-codeblock"><button class="km-copy">Copy</button>You are an expert GPU kernel engineer. Write a single correct, fast Triton kernel. Output ONLY one fenced python code block defining run(*inputs) and its @triton.jit kernel. Accumulate reductions in float32. No prose.</div>
</div>
<div>
<b>User template</b>
<div class="km-codeblock"><button class="km-copy">Copy</button>Operation: add_rmsnorm_gelu
Inputs: x, residual, weight. Each row is one transformer hidden state.
Reference: y = gelu(rmsnorm(x + residual, weight)).
Return: one fenced python block with imports, one @triton.jit kernel, and run(x, residual, weight).
Target: correct vs PyTorch first, then faster than torch.compile max-autotune.</div>
</div>
</div>
<div class="km-doc-chiprow">
<span class="km-doc-chip">name the op</span>
<span class="km-doc-chip">state tensor order</span>
<span class="km-doc-chip">require fp32 reductions</span>
<span class="km-doc-chip">forbid prose</span>
<span class="km-doc-chip">verify before trusting</span>
</div>
</article>
<article class="km-doc-panel">
<div class="km-doc-label">Prompt example A</div>
<h3>Residual RMSNorm plus GELU.</h3>
<div class="km-codeblock"><button class="km-copy">Copy</button>Write a fused Triton kernel for add_rmsnorm_gelu.
Inputs are x, residual, and weight, all CUDA tensors.
Compute RMSNorm over each row after x + residual, multiply by weight, then apply GELU.
Use fp32 accumulation for the row reduction.
Return exactly one fenced python code block with run(x, residual, weight).</div>
</article>
<article class="km-doc-panel">
<div class="km-doc-label">Prompt example B</div>
<h3>Stable softmax.</h3>
<div class="km-codeblock"><button class="km-copy">Copy</button>Write a fused Triton kernel for row-wise softmax.
Input x is a CUDA tensor shaped [M, N].
Use the stable max-subtraction form.
Return exactly one fenced python code block with run(x).
Do not include explanation text outside the code block.</div>
</article>
<article class="km-doc-panel">
<div class="km-doc-label">Prompt example C</div>
<h3>SwiGLU gate.</h3>
<div class="km-codeblock"><button class="km-copy">Copy</button>Write a fused Triton kernel for swiglu.
Inputs are gate and up tensors with the same shape.
Compute silu(gate) * up elementwise.
Return exactly one fenced python code block with run(gate, up).
Keep the launch grid simple and contiguous-row friendly.</div>
</article>
<article class="km-doc-panel wide">
<div class="km-doc-label">7. Verify a generated kernel</div>
<h3>Never score the model output by eye.</h3>
<p>Save the candidate as `candidate.py`, then send it to the referee. The result object reports correctness, latency, and speedups vs eager, torch.compile, and max-autotune.</p>
<div class="km-codeblock"><button class="km-copy">Copy</button>git clone https://github.com/ymrohit/ouroboros-kernelsmith.git
cd ouroboros-kernelsmith
python -m pip install torch triton numpy</div>
<div class="km-codeblock"><button class="km-copy">Copy</button>import pathlib
import sys
sys.path.insert(0, "referee")
from harness import evaluate_inprocess_full
kernel_src = pathlib.Path("candidate.py").read_text()
result = evaluate_inprocess_full(kernel_src, "add_rmsnorm_gelu", n_shapes=2, n_iters=30)
print(result.to_dict())</div>
<div class="km-doc-callout">A kernel is submission-grade only if `status` is `ok`, correctness is true, and the compiler baselines are present. The strongest number is `speedup_maxauto` because that compares against torch.compile max-autotune.</div>
</article>
<article class="km-doc-panel wide">
<div class="km-doc-label">8. What the referee can certify</div>
<h3>Known operations only, by design.</h3>
<p>The referee needs a PyTorch reference. It certifies norm and activation fusions plus named transformer operations. Unknown math can still be interesting, but this app will not label it a verified win until there is a reference spec.</p>
<div id="km-docs-ops" class="km-oplist"></div>
</article>
<article class="km-doc-panel wide">
<div class="km-doc-label">9. Glossary</div>
<div id="km-learn-blocks"></div>
</article>
<article class="km-doc-panel wide">
<div class="km-doc-label">Source and artifacts</div>
<p>MIT licensed code: <a href="https://github.com/ymrohit/ouroboros-kernelsmith">ymrohit/ouroboros-kernelsmith</a>. Models: <a href="https://huggingface.co/YMRohit/ouroboros-kernelsmith-minicpm5-1b">MiniCPM5-1B smith</a>, <a href="https://huggingface.co/YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF">MiniCPM5-1B GGUF</a>, and <a href="https://huggingface.co/YMRohit/ouroboros-kernelsmith-qwen3.6-27b">Qwen3.6-27B smith</a>. Evidence: <a href="https://huggingface.co/datasets/YMRohit/ouroboros-kernel-corpus">verified kernel corpus</a>.</p>
</article>
</div>
</section>
</div>
</div>
"""
APP_HTML = (APP_HTML
.replace("__KM_REPLAYS__", REPLAYS_JSON)
.replace("__KM_LB_ONE__", lb_url(ONE_B))
.replace("__KM_LB_PRO__", lb_url(PRO)))
APP_JS = r"""
() => {
const ACTS = {
gelu:{lbl:'GELU',verb:'smoothly bends',info:'GELU is the gentle bend used inside many transformers. It lets a little of the negative side through.',f:x=>0.5*x*(1+Math.tanh(0.7978845608*(x+0.044715*x*x*x)))},
silu:{lbl:'SiLU',verb:'swishes',info:'SiLU, also called Swish, is x times sigmoid(x). It is common in LLaMA style feed-forward blocks.',f:x=>x/(1+Math.exp(-x))},
relu:{lbl:'ReLU',verb:'clips negatives',info:'ReLU keeps positive values and clips negative values to zero. It is fast, classic, and direct.',f:x=>Math.max(x,0)},
tanh:{lbl:'Tanh',verb:'squashes to -1..1',info:'Tanh squashes each value into the range from -1 to 1. Large values flatten near the top and bottom.',f:x=>Math.tanh(x)},
sigmoid:{lbl:'Sigmoid',verb:'squashes to 0..1',info:'Sigmoid squashes each value into the range from 0 to 1. It is often used for gates and probabilities.',f:x=>1/(1+Math.exp(-x))},
relu2:{lbl:'ReLU2',verb:'clips then squares',info:'Squared ReLU clips negative values to zero, then squares the remaining positive values.',f:x=>{const r=Math.max(x,0);return r*r;}},
gelu_erf:{lbl:'GELU exact',verb:'bends exactly',info:'Exact GELU uses the erf form of GELU. It behaves almost like the tanh approximation, but follows the exact formula.',f:x=>0.5*x*(1+Math.tanh(0.7978845608*(x+0.044715*x*x*x)))},
leaky_relu:{lbl:'Leaky ReLU',verb:'leaks negatives',info:'Leaky ReLU is like ReLU, but negative values leak through at a small scale instead of becoming zero.',f:x=>x>0?x:0.01*x},
elu:{lbl:'ELU',verb:'curves negatives',info:'ELU passes positive values through and curves negative values smoothly down toward -1.',f:x=>x>0?x:Math.exp(x)-1},
mish:{lbl:'Mish',verb:'self gates',info:'Mish is a smooth self-gating activation, x times tanh of softplus(x).',f:x=>x*Math.tanh(Math.log(1+Math.exp(x)))},
};
const NORMS = {
rmsnorm:{lbl:'RMSNorm',info:'RMSNorm makes a row of values a consistent size by dividing by root mean square, then scaling.'},
layernorm:{lbl:'LayerNorm',info:'LayerNorm centers a row of values by subtracting the average, then rescales by the spread.'},
};
const MEM = {info:'Residual add means adding the input back before the next operation. This is the skip path used in deep models.'};
const CLASSICS = {
softmax:{lbl:'Softmax',info:'Softmax turns a row of scores into probabilities that add to 1.'},
softmax_scale:{lbl:'Scaled softmax',info:'Scaled softmax applies a scale before softmax. It is the attention score step used before attention weights.'},
swiglu:{lbl:'SwiGLU',info:'SwiGLU is a gated feed-forward operation: SiLU on the gate multiplied by the up projection.'},
geglu:{lbl:'GeGLU',info:'GeGLU is a gated feed-forward operation like SwiGLU, but with a GELU gate.'},
rmsnorm:{lbl:'RMSNorm',info:'RMSNorm on its own, without an activation after it.'},
layernorm:{lbl:'LayerNorm',info:'LayerNorm on its own, without an activation after it.'},
};
const INPUT = [0.6,-0.9,1.4,-0.3,0.2,-1.6,0.8,1.1,-0.5,0.0,1.7,-1.1,0.4,-0.2];
const root=document.getElementById('km');
if(!root||root.dataset.init)return;
root.dataset.init='1';
const $=s=>root.querySelector(s), $$=s=>Array.from(root.querySelectorAll(s));
const LB_URLS=[root.dataset.lbOne,root.dataset.lbPro].filter(Boolean);
const cleanText=s=>String(s??'').replace(/\u2013|\u2014/g,' - ').replace(/\u2192/g,'->').replace(/\u00d7/g,'x').replace(/[\u2600-\u27BF]/g,'').replace(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g,'').replace(/\s+/g,' ').trim();
const esc=s=>cleanText(s).replace(/[&<>"']/g,c=>({'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#39;'}[c]));
let REPLAYS={};
try{REPLAYS=JSON.parse(($('#km-replays-data')||{}).textContent||'{}');}catch(e){REPLAYS={};}
const REPLAY_META={
rmsnorm_gelu:{label:'RMSNorm to GELU',short:'RMSNorm GELU'},
softmax:{label:'Softmax',short:'Softmax'},
add_layernorm_silu:{label:'Residual plus LayerNorm to SiLU',short:'Residual LayerNorm SiLU'}
};
const REPLAY_ORDER=['rmsnorm_gelu','softmax','add_layernorm_silu'].filter(k=>REPLAYS[k]);
const state={tab:'build',mode:'build',slots:{mem:false,norm:'rmsnorm',act:'gelu',act2:null},classic:'softmax'};
let buildTries=4, expertTries=5, resolvedExpert=null, lastMintedMode='build';
let activeMintNonce=null, awaitingMint=false, pendingLBRefresh=false, lbRefreshTimer=null, lbPollTimer=null, lbUiTimer=null;
const HIST=[];
function animate(el,frames,opts){ if(el&&el.animate)el.animate(frames,opts); }
function labelRecipe(r){
if(!r)return 'unknown';
if(r.mode==='classic')return CLASSICS[r.classic]?.lbl||r.classic||'classic op';
const p=[];
if(r.residual)p.push('residual add');
p.push(NORMS[r.norm]?.lbl||r.norm);
(r.acts||[]).forEach(a=>p.push(ACTS[a]?.lbl||a));
return p.join(' + ');
}
function machineInfo(){
if(state.mode==='classic')return CLASSICS[state.classic].info;
const p=[];
p.push(state.slots.mem?'add input':'no residual add');
p.push(NORMS[state.slots.norm].lbl);
p.push(ACTS[state.slots.act].lbl);
if(state.slots.act2)p.push(ACTS[state.slots.act2].lbl);
return 'Current operation: '+p.join(' + ')+'. The model must fuse it into one verified kernel.';
}
function setInfo(t){
const info=$('#km-info');
if(!info)return;
info.textContent=cleanText(t||machineInfo());
animate(info,[{opacity:.45},{opacity:1}],{duration:180,easing:'cubic-bezier(0.16,1,0.3,1)'});
}
function positionTabPill(){
const tabs=$('#km-tabs'), pill=$('#km-tab-pill'), btn=$('.km-tab.on');
if(!tabs||!pill||!btn)return;
const tr=tabs.getBoundingClientRect(), br=btn.getBoundingClientRect();
pill.style.width=br.width+'px';
pill.style.transform='translateX('+(br.left-tr.left-4)+'px)';
}
function showTab(tab){
state.tab=tab;
$$('.km-tab').forEach(b=>b.classList.toggle('on',b.dataset.tab===tab));
$$('.km-page').forEach(p=>{p.hidden=p.dataset.page!==tab;});
requestAnimationFrame(positionTabPill);
if(tab==='expert')resolveExpert();
if(tab==='lb'){refreshLB(true); setLBPolling(true);}
else setLBPolling(false);
}
$$('.km-tab').forEach(b=>b.addEventListener('click',()=>showTab(b.dataset.tab)));
window.addEventListener('resize',positionTabPill);
function optionInfo(slot,key){
if(slot==='mem')return key==='add'?MEM.info:'No residual add. The kernel starts from the input row and does not add a skip value.';
if(slot==='norm')return NORMS[key].info;
if(slot==='act'||slot==='act2')return key==='none'?'No second activation. The pipeline stops after the first activation.':ACTS[key].info;
return '';
}
function optionLabel(slot,key){
if(slot==='mem')return key==='add'?'add input':'none';
if(slot==='norm')return NORMS[key].lbl;
if(slot==='act'||slot==='act2')return key==='none'?'none':ACTS[key].lbl;
return key;
}
function selectedKey(slot){
if(slot==='mem')return state.slots.mem?'add':'none';
if(slot==='act2')return state.slots.act2||'none';
return state.slots[slot];
}
function setSlot(slot,key,quiet){
if(slot==='mem')state.slots.mem=key==='add';
else if(slot==='act2')state.slots.act2=key==='none'?null:key;
else state.slots[slot]=key;
const wrap=root.querySelector('.km-pick[data-slot="'+slot+'"]');
if(wrap)wrap.querySelectorAll('.km-opt').forEach(b=>b.classList.toggle('on',b.dataset.key===selectedKey(slot)));
if(!quiet)setInfo(optionInfo(slot,key));
render();
}
function renderPickers(){
$$('.km-pick[data-slot]').forEach(wrap=>{
const slot=wrap.dataset.slot;
let keys=[];
if(slot==='mem')keys=['none','add'];
if(slot==='norm')keys=Object.keys(NORMS);
if(slot==='act')keys=Object.keys(ACTS);
if(slot==='act2')keys=['none'].concat(Object.keys(ACTS));
wrap.innerHTML='';
keys.forEach(key=>{
const b=document.createElement('button');
b.type='button';
b.className='km-opt';
b.dataset.key=key;
b.textContent=optionLabel(slot,key);
b.title=optionInfo(slot,key);
b.classList.toggle('on',key===selectedKey(slot));
b.addEventListener('mouseenter',()=>setInfo(optionInfo(slot,key)));
b.addEventListener('focus',()=>setInfo(optionInfo(slot,key)));
b.addEventListener('click',()=>setSlot(slot,key));
wrap.appendChild(b);
});
});
}
function renderClassics(){
const cwrap=$('#km-classics');
if(!cwrap)return;
cwrap.innerHTML='';
Object.entries(CLASSICS).forEach(([k,v])=>{
const b=document.createElement('button');
b.type='button';
b.className='km-opt';
b.dataset.key=k;
b.textContent=v.lbl;
b.title=v.info;
b.classList.toggle('on',state.classic===k);
b.addEventListener('mouseenter',()=>setInfo(v.info));
b.addEventListener('focus',()=>setInfo(v.info));
b.addEventListener('click',()=>{
state.classic=k;
cwrap.querySelectorAll('.km-opt').forEach(x=>x.classList.toggle('on',x===b));
setInfo(v.info);
render();
});
cwrap.appendChild(b);
});
}
function setComposerMode(mode){
state.mode=mode;
$$('.km-segb').forEach(b=>b.classList.toggle('on',b.dataset.mode===mode));
const compose=$('#km-compose'), classic=$('#km-classic');
if(compose)compose.hidden=mode!=='build';
if(classic)classic.hidden=mode!=='classic';
setInfo(machineInfo());
render();
}
$$('.km-segb').forEach(b=>b.addEventListener('click',()=>setComposerMode(b.dataset.mode)));
function setupStepper(id,outId,initial,onChange){
const wrap=$(id), out=$(outId);
if(!wrap)return;
wrap.innerHTML='';
const set=v=>{
onChange(v);
if(out)out.textContent=String(v);
wrap.querySelectorAll('button').forEach(b=>b.classList.toggle('on',+b.dataset.v===v));
};
for(let i=1;i<=8;i++){
const b=document.createElement('button');
b.type='button';
b.dataset.v=String(i);
b.textContent=String(i);
b.addEventListener('click',()=>set(i));
wrap.appendChild(b);
}
set(initial);
}
function setupTemp(id,outId){
const el=$(id), out=$(outId);
if(!el)return;
const sync=()=>{if(out)out.textContent=(+el.value/100).toFixed(2);};
el.addEventListener('input',sync);
sync();
}
function syncModePair(localSel,proSel){
const local=$(localSel), pro=$(proSel);
if(!local||!pro)return;
const sync=()=>{
if(local.checked){pro.checked=false;pro.disabled=true;}
else pro.disabled=false;
};
local.addEventListener('change',sync);
pro.addEventListener('change',()=>{if(pro.checked)local.checked=false;sync();});
sync();
}
const svg=$('#km-svg'), W=320, H=150, PAD=12, n=INPUT.length;
const xs=i=>PAD+i*(W-2*PAD)/(n-1);
const ys=v=>{const t=Math.max(-2.2,Math.min(2.2,v));return H/2-t*(H/2-PAD)/2.2;};
let dots=[], line=null, cur=INPUT.slice(), anim=null;
function buildSvg(){
if(!svg)return;
svg.innerHTML='';
const mid=document.createElementNS('http://www.w3.org/2000/svg','line');
mid.setAttribute('x1',0); mid.setAttribute('x2',W); mid.setAttribute('y1',H/2); mid.setAttribute('y2',H/2);
mid.setAttribute('stroke','#1f2d26'); mid.setAttribute('stroke-width','1'); svg.appendChild(mid);
line=document.createElementNS('http://www.w3.org/2000/svg','polyline');
line.setAttribute('fill','none'); line.setAttribute('stroke','#34e0a1'); line.setAttribute('stroke-width','2'); line.setAttribute('stroke-linejoin','round');
svg.appendChild(line);
dots=cur.map((v,i)=>{const c=document.createElementNS('http://www.w3.org/2000/svg','circle');c.setAttribute('r','3.4');c.setAttribute('cx',xs(i));svg.appendChild(c);return c;});
paint(cur);
}
function paint(v){
if(!line)return;
line.setAttribute('points',v.map((x,i)=>xs(i)+','+ys(x)).join(' '));
v.forEach((x,i)=>{if(dots[i]){dots[i].setAttribute('cy',ys(x));dots[i].setAttribute('fill',x>=0?'#34e0a1':'#ff7a7a');}});
}
const ease=t=>1-Math.pow(1-t,3);
function morph(to,ms=480){
if(!line)return;
const from=cur.slice(), t0=performance.now();
if(anim)cancelAnimationFrame(anim);
const step=t=>{
const k=Math.min(1,(t-t0)/ms), e=ease(k);
cur=from.map((f,i)=>f+(to[i]-f)*e);
paint(cur);
if(k<1)anim=requestAnimationFrame(step); else cur=to.slice();
};
anim=requestAnimationFrame(step);
}
function normalize(a,kind){
const m=a.reduce((s,x)=>s+x,0)/a.length;
if(kind==='layernorm'){
const v=a.reduce((s,x)=>s+(x-m)*(x-m),0)/a.length;
return a.map(x=>(x-m)/Math.sqrt(v+1e-5));
}
const r=Math.sqrt(a.reduce((s,x)=>s+x*x,0)/a.length+1e-6);
return a.map(x=>x/r);
}
function classicViz(key){
let v=INPUT.slice();
if(key.startsWith('softmax')){
const scale=key==='softmax_scale'?0.7:1;
const z=v.map(x=>x*scale), mx=Math.max(...z), e=z.map(x=>Math.exp(x-mx)), s=e.reduce((a,b)=>a+b,0);
v=e.map(x=>x/s*4-0.5);
}else if(key==='rmsnorm'||key==='layernorm'){
v=normalize(v,key);
}else if(key==='swiglu'){
v=v.map(x=>(x/(1+Math.exp(-x)))*(0.7+Math.abs(x)*0.3));
}else if(key==='geglu'){
v=v.map(x=>ACTS.gelu.f(x)*(0.7+Math.abs(x)*0.3));
}
return v;
}
function pipeline(){
if(state.mode==='classic')return [{l:'input',v:INPUT.slice()},{l:CLASSICS[state.classic].lbl,v:classicViz(state.classic)}];
const st=[{l:'input',v:INPUT.slice()}];
let v=INPUT.slice();
if(state.slots.mem){v=v.map((x,i)=>x+INPUT[(i+3)%n]*0.5);st.push({l:'residual add',v:v.slice()});}
v=normalize(v,state.slots.norm);st.push({l:NORMS[state.slots.norm].lbl,v:v.slice()});
v=v.map(ACTS[state.slots.act].f);st.push({l:ACTS[state.slots.act].lbl+' '+ACTS[state.slots.act].verb,v:v.slice()});
if(state.slots.act2){v=v.map(ACTS[state.slots.act2].f);st.push({l:ACTS[state.slots.act2].lbl+' second pass',v:v.slice()});}
return st;
}
let playing=false;
function play(){
if(playing)return;
playing=true;
const st=pipeline();
let i=0;
const stage=$('#km-stage-label');
const next=()=>{
if(i>=st.length){playing=false;return;}
if(stage){stage.textContent=cleanText(st[i].l);animate(stage,[{opacity:0,transform:'translateY(4px)'},{opacity:1,transform:'none'}],{duration:200,easing:'cubic-bezier(0.16,1,0.3,1)'});}
morph(st[i].v);
i++;
setTimeout(next,680);
};
next();
}
function render(){
const st=pipeline(), stage=$('#km-stage-label');
if(stage)stage.textContent=cleanText(st[st.length-1].l);
morph(st[st.length-1].v,360);
updateReplayOffer();
}
function buildRecipe(){
const tempEl=$('#km-temp'), proEl=$('#km-pro'), localEl=$('#km-local'), nameEl=$('#km-name');
const local=!!(localEl&&localEl.checked);
return {
mode:state.mode,
norm:state.slots.norm,
residual:!!state.slots.mem,
acts:[state.slots.act].concat(state.slots.act2?[state.slots.act2]:[]),
classic:state.classic,
spark:'',
k:buildTries,
temp:(+(tempEl?tempEl.value:70))/100,
pro:!local&&!!(proEl&&proEl.checked),
local,
name:nameEl?nameEl.value:''
};
}
function expertRecipe(){
if(!resolvedExpert||resolvedExpert.unknown)return null;
const tempEl=$('#km-xtemp'), proEl=$('#km-xpro'), localEl=$('#km-xlocal'), briefEl=$('#km-xbrief');
const local=!!(localEl&&localEl.checked);
const r=Object.assign({},resolvedExpert.recipe);
r.k=expertTries;
r.temp=(+(tempEl?tempEl.value:70))/100;
r.pro=!local&&!!(proEl&&proEl.checked);
r.local=local;
r.name='expert';
r.spark=briefEl?briefEl.value:'';
return r;
}
function currentRecipe(){
return state.tab==='expert'?(expertRecipe()||buildRecipe()):buildRecipe();
}
function bridge(recipe){
const inp=document.querySelector('#km_recipe textarea');
if(!inp)return;
// _n is a nonce: it makes the recipe JSON differ on every mint so Gradio always registers a
// value change and re-runs do_mint. Without it, minting the same recipe twice in a row left
// the textarea value unchanged and the second submit was dropped (the "needs a double-click"
// bug). The backend echoes it back so stale blobs from the previous run cannot repaint the old
// result over the new loading card while Gradio is preparing the next response.
const nonce=Date.now().toString(36)+'-'+Math.random().toString(36).slice(2);
const payload=Object.assign({},recipe,{_n:nonce});
activeMintNonce=nonce;
awaitingMint=true;
const set=Object.getOwnPropertyDescriptor(window.HTMLTextAreaElement.prototype,'value').set;
set.call(inp,JSON.stringify(payload));
inp.dispatchEvent(new Event('input',{bubbles:true}));
// NOTE: we intentionally do NOT dispatch 'change' here. The leaderboard fetch is triggered
// separately via refreshLB() so it never competes with do_mint in Gradio's queue.
setTimeout(()=>{
const g=document.querySelector('#km_go button')||document.querySelector('#km_go');
if(g)g.click();
},80);
}
function setLBStatus(text,kind){
const s=$('#km-lb-status');
if(!s)return;
s.textContent=text;
s.classList.toggle('bad',kind==='bad');
s.classList.toggle('warn',kind==='warn');
}
function setLBButton(on){
const b=$('#km-lb-refresh');
if(!b)return;
b.disabled=on;
b.textContent=on?'Refreshing...':'Refresh';
}
async function refreshLBDirect(){
const rows=[], seen=new Set(), sources=[];
for(const url of LB_URLS){
try{
const res=await fetch(url,{cache:'no-store'});
if(!res.ok)continue;
const data=await res.json();
const got=Array.isArray(data.rows)?data.rows:[];
if(got.length)sources.push(url);
got.forEach(row=>{
const key=[row.op,row.name,row.ts,row.speedup_compile,row.speedup_eager].join('|');
if(seen.has(key))return;
seen.add(key);
rows.push(row);
});
}catch(e){}
}
if(!sources.length)throw new Error('direct leaderboard fetch failed');
renderLB(rows,{fetched_at:Date.now()/1000,sources});
}
function refreshLB(manual=false){
if(awaitingMint){
pendingLBRefresh=true;
if(manual)setLBStatus('Queued until the current mint finishes.','warn');
return;
}
if(manual||state.tab==='lb'){
setLBButton(true);
setLBStatus('Checking leaderboard...','');
if(lbUiTimer)clearTimeout(lbUiTimer);
lbUiTimer=setTimeout(()=>{
refreshLBDirect().catch(()=>{
setLBButton(false);
setLBStatus('Still waiting for the Gradio refresh bridge. Direct browser fetch was blocked. Try once more or reopen the tab.','warn');
});
},8000);
}
const btn=document.querySelector('#km_lb_btn button')||document.querySelector('#km_lb_btn');
if(btn)btn.click();
else{
setLBButton(false);
setLBStatus('Refresh bridge is not mounted yet. Reopen the Leaderboard tab.','bad');
}
}
function scheduleLBRefresh(delay=700){
pendingLBRefresh=false;
if(lbRefreshTimer)clearTimeout(lbRefreshTimer);
lbRefreshTimer=setTimeout(()=>refreshLB(false),delay);
}
function setLBPolling(on){
if(lbPollTimer){clearInterval(lbPollTimer);lbPollTimer=null;}
if(on)lbPollTimer=setInterval(()=>{if(state.tab==='lb')refreshLB(false);},15000);
}
let mintSafetyTimer=null;
function setMinting(on){
['#km-build','#km-xbuild'].forEach(id=>{const b=$(id); if(b){b.disabled=on; b.classList.toggle('minting',on);}});
if(mintSafetyTimer){clearTimeout(mintSafetyTimer); mintSafetyTimer=null;}
// Bulletproof re-enable: showResult() clears this, but if a result never arrives (hung backend)
// the button must not stay dead. 4 min is longer than any real mint.
if(on) mintSafetyTimer=setTimeout(()=>setMinting(false), 240000);
}
function replayName(key){return (REPLAY_META[key]&&REPLAY_META[key].label)||(REPLAYS[key]&&cleanText(REPLAYS[key].op))||key;}
function matchingReplayKey(){
if(state.mode==='classic'&&state.classic==='softmax'&&REPLAYS.softmax)return 'softmax';
if(state.mode==='build'&&!state.slots.mem&&state.slots.norm==='rmsnorm'&&state.slots.act==='gelu'&&!state.slots.act2&&REPLAYS.rmsnorm_gelu)return 'rmsnorm_gelu';
if(state.mode==='build'&&state.slots.mem&&state.slots.norm==='layernorm'&&state.slots.act==='silu'&&!state.slots.act2&&REPLAYS.add_layernorm_silu)return 'add_layernorm_silu';
return null;
}
function updateReplayOffer(){
const btn=$('#km-replay');
if(!btn||!REPLAY_ORDER.length)return;
const key=matchingReplayKey()||REPLAY_ORDER[0];
btn.dataset.replayKey=key;
btn.textContent=matchingReplayKey()?'Watch this recorded mint':'Watch a recorded mint';
$$('.km-rpick').forEach(b=>b.classList.toggle('on',b.dataset.replayKey===key));
}
function renderReplayChoices(){
const wrap=$('#km-replay-picks');
if(!wrap)return;
if(!REPLAY_ORDER.length){wrap.hidden=true;return;}
wrap.hidden=false;
wrap.innerHTML=REPLAY_ORDER.map(k=>"<button class='km-rpick' data-replay-key='"+esc(k)+"'>"+esc((REPLAY_META[k]&&REPLAY_META[k].short)||replayName(k))+" <span>"+sp3(REPLAYS[k].speedup_compile)+"x</span></button>").join('');
updateReplayOffer();
}
const sp=x=>Number.isFinite(+x)?(+x).toFixed(1).replace(/\.0$/,''):'?';
const sp2=x=>Number.isFinite(+x)?(+x).toFixed(2):'?';
const sp3=x=>Number.isFinite(+x)?(+x).toFixed(3).replace(/0+$/,'').replace(/\.$/,''):'?';
const isOk=s=>String(s||'').toLowerCase()==='ok'||String(s||'').toLowerCase()==='pass';
function attemptSummary(d){
const statuses=Array.isArray(d.statuses)?d.statuses:[];
const passed=statuses.length?statuses.filter(isOk).length:(Number.isFinite(+d.n_verified)?+d.n_verified:0);
const total=statuses.length||(Number.isFinite(+d.k)?+d.k:passed||0);
return {statuses,passed,total};
}
function attemptChips(d,limit){
const a=attemptSummary(d), statuses=a.statuses.slice(0,limit??a.statuses.length);
if(!statuses.length)return '';
return "<div class='km-attempts'>"+statuses.map((s,i)=>{
const ok=isOk(s);
return "<span class='km-attempt "+(ok?'ok':'bad')+"'><b>"+(ok?'OK':'FAIL')+"</b> attempt "+(i+1)+", "+esc(s)+"</span>";
}).join('')+"</div>";
}
function target(mode){
if(mode==='expert'){
return {verdict:$('#km-xverdict'),source:$('#km-xsource'),code:$('#km-xsource .km-code')};
}
return {verdict:$('#km-verdict'),source:$('#km-source'),code:$('#km-code')};
}
function clearSource(mode){
const t=target(mode);
if(t.source)t.source.hidden=true;
if(t.code)t.code.textContent='';
}
function setSource(d,kind,mode){
const t=target(mode);
if(!t.source||!t.code)return;
if(!d.source){clearSource(mode);return;}
t.source.hidden=false;
t.source.open=false;
const sum=t.source.querySelector('summary');
if(sum)sum.textContent=kind==='recorded'?'Recorded Triton kernel source':'Live Triton kernel source';
t.code.textContent=d.source;
}
function wakingError(msg){return /waking|wake|timeout|timed out|still|unexpected|http|modal|loading/i.test(String(msg||''));}
function showResult(d,opts={}){
if(!opts.recorded&&awaitingMint&&String(d&&d._n||'')!==String(activeMintNonce||'')){
return;
}
if(!opts.recorded&&awaitingMint){
awaitingMint=false;
activeMintNonce=null;
}
const mode=opts.mode||lastMintedMode||'build';
stopBuild();
stopReplay();
if(!opts.recorded) setMinting(false); // result arrived -> re-enable Mint (replays don't disable it)
const t=target(mode), v=t.verdict;
if(!v)return;
clearSource(mode);
if(d.error){
const local=!!d.local;
const wake=!local&&wakingError(d.error);
const replayKey=matchingReplayKey()||REPLAY_ORDER[0]||'';
const replayBtn=replayKey?"<button class='km-inline-replay' data-replay-key='"+esc(replayKey)+"'>Watch a recorded mint while it wakes</button>":"";
v.innerHTML="<div class='km-card err'>"+
"<div class='km-result-top'><div class='km-verdict-icon bad'>!</div><div><span class='km-run-label'>"+(local?'local offline mint':'live mint')+"</span><b>"+(wake?'The live model is still waking':(local?'The local mint did not finish':'The live mint did not finish'))+"</b></div></div>"+
"<p class='km-dim'>"+(wake?'The backend scales to zero, so the first live mint can take about 90 seconds. The replay below is a real earlier mint, not a mockup.':(local?esc(d.error):'Backend said: '+esc(d.error)))+"</p>"+
(wake?replayBtn+"<p class='km-rawerr'>Backend said: "+esc(d.error)+"</p>":replayBtn)+"</div>";
if(pendingLBRefresh)scheduleLBRefresh();
return;
}
const a=attemptSummary(d);
if(!d.verified){
v.innerHTML="<div class='km-card miss'>"+
"<div class='km-result-top'><div class='km-verdict-icon bad'>NO</div><div><span class='km-run-label'>referee verdict</span><b>Not verified yet</b></div></div>"+
"<div class='km-ref-summary'><b>"+a.passed+" of "+(a.total||'?')+"</b> attempts passed the referee</div>"+
attemptChips(d)+"<p class='km-dim'>"+esc(d.explanation||'Press Mint again, raise attempts, or change the operation.')+"</p></div>";
if(pendingLBRefresh)scheduleLBRefresh();
return;
}
const recorded=!!opts.recorded, local=!!d.local;
// Headline the HONEST baseline (the compiler), not the inflated vs-eager fusion number.
// Strongest available wins: max-autotune > torch.compile default > eager.
const se=+d.speedup_eager, sc=+d.speedup_compile, sm=+d.speedup_maxauto;
const hasC=Number.isFinite(sc)&&sc>0, hasM=Number.isFinite(sm)&&sm>0;
const heroSpeed=hasM?sm:(hasC?sc:se);
const heroLabel=hasM?'vs torch.compile max-autotune':(hasC?'vs torch.compile':'vs PyTorch eager');
const missingLocalCompiler=local&&!recorded&&!hasC&&!hasM;
const compilerGuard=missingLocalCompiler
? "<div class='km-baseline-guard'>Compiler baselines did not return for this Local run. Re-mint; a certified Local result should include torch.compile and max-autotune next to eager.</div>"
: "";
const runLabel=local?'local offline mint':(recorded?'recorded mint':'live mint');
const noteText=recorded?'recorded backend run':(local?'local offline run':'live backend run');
const note="<span class='km-pill-note "+(recorded?'recorded':(local?'local':'hit'))+"'>"+noteText+"</span>";
const board=d.beat_champion?"<span class='km-crown'>27B champion beaten</span>":d.on_leaderboard?"<span class='km-board'>leaderboard result</span>":"";
v.innerHTML="<div class='km-card ok km-result-card'>"+note+board+
"<div class='km-result-top'><div class='km-verdict-icon'>OK</div><div><span class='km-run-label'>"+runLabel+"</span><b>Verified by the referee</b><span>"+esc(d.op||'kernel')+"</span></div></div>"+
"<div class='km-ref-summary'><b>"+a.passed+" of "+(a.total||'?')+"</b> attempts passed the referee</div>"+
attemptChips(d)+
compilerGuard+
// Every comparison as its own big number, strongest (most honest) baseline first, eager last
// and labelled 'unfused' so the side-by-side is self-explanatory: the model's real edge is
// the ~1.1x over the compiler; the big eager number is mostly the fusion win.
"<div class='km-stats km-cmp'>"+
(hasM?"<div class='km-cmp-key'><big>"+sp3(sm)+"x</big><span>vs torch.compile max-autotune</span></div>":"")+
(hasC?"<div"+(hasM?"":" class='km-cmp-key'")+"><big>"+sp3(sc)+"x</big><span>vs torch.compile</span></div>":"")+
"<div><big>"+sp(se)+"x</big><span>vs PyTorch eager (unfused)</span></div>"+
"<div><big>"+(Number.isFinite(+d.gen_seconds)?sp(d.gen_seconds)+'s':'?')+"</big><span>generation</span></div></div>"+
"<p class='km-dim'>"+esc(d.explanation||'')+"</p></div>";
animate(v.firstElementChild,[{transform:'scale(0.96)',opacity:0},{transform:'scale(1)',opacity:1}],{duration:300,easing:'cubic-bezier(0.16,1,0.3,1)'});
setSource(d,recorded?'recorded':'live',mode);
if(!recorded&&mode==='build'){
HIST.unshift("<code>"+esc(d.op||'kernel')+"</code><span>"+sp(d.speedup_maxauto||d.speedup_compile||d.speedup_eager)+"x vs compiler</span>");
const hist=$('#km-hist');
if(hist)hist.innerHTML=HIST.slice(0,10).map(h=>"<div class='km-hrow'>"+h+"</div>").join('');
}
// The leaderboard refresh is deliberately decoupled from the mint call. Once the result has
// arrived, the queue is free again, so refresh the board without hiding the just-rendered result.
if(!recorded)scheduleLBRefresh(900);
}
let replayTimer=null;
function stopReplay(){if(replayTimer){clearTimeout(replayTimer);replayTimer=null;}}
function startReplay(key){
const d=REPLAYS[key];
if(!d)return;
lastMintedMode=state.tab==='expert'?'expert':'build';
stopBuild();
stopReplay();
play();
const t=target(lastMintedMode), v=t.verdict;
if(!v)return;
clearSource(lastMintedMode);
const statuses=Array.isArray(d.statuses)?d.statuses:[];
const total=statuses.length||(Number.isFinite(+d.k)?+d.k:4);
const passed=statuses.length?statuses.filter(isOk).length:(Number.isFinite(+d.n_verified)?+d.n_verified:0);
v.innerHTML="<div class='km-card building replaying'>"+
"<div class='km-result-top'><div class='km-verdict-icon'>RUN</div><div><span class='km-run-label recorded'>recorded mint</span><b>"+esc(replayName(key))+"</b><span>This is a genuine earlier backend run. Mint still starts a live run.</span></div></div>"+
"<div class='km-replay-stage' id='km-replay-stage'>loading recorded attempts...</div>"+
"<div class='km-attempts' id='km-replay-attempts'></div>"+
"<div class='km-bar'><i id='km-replay-fill'></i></div>"+
"<p class='km-dim'>The model took "+(Number.isFinite(+d.gen_seconds)?sp(d.gen_seconds)+'s':'real time')+" on this recorded run. The replay shows the referee loop without waiting for cold start.</p></div>";
const att=$('#km-replay-attempts'), stage=$('#km-replay-stage'), fill=$('#km-replay-fill');
let i=0;
const tick=()=>{
if(i<total){
const st=statuses[i]||(i<passed?'ok':'not verified'), ok=isOk(st);
if(stage)stage.textContent='referee verdict for attempt '+(i+1)+' of '+total;
if(att)att.insertAdjacentHTML('beforeend',"<span class='km-attempt "+(ok?'ok':'bad')+"'><b>"+(ok?'OK':'FAIL')+"</b> attempt "+(i+1)+", "+esc(st)+"</span>");
if(fill)fill.style.width=Math.round(((i+1)/(total+1))*100)+'%';
i++;
replayTimer=setTimeout(tick,360);
return;
}
if(stage)stage.textContent='best verified kernel selected';
if(fill)fill.style.width='100%';
replayTimer=setTimeout(()=>showResult(d,{recorded:true,mode:lastMintedMode,replayKey:key}),420);
};
replayTimer=setTimeout(tick,180);
}
let buildTimer=null;
function startBuild(mode,recipe){
stopReplay();
const t=target(mode), v=t.verdict;
if(!v)return;
clearSource(mode);
const k=recipe.k||4, started=Date.now(), replayKey=mode==='build'?(matchingReplayKey()||REPLAY_ORDER[0]||''):(REPLAY_ORDER[0]||'');
const local=!!recipe.local;
const stages=local?['loading local GGUF...','llama.cpp drafting '+Math.min(k,2)+' kernels...','compiling Triton on this Space GPU...','checking correctness vs PyTorch...','timing vs torch.compile max-autotune...']:
['model waking...','drafting '+k+' kernels...','compiling Triton...','checking correctness vs PyTorch...','timing vs torch.compile max-autotune...'];
const stepHtml=stages.map((s,i)=>"<span class='"+(i===0?'on':'')+"'>"+esc(s)+"</span>").join('');
v.innerHTML="<div class='km-card building'><b class='km-bstage' id='km-bstage'>"+esc(stages[0])+"</b>"+
"<div class='km-build-steps' id='km-build-steps'>"+stepHtml+"</div>"+
"<div class='km-bar'><i id='km-barfill'></i></div>"+
"<span class='km-dim'>"+(local?"Local mode drafts up to two kernels with llama.cpp on this Space's GPU, then verifies them with the in-process referee and times them against PyTorch eager, torch.compile, and max-autotune. First run may also download/cache the GGUF.":"The model drafts up to "+k+" kernels. The referee compiles, correctness-checks, and times every candidate. First build of a session can take about 90 seconds while the model wakes.")+"</span>"+
(replayKey?"<button class='km-inline-replay' data-replay-key='"+esc(replayKey)+"'>Watch a recorded mint while this runs</button>":"")+"</div>";
// Honest progress: do_mint is one blocking call with no progress signal, so we cannot show a
// live readout. Instead we walk the referee's real phases ONCE on a rough estimate, monotonically
// (the index only ever increases), and park on the final phase until the result arrives. It never
// loops back or resets the way the old modulo version did. A gentle pulse keeps the viz alive.
const stageStart=[0,12,18,23,28]; // seconds at which each phase is assumed to begin
let baseVals=null; try{const st=pipeline();baseVals=st[st.length-1].v.slice();}catch(e){}
buildTimer=setInterval(()=>{
const elapsed=(Date.now()-started)/1000;
let ix=0; for(let j=0;j<stageStart.length;j++){if(elapsed>=stageStart[j])ix=j;}
ix=Math.min(ix,stages.length-1);
const label=(ix===stages.length-1&&elapsed>45)?'still timing vs torch.compile max-autotune...':stages[ix];
const el=$('#km-bstage');
if(el&&el.textContent!==label){el.textContent=label;animate(el,[{opacity:.35,transform:'translateY(3px)'},{opacity:1,transform:'none'}],{duration:220,easing:'cubic-bezier(0.16,1,0.3,1)'});}
$$('#km-build-steps span').forEach((s,j)=>{s.classList.toggle('on',j===ix);s.classList.toggle('done',j<ix);});
const f=$('#km-barfill');
if(f)f.style.width=Math.min(94,(Date.now()-started)/900)+'%';
if(baseVals){const p=0.96+0.04*Math.sin(elapsed*1.5);morph(baseVals.map(x=>x*p),650);}
},650);
}
function stopBuild(){
if(buildTimer){clearInterval(buildTimer);buildTimer=null;}
const f=$('#km-barfill');
if(f)f.style.width='100%';
render();
}
function renderLB(rows,meta={}){
const el=$('#km-lb-view');
if(!el)return;
if(lbUiTimer){clearTimeout(lbUiTimer);lbUiTimer=null;}
setLBButton(false);
const fetched=Number(meta.fetched_at||0);
const stamp=fetched?new Date(fetched*1000).toLocaleTimeString([], {hour:'2-digit', minute:'2-digit', second:'2-digit'}):'just now';
const sourceCount=Array.isArray(meta.sources)?meta.sources.length:0;
setLBStatus('Last checked '+stamp+(sourceCount?' from '+sourceCount+' endpoint'+(sourceCount>1?'s':'')+'.':'.'),'');
if(!rows.length){el.innerHTML='<i>No kernels yet. Be the first.</i>';return;}
el.innerHTML="<table class='km-lbt'><thead><tr><th>#</th><th>machine</th><th>builder</th><th>vs compiler</th><th>vs PyTorch</th></tr></thead><tbody>"+
rows.slice(0,25).map((r,i)=>"<tr class='"+(r.champion?'champ':'')+"'><td class='km-rank'>"+(i+1)+"</td><td><code>"+esc(r.op||'kernel')+"</code></td><td>"+esc(r.name||'anonymous')+"</td><td class='hot'>"+sp2(r.speedup_compile)+"x</td><td>"+sp(r.speedup_eager)+"x</td></tr>").join('')+"</tbody></table>";
}
function watch(id,cb){
const el=document.querySelector(id);
if(!el)return;
const read=()=>{
const s=el.querySelector('[data-blob]');
if(s){try{cb(JSON.parse(atob(s.dataset.blob)));}catch(e){}}
};
new MutationObserver(read).observe(el,{childList:true,subtree:true});
read();
}
setTimeout(()=>{watch('#km_result',showResult);watch('#km_lb',d=>renderLB(d.rows||[],d));},250);
const rf=$('#km-lb-refresh');
if(rf)rf.addEventListener('click',()=>refreshLB(true));
const CLASSIC_TERMS=[
{key:'softmax_scale',terms:['scaled softmax','scale softmax','attention scale']},
{key:'swiglu',terms:['swiglu','swi glu']},
{key:'geglu',terms:['geglu','ge glu']},
{key:'softmax',terms:['softmax']}
];
const ACT_TERMS=[
{key:'gelu_erf',terms:['gelu_erf','exact gelu','erf gelu']},
{key:'relu2',terms:['relu2','relu squared','squared relu','square relu']},
{key:'leaky_relu',terms:['leaky_relu','leaky relu','leaky']},
{key:'silu',terms:['silu','swish','swishes']},
{key:'sigmoid',terms:['sigmoid']},
{key:'gelu',terms:['gelu']},
{key:'relu',terms:['relu']},
{key:'tanh',terms:['tanh']},
{key:'elu',terms:['elu']},
{key:'mish',terms:['mish']}
];
function termMatches(text,items){
// word-boundary match so 'elu' does not fire inside 'gelu', 'relu' not inside 'leaky_relu', etc.
// (underscores and digits are word chars, so \b correctly keeps relu2 and gelu_erf distinct.)
const found=[];
items.forEach(item=>{
let best=-1, bestLen=0;
item.terms.forEach(term=>{
const re=new RegExp('\\b'+term.replace(/[.*+?^${}()|[\]\\]/g,'\\$&')+'\\b');
const m=re.exec(text);
if(m&&(best<0||m.index<best||(m.index===best&&term.length>bestLen))){best=m.index;bestLen=term.length;}
});
if(best>=0)found.push({key:item.key,ix:best,len:bestLen});
});
found.sort((a,b)=>a.ix-b.ix||b.len-a.len);
// drop a shorter match that overlaps a longer one ('relu' inside 'leaky relu', 'softmax'
// inside 'scaled softmax'), keeping the first/longest at each span.
const kept=[];
found.forEach(m=>{ if(!kept.some(k=>m.ix<k.ix+k.len&&k.ix<m.ix+m.len))kept.push(m); });
return kept;
}
function resolveBrief(text){
const raw=text||'', t=raw.toLowerCase();
const classics=termMatches(t,CLASSIC_TERMS);
if(classics.length){
const key=classics[0].key;
return {unknown:false,label:CLASSICS[key].lbl,recipe:{mode:'classic',norm:'rmsnorm',residual:false,acts:[],classic:key,spark:raw}};
}
const normHit=t.includes('layernorm')||t.includes('layer norm')?'layernorm':(t.includes('rmsnorm')||t.includes('rms norm')||/\brms\b/.test(t)?'rmsnorm':null);
const residual=/residual|resid|\bskip\b|add input|\+ input/.test(t);
const acts=[];
termMatches(t,ACT_TERMS).forEach(m=>{if(!acts.includes(m.key)&&acts.length<2)acts.push(m.key);});
if(!normHit&&!acts.length){
return {unknown:true,message:'Could not map this brief. Use RMSNorm, LayerNorm, residual, and a single supported activation, or a named op such as softmax, scaled softmax, SwiGLU, or GeGLU.'};
}
if(!acts.length){
if(normHit&&!residual){
return {unknown:false,label:CLASSICS[normHit].lbl,recipe:{mode:'classic',norm:normHit,residual:false,acts:[],classic:normHit,spark:raw}};
}
return {unknown:true,message:'This brief needs a supported activation after the residual or normalization so the referee can verify it.'};
}
const recipe={mode:'build',norm:normHit||'rmsnorm',residual,acts,classic:'softmax',spark:raw};
return {unknown:false,label:labelRecipe(recipe),recipe};
}
function resolveExpert(){
const box=$('#km-xbrief'), res=$('#km-xresolved'), out=res?res.querySelector('.v'):null, btn=$('#km-xbuild');
if(!box||!res||!out)return;
resolvedExpert=resolveBrief(box.value);
res.classList.toggle('unknown',!!resolvedExpert.unknown);
out.textContent=resolvedExpert.unknown?resolvedExpert.message:resolvedExpert.label;
if(btn)btn.disabled=!!resolvedExpert.unknown;
}
const OP_TAGS=[
['softmax','softmax over each row'],
['scaled_softmax','scaled softmax over each row'],
['swiglu','SwiGLU gated feed forward'],
['geglu','GeGLU gated feed forward'],
['rmsnorm','RMSNorm only'],
['layernorm','LayerNorm only'],
['add_rmsnorm_gelu','fused RMSNorm with residual and GELU'],
['layernorm_silu','fused LayerNorm with SiLU'],
['rmsnorm_mish','fused RMSNorm with Mish'],
['add_layernorm_silu','fused LayerNorm with residual and SiLU'],
['rmsnorm_gelu_relu2','fused RMSNorm with GELU then squared ReLU']
].concat(Object.keys(ACTS).map(k=>[k,ACTS[k].lbl+' activation']));
function renderOpTags(sel,interactive){
const wrap=$(sel);
if(!wrap)return;
wrap.innerHTML='';
OP_TAGS.forEach(([label,brief])=>{
const b=document.createElement('button');
b.type='button';
b.className='km-optag';
b.textContent=label;
b.dataset.brief=brief;
if(interactive)b.addEventListener('click',()=>{
const box=$('#km-xbrief');
if(box){box.value=brief;box.dispatchEvent(new Event('input',{bubbles:true}));box.focus();}
});
wrap.appendChild(b);
});
}
function renderGlossary(){
const lb=$('#km-learn-blocks');
if(!lb)return;
const rows=[['Residual',MEM.info],...Object.values(NORMS).map(v=>[v.lbl,v.info]),...Object.values(ACTS).map(v=>[v.lbl,v.info]),...Object.values(CLASSICS).map(v=>[v.lbl,v.info])];
lb.innerHTML=rows.map(([n,d])=>"<div class='km-gloss'><b>"+esc(n)+"</b><span>"+esc(d)+"</span></div>").join('');
}
function setupCopyButtons(){
$$('.km-copy').forEach(btn=>btn.addEventListener('click',()=>{
const block=btn.closest('.km-codeblock');
if(!block)return;
const clone=block.cloneNode(true), copy=clone.querySelector('.km-copy');
if(copy)copy.remove();
const text=clone.textContent.trim();
const done=()=>{const old=btn.textContent;btn.textContent='Copied';setTimeout(()=>{btn.textContent=old;},900);};
if(navigator.clipboard&&navigator.clipboard.writeText)navigator.clipboard.writeText(text).then(done).catch(done);
else done();
}));
}
root.addEventListener('click',e=>{
const b=e.target.closest('[data-replay-key]');
if(!b||!root.contains(b))return;
const key=b.dataset.replayKey;
if(!REPLAYS[key])return;
e.preventDefault();
startReplay(key);
});
const xb=$('#km-xbrief');
if(xb)xb.addEventListener('input',resolveExpert);
// (second-activation builder removed: the kernelsmith grammar is norm + one activation)
const buildBtn=$('#km-build');
if(buildBtn)buildBtn.addEventListener('click',()=>{
lastMintedMode='build';
const r=buildRecipe();
animate(buildBtn,[{transform:'scale(0.97)'},{transform:'scale(1)'}],{duration:140,easing:'cubic-bezier(0.16,1,0.3,1)'});
play();
setMinting(true);
startBuild('build',r);
bridge(r);
});
const xBuild=$('#km-xbuild');
if(xBuild)xBuild.addEventListener('click',()=>{
resolveExpert();
const r=expertRecipe();
if(!r)return;
lastMintedMode='expert';
animate(xBuild,[{transform:'scale(0.97)'},{transform:'scale(1)'}],{duration:140,easing:'cubic-bezier(0.16,1,0.3,1)'});
setMinting(true);
startBuild('expert',r);
bridge(r);
});
setupStepper('#km-tries','#km-tries-v',4,v=>{buildTries=v;});
setupStepper('#km-xtries','#km-xtries-v',5,v=>{expertTries=v;});
setupTemp('#km-temp','#km-temp-v');
setupTemp('#km-xtemp','#km-xtemp-v');
syncModePair('#km-local','#km-pro');
syncModePair('#km-xlocal','#km-xpro');
// The build and expert pages each have their own "Local (offline)" toggle. Local mode is the ONLY
// path that computes the max-autotune number, so a page in Modal mode shows just 2 comparisons.
// Keep both toggles in lockstep so build and expert always run the same way and show the same
// set of comparison numbers (the guard stops the change events from ping-ponging).
(function(){
const a=$('#km-local'), b=$('#km-xlocal');
if(!a||!b)return;
a.addEventListener('change',()=>{ if(b.checked!==a.checked){ b.checked=a.checked; b.dispatchEvent(new Event('change')); } });
b.addEventListener('change',()=>{ if(a.checked!==b.checked){ a.checked=b.checked; a.dispatchEvent(new Event('change')); } });
})();
renderPickers();
renderClassics();
renderReplayChoices();
renderOpTags('#km-oplist',true);
renderOpTags('#km-docs-ops',false);
renderGlossary();
setupCopyButtons();
buildSvg();
setInfo(machineInfo());
resolveExpert();
showTab('build');
render();
}
"""
with gr.Blocks(title="Kernel Mint", head=f"<style>{CSS}</style>") as demo:
gr.HTML(APP_HTML)
# bridge components: kept in the DOM (NOT visible=False, which removes them entirely so the
# custom JS can't reach the inner textarea/button) and hidden with CSS (#km_recipe etc.).
recipe_in = gr.Textbox(elem_id="km_recipe")
result_out = gr.HTML(elem_id="km_result")
lb_out = gr.HTML(elem_id="km_lb")
go = gr.Button("go", elem_id="km_go")
lb_btn = gr.Button("lb", elem_id="km_lb_btn")
# trigger_mode="multiple": the default ("once") silently DROPS a click that lands while a
# previous mint is still running, so minting one kernel after another needed two clicks and the
# old result lingered. The frontend also disables the Mint button during a mint (one at a time,
# no wasted ZeroGPU calls); this is the belt-and-suspenders so no click is ever swallowed.
go.click(do_mint, recipe_in, result_out, trigger_mode="multiple")
# Leaderboard refresh is its OWN trigger, decoupled from minting. Previously fetch_lb was wired
# to recipe_in.change, so every mint (which sets recipe_in) also fired a leaderboard fetch; on a
# slow backend that fetch sat ahead of do_mint in Gradio's queue and the mint appeared to do
# nothing until a second click. Now the JS clicks #km_lb_btn only when it actually wants the board.
lb_btn.click(fetch_lb, None, lb_out)
demo.load(fetch_lb, None, lb_out)
demo.load(None, None, None, js=APP_JS)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))