Spaces:

build-small-hackathon
/

ouroboros-kernel-mint

Running on Zero

App Files Files Community

ouroboros-kernel-mint / app.py

YMRohit

Make leaderboard refresh visible

281cd85 18 days ago

Raw

History Blame Contribute Delete

90.7 kB

	"""OUROBOROS Kernel Mint: a GPU-kernel demo (Gradio + custom JS).

	Compose a fused operation, and a 1B model writes a real Triton kernel for it while an
	immutable referee checks correctness against PyTorch and times it against torch.compile. The
	whole interactive surface is a custom JS widget in one gr.HTML; the mint runs through Python
	(no CORS) and hits the 1B model (default) or the 27B (Pro mode).
	"""
	from __future__ import annotations

	import base64
	import json
	import os
	import re
	import sys
	import time

	import gradio as gr
	import requests

	# `spaces.GPU` only matters on ZeroGPU, where the GPU is granted lazily per call. On a dedicated
	# GPU Space (e.g. L4) or locally the GPU is always attached, so the decorator must be a plain
	# pass-through — otherwise it can fight the always-present CUDA context. Gate on the env var HF sets
	# only on ZeroGPU hardware, so the same app.py runs unchanged on ZeroGPU, a paid GPU, and a laptop.
	try:
	if not os.environ.get("SPACES_ZERO_GPU"):
	raise ImportError("not on ZeroGPU: use the no-op GPU shim")
	import spaces
	except Exception:
	class _SpacesShim:
	@staticmethod
	def GPU(_args, *_kwargs):
	def _decorator(fn):
	return fn
	return _decorator

	spaces = _SpacesShim()

	ONE_B = os.environ.get("BACKEND_URL", "https://ymrohit--ouroboros-kernel-mint-mint-mint.modal.run")
	PRO = os.environ.get("BACKEND_PRO_URL", "https://ymrohit--ouroboros-kernel-mint-pro-mint-mint.modal.run")
	ROOT = os.path.dirname(__file__)
	REFEREE = os.path.join(ROOT, "referee")
	SEED_DIR = os.path.join(ROOT, "seed_kernels")
	LOCAL_FINE_TUNED_REPO = os.environ.get(
	"LOCAL_GGUF_REPO", "YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF")
	LOCAL_BASE_REPO = os.environ.get("LOCAL_GGUF_FALLBACK_REPO", "openbmb/MiniCPM5-1B-GGUF")
	LOCAL_QUANT_PREFS = tuple(q.strip() for q in os.environ.get(
	"LOCAL_GGUF_QUANTS", "Q5_K_M,Q6_K,Q4_K_M,Q8_0,F16,BF16").split(",") if q.strip())
	LOCAL_MAX_ATTEMPTS = int(os.environ.get("LOCAL_MAX_ATTEMPTS", "2"))
	_LOCAL_LLM = None
	_LOCAL_LLM_PATH = None


	def _ensure_referee_path():
	if REFEREE not in sys.path:
	sys.path.insert(0, REFEREE)


	LOCAL_SYS = ("You are an expert GPU kernel engineer. Write a single correct, fast Triton "
	"kernel. Output ONLY one fenced python code block defining `run(*inputs)` and "
	"its @triton.jit kernel. Accumulate reductions in float32. No prose.")

	# The model was trained (rl_kernelsmith.py Proposer.prompt) with a real per-op SEED kernel as the
	# style guide, and for almost every op that exemplar is rmsnorm's ROW-WISE reduction. We must hand
	# the model that SAME exemplar at inference, or it goes off-distribution and writes flat elementwise
	# code for reduction ops (wrong results). Seeds are bundled in ./seed_kernels; this inline copy is
	# only a fallback if a file is missing.
	_FALLBACK_EXEMPLAR = """# GOLD seed kernel: fused RMSNorm, one row per program, fp32 accumulation.
	@triton.jit
	def _rmsnorm_kernel(x_ptr, w_ptr, y_ptr, stride, N, eps, BLOCK: tl.constexpr):
	row = tl.program_id(0)
	x_ptr += row * stride
	y_ptr += row * stride
	acc = tl.zeros([BLOCK], dtype=tl.float32)
	for off in range(0, N, BLOCK):
	cols = off + tl.arange(0, BLOCK)
	x = tl.load(x_ptr + cols, mask=cols < N, other=0.0).to(tl.float32)
	acc += x * x
	rms = tl.rsqrt(tl.sum(acc) / N + eps)
	for off in range(0, N, BLOCK):
	cols = off + tl.arange(0, BLOCK)
	mask = cols < N
	x = tl.load(x_ptr + cols, mask=mask, other=0.0).to(tl.float32)
	w = tl.load(w_ptr + cols, mask=mask, other=0.0).to(tl.float32)
	tl.store(y_ptr + cols, (x * rms * w), mask=mask)


	def run(x, w):
	M, N = x.shape
	y = torch.empty_like(x)
	_rmsnorm_kernel[(M,)](x, w, y, x.stride(0), N, 1e-6, BLOCK=1024)
	return y
	"""


	def _load_seed_kernel(op: str) -> str:
	try:
	with open(os.path.join(SEED_DIR, f"{op}.py")) as f:
	return f.read()
	except OSError:
	return _FALLBACK_EXEMPLAR


	def extract_kernel(text: str) -> str:
	"""Pull a fenced Python kernel out of a llama.cpp completion."""
	m = re.search(r"```(?:python)?\s(.?)```", text or "", re.S)
	body = m.group(1) if m else (text or "")
	starts = [body.find(k) for k in ("@triton", "import ", "def run", "def _") if body.find(k) >= 0]
	i = min(starts or [0])
	return body[i:].strip()


	def _local_op_from_recipe(r: dict) -> tuple[str, str]:
	_ensure_referee_path()
	from specs import SPECS

	if r.get("mode") == "classic":
	op = (r.get("classic") or "softmax").strip()
	if op not in SPECS:
	raise ValueError(f"unknown op '{op}'")
	return op, op

	normfull = "rmsnorm" if str(r.get("norm", "rmsnorm")).startswith("rms") else "layernorm"
	norm = "rms" if normfull == "rmsnorm" else "layer"
	residual = bool(r.get("residual"))
	acts = [a for a in (r.get("acts") or []) if a][:3] or ["gelu"]
	label = ("residual + " if residual else "") + \
	("RMSNorm" if norm == "rms" else "LayerNorm") + " -> " + " -> ".join(acts)
	named = ("add_" if residual else "") + normfull + "_" + acts[0]
	if len(acts) == 1 and named in SPECS:
	return named, label
	return f"chain\|{norm}\|{'1' if residual else '0'}\|{','.join(acts)}", label


	def _local_prompt_messages(op: str, spark: str = "") -> list[dict[str, str]]:
	_ensure_referee_path()
	from specs import get_spec, SPECS

	spec = get_spec(op)
	# Same exemplar rule as training: the real seed kernel for the next op in SPECS that isn't this
	# one (almost always rmsnorm's row-wise reduction). This is the single thing that makes the 1B
	# write correct kernels here instead of elementwise guesses.
	exemplar_op = next((o for o in SPECS if o != op), op)
	exemplar = _load_seed_kernel(exemplar_op)
	user = (f"Op `{op}`: {spec.notes}\nSignature:\n{spec.signature_hint}\n\n"
	f"Here is a valid Triton kernel for a DIFFERENT op (`{exemplar_op}`) as a style guide:\n"
	f"```python\n{exemplar}\n```\n")
	if spark:
	user += (f"\nThe person building this kernel added their own idea: \"{spark}\". "
	"Honour their idea where you can, but correctness is mandatory.")
	return [{"role": "system", "content": LOCAL_SYS}, {"role": "user", "content": user}]


	def _render_prompt(messages: list[dict[str, str]]) -> str:
	# MiniCPM5 uses ChatML (<\|im_start\|>/<\|im_end\|>). It is a reasoning model, so we append the
	# empty <think></think> block (the enable_thinking=False convention) to make it answer
	# directly with the kernel instead of emitting a reasoning trace. Verified against the
	# tokenizer's apply_chat_template(enable_thinking=False) output.
	out = []
	for m in messages:
	out.append(f"<\|im_start\|>{m['role']}\n{m['content']}<\|im_end\|>\n")
	out.append("<\|im_start\|>assistant\n<think>\n\n</think>\n\n")
	return "".join(out)


	def _common_gguf_names() -> list[str]:
	names = []
	stems = ("MiniCPM5-1B", "minicpm5-1b", "model", "ggml-model")
	for q in LOCAL_QUANT_PREFS:
	for stem in stems:
	names.append(f"{stem}-{q}.gguf")
	names.append(f"{stem}.{q}.gguf")
	return names


	def _download_gguf_from_repo(repo_id: str) -> str:
	from huggingface_hub import HfApi, hf_hub_download

	explicit = os.environ.get("LOCAL_GGUF_FILE")
	cache_dir = os.environ.get("LOCAL_GGUF_CACHE")
	if explicit:
	return hf_hub_download(repo_id=repo_id, filename=explicit, cache_dir=cache_dir)

	files = []
	try:
	files = HfApi().list_repo_files(repo_id)
	except Exception:
	files = []
	ggufs = [f for f in files if f.lower().endswith(".gguf")]
	for quant in LOCAL_QUANT_PREFS:
	hit = next((f for f in ggufs if quant.lower() in f.lower()), None)
	if hit:
	return hf_hub_download(repo_id=repo_id, filename=hit, cache_dir=cache_dir)
	if ggufs:
	return hf_hub_download(repo_id=repo_id, filename=ggufs[0], cache_dir=cache_dir)

	last = None
	for filename in _common_gguf_names():
	try:
	return hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir)
	except Exception as e:
	last = e
	raise RuntimeError(f"no GGUF found in {repo_id}: {last}")


	def _resolve_local_gguf() -> str:
	errors = []
	for repo_id in (LOCAL_FINE_TUNED_REPO, LOCAL_BASE_REPO):
	try:
	return _download_gguf_from_repo(repo_id)
	except Exception as e:
	errors.append(f"{repo_id}: {type(e).__name__}: {str(e)[:180]}")
	raise RuntimeError("could not load a local GGUF. " + " \| ".join(errors))


	_CUDA_PRELOADED = False


	def _preload_cuda_libs():
	# The CUDA llama.cpp wheel's libllama.so links libcudart.so.12 / libcublas*.so.12, which on HF
	# Spaces live inside torch's bundled nvidia-* packages and are NOT on the default loader path,
	# so the import dies with "libcudart.so.12: cannot open shared object file". Preload them with
	# RTLD_GLOBAL (in dependency order) so libllama.so resolves their symbols. No-op if absent.
	global _CUDA_PRELOADED
	if _CUDA_PRELOADED:
	return
	_CUDA_PRELOADED = True
	import ctypes
	import glob
	roots = []
	try:
	import torch
	roots.append(os.path.join(os.path.dirname(torch.__file__), "lib"))
	except Exception:
	pass
	try:
	import site
	roots += list(site.getsitepackages() if hasattr(site, "getsitepackages") else [])
	except Exception:
	pass
	roots += ["/usr/local/lib/python3.10/site-packages", "/usr/local/cuda/lib64",
	"/usr/lib/x86_64-linux-gnu", "/usr/local/lib"]
	# Match versioned sonames too (e.g. libcudart.so.12.4.127 with no libcudart.so.12 symlink);
	# RTLD_GLOBAL makes the symbols visible to libllama.so regardless of the exact soname.
	for pat in ("libcudart.so", "libcublas.so", "libcublasLt.so*"):
	loaded = False
	for root in roots:
	if loaded:
	break
	try:
	hits = sorted(glob.glob(os.path.join(root, "**", pat), recursive=True))
	except Exception:
	hits = []
	for hit in hits:
	try:
	ctypes.CDLL(hit, mode=ctypes.RTLD_GLOBAL)
	loaded = True
	break
	except Exception:
	continue


	# Confirm the CUDA llama.cpp lib loads at startup (loading the .so needs no GPU device, so it spends
	# no ZeroGPU quota). It also warms the import so the in-@spaces.GPU call is instant.
	if os.environ.get("LOCAL_LLAMA_GPU_LAYERS", "-1") != "0":
	try:
	_preload_cuda_libs()
	import llama_cpp as _lc_probe
	print(f"[startup] llama_cpp {_lc_probe.__version__} loaded OK", flush=True)
	except Exception as _e:
	print(f"[startup] llama_cpp load FAILED: {type(_e).__name__}: {str(_e)[:200]}", flush=True)


	def _get_local_llm():
	global _LOCAL_LLM, _LOCAL_LLM_PATH
	on_zero = bool(os.environ.get("SPACES_ZERO_GPU"))
	gpu_layers = int(os.environ.get("LOCAL_LLAMA_GPU_LAYERS", "-1"))
	# Default to full GPU offload (-1). On ZeroGPU the H200 is ~30x faster than its throttled
	# shared CPU for this 1B, and the GPU is granted only inside @spaces.GPU and detached between
	# calls, so a GPU-resident model can't be reused across mints: rebuild it each call there.
	# Off ZeroGPU (dedicated GPU or CPU) the model is cached once. Set LOCAL_LLAMA_GPU_LAYERS=0
	# to force CPU (e.g. when only a CPU-only llama.cpp wheel is installed).
	if _LOCAL_LLM is not None and not (on_zero and gpu_layers != 0):
	return _LOCAL_LLM
	if gpu_layers != 0:
	_preload_cuda_libs()
	from llama_cpp import Llama

	_LOCAL_LLM_PATH = _resolve_local_gguf()
	threads = int(os.environ.get("LOCAL_LLAMA_THREADS", str(max(1, (os.cpu_count() or 4) - 1))))
	llm = Llama(
	model_path=_LOCAL_LLM_PATH,
	n_ctx=int(os.environ.get("LOCAL_LLAMA_CTX", "4096")),
	n_threads=threads,
	n_gpu_layers=gpu_layers,
	verbose=bool(int(os.environ.get("LOCAL_LLAMA_VERBOSE", "0"))),
	)
	if not (on_zero and gpu_layers != 0):
	_LOCAL_LLM = llm
	return llm


	def _llama_complete(messages: list[dict[str, str]], temp: float, seed: int) -> str:
	# Use raw completion with our think-suppressed ChatML render (not create_chat_completion, which
	# leaves the reasoning trace on and makes a 1B ramble instead of writing the kernel).
	llm = _get_local_llm()
	max_tokens = int(os.environ.get("LOCAL_MAX_TOKENS", "768"))
	out = llm.create_completion(
	_render_prompt(messages),
	max_tokens=max_tokens,
	temperature=temp,
	top_p=0.97,
	seed=seed,
	stop=["<\|im_end\|>", "<\|im_start\|>"],
	)
	return out["choices"][0]["text"]


	def _local_gpu_duration() -> int:
	# This is a ZeroGPU reservation window, not a benchmark setting. Keep the default within the
	# usual free-tier budget, but make the cap explicit so slower/cold hardware can raise it without
	# code changes. The current Space had LOCAL_GPU_DURATION=135, which over-reserved quota; the
	# default cap trims that to 120 while still leaving headroom for cold local mints.
	try:
	requested = int(os.environ.get("LOCAL_GPU_DURATION", "120"))
	except Exception:
	requested = 120
	try:
	cap = int(os.environ.get("LOCAL_GPU_DURATION_CAP", "120"))
	except Exception:
	cap = 120
	return max(45, min(max(45, cap), requested))


	LOCAL_GPU_DURATION = _local_gpu_duration()


	@spaces.GPU(duration=LOCAL_GPU_DURATION)
	def _local_gpu_mint(op: str, k: int, temp: float, spark: str) -> dict:
	# Generation AND verification happen inside this one GPU window. That is required on ZeroGPU,
	# where the GPU exists only inside @spaces.GPU: llama.cpp offloads generation to the H200
	# (LOCAL_LLAMA_GPU_LAYERS=-1, seconds instead of minutes on the throttled CPU) and the referee
	# compiles + runs Triton, all here. Off ZeroGPU the decorator is a no-op and the GPU is always
	# present, so the same code path works on a dedicated GPU and locally too.
	_ensure_referee_path()
	# Time every verified kernel against eager AND torch.compile (default + max-autotune), so local
	# mode shows the honest baselines, not just the inflated vs-eager fusion win. The max-autotune
	# compile is a few seconds for these ops (inductor-cached across the k attempts), so it fits.
	from harness import evaluate_inprocess_full

	messages = _local_prompt_messages(op, spark)
	srcs = []
	base_seed = int(time.time() * 1000) & 0x7FFFFFFF
	for i in range(k):
	text = _llama_complete(messages, temp=temp, seed=base_seed + i)
	src = extract_kernel(text)
	if src:
	srcs.append(src)

	statuses = []
	best = None
	n_shapes = int(os.environ.get("LOCAL_REFEREE_SHAPES", "2"))
	n_iters = int(os.environ.get("LOCAL_REFEREE_ITERS", "30"))
	for i, src in enumerate(srcs):
	try:
	res = evaluate_inprocess_full(src, op, n_shapes=n_shapes, n_iters=n_iters, seed=i)
	except Exception:
	statuses.append("runtime_fail")
	continue
	statuses.append(res.status)
	# Keep the FASTEST verified kernel (lowest latency = best vs every baseline at once).
	if res.status == "ok" and (best is None or res.latency_ms < best["result"]["latency_ms"]):
	best = {"source": src, "result": res.to_dict()}
	return {"n_srcs": len(srcs), "statuses": statuses, "best": best}


	def _local_explain(label: str, res) -> str:
	if res is None or res.get("status") != "ok":
	status = "no verified attempt" if res is None else res.get("status", "failed")
	return (f"The local referee rejected these kernels ({status}). It still compiled and "
	"checked them inside this Space; try minting again or simplify the operation.")
	se = float(res.get("speedup_eager", 0) or 0)
	honest = res.get("speedup_maxauto") or res.get("speedup_compile")
	bl = "torch.compile max-autotune" if res.get("speedup_maxauto") else "torch.compile"
	if honest:
	return (f"Verified correct. {float(honest):.2f}x faster than {bl} (the honest baseline that "
	f"also fuses and autotunes), and {se:.0f}x vs unfused PyTorch eager. The whole loop "
	"ran in this Space: the 1B wrote it via llama.cpp, the referee compiled, checked it "
	"against PyTorch, and timed it.")
	return (f"Verified correct, {se:.0f}x faster than unfused PyTorch eager. The 1B wrote it via "
	"llama.cpp and the referee checked and timed it, all inside this Space.")


	def local_mint(recipe: dict) -> dict:
	# The 1B reliably writes the named ops and single-activation machines (norm + optional residual
	# + one activation), which is what it was trained on. Arbitrary multi-activation chains are
	# off-distribution and it can't write them correctly, so local mode declines them with a clear
	# message instead of looping through rejected attempts. Pro mode (the 27B) is the place for those.
	op, label = _local_op_from_recipe(recipe)
	if op.startswith("chain\|"):
	return {"op": label, "verified": False, "local": True, "baseline": "eager",
	"statuses": [], "gen_seconds": 0.0, "verify_seconds": 0.0,
	"speedup_compile": None, "unsupported": True, "k": 0,
	"explanation": ("This machine stacks two activations, which is outside what the "
	"kernelsmith models were trained on (the grammar is a norm plus one "
	"activation), so neither the 1B nor the 27B writes it reliably. Drop "
	"to a single activation and it mints with a verified kernel.")}
	k = max(1, min(LOCAL_MAX_ATTEMPTS, int(recipe.get("k", 1))))
	temp = max(0.2, min(1.3, float(recipe.get("temp", 0.7))))
	spark = (recipe.get("spark") or "").strip()[:200]
	# Pre-fetch the GGUF to disk OUTSIDE the GPU window, so the ZeroGPU 120s budget is spent on
	# generate + verify rather than a 1.15GB download.
	_resolve_local_gguf()
	t0 = time.time()
	work = _local_gpu_mint(op, k, temp, spark)
	total_s = round(time.time() - t0, 1)
	statuses = work.get("statuses") or []
	best = work.get("best")
	if not work.get("n_srcs"):
	return {"op": label, "verified": False, "local": True, "baseline": "eager",
	"statuses": statuses, "gen_seconds": total_s, "verify_seconds": 0.0,
	"speedup_compile": None, "k": k,
	"explanation": "llama.cpp returned no parseable kernel this time. Try minting again."}
	if not best:
	return {"op": label, "verified": False, "local": True, "baseline": "eager",
	"statuses": statuses, "gen_seconds": total_s, "verify_seconds": 0.0,
	"speedup_compile": None, "explanation": _local_explain(label, None), "k": k}
	res = best["result"]

	def _su(key):
	v = res.get(key)
	return round(float(v), 2) if v else None
	return {"op": label, "verified": True, "local": True, "baseline": "compile",
	"speedup_eager": round(float(res.get("speedup_eager", 0)), 2),
	"speedup_compile": _su("speedup_compile"), "speedup_maxauto": _su("speedup_maxauto"),
	"latency_ms": res.get("latency_ms"), "eager_ms": res.get("eager_ms"),
	"compile_ms": res.get("compile_ms"), "maxauto_ms": res.get("maxauto_ms"),
	"source": best["source"], "statuses": statuses, "gen_seconds": total_s,
	"verify_seconds": 0.0, "explanation": _local_explain(label, res),
	"n_verified": sum(1 for s in statuses if s == "ok"), "k": k}


	def lb_url(u):
	return u.replace("-mint-mint.modal.run", "-leaderboard.modal.run")


	def _blob(d):
	return f"<span data-blob='{base64.b64encode(json.dumps(d).encode()).decode()}'></span>"


	def _with_nonce(d: dict, r: dict) -> dict:
	out = dict(d or {})
	if r.get("_n") is not None:
	out["_n"] = r.get("_n")
	return out


	def do_mint(recipe_json: str) -> str:
	try:
	r = json.loads(recipe_json or "{}")
	except Exception:
	return _blob({"error": "bad recipe"})
	if r.get("local"):
	try:
	return _blob(_with_nonce(local_mint(r), r))
	except Exception as e:
	return _blob(_with_nonce(
	{"error": f"Local offline mint failed: {type(e).__name__}: {str(e)[:240]}",
	"local": True, "baseline": "eager"}, r))
	base = PRO if r.get("pro") else ONE_B
	if r.get("mode") == "classic":
	payload = {"op": r.get("classic", "softmax")}
	else:
	payload = {"blocks": {"norm": r.get("norm", "rmsnorm"), "residual": bool(r.get("residual")),
	"acts": r.get("acts") or ["gelu"]}}
	payload.update({"name": (r.get("name") or "anonymous")[:40], "k": int(r.get("k", 4)),
	"temp": float(r.get("temp", 0.7)), "spark": (r.get("spark") or "")[:200]})
	# During a cold start Modal can return an empty body before the model finishes loading,
	# which makes .json() blow up. Retry a few times, then explain it plainly.
	last = ""
	for attempt in range(4):
	try:
	resp = requests.post(base, json=payload, timeout=600)
	if resp.status_code == 200 and resp.text.strip():
	try:
	return _blob(_with_nonce(resp.json(), r))
	except ValueError:
	last = "the model returned an unexpected response"
	else:
	last = f"the model is still waking up (HTTP {resp.status_code})"
	except Exception as e:
	last = str(e)
	time.sleep(4)
	pro = " The 27B in Pro mode takes longer to wake. Try again, or turn Pro off." if r.get("pro") else ""
	return _blob(_with_nonce(
	{"error": f"The model is still waking up. Give it about 30 seconds and press Mint again.{pro}"},
	r))


	def fetch_lb(_=None):
	rows = []
	seen = set()
	sources = []
	errors = []
	for u in (ONE_B, PRO):
	try:
	url = lb_url(u)
	got = requests.get(url, timeout=20).json().get("rows", [])
	if got:
	sources.append(url)
	for row in got:
	key = (row.get("op"), row.get("name"), row.get("ts"),
	row.get("speedup_compile"), row.get("speedup_eager"))
	if key in seen:
	continue
	seen.add(key)
	rows.append(row)
	except Exception as e:
	errors.append(f"{lb_url(u)}: {type(e).__name__}")
	continue
	# Always include fetched_at so the hidden Gradio HTML changes even when the rows are identical.
	# That makes the visible Refresh button observably work.
	return _blob({"rows": rows, "fetched_at": time.time(), "sources": sources, "errors": errors[:2]})


	CSS = open(os.path.join(os.path.dirname(__file__), "style.css")).read() if os.path.exists(
	os.path.join(os.path.dirname(__file__), "style.css")) else ""

	REPLAYS_PATH = os.path.join(os.path.dirname(__file__), "replays", "index.json")
	try:
	with open(REPLAYS_PATH, encoding="utf-8") as f:
	REPLAYS_JSON = json.dumps(json.load(f), separators=(",", ":")).replace("</", "<\\/")
	except Exception:
	REPLAYS_JSON = "{}"

	APP_HTML = r"""
	<div id="km" data-build="2026-06-15-leaderboard-refresh-fallback"
	data-lb-one="__KM_LB_ONE__"
	data-lb-pro="__KM_LB_PRO__">
	<div id="km-top">
	<div id="km-brand">
	<span class="mark"><span class="dot"></span>Kernel Mint</span>
	<span class="sub">an OUROBOROS demo</span>
	</div>
	<div id="km-tabs">
	<span id="km-tab-pill"></span>
	<button class="km-tab on" data-tab="build">Build</button>
	<button class="km-tab" data-tab="expert">Expert</button>
	<button class="km-tab" data-tab="lb">Leaderboard</button>
	<button class="km-tab" data-tab="docs">Run & verify</button>
	</div>
	</div>
	<script id="km-replays-data" type="application/json">__KM_REPLAYS__</script>

	<!-- ============ BUILD ============ -->
	<div class="km-page" data-page="build">
	<header id="km-hero">
	<h1>A 1B model writes a real GPU kernel. <span class="g">A referee that can't be fooled checks it.</span></h1>
	<p>Compose an operation, and a 1-billion-parameter model writes a fused Triton kernel for it. The kernel is compiled, checked against PyTorch on adversarial inputs, and timed against PyTorch's own compiler before anything counts as a win.</p>
	</header>
	<div id="km-grid">
	<section class="km-card">
	<div class="km-seg" role="tablist">
	<button class="km-segb on" data-mode="build">Compose</button>
	<button class="km-segb" data-mode="classic">Classic ops</button>
	</div>

	<div id="km-compose">
	<div class="km-section-label">Pipeline</div>
	<div id="km-pipe">
	<div class="km-stage fixed"><span class="km-stage-k">input</span><span class="km-dim">a row of activations</span></div>
	<div class="km-flow">↓</div>
	<div class="km-stage"><span class="km-stage-k">residual</span><span class="km-pick" data-slot="mem"></span></div>
	<div class="km-flow">↓</div>
	<div class="km-stage"><span class="km-stage-k">normalize</span><span class="km-pick" data-slot="norm"></span></div>
	<div class="km-flow">↓</div>
	<div class="km-stage"><span class="km-stage-k">activate</span><span class="km-pick" data-slot="act"></span></div>
	</div>
	</div>

	<div id="km-classic" hidden>
	<div class="km-section-label">A named operation from a real transformer</div>
	<div class="km-pick" id="km-classics"></div>
	</div>

	<div id="km-info"></div>

	<div class="km-params">
	<div class="km-field">
	<label>Attempts the model drafts <b id="km-tries-v">4</b></label>
	<div class="km-stepper" id="km-tries"></div>
	</div>
	<div class="km-field">
	<label>Sampling temperature <b id="km-temp-v">0.70</b></label>
	<input type="range" class="km-range" id="km-temp" min="20" max="130" value="70">
	</div>
	<div class="km-field">
	<label>Name for the leaderboard</label>
	<input class="km-input" id="km-name" placeholder="optional" autocomplete="off">
	</div>
	<label class="km-toggle"><input type="checkbox" id="km-pro"><span class="km-switch"></span>Use the 27B model (slower, knows more)</label>
	<label class="km-toggle km-local-toggle"><input type="checkbox" id="km-local"><span class="km-switch"></span>Local (offline)</label>
	<p class="km-mode-help">Runs entirely in this Space: a 1B GGUF model (llama.cpp) writes the kernel on this Space's GPU, then the in-process referee compiles it, checks correctness against PyTorch, and times it against PyTorch eager, torch.compile, and max-autotune. No Modal or cloud model calls. Pro uses the 27B.</p>
	</div>

	<div class="km-actions">
	<button class="km-btn primary" id="km-build">Mint kernel</button>
	<button class="km-btn ghost" id="km-replay" data-replay-key="rmsnorm_gelu">Watch a recorded mint</button>
	</div>
	<div id="km-replay-picks" class="km-replay-picks" aria-label="recorded mints"></div>
	</section>

	<section class="km-card">
	<div class="km-section-label">The operation, stage by stage</div>
	<div id="km-viz"><svg id="km-svg" viewBox="0 0 320 150" preserveAspectRatio="none"></svg>
	<div id="km-stage-label">input</div></div>
	<div id="km-verdict"><div class="km-card"><div class="km-result-top"><div class="km-verdict-icon">→</div>
	<div><span class="km-run-label">ready</span><b>Press Mint to begin</b></div></div>
	<p class="km-dim" style="margin-top:10px">Compose a pipeline on the left and mint it, or watch a recorded verified mint to see the whole referee loop instantly.</p></div></div>
	<details id="km-source" class="km-source" hidden>
	<summary>Triton kernel source</summary>
	<pre id="km-code"></pre>
	</details>
	<div class="km-section-label">This session</div>
	<div id="km-hist" class="km-dim">No mints yet.</div>
	</section>
	</div>
	</div>

	<!-- ============ EXPERT ============ -->
	<div class="km-page" data-page="expert" hidden>
	<header id="km-hero">
	<h1>Free-test the verifier.</h1>
	<p>Describe the fused operation you want in plain language. The model writes a Triton kernel and the referee certifies it against PyTorch, torch.compile, and max-autotune. The referee can only certify operations it has a reference for, so your request is resolved to the nearest one it can check.</p>
	</header>
	<div class="km-expert-grid">
	<section class="km-card">
	<div class="km-section-label">Your brief</div>
	<textarea class="km-textarea" id="km-xbrief" placeholder="e.g. fused RMSNorm with a residual add and a SiLU activation, contiguous rows"></textarea>
	<div id="km-xresolved" class="km-resolved"><span class="k">resolves to</span><div class="v">rmsnorm with gelu</div></div>
	<div class="km-params">
	<div class="km-field">
	<label>Attempts the model drafts <b id="km-xtries-v">5</b></label>
	<div class="km-stepper" id="km-xtries"></div>
	</div>
	<div class="km-field">
	<label>Sampling temperature <b id="km-xtemp-v">0.70</b></label>
	<input type="range" class="km-range" id="km-xtemp" min="20" max="130" value="70">
	</div>
	<label class="km-toggle"><input type="checkbox" id="km-xpro"><span class="km-switch"></span>Use the 27B model</label>
	<label class="km-toggle km-local-toggle"><input type="checkbox" id="km-xlocal"><span class="km-switch"></span>Local (offline)</label>
	<p class="km-mode-help">Uses llama.cpp inside the Space and verifies on this Space's GPU, timed against torch.compile and max-autotune (the honest baselines), not just eager. Pro mode keeps the 27B on Modal.</p>
	</div>
	<div class="km-actions">
	<button class="km-btn primary" id="km-xbuild">Mint from brief</button>
	</div>
	</section>
	<section class="km-card">
	<div class="km-section-label">What the referee can verify</div>
	<p class="km-limits">It certifies fusions of a <b>normalization</b> (RMSNorm or LayerNorm), an optional <b>residual add</b>, and one <b>activation</b>, plus a set of named operators from real transformers. Click one to load it into your brief.</p>
	<div class="km-oplist" id="km-oplist"></div>
	<div id="km-xverdict" style="margin-top:16px"></div>
	<details id="km-xsource" class="km-source" hidden>
	<summary>Triton kernel source</summary>
	<pre class="km-code"></pre>
	</details>
	</section>
	</div>
	</div>

	<!-- ============ LEADERBOARD ============ -->
	<div class="km-page" data-page="lb" hidden>
	<section class="km-card">
	<div class="km-h">Leaderboard</div>
	<p class="km-lede">The best verified kernel per operation. Crowned rows were minted by the 27B model. Every speedup here was measured by the referee, not reported by the model.</p>
	<div id="km-lb-view" class="km-dim" style="margin-top:16px">Loading.</div>
	<div class="km-lb-tools">
	<button id="km-lb-refresh" class="km-mini">Refresh</button>
	<span id="km-lb-status" class="km-lb-status">Auto-refreshes while this tab is open.</span>
	</div>
	</section>
	</div>

	<!-- ============ RUN + VERIFY / DOCS ============ -->
	<div class="km-page" data-page="docs" hidden>
	<section class="km-docs">
	<div class="km-doc-hero">
	<span class="km-doc-kicker">reproduce the claim</span>
	<h2>Run the smith, then make the referee decide.</h2>
	<p>The Space has two execution paths. The <b>Local (offline)</b> switch runs the 1B GGUF model with llama.cpp inside this Space, then verifies the candidate kernel in-process on the same GPU. <b>Pro</b> uses the 27B adapter through the Modal backend. Both paths compare against PyTorch eager, torch.compile, and torch.compile max-autotune before a result counts.</p>
	</div>

	<div class="km-doc-metrics" aria-label="execution modes">
	<div><b>1B local</b><span>MiniCPM5-1B GGUF, llama.cpp, in-Space referee</span></div>
	<div><b>27B Pro</b><span>Qwen3.6-27B adapter, Modal H200 backend</span></div>
	<div><b>referee</b><span>compile, allclose, eager, compile, max-autotune</span></div>
	</div>

	<div class="km-doc-grid">
	<article class="km-doc-panel wide">
	<div class="km-doc-label">1. Use this Space properly</div>
	<h3>The easiest local run is already on this page.</h3>
	<ol class="km-doc-steps">
	<li>Open <b>Build</b> or <b>Expert</b>.</li>
	<li>Turn on <b>Local (offline)</b>. It disables Pro because the two paths are intentionally separate.</li>
	<li>Pick a named op such as <code>rmsnorm_gelu</code>, <code>softmax</code>, or <code>swiglu</code>, or compose one norm plus one activation.</li>
	<li>Press <b>Mint kernel</b>. The Space downloads or reuses the GGUF, drafts up to two kernels, compiles Triton, checks PyTorch correctness, and times eager, torch.compile, and max-autotune.</li>
	</ol>
	<div class="km-doc-callout">A certified Local result should show compiler baselines. If a cold ZeroGPU session times out during max-autotune, mint again. A result that only beats eager is not treated as the headline claim.</div>
	</article>

	<article class="km-doc-panel">
	<div class="km-doc-label">2. Run the Space clone</div>
	<h3>Same UI, your GPU.</h3>
	<p>Use this when you want the full app locally. With no `SPACES_ZERO_GPU` env var, the GPU decorator becomes a no-op and the local verifier runs directly on your attached CUDA device.</p>
	<div class="km-codeblock"><button class="km-copy">Copy</button>git clone https://huggingface.co/spaces/build-small-hackathon/ouroboros-kernel-mint
	cd ouroboros-kernel-mint
	python -m venv .venv
	. .venv/bin/activate
	python -m pip install -r requirements.txt
	python app.py</div>
	<p>Useful knobs:</p>
	<div class="km-codeblock"><button class="km-copy">Copy</button>LOCAL_GGUF_REPO=YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF
	LOCAL_GGUF_QUANTS=Q5_K_M,Q4_K_M
	LOCAL_LLAMA_GPU_LAYERS=-1
	LOCAL_GPU_DURATION=120
	python app.py</div>
	</article>

	<article class="km-doc-panel">
	<div class="km-doc-label">3. Run the 1B smith directly</div>
	<h3>GGUF path, no Modal.</h3>
	<p>This mirrors the Space-local path. It uses the fine-tuned GGUF repo first and falls back to OpenBMB's base GGUF only if needed.</p>
	<div class="km-codeblock"><button class="km-copy">Copy</button>python -m pip install torch triton huggingface_hub \
	--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu130 \
	llama-cpp-python==0.3.28</div>
	<div class="km-codeblock"><button class="km-copy">Copy</button>from huggingface_hub import HfApi, hf_hub_download
	from llama_cpp import Llama

	repo = "YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF"
	files = [f for f in HfApi().list_repo_files(repo) if f.lower().endswith(".gguf")]
	filename = next((f for f in files if "Q5_K_M" in f), files[0])
	gguf = hf_hub_download(repo, filename=filename)

	llm = Llama(model_path=gguf, n_ctx=4096, n_gpu_layers=-1)

	system = "You are an expert GPU kernel engineer. Output only one fenced python code block."
	user = "Write a fused Triton kernel for row-wise softmax. Use stable max-subtraction. Return run(x)."
	prompt = (
	f"<\|im_start\|>system\n{system}<\|im_end\|>\n"
	f"<\|im_start\|>user\n{user}<\|im_end\|>\n"
	"<\|im_start\|>assistant\n```python\n"
	)
	out = llm.create_completion(prompt, max_tokens=768, temperature=0.7, top_p=0.97)
	print(out["choices"][0]["text"])</div>
	</article>

	<article class="km-doc-panel">
	<div class="km-doc-label">4. Run the 1B adapter directly</div>
	<h3>Transformers plus PEFT.</h3>
	<p>Use this when you want the published LoRA adapter instead of GGUF. It is slower to start than llama.cpp, but useful for inspection or further fine-tuning.</p>
	<div class="km-codeblock"><button class="km-copy">Copy</button>python -m pip install torch transformers peft accelerate triton</div>
	<div class="km-codeblock"><button class="km-copy">Copy</button>from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel

	tok = AutoTokenizer.from_pretrained("openbmb/MiniCPM5-1B", trust_remote_code=True)
	base = AutoModelForCausalLM.from_pretrained(
	"openbmb/MiniCPM5-1B",
	trust_remote_code=True,
	torch_dtype="auto",
	device_map="auto",
	)
	model = PeftModel.from_pretrained(base, "YMRohit/ouroboros-kernelsmith-minicpm5-1b")
	model.eval()</div>
	</article>

	<article class="km-doc-panel">
	<div class="km-doc-label">5. Run the 27B smith locally</div>
	<h3>Same prompt contract, much bigger hardware.</h3>
	<p>The 27B artifact is the stronger Qwen3.6-27B smith used for the 76 verified compiler-beating kernels. Run it locally if you have a large GPU or multi-GPU box. The training run used Modal H200s and peaked around 110 GB VRAM; inference is still a heavyweight path compared with the 1B GGUF.</p>
	<div class="km-codeblock"><button class="km-copy">Copy</button>python -m pip install torch transformers peft accelerate triton</div>
	<div class="km-codeblock"><button class="km-copy">Copy</button>from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel

	tok = AutoTokenizer.from_pretrained("Qwen/Qwen3.6-27B", trust_remote_code=True)
	base = AutoModelForCausalLM.from_pretrained(
	"Qwen/Qwen3.6-27B",
	trust_remote_code=True,
	torch_dtype="auto",
	device_map="auto",
	)
	model = PeftModel.from_pretrained(base, "YMRohit/ouroboros-kernelsmith-qwen3.6-27b")
	model.eval()</div>
	<p>If that does not fit, use <b>Pro</b> in this Space. It calls the hosted 27B backend and still sends the output through the same three-baseline referee before returning a result.</p>
	</article>

	<article class="km-doc-panel wide">
	<div class="km-doc-label">6. Prompt contract</div>
	<h3>Ask for one operation, one code block, one `run(...)` entry point.</h3>
	<p>The model is not a general Python assistant here. It was trained to emit a single Triton kernel for a verifier-known operation. Keep the prompt narrow and include the exact operation, input tensors, output contract, and baseline target.</p>
	<div class="km-prompt-grid">
	<div>
	<b>System</b>
	<div class="km-codeblock"><button class="km-copy">Copy</button>You are an expert GPU kernel engineer. Write a single correct, fast Triton kernel. Output ONLY one fenced python code block defining run(*inputs) and its @triton.jit kernel. Accumulate reductions in float32. No prose.</div>
	</div>
	<div>
	<b>User template</b>
	<div class="km-codeblock"><button class="km-copy">Copy</button>Operation: add_rmsnorm_gelu
	Inputs: x, residual, weight. Each row is one transformer hidden state.
	Reference: y = gelu(rmsnorm(x + residual, weight)).
	Return: one fenced python block with imports, one @triton.jit kernel, and run(x, residual, weight).
	Target: correct vs PyTorch first, then faster than torch.compile max-autotune.</div>
	</div>
	</div>
	<div class="km-doc-chiprow">
	<span class="km-doc-chip">name the op</span>
	<span class="km-doc-chip">state tensor order</span>
	<span class="km-doc-chip">require fp32 reductions</span>
	<span class="km-doc-chip">forbid prose</span>
	<span class="km-doc-chip">verify before trusting</span>
	</div>
	</article>

	<article class="km-doc-panel">
	<div class="km-doc-label">Prompt example A</div>
	<h3>Residual RMSNorm plus GELU.</h3>
	<div class="km-codeblock"><button class="km-copy">Copy</button>Write a fused Triton kernel for add_rmsnorm_gelu.
	Inputs are x, residual, and weight, all CUDA tensors.
	Compute RMSNorm over each row after x + residual, multiply by weight, then apply GELU.
	Use fp32 accumulation for the row reduction.
	Return exactly one fenced python code block with run(x, residual, weight).</div>
	</article>

	<article class="km-doc-panel">
	<div class="km-doc-label">Prompt example B</div>
	<h3>Stable softmax.</h3>
	<div class="km-codeblock"><button class="km-copy">Copy</button>Write a fused Triton kernel for row-wise softmax.
	Input x is a CUDA tensor shaped [M, N].
	Use the stable max-subtraction form.
	Return exactly one fenced python code block with run(x).
	Do not include explanation text outside the code block.</div>
	</article>

	<article class="km-doc-panel">
	<div class="km-doc-label">Prompt example C</div>
	<h3>SwiGLU gate.</h3>
	<div class="km-codeblock"><button class="km-copy">Copy</button>Write a fused Triton kernel for swiglu.
	Inputs are gate and up tensors with the same shape.
	Compute silu(gate) * up elementwise.
	Return exactly one fenced python code block with run(gate, up).
	Keep the launch grid simple and contiguous-row friendly.</div>
	</article>

	<article class="km-doc-panel wide">
	<div class="km-doc-label">7. Verify a generated kernel</div>
	<h3>Never score the model output by eye.</h3>
	<p>Save the candidate as `candidate.py`, then send it to the referee. The result object reports correctness, latency, and speedups vs eager, torch.compile, and max-autotune.</p>
	<div class="km-codeblock"><button class="km-copy">Copy</button>git clone https://github.com/ymrohit/ouroboros-kernelsmith.git
	cd ouroboros-kernelsmith
	python -m pip install torch triton numpy</div>
	<div class="km-codeblock"><button class="km-copy">Copy</button>import pathlib
	import sys

	sys.path.insert(0, "referee")
	from harness import evaluate_inprocess_full

	kernel_src = pathlib.Path("candidate.py").read_text()
	result = evaluate_inprocess_full(kernel_src, "add_rmsnorm_gelu", n_shapes=2, n_iters=30)
	print(result.to_dict())</div>
	<div class="km-doc-callout">A kernel is submission-grade only if `status` is `ok`, correctness is true, and the compiler baselines are present. The strongest number is `speedup_maxauto` because that compares against torch.compile max-autotune.</div>
	</article>

	<article class="km-doc-panel wide">
	<div class="km-doc-label">8. What the referee can certify</div>
	<h3>Known operations only, by design.</h3>
	<p>The referee needs a PyTorch reference. It certifies norm and activation fusions plus named transformer operations. Unknown math can still be interesting, but this app will not label it a verified win until there is a reference spec.</p>
	<div id="km-docs-ops" class="km-oplist"></div>
	</article>

	<article class="km-doc-panel wide">
	<div class="km-doc-label">9. Glossary</div>
	<div id="km-learn-blocks"></div>
	</article>

	<article class="km-doc-panel wide">
	<div class="km-doc-label">Source and artifacts</div>
	<p>MIT licensed code: <a href="https://github.com/ymrohit/ouroboros-kernelsmith">ymrohit/ouroboros-kernelsmith</a>. Models: <a href="https://huggingface.co/YMRohit/ouroboros-kernelsmith-minicpm5-1b">MiniCPM5-1B smith</a>, <a href="https://huggingface.co/YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF">MiniCPM5-1B GGUF</a>, and <a href="https://huggingface.co/YMRohit/ouroboros-kernelsmith-qwen3.6-27b">Qwen3.6-27B smith</a>. Evidence: <a href="https://huggingface.co/datasets/YMRohit/ouroboros-kernel-corpus">verified kernel corpus</a>.</p>
	</article>
	</div>
	</section>
	</div>
	</div>
	"""

	APP_HTML = (APP_HTML
	.replace("__KM_REPLAYS__", REPLAYS_JSON)
	.replace("__KM_LB_ONE__", lb_url(ONE_B))
	.replace("__KM_LB_PRO__", lb_url(PRO)))

	APP_JS = r"""
	() => {
	const ACTS = {
	gelu:{lbl:'GELU',verb:'smoothly bends',info:'GELU is the gentle bend used inside many transformers. It lets a little of the negative side through.',f:x=>0.5x(1+Math.tanh(0.7978845608(x+0.044715xxx)))},
	silu:{lbl:'SiLU',verb:'swishes',info:'SiLU, also called Swish, is x times sigmoid(x). It is common in LLaMA style feed-forward blocks.',f:x=>x/(1+Math.exp(-x))},
	relu:{lbl:'ReLU',verb:'clips negatives',info:'ReLU keeps positive values and clips negative values to zero. It is fast, classic, and direct.',f:x=>Math.max(x,0)},
	tanh:{lbl:'Tanh',verb:'squashes to -1..1',info:'Tanh squashes each value into the range from -1 to 1. Large values flatten near the top and bottom.',f:x=>Math.tanh(x)},
	sigmoid:{lbl:'Sigmoid',verb:'squashes to 0..1',info:'Sigmoid squashes each value into the range from 0 to 1. It is often used for gates and probabilities.',f:x=>1/(1+Math.exp(-x))},
	relu2:{lbl:'ReLU2',verb:'clips then squares',info:'Squared ReLU clips negative values to zero, then squares the remaining positive values.',f:x=>{const r=Math.max(x,0);return r*r;}},
	gelu_erf:{lbl:'GELU exact',verb:'bends exactly',info:'Exact GELU uses the erf form of GELU. It behaves almost like the tanh approximation, but follows the exact formula.',f:x=>0.5x(1+Math.tanh(0.7978845608(x+0.044715xxx)))},
	leaky_relu:{lbl:'Leaky ReLU',verb:'leaks negatives',info:'Leaky ReLU is like ReLU, but negative values leak through at a small scale instead of becoming zero.',f:x=>x>0?x:0.01*x},
	elu:{lbl:'ELU',verb:'curves negatives',info:'ELU passes positive values through and curves negative values smoothly down toward -1.',f:x=>x>0?x:Math.exp(x)-1},
	mish:{lbl:'Mish',verb:'self gates',info:'Mish is a smooth self-gating activation, x times tanh of softplus(x).',f:x=>x*Math.tanh(Math.log(1+Math.exp(x)))},
	};
	const NORMS = {
	rmsnorm:{lbl:'RMSNorm',info:'RMSNorm makes a row of values a consistent size by dividing by root mean square, then scaling.'},
	layernorm:{lbl:'LayerNorm',info:'LayerNorm centers a row of values by subtracting the average, then rescales by the spread.'},
	};
	const MEM = {info:'Residual add means adding the input back before the next operation. This is the skip path used in deep models.'};
	const CLASSICS = {
	softmax:{lbl:'Softmax',info:'Softmax turns a row of scores into probabilities that add to 1.'},
	softmax_scale:{lbl:'Scaled softmax',info:'Scaled softmax applies a scale before softmax. It is the attention score step used before attention weights.'},
	swiglu:{lbl:'SwiGLU',info:'SwiGLU is a gated feed-forward operation: SiLU on the gate multiplied by the up projection.'},
	geglu:{lbl:'GeGLU',info:'GeGLU is a gated feed-forward operation like SwiGLU, but with a GELU gate.'},
	rmsnorm:{lbl:'RMSNorm',info:'RMSNorm on its own, without an activation after it.'},
	layernorm:{lbl:'LayerNorm',info:'LayerNorm on its own, without an activation after it.'},
	};
	const INPUT = [0.6,-0.9,1.4,-0.3,0.2,-1.6,0.8,1.1,-0.5,0.0,1.7,-1.1,0.4,-0.2];

	const root=document.getElementById('km');
	if(!root\|\|root.dataset.init)return;
	root.dataset.init='1';
	const $=s=>root.querySelector(s), $$=s=>Array.from(root.querySelectorAll(s));
	const LB_URLS=[root.dataset.lbOne,root.dataset.lbPro].filter(Boolean);
	const cleanText=s=>String(s??'').replace(/\u2013\|\u2014/g,' - ').replace(/\u2192/g,'->').replace(/\u00d7/g,'x').replace(/[\u2600-\u27BF]/g,'').replace(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g,'').replace(/\s+/g,' ').trim();
	const esc=s=>cleanText(s).replace(/[&<>"']/g,c=>({'&':'&','<':'<','>':'>','"':'"',"'":'''}[c]));
	let REPLAYS={};
	try{REPLAYS=JSON.parse(($('#km-replays-data')\|\|{}).textContent\|\|'{}');}catch(e){REPLAYS={};}
	const REPLAY_META={
	rmsnorm_gelu:{label:'RMSNorm to GELU',short:'RMSNorm GELU'},
	softmax:{label:'Softmax',short:'Softmax'},
	add_layernorm_silu:{label:'Residual plus LayerNorm to SiLU',short:'Residual LayerNorm SiLU'}
	};
	const REPLAY_ORDER=['rmsnorm_gelu','softmax','add_layernorm_silu'].filter(k=>REPLAYS[k]);

	const state={tab:'build',mode:'build',slots:{mem:false,norm:'rmsnorm',act:'gelu',act2:null},classic:'softmax'};
	let buildTries=4, expertTries=5, resolvedExpert=null, lastMintedMode='build';
	let activeMintNonce=null, awaitingMint=false, pendingLBRefresh=false, lbRefreshTimer=null, lbPollTimer=null, lbUiTimer=null;
	const HIST=[];

	function animate(el,frames,opts){ if(el&&el.animate)el.animate(frames,opts); }
	function labelRecipe(r){
	if(!r)return 'unknown';
	if(r.mode==='classic')return CLASSICS[r.classic]?.lbl\|\|r.classic\|\|'classic op';
	const p=[];
	if(r.residual)p.push('residual add');
	p.push(NORMS[r.norm]?.lbl\|\|r.norm);
	(r.acts\|\|[]).forEach(a=>p.push(ACTS[a]?.lbl\|\|a));
	return p.join(' + ');
	}
	function machineInfo(){
	if(state.mode==='classic')return CLASSICS[state.classic].info;
	const p=[];
	p.push(state.slots.mem?'add input':'no residual add');
	p.push(NORMS[state.slots.norm].lbl);
	p.push(ACTS[state.slots.act].lbl);
	if(state.slots.act2)p.push(ACTS[state.slots.act2].lbl);
	return 'Current operation: '+p.join(' + ')+'. The model must fuse it into one verified kernel.';
	}
	function setInfo(t){
	const info=$('#km-info');
	if(!info)return;
	info.textContent=cleanText(t\|\|machineInfo());
	animate(info,[{opacity:.45},{opacity:1}],{duration:180,easing:'cubic-bezier(0.16,1,0.3,1)'});
	}

	function positionTabPill(){
	const tabs=$('#km-tabs'), pill=$('#km-tab-pill'), btn=$('.km-tab.on');
	if(!tabs\|\|!pill\|\|!btn)return;
	const tr=tabs.getBoundingClientRect(), br=btn.getBoundingClientRect();
	pill.style.width=br.width+'px';
	pill.style.transform='translateX('+(br.left-tr.left-4)+'px)';
	}
	function showTab(tab){
	state.tab=tab;
	$$('.km-tab').forEach(b=>b.classList.toggle('on',b.dataset.tab===tab));
	$$('.km-page').forEach(p=>{p.hidden=p.dataset.page!==tab;});
	requestAnimationFrame(positionTabPill);
	if(tab==='expert')resolveExpert();
	if(tab==='lb'){refreshLB(true); setLBPolling(true);}
	else setLBPolling(false);
	}
	$$('.km-tab').forEach(b=>b.addEventListener('click',()=>showTab(b.dataset.tab)));
	window.addEventListener('resize',positionTabPill);

	function optionInfo(slot,key){
	if(slot==='mem')return key==='add'?MEM.info:'No residual add. The kernel starts from the input row and does not add a skip value.';
	if(slot==='norm')return NORMS[key].info;
	if(slot==='act'\|\|slot==='act2')return key==='none'?'No second activation. The pipeline stops after the first activation.':ACTS[key].info;
	return '';
	}
	function optionLabel(slot,key){
	if(slot==='mem')return key==='add'?'add input':'none';
	if(slot==='norm')return NORMS[key].lbl;
	if(slot==='act'\|\|slot==='act2')return key==='none'?'none':ACTS[key].lbl;
	return key;
	}
	function selectedKey(slot){
	if(slot==='mem')return state.slots.mem?'add':'none';
	if(slot==='act2')return state.slots.act2\|\|'none';
	return state.slots[slot];
	}
	function setSlot(slot,key,quiet){
	if(slot==='mem')state.slots.mem=key==='add';
	else if(slot==='act2')state.slots.act2=key==='none'?null:key;
	else state.slots[slot]=key;
	const wrap=root.querySelector('.km-pick[data-slot="'+slot+'"]');
	if(wrap)wrap.querySelectorAll('.km-opt').forEach(b=>b.classList.toggle('on',b.dataset.key===selectedKey(slot)));
	if(!quiet)setInfo(optionInfo(slot,key));
	render();
	}
	function renderPickers(){
	$$('.km-pick[data-slot]').forEach(wrap=>{
	const slot=wrap.dataset.slot;
	let keys=[];
	if(slot==='mem')keys=['none','add'];
	if(slot==='norm')keys=Object.keys(NORMS);
	if(slot==='act')keys=Object.keys(ACTS);
	if(slot==='act2')keys=['none'].concat(Object.keys(ACTS));
	wrap.innerHTML='';
	keys.forEach(key=>{
	const b=document.createElement('button');
	b.type='button';
	b.className='km-opt';
	b.dataset.key=key;
	b.textContent=optionLabel(slot,key);
	b.title=optionInfo(slot,key);
	b.classList.toggle('on',key===selectedKey(slot));
	b.addEventListener('mouseenter',()=>setInfo(optionInfo(slot,key)));
	b.addEventListener('focus',()=>setInfo(optionInfo(slot,key)));
	b.addEventListener('click',()=>setSlot(slot,key));
	wrap.appendChild(b);
	});
	});
	}

	function renderClassics(){
	const cwrap=$('#km-classics');
	if(!cwrap)return;
	cwrap.innerHTML='';
	Object.entries(CLASSICS).forEach(([k,v])=>{
	const b=document.createElement('button');
	b.type='button';
	b.className='km-opt';
	b.dataset.key=k;
	b.textContent=v.lbl;
	b.title=v.info;
	b.classList.toggle('on',state.classic===k);
	b.addEventListener('mouseenter',()=>setInfo(v.info));
	b.addEventListener('focus',()=>setInfo(v.info));
	b.addEventListener('click',()=>{
	state.classic=k;
	cwrap.querySelectorAll('.km-opt').forEach(x=>x.classList.toggle('on',x===b));
	setInfo(v.info);
	render();
	});
	cwrap.appendChild(b);
	});
	}
	function setComposerMode(mode){
	state.mode=mode;
	$$('.km-segb').forEach(b=>b.classList.toggle('on',b.dataset.mode===mode));
	const compose=$('#km-compose'), classic=$('#km-classic');
	if(compose)compose.hidden=mode!=='build';
	if(classic)classic.hidden=mode!=='classic';
	setInfo(machineInfo());
	render();
	}
	$$('.km-segb').forEach(b=>b.addEventListener('click',()=>setComposerMode(b.dataset.mode)));

	function setupStepper(id,outId,initial,onChange){
	const wrap=$(id), out=$(outId);
	if(!wrap)return;
	wrap.innerHTML='';
	const set=v=>{
	onChange(v);
	if(out)out.textContent=String(v);
	wrap.querySelectorAll('button').forEach(b=>b.classList.toggle('on',+b.dataset.v===v));
	};
	for(let i=1;i<=8;i++){
	const b=document.createElement('button');
	b.type='button';
	b.dataset.v=String(i);
	b.textContent=String(i);
	b.addEventListener('click',()=>set(i));
	wrap.appendChild(b);
	}
	set(initial);
	}
	function setupTemp(id,outId){
	const el=$(id), out=$(outId);
	if(!el)return;
	const sync=()=>{if(out)out.textContent=(+el.value/100).toFixed(2);};
	el.addEventListener('input',sync);
	sync();
	}
	function syncModePair(localSel,proSel){
	const local=$(localSel), pro=$(proSel);
	if(!local\|\|!pro)return;
	const sync=()=>{
	if(local.checked){pro.checked=false;pro.disabled=true;}
	else pro.disabled=false;
	};
	local.addEventListener('change',sync);
	pro.addEventListener('change',()=>{if(pro.checked)local.checked=false;sync();});
	sync();
	}

	const svg=$('#km-svg'), W=320, H=150, PAD=12, n=INPUT.length;
	const xs=i=>PAD+i(W-2PAD)/(n-1);
	const ys=v=>{const t=Math.max(-2.2,Math.min(2.2,v));return H/2-t*(H/2-PAD)/2.2;};
	let dots=[], line=null, cur=INPUT.slice(), anim=null;
	function buildSvg(){
	if(!svg)return;
	svg.innerHTML='';
	const mid=document.createElementNS('http://www.w3.org/2000/svg','line');
	mid.setAttribute('x1',0); mid.setAttribute('x2',W); mid.setAttribute('y1',H/2); mid.setAttribute('y2',H/2);
	mid.setAttribute('stroke','#1f2d26'); mid.setAttribute('stroke-width','1'); svg.appendChild(mid);
	line=document.createElementNS('http://www.w3.org/2000/svg','polyline');
	line.setAttribute('fill','none'); line.setAttribute('stroke','#34e0a1'); line.setAttribute('stroke-width','2'); line.setAttribute('stroke-linejoin','round');
	svg.appendChild(line);
	dots=cur.map((v,i)=>{const c=document.createElementNS('http://www.w3.org/2000/svg','circle');c.setAttribute('r','3.4');c.setAttribute('cx',xs(i));svg.appendChild(c);return c;});
	paint(cur);
	}
	function paint(v){
	if(!line)return;
	line.setAttribute('points',v.map((x,i)=>xs(i)+','+ys(x)).join(' '));
	v.forEach((x,i)=>{if(dots[i]){dots[i].setAttribute('cy',ys(x));dots[i].setAttribute('fill',x>=0?'#34e0a1':'#ff7a7a');}});
	}
	const ease=t=>1-Math.pow(1-t,3);
	function morph(to,ms=480){
	if(!line)return;
	const from=cur.slice(), t0=performance.now();
	if(anim)cancelAnimationFrame(anim);
	const step=t=>{
	const k=Math.min(1,(t-t0)/ms), e=ease(k);
	cur=from.map((f,i)=>f+(to[i]-f)*e);
	paint(cur);
	if(k<1)anim=requestAnimationFrame(step); else cur=to.slice();
	};
	anim=requestAnimationFrame(step);
	}
	function normalize(a,kind){
	const m=a.reduce((s,x)=>s+x,0)/a.length;
	if(kind==='layernorm'){
	const v=a.reduce((s,x)=>s+(x-m)*(x-m),0)/a.length;
	return a.map(x=>(x-m)/Math.sqrt(v+1e-5));
	}
	const r=Math.sqrt(a.reduce((s,x)=>s+x*x,0)/a.length+1e-6);
	return a.map(x=>x/r);
	}
	function classicViz(key){
	let v=INPUT.slice();
	if(key.startsWith('softmax')){
	const scale=key==='softmax_scale'?0.7:1;
	const z=v.map(x=>x*scale), mx=Math.max(...z), e=z.map(x=>Math.exp(x-mx)), s=e.reduce((a,b)=>a+b,0);
	v=e.map(x=>x/s*4-0.5);
	}else if(key==='rmsnorm'\|\|key==='layernorm'){
	v=normalize(v,key);
	}else if(key==='swiglu'){
	v=v.map(x=>(x/(1+Math.exp(-x)))(0.7+Math.abs(x)0.3));
	}else if(key==='geglu'){
	v=v.map(x=>ACTS.gelu.f(x)(0.7+Math.abs(x)0.3));
	}
	return v;
	}
	function pipeline(){
	if(state.mode==='classic')return [{l:'input',v:INPUT.slice()},{l:CLASSICS[state.classic].lbl,v:classicViz(state.classic)}];
	const st=[{l:'input',v:INPUT.slice()}];
	let v=INPUT.slice();
	if(state.slots.mem){v=v.map((x,i)=>x+INPUT[(i+3)%n]*0.5);st.push({l:'residual add',v:v.slice()});}
	v=normalize(v,state.slots.norm);st.push({l:NORMS[state.slots.norm].lbl,v:v.slice()});
	v=v.map(ACTS[state.slots.act].f);st.push({l:ACTS[state.slots.act].lbl+' '+ACTS[state.slots.act].verb,v:v.slice()});
	if(state.slots.act2){v=v.map(ACTS[state.slots.act2].f);st.push({l:ACTS[state.slots.act2].lbl+' second pass',v:v.slice()});}
	return st;
	}
	let playing=false;
	function play(){
	if(playing)return;
	playing=true;
	const st=pipeline();
	let i=0;
	const stage=$('#km-stage-label');
	const next=()=>{
	if(i>=st.length){playing=false;return;}
	if(stage){stage.textContent=cleanText(st[i].l);animate(stage,[{opacity:0,transform:'translateY(4px)'},{opacity:1,transform:'none'}],{duration:200,easing:'cubic-bezier(0.16,1,0.3,1)'});}
	morph(st[i].v);
	i++;
	setTimeout(next,680);
	};
	next();
	}
	function render(){
	const st=pipeline(), stage=$('#km-stage-label');
	if(stage)stage.textContent=cleanText(st[st.length-1].l);
	morph(st[st.length-1].v,360);
	updateReplayOffer();
	}

	function buildRecipe(){
	const tempEl=$('#km-temp'), proEl=$('#km-pro'), localEl=$('#km-local'), nameEl=$('#km-name');
	const local=!!(localEl&&localEl.checked);
	return {
	mode:state.mode,
	norm:state.slots.norm,
	residual:!!state.slots.mem,
	acts:[state.slots.act].concat(state.slots.act2?[state.slots.act2]:[]),
	classic:state.classic,
	spark:'',
	k:buildTries,
	temp:(+(tempEl?tempEl.value:70))/100,
	pro:!local&&!!(proEl&&proEl.checked),
	local,
	name:nameEl?nameEl.value:''
	};
	}
	function expertRecipe(){
	if(!resolvedExpert\|\|resolvedExpert.unknown)return null;
	const tempEl=$('#km-xtemp'), proEl=$('#km-xpro'), localEl=$('#km-xlocal'), briefEl=$('#km-xbrief');
	const local=!!(localEl&&localEl.checked);
	const r=Object.assign({},resolvedExpert.recipe);
	r.k=expertTries;
	r.temp=(+(tempEl?tempEl.value:70))/100;
	r.pro=!local&&!!(proEl&&proEl.checked);
	r.local=local;
	r.name='expert';
	r.spark=briefEl?briefEl.value:'';
	return r;
	}
	function currentRecipe(){
	return state.tab==='expert'?(expertRecipe()\|\|buildRecipe()):buildRecipe();
	}
	function bridge(recipe){
	const inp=document.querySelector('#km_recipe textarea');
	if(!inp)return;
	// _n is a nonce: it makes the recipe JSON differ on every mint so Gradio always registers a
	// value change and re-runs do_mint. Without it, minting the same recipe twice in a row left
	// the textarea value unchanged and the second submit was dropped (the "needs a double-click"
	// bug). The backend echoes it back so stale blobs from the previous run cannot repaint the old
	// result over the new loading card while Gradio is preparing the next response.
	const nonce=Date.now().toString(36)+'-'+Math.random().toString(36).slice(2);
	const payload=Object.assign({},recipe,{_n:nonce});
	activeMintNonce=nonce;
	awaitingMint=true;
	const set=Object.getOwnPropertyDescriptor(window.HTMLTextAreaElement.prototype,'value').set;
	set.call(inp,JSON.stringify(payload));
	inp.dispatchEvent(new Event('input',{bubbles:true}));
	// NOTE: we intentionally do NOT dispatch 'change' here. The leaderboard fetch is triggered
	// separately via refreshLB() so it never competes with do_mint in Gradio's queue.
	setTimeout(()=>{
	const g=document.querySelector('#km_go button')\|\|document.querySelector('#km_go');
	if(g)g.click();
	},80);
	}
	function setLBStatus(text,kind){
	const s=$('#km-lb-status');
	if(!s)return;
	s.textContent=text;
	s.classList.toggle('bad',kind==='bad');
	s.classList.toggle('warn',kind==='warn');
	}
	function setLBButton(on){
	const b=$('#km-lb-refresh');
	if(!b)return;
	b.disabled=on;
	b.textContent=on?'Refreshing...':'Refresh';
	}
	async function refreshLBDirect(){
	const rows=[], seen=new Set(), sources=[];
	for(const url of LB_URLS){
	try{
	const res=await fetch(url,{cache:'no-store'});
	if(!res.ok)continue;
	const data=await res.json();
	const got=Array.isArray(data.rows)?data.rows:[];
	if(got.length)sources.push(url);
	got.forEach(row=>{
	const key=[row.op,row.name,row.ts,row.speedup_compile,row.speedup_eager].join('\|');
	if(seen.has(key))return;
	seen.add(key);
	rows.push(row);
	});
	}catch(e){}
	}
	if(!sources.length)throw new Error('direct leaderboard fetch failed');
	renderLB(rows,{fetched_at:Date.now()/1000,sources});
	}
	function refreshLB(manual=false){
	if(awaitingMint){
	pendingLBRefresh=true;
	if(manual)setLBStatus('Queued until the current mint finishes.','warn');
	return;
	}
	if(manual\|\|state.tab==='lb'){
	setLBButton(true);
	setLBStatus('Checking leaderboard...','');
	if(lbUiTimer)clearTimeout(lbUiTimer);
	lbUiTimer=setTimeout(()=>{
	refreshLBDirect().catch(()=>{
	setLBButton(false);
	setLBStatus('Still waiting for the Gradio refresh bridge. Direct browser fetch was blocked. Try once more or reopen the tab.','warn');
	});
	},8000);
	}
	const btn=document.querySelector('#km_lb_btn button')\|\|document.querySelector('#km_lb_btn');
	if(btn)btn.click();
	else{
	setLBButton(false);
	setLBStatus('Refresh bridge is not mounted yet. Reopen the Leaderboard tab.','bad');
	}
	}
	function scheduleLBRefresh(delay=700){
	pendingLBRefresh=false;
	if(lbRefreshTimer)clearTimeout(lbRefreshTimer);
	lbRefreshTimer=setTimeout(()=>refreshLB(false),delay);
	}
	function setLBPolling(on){
	if(lbPollTimer){clearInterval(lbPollTimer);lbPollTimer=null;}
	if(on)lbPollTimer=setInterval(()=>{if(state.tab==='lb')refreshLB(false);},15000);
	}
	let mintSafetyTimer=null;
	function setMinting(on){
	['#km-build','#km-xbuild'].forEach(id=>{const b=$(id); if(b){b.disabled=on; b.classList.toggle('minting',on);}});
	if(mintSafetyTimer){clearTimeout(mintSafetyTimer); mintSafetyTimer=null;}
	// Bulletproof re-enable: showResult() clears this, but if a result never arrives (hung backend)
	// the button must not stay dead. 4 min is longer than any real mint.
	if(on) mintSafetyTimer=setTimeout(()=>setMinting(false), 240000);
	}

	function replayName(key){return (REPLAY_META[key]&&REPLAY_META[key].label)\|\|(REPLAYS[key]&&cleanText(REPLAYS[key].op))\|\|key;}
	function matchingReplayKey(){
	if(state.mode==='classic'&&state.classic==='softmax'&&REPLAYS.softmax)return 'softmax';
	if(state.mode==='build'&&!state.slots.mem&&state.slots.norm==='rmsnorm'&&state.slots.act==='gelu'&&!state.slots.act2&&REPLAYS.rmsnorm_gelu)return 'rmsnorm_gelu';
	if(state.mode==='build'&&state.slots.mem&&state.slots.norm==='layernorm'&&state.slots.act==='silu'&&!state.slots.act2&&REPLAYS.add_layernorm_silu)return 'add_layernorm_silu';
	return null;
	}
	function updateReplayOffer(){
	const btn=$('#km-replay');
	if(!btn\|\|!REPLAY_ORDER.length)return;
	const key=matchingReplayKey()\|\|REPLAY_ORDER[0];
	btn.dataset.replayKey=key;
	btn.textContent=matchingReplayKey()?'Watch this recorded mint':'Watch a recorded mint';
	$$('.km-rpick').forEach(b=>b.classList.toggle('on',b.dataset.replayKey===key));
	}
	function renderReplayChoices(){
	const wrap=$('#km-replay-picks');
	if(!wrap)return;
	if(!REPLAY_ORDER.length){wrap.hidden=true;return;}
	wrap.hidden=false;
	wrap.innerHTML=REPLAY_ORDER.map(k=>"<button class='km-rpick' data-replay-key='"+esc(k)+"'>"+esc((REPLAY_META[k]&&REPLAY_META[k].short)\|\|replayName(k))+" <span>"+sp3(REPLAYS[k].speedup_compile)+"x</span></button>").join('');
	updateReplayOffer();
	}

	const sp=x=>Number.isFinite(+x)?(+x).toFixed(1).replace(/\.0$/,''):'?';
	const sp2=x=>Number.isFinite(+x)?(+x).toFixed(2):'?';
	const sp3=x=>Number.isFinite(+x)?(+x).toFixed(3).replace(/0+$/,'').replace(/\.$/,''):'?';
	const isOk=s=>String(s\|\|'').toLowerCase()==='ok'\|\|String(s\|\|'').toLowerCase()==='pass';
	function attemptSummary(d){
	const statuses=Array.isArray(d.statuses)?d.statuses:[];
	const passed=statuses.length?statuses.filter(isOk).length:(Number.isFinite(+d.n_verified)?+d.n_verified:0);
	const total=statuses.length\|\|(Number.isFinite(+d.k)?+d.k:passed\|\|0);
	return {statuses,passed,total};
	}
	function attemptChips(d,limit){
	const a=attemptSummary(d), statuses=a.statuses.slice(0,limit??a.statuses.length);
	if(!statuses.length)return '';
	return "<div class='km-attempts'>"+statuses.map((s,i)=>{
	const ok=isOk(s);
	return "<span class='km-attempt "+(ok?'ok':'bad')+"'><b>"+(ok?'OK':'FAIL')+"</b> attempt "+(i+1)+", "+esc(s)+"</span>";
	}).join('')+"</div>";
	}
	function target(mode){
	if(mode==='expert'){
	return {verdict:$('#km-xverdict'),source:$('#km-xsource'),code:$('#km-xsource .km-code')};
	}
	return {verdict:$('#km-verdict'),source:$('#km-source'),code:$('#km-code')};
	}
	function clearSource(mode){
	const t=target(mode);
	if(t.source)t.source.hidden=true;
	if(t.code)t.code.textContent='';
	}
	function setSource(d,kind,mode){
	const t=target(mode);
	if(!t.source\|\|!t.code)return;
	if(!d.source){clearSource(mode);return;}
	t.source.hidden=false;
	t.source.open=false;
	const sum=t.source.querySelector('summary');
	if(sum)sum.textContent=kind==='recorded'?'Recorded Triton kernel source':'Live Triton kernel source';
	t.code.textContent=d.source;
	}
	function wakingError(msg){return /waking\|wake\|timeout\|timed out\|still\|unexpected\|http\|modal\|loading/i.test(String(msg\|\|''));}
	function showResult(d,opts={}){
	if(!opts.recorded&&awaitingMint&&String(d&&d._n\|\|'')!==String(activeMintNonce\|\|'')){
	return;
	}
	if(!opts.recorded&&awaitingMint){
	awaitingMint=false;
	activeMintNonce=null;
	}
	const mode=opts.mode\|\|lastMintedMode\|\|'build';
	stopBuild();
	stopReplay();
	if(!opts.recorded) setMinting(false); // result arrived -> re-enable Mint (replays don't disable it)
	const t=target(mode), v=t.verdict;
	if(!v)return;
	clearSource(mode);
	if(d.error){
	const local=!!d.local;
	const wake=!local&&wakingError(d.error);
	const replayKey=matchingReplayKey()\|\|REPLAY_ORDER[0]\|\|'';
	const replayBtn=replayKey?"<button class='km-inline-replay' data-replay-key='"+esc(replayKey)+"'>Watch a recorded mint while it wakes</button>":"";
	v.innerHTML="<div class='km-card err'>"+
	"<div class='km-result-top'><div class='km-verdict-icon bad'>!</div><div><span class='km-run-label'>"+(local?'local offline mint':'live mint')+"</span><b>"+(wake?'The live model is still waking':(local?'The local mint did not finish':'The live mint did not finish'))+"</b></div></div>"+
	"<p class='km-dim'>"+(wake?'The backend scales to zero, so the first live mint can take about 90 seconds. The replay below is a real earlier mint, not a mockup.':(local?esc(d.error):'Backend said: '+esc(d.error)))+"</p>"+
	(wake?replayBtn+"<p class='km-rawerr'>Backend said: "+esc(d.error)+"</p>":replayBtn)+"</div>";
	if(pendingLBRefresh)scheduleLBRefresh();
	return;
	}
	const a=attemptSummary(d);
	if(!d.verified){
	v.innerHTML="<div class='km-card miss'>"+
	"<div class='km-result-top'><div class='km-verdict-icon bad'>NO</div><div><span class='km-run-label'>referee verdict</span><b>Not verified yet</b></div></div>"+
	"<div class='km-ref-summary'><b>"+a.passed+" of "+(a.total\|\|'?')+"</b> attempts passed the referee</div>"+
	attemptChips(d)+"<p class='km-dim'>"+esc(d.explanation\|\|'Press Mint again, raise attempts, or change the operation.')+"</p></div>";
	if(pendingLBRefresh)scheduleLBRefresh();
	return;
	}
	const recorded=!!opts.recorded, local=!!d.local;
	// Headline the HONEST baseline (the compiler), not the inflated vs-eager fusion number.
	// Strongest available wins: max-autotune > torch.compile default > eager.
	const se=+d.speedup_eager, sc=+d.speedup_compile, sm=+d.speedup_maxauto;
	const hasC=Number.isFinite(sc)&&sc>0, hasM=Number.isFinite(sm)&&sm>0;
	const heroSpeed=hasM?sm:(hasC?sc:se);
	const heroLabel=hasM?'vs torch.compile max-autotune':(hasC?'vs torch.compile':'vs PyTorch eager');
	const missingLocalCompiler=local&&!recorded&&!hasC&&!hasM;
	const compilerGuard=missingLocalCompiler
	? "<div class='km-baseline-guard'>Compiler baselines did not return for this Local run. Re-mint; a certified Local result should include torch.compile and max-autotune next to eager.</div>"
	: "";
	const runLabel=local?'local offline mint':(recorded?'recorded mint':'live mint');
	const noteText=recorded?'recorded backend run':(local?'local offline run':'live backend run');
	const note="<span class='km-pill-note "+(recorded?'recorded':(local?'local':'hit'))+"'>"+noteText+"</span>";
	const board=d.beat_champion?"<span class='km-crown'>27B champion beaten</span>":d.on_leaderboard?"<span class='km-board'>leaderboard result</span>":"";
	v.innerHTML="<div class='km-card ok km-result-card'>"+note+board+
	"<div class='km-result-top'><div class='km-verdict-icon'>OK</div><div><span class='km-run-label'>"+runLabel+"</span><b>Verified by the referee</b><span>"+esc(d.op\|\|'kernel')+"</span></div></div>"+
	"<div class='km-ref-summary'><b>"+a.passed+" of "+(a.total\|\|'?')+"</b> attempts passed the referee</div>"+
	attemptChips(d)+
	compilerGuard+
	// Every comparison as its own big number, strongest (most honest) baseline first, eager last
	// and labelled 'unfused' so the side-by-side is self-explanatory: the model's real edge is
	// the ~1.1x over the compiler; the big eager number is mostly the fusion win.
	"<div class='km-stats km-cmp'>"+
	(hasM?"<div class='km-cmp-key'><big>"+sp3(sm)+"x</big><span>vs torch.compile max-autotune</span></div>":"")+
	(hasC?"<div"+(hasM?"":" class='km-cmp-key'")+"><big>"+sp3(sc)+"x</big><span>vs torch.compile</span></div>":"")+
	"<div><big>"+sp(se)+"x</big><span>vs PyTorch eager (unfused)</span></div>"+
	"<div><big>"+(Number.isFinite(+d.gen_seconds)?sp(d.gen_seconds)+'s':'?')+"</big><span>generation</span></div></div>"+
	"<p class='km-dim'>"+esc(d.explanation\|\|'')+"</p></div>";
	animate(v.firstElementChild,[{transform:'scale(0.96)',opacity:0},{transform:'scale(1)',opacity:1}],{duration:300,easing:'cubic-bezier(0.16,1,0.3,1)'});
	setSource(d,recorded?'recorded':'live',mode);
	if(!recorded&&mode==='build'){
	HIST.unshift("<code>"+esc(d.op\|\|'kernel')+"</code><span>"+sp(d.speedup_maxauto\|\|d.speedup_compile\|\|d.speedup_eager)+"x vs compiler</span>");
	const hist=$('#km-hist');
	if(hist)hist.innerHTML=HIST.slice(0,10).map(h=>"<div class='km-hrow'>"+h+"</div>").join('');
	}
	// The leaderboard refresh is deliberately decoupled from the mint call. Once the result has
	// arrived, the queue is free again, so refresh the board without hiding the just-rendered result.
	if(!recorded)scheduleLBRefresh(900);
	}

	let replayTimer=null;
	function stopReplay(){if(replayTimer){clearTimeout(replayTimer);replayTimer=null;}}
	function startReplay(key){
	const d=REPLAYS[key];
	if(!d)return;
	lastMintedMode=state.tab==='expert'?'expert':'build';
	stopBuild();
	stopReplay();
	play();
	const t=target(lastMintedMode), v=t.verdict;
	if(!v)return;
	clearSource(lastMintedMode);
	const statuses=Array.isArray(d.statuses)?d.statuses:[];
	const total=statuses.length\|\|(Number.isFinite(+d.k)?+d.k:4);
	const passed=statuses.length?statuses.filter(isOk).length:(Number.isFinite(+d.n_verified)?+d.n_verified:0);
	v.innerHTML="<div class='km-card building replaying'>"+
	"<div class='km-result-top'><div class='km-verdict-icon'>RUN</div><div><span class='km-run-label recorded'>recorded mint</span><b>"+esc(replayName(key))+"</b><span>This is a genuine earlier backend run. Mint still starts a live run.</span></div></div>"+
	"<div class='km-replay-stage' id='km-replay-stage'>loading recorded attempts...</div>"+
	"<div class='km-attempts' id='km-replay-attempts'></div>"+
	"<div class='km-bar'><i id='km-replay-fill'></i></div>"+
	"<p class='km-dim'>The model took "+(Number.isFinite(+d.gen_seconds)?sp(d.gen_seconds)+'s':'real time')+" on this recorded run. The replay shows the referee loop without waiting for cold start.</p></div>";
	const att=$('#km-replay-attempts'), stage=$('#km-replay-stage'), fill=$('#km-replay-fill');
	let i=0;
	const tick=()=>{
	if(i<total){
	const st=statuses[i]\|\|(i<passed?'ok':'not verified'), ok=isOk(st);
	if(stage)stage.textContent='referee verdict for attempt '+(i+1)+' of '+total;
	if(att)att.insertAdjacentHTML('beforeend',"<span class='km-attempt "+(ok?'ok':'bad')+"'><b>"+(ok?'OK':'FAIL')+"</b> attempt "+(i+1)+", "+esc(st)+"</span>");
	if(fill)fill.style.width=Math.round(((i+1)/(total+1))*100)+'%';
	i++;
	replayTimer=setTimeout(tick,360);
	return;
	}
	if(stage)stage.textContent='best verified kernel selected';
	if(fill)fill.style.width='100%';
	replayTimer=setTimeout(()=>showResult(d,{recorded:true,mode:lastMintedMode,replayKey:key}),420);
	};
	replayTimer=setTimeout(tick,180);
	}

	let buildTimer=null;
	function startBuild(mode,recipe){
	stopReplay();
	const t=target(mode), v=t.verdict;
	if(!v)return;
	clearSource(mode);
	const k=recipe.k\|\|4, started=Date.now(), replayKey=mode==='build'?(matchingReplayKey()\|\|REPLAY_ORDER[0]\|\|''):(REPLAY_ORDER[0]\|\|'');
	const local=!!recipe.local;
	const stages=local?['loading local GGUF...','llama.cpp drafting '+Math.min(k,2)+' kernels...','compiling Triton on this Space GPU...','checking correctness vs PyTorch...','timing vs torch.compile max-autotune...']:
	['model waking...','drafting '+k+' kernels...','compiling Triton...','checking correctness vs PyTorch...','timing vs torch.compile max-autotune...'];
	const stepHtml=stages.map((s,i)=>"<span class='"+(i===0?'on':'')+"'>"+esc(s)+"</span>").join('');
	v.innerHTML="<div class='km-card building'><b class='km-bstage' id='km-bstage'>"+esc(stages[0])+"</b>"+
	"<div class='km-build-steps' id='km-build-steps'>"+stepHtml+"</div>"+
	"<div class='km-bar'><i id='km-barfill'></i></div>"+
	"<span class='km-dim'>"+(local?"Local mode drafts up to two kernels with llama.cpp on this Space's GPU, then verifies them with the in-process referee and times them against PyTorch eager, torch.compile, and max-autotune. First run may also download/cache the GGUF.":"The model drafts up to "+k+" kernels. The referee compiles, correctness-checks, and times every candidate. First build of a session can take about 90 seconds while the model wakes.")+"</span>"+
	(replayKey?"<button class='km-inline-replay' data-replay-key='"+esc(replayKey)+"'>Watch a recorded mint while this runs</button>":"")+"</div>";
	// Honest progress: do_mint is one blocking call with no progress signal, so we cannot show a
	// live readout. Instead we walk the referee's real phases ONCE on a rough estimate, monotonically
	// (the index only ever increases), and park on the final phase until the result arrives. It never
	// loops back or resets the way the old modulo version did. A gentle pulse keeps the viz alive.
	const stageStart=[0,12,18,23,28]; // seconds at which each phase is assumed to begin
	let baseVals=null; try{const st=pipeline();baseVals=st[st.length-1].v.slice();}catch(e){}
	buildTimer=setInterval(()=>{
	const elapsed=(Date.now()-started)/1000;
	let ix=0; for(let j=0;j<stageStart.length;j++){if(elapsed>=stageStart[j])ix=j;}
	ix=Math.min(ix,stages.length-1);
	const label=(ix===stages.length-1&&elapsed>45)?'still timing vs torch.compile max-autotune...':stages[ix];
	const el=$('#km-bstage');
	if(el&&el.textContent!==label){el.textContent=label;animate(el,[{opacity:.35,transform:'translateY(3px)'},{opacity:1,transform:'none'}],{duration:220,easing:'cubic-bezier(0.16,1,0.3,1)'});}
	$$('#km-build-steps span').forEach((s,j)=>{s.classList.toggle('on',j===ix);s.classList.toggle('done',j<ix);});
	const f=$('#km-barfill');
	if(f)f.style.width=Math.min(94,(Date.now()-started)/900)+'%';
	if(baseVals){const p=0.96+0.04Math.sin(elapsed1.5);morph(baseVals.map(x=>x*p),650);}
	},650);
	}
	function stopBuild(){
	if(buildTimer){clearInterval(buildTimer);buildTimer=null;}
	const f=$('#km-barfill');
	if(f)f.style.width='100%';
	render();
	}

	function renderLB(rows,meta={}){
	const el=$('#km-lb-view');
	if(!el)return;
	if(lbUiTimer){clearTimeout(lbUiTimer);lbUiTimer=null;}
	setLBButton(false);
	const fetched=Number(meta.fetched_at\|\|0);
	const stamp=fetched?new Date(fetched*1000).toLocaleTimeString([], {hour:'2-digit', minute:'2-digit', second:'2-digit'}):'just now';
	const sourceCount=Array.isArray(meta.sources)?meta.sources.length:0;
	setLBStatus('Last checked '+stamp+(sourceCount?' from '+sourceCount+' endpoint'+(sourceCount>1?'s':'')+'.':'.'),'');
	if(!rows.length){el.innerHTML='<i>No kernels yet. Be the first.</i>';return;}
	el.innerHTML="<table class='km-lbt'><thead><tr><th>#</th><th>machine</th><th>builder</th><th>vs compiler</th><th>vs PyTorch</th></tr></thead><tbody>"+
	rows.slice(0,25).map((r,i)=>"<tr class='"+(r.champion?'champ':'')+"'><td class='km-rank'>"+(i+1)+"</td><td><code>"+esc(r.op\|\|'kernel')+"</code></td><td>"+esc(r.name\|\|'anonymous')+"</td><td class='hot'>"+sp2(r.speedup_compile)+"x</td><td>"+sp(r.speedup_eager)+"x</td></tr>").join('')+"</tbody></table>";
	}
	function watch(id,cb){
	const el=document.querySelector(id);
	if(!el)return;
	const read=()=>{
	const s=el.querySelector('[data-blob]');
	if(s){try{cb(JSON.parse(atob(s.dataset.blob)));}catch(e){}}
	};
	new MutationObserver(read).observe(el,{childList:true,subtree:true});
	read();
	}
	setTimeout(()=>{watch('#km_result',showResult);watch('#km_lb',d=>renderLB(d.rows\|\|[],d));},250);
	const rf=$('#km-lb-refresh');
	if(rf)rf.addEventListener('click',()=>refreshLB(true));

	const CLASSIC_TERMS=[
	{key:'softmax_scale',terms:['scaled softmax','scale softmax','attention scale']},
	{key:'swiglu',terms:['swiglu','swi glu']},
	{key:'geglu',terms:['geglu','ge glu']},
	{key:'softmax',terms:['softmax']}
	];
	const ACT_TERMS=[
	{key:'gelu_erf',terms:['gelu_erf','exact gelu','erf gelu']},
	{key:'relu2',terms:['relu2','relu squared','squared relu','square relu']},
	{key:'leaky_relu',terms:['leaky_relu','leaky relu','leaky']},
	{key:'silu',terms:['silu','swish','swishes']},
	{key:'sigmoid',terms:['sigmoid']},
	{key:'gelu',terms:['gelu']},
	{key:'relu',terms:['relu']},
	{key:'tanh',terms:['tanh']},
	{key:'elu',terms:['elu']},
	{key:'mish',terms:['mish']}
	];
	function termMatches(text,items){
	// word-boundary match so 'elu' does not fire inside 'gelu', 'relu' not inside 'leaky_relu', etc.
	// (underscores and digits are word chars, so \b correctly keeps relu2 and gelu_erf distinct.)
	const found=[];
	items.forEach(item=>{
	let best=-1, bestLen=0;
	item.terms.forEach(term=>{
	const re=new RegExp('\\b'+term.replace(/[.*+?^${}()\|[\]\\]/g,'\\$&')+'\\b');
	const m=re.exec(text);
	if(m&&(best<0\|\|m.index<best\|\|(m.index===best&&term.length>bestLen))){best=m.index;bestLen=term.length;}
	});
	if(best>=0)found.push({key:item.key,ix:best,len:bestLen});
	});
	found.sort((a,b)=>a.ix-b.ix\|\|b.len-a.len);
	// drop a shorter match that overlaps a longer one ('relu' inside 'leaky relu', 'softmax'
	// inside 'scaled softmax'), keeping the first/longest at each span.
	const kept=[];
	found.forEach(m=>{ if(!kept.some(k=>m.ix<k.ix+k.len&&k.ix<m.ix+m.len))kept.push(m); });
	return kept;
	}
	function resolveBrief(text){
	const raw=text\|\|'', t=raw.toLowerCase();
	const classics=termMatches(t,CLASSIC_TERMS);
	if(classics.length){
	const key=classics[0].key;
	return {unknown:false,label:CLASSICS[key].lbl,recipe:{mode:'classic',norm:'rmsnorm',residual:false,acts:[],classic:key,spark:raw}};
	}
	const normHit=t.includes('layernorm')\|\|t.includes('layer norm')?'layernorm':(t.includes('rmsnorm')\|\|t.includes('rms norm')\|\|/\brms\b/.test(t)?'rmsnorm':null);
	const residual=/residual\|resid\|\bskip\b\|add input\|\+ input/.test(t);
	const acts=[];
	termMatches(t,ACT_TERMS).forEach(m=>{if(!acts.includes(m.key)&&acts.length<2)acts.push(m.key);});
	if(!normHit&&!acts.length){
	return {unknown:true,message:'Could not map this brief. Use RMSNorm, LayerNorm, residual, and a single supported activation, or a named op such as softmax, scaled softmax, SwiGLU, or GeGLU.'};
	}
	if(!acts.length){
	if(normHit&&!residual){
	return {unknown:false,label:CLASSICS[normHit].lbl,recipe:{mode:'classic',norm:normHit,residual:false,acts:[],classic:normHit,spark:raw}};
	}
	return {unknown:true,message:'This brief needs a supported activation after the residual or normalization so the referee can verify it.'};
	}
	const recipe={mode:'build',norm:normHit\|\|'rmsnorm',residual,acts,classic:'softmax',spark:raw};
	return {unknown:false,label:labelRecipe(recipe),recipe};
	}
	function resolveExpert(){
	const box=$('#km-xbrief'), res=$('#km-xresolved'), out=res?res.querySelector('.v'):null, btn=$('#km-xbuild');
	if(!box\|\|!res\|\|!out)return;
	resolvedExpert=resolveBrief(box.value);
	res.classList.toggle('unknown',!!resolvedExpert.unknown);
	out.textContent=resolvedExpert.unknown?resolvedExpert.message:resolvedExpert.label;
	if(btn)btn.disabled=!!resolvedExpert.unknown;
	}
	const OP_TAGS=[
	['softmax','softmax over each row'],
	['scaled_softmax','scaled softmax over each row'],
	['swiglu','SwiGLU gated feed forward'],
	['geglu','GeGLU gated feed forward'],
	['rmsnorm','RMSNorm only'],
	['layernorm','LayerNorm only'],
	['add_rmsnorm_gelu','fused RMSNorm with residual and GELU'],
	['layernorm_silu','fused LayerNorm with SiLU'],
	['rmsnorm_mish','fused RMSNorm with Mish'],
	['add_layernorm_silu','fused LayerNorm with residual and SiLU'],
	['rmsnorm_gelu_relu2','fused RMSNorm with GELU then squared ReLU']
	].concat(Object.keys(ACTS).map(k=>[k,ACTS[k].lbl+' activation']));
	function renderOpTags(sel,interactive){
	const wrap=$(sel);
	if(!wrap)return;
	wrap.innerHTML='';
	OP_TAGS.forEach(([label,brief])=>{
	const b=document.createElement('button');
	b.type='button';
	b.className='km-optag';
	b.textContent=label;
	b.dataset.brief=brief;
	if(interactive)b.addEventListener('click',()=>{
	const box=$('#km-xbrief');
	if(box){box.value=brief;box.dispatchEvent(new Event('input',{bubbles:true}));box.focus();}
	});
	wrap.appendChild(b);
	});
	}

	function renderGlossary(){
	const lb=$('#km-learn-blocks');
	if(!lb)return;
	const rows=[['Residual',MEM.info],...Object.values(NORMS).map(v=>[v.lbl,v.info]),...Object.values(ACTS).map(v=>[v.lbl,v.info]),...Object.values(CLASSICS).map(v=>[v.lbl,v.info])];
	lb.innerHTML=rows.map(([n,d])=>"<div class='km-gloss'><b>"+esc(n)+"</b><span>"+esc(d)+"</span></div>").join('');
	}
	function setupCopyButtons(){
	$$('.km-copy').forEach(btn=>btn.addEventListener('click',()=>{
	const block=btn.closest('.km-codeblock');
	if(!block)return;
	const clone=block.cloneNode(true), copy=clone.querySelector('.km-copy');
	if(copy)copy.remove();
	const text=clone.textContent.trim();
	const done=()=>{const old=btn.textContent;btn.textContent='Copied';setTimeout(()=>{btn.textContent=old;},900);};
	if(navigator.clipboard&&navigator.clipboard.writeText)navigator.clipboard.writeText(text).then(done).catch(done);
	else done();
	}));
	}

	root.addEventListener('click',e=>{
	const b=e.target.closest('[data-replay-key]');
	if(!b\|\|!root.contains(b))return;
	const key=b.dataset.replayKey;
	if(!REPLAYS[key])return;
	e.preventDefault();
	startReplay(key);
	});
	const xb=$('#km-xbrief');
	if(xb)xb.addEventListener('input',resolveExpert);
	// (second-activation builder removed: the kernelsmith grammar is norm + one activation)
	const buildBtn=$('#km-build');
	if(buildBtn)buildBtn.addEventListener('click',()=>{
	lastMintedMode='build';
	const r=buildRecipe();
	animate(buildBtn,[{transform:'scale(0.97)'},{transform:'scale(1)'}],{duration:140,easing:'cubic-bezier(0.16,1,0.3,1)'});
	play();
	setMinting(true);
	startBuild('build',r);
	bridge(r);
	});
	const xBuild=$('#km-xbuild');
	if(xBuild)xBuild.addEventListener('click',()=>{
	resolveExpert();
	const r=expertRecipe();
	if(!r)return;
	lastMintedMode='expert';
	animate(xBuild,[{transform:'scale(0.97)'},{transform:'scale(1)'}],{duration:140,easing:'cubic-bezier(0.16,1,0.3,1)'});
	setMinting(true);
	startBuild('expert',r);
	bridge(r);
	});

	setupStepper('#km-tries','#km-tries-v',4,v=>{buildTries=v;});
	setupStepper('#km-xtries','#km-xtries-v',5,v=>{expertTries=v;});
	setupTemp('#km-temp','#km-temp-v');
	setupTemp('#km-xtemp','#km-xtemp-v');
	syncModePair('#km-local','#km-pro');
	syncModePair('#km-xlocal','#km-xpro');
	// The build and expert pages each have their own "Local (offline)" toggle. Local mode is the ONLY
	// path that computes the max-autotune number, so a page in Modal mode shows just 2 comparisons.
	// Keep both toggles in lockstep so build and expert always run the same way and show the same
	// set of comparison numbers (the guard stops the change events from ping-ponging).
	(function(){
	const a=$('#km-local'), b=$('#km-xlocal');
	if(!a\|\|!b)return;
	a.addEventListener('change',()=>{ if(b.checked!==a.checked){ b.checked=a.checked; b.dispatchEvent(new Event('change')); } });
	b.addEventListener('change',()=>{ if(a.checked!==b.checked){ a.checked=b.checked; a.dispatchEvent(new Event('change')); } });
	})();
	renderPickers();
	renderClassics();
	renderReplayChoices();
	renderOpTags('#km-oplist',true);
	renderOpTags('#km-docs-ops',false);
	renderGlossary();
	setupCopyButtons();
	buildSvg();
	setInfo(machineInfo());
	resolveExpert();
	showTab('build');
	render();
	}
	"""

	with gr.Blocks(title="Kernel Mint", head=f"<style>{CSS}</style>") as demo:
	gr.HTML(APP_HTML)
	# bridge components: kept in the DOM (NOT visible=False, which removes them entirely so the
	# custom JS can't reach the inner textarea/button) and hidden with CSS (#km_recipe etc.).
	recipe_in = gr.Textbox(elem_id="km_recipe")
	result_out = gr.HTML(elem_id="km_result")
	lb_out = gr.HTML(elem_id="km_lb")
	go = gr.Button("go", elem_id="km_go")
	lb_btn = gr.Button("lb", elem_id="km_lb_btn")
	# trigger_mode="multiple": the default ("once") silently DROPS a click that lands while a
	# previous mint is still running, so minting one kernel after another needed two clicks and the
	# old result lingered. The frontend also disables the Mint button during a mint (one at a time,
	# no wasted ZeroGPU calls); this is the belt-and-suspenders so no click is ever swallowed.
	go.click(do_mint, recipe_in, result_out, trigger_mode="multiple")
	# Leaderboard refresh is its OWN trigger, decoupled from minting. Previously fetch_lb was wired
	# to recipe_in.change, so every mint (which sets recipe_in) also fired a leaderboard fetch; on a
	# slow backend that fetch sat ahead of do_mint in Gradio's queue and the mint appeared to do
	# nothing until a second click. Now the JS clicks #km_lb_btn only when it actually wants the board.
	lb_btn.click(fetch_lb, None, lb_out)
	demo.load(fetch_lb, None, lb_out)
	demo.load(None, None, None, js=APP_JS)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))