"""OUROBOROS Kernel Mint: a GPU-kernel demo (Gradio + custom JS). Compose a fused operation, and a 1B model writes a real Triton kernel for it while an immutable referee checks correctness against PyTorch and times it against torch.compile. The whole interactive surface is a custom JS widget in one gr.HTML; the mint runs through Python (no CORS) and hits the 1B model (default) or the 27B (Pro mode). """ from __future__ import annotations import base64 import json import os import re import sys import time import gradio as gr import requests # `spaces.GPU` only matters on ZeroGPU, where the GPU is granted lazily per call. On a dedicated # GPU Space (e.g. L4) or locally the GPU is always attached, so the decorator must be a plain # pass-through — otherwise it can fight the always-present CUDA context. Gate on the env var HF sets # only on ZeroGPU hardware, so the same app.py runs unchanged on ZeroGPU, a paid GPU, and a laptop. try: if not os.environ.get("SPACES_ZERO_GPU"): raise ImportError("not on ZeroGPU: use the no-op GPU shim") import spaces except Exception: class _SpacesShim: @staticmethod def GPU(*_args, **_kwargs): def _decorator(fn): return fn return _decorator spaces = _SpacesShim() ONE_B = os.environ.get("BACKEND_URL", "https://ymrohit--ouroboros-kernel-mint-mint-mint.modal.run") PRO = os.environ.get("BACKEND_PRO_URL", "https://ymrohit--ouroboros-kernel-mint-pro-mint-mint.modal.run") ROOT = os.path.dirname(__file__) REFEREE = os.path.join(ROOT, "referee") SEED_DIR = os.path.join(ROOT, "seed_kernels") LOCAL_FINE_TUNED_REPO = os.environ.get( "LOCAL_GGUF_REPO", "YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF") LOCAL_BASE_REPO = os.environ.get("LOCAL_GGUF_FALLBACK_REPO", "openbmb/MiniCPM5-1B-GGUF") LOCAL_QUANT_PREFS = tuple(q.strip() for q in os.environ.get( "LOCAL_GGUF_QUANTS", "Q5_K_M,Q6_K,Q4_K_M,Q8_0,F16,BF16").split(",") if q.strip()) LOCAL_MAX_ATTEMPTS = int(os.environ.get("LOCAL_MAX_ATTEMPTS", "2")) _LOCAL_LLM = None _LOCAL_LLM_PATH = None def _ensure_referee_path(): if REFEREE not in sys.path: sys.path.insert(0, REFEREE) LOCAL_SYS = ("You are an expert GPU kernel engineer. Write a single correct, fast Triton " "kernel. Output ONLY one fenced python code block defining `run(*inputs)` and " "its @triton.jit kernel. Accumulate reductions in float32. No prose.") # The model was trained (rl_kernelsmith.py Proposer.prompt) with a real per-op SEED kernel as the # style guide, and for almost every op that exemplar is rmsnorm's ROW-WISE reduction. We must hand # the model that SAME exemplar at inference, or it goes off-distribution and writes flat elementwise # code for reduction ops (wrong results). Seeds are bundled in ./seed_kernels; this inline copy is # only a fallback if a file is missing. _FALLBACK_EXEMPLAR = """# GOLD seed kernel: fused RMSNorm, one row per program, fp32 accumulation. @triton.jit def _rmsnorm_kernel(x_ptr, w_ptr, y_ptr, stride, N, eps, BLOCK: tl.constexpr): row = tl.program_id(0) x_ptr += row * stride y_ptr += row * stride acc = tl.zeros([BLOCK], dtype=tl.float32) for off in range(0, N, BLOCK): cols = off + tl.arange(0, BLOCK) x = tl.load(x_ptr + cols, mask=cols < N, other=0.0).to(tl.float32) acc += x * x rms = tl.rsqrt(tl.sum(acc) / N + eps) for off in range(0, N, BLOCK): cols = off + tl.arange(0, BLOCK) mask = cols < N x = tl.load(x_ptr + cols, mask=mask, other=0.0).to(tl.float32) w = tl.load(w_ptr + cols, mask=mask, other=0.0).to(tl.float32) tl.store(y_ptr + cols, (x * rms * w), mask=mask) def run(x, w): M, N = x.shape y = torch.empty_like(x) _rmsnorm_kernel[(M,)](x, w, y, x.stride(0), N, 1e-6, BLOCK=1024) return y """ def _load_seed_kernel(op: str) -> str: try: with open(os.path.join(SEED_DIR, f"{op}.py")) as f: return f.read() except OSError: return _FALLBACK_EXEMPLAR def extract_kernel(text: str) -> str: """Pull a fenced Python kernel out of a llama.cpp completion.""" m = re.search(r"```(?:python)?\s*(.*?)```", text or "", re.S) body = m.group(1) if m else (text or "") starts = [body.find(k) for k in ("@triton", "import ", "def run", "def _") if body.find(k) >= 0] i = min(starts or [0]) return body[i:].strip() def _local_op_from_recipe(r: dict) -> tuple[str, str]: _ensure_referee_path() from specs import SPECS if r.get("mode") == "classic": op = (r.get("classic") or "softmax").strip() if op not in SPECS: raise ValueError(f"unknown op '{op}'") return op, op normfull = "rmsnorm" if str(r.get("norm", "rmsnorm")).startswith("rms") else "layernorm" norm = "rms" if normfull == "rmsnorm" else "layer" residual = bool(r.get("residual")) acts = [a for a in (r.get("acts") or []) if a][:3] or ["gelu"] label = ("residual + " if residual else "") + \ ("RMSNorm" if norm == "rms" else "LayerNorm") + " -> " + " -> ".join(acts) named = ("add_" if residual else "") + normfull + "_" + acts[0] if len(acts) == 1 and named in SPECS: return named, label return f"chain|{norm}|{'1' if residual else '0'}|{','.join(acts)}", label def _local_prompt_messages(op: str, spark: str = "") -> list[dict[str, str]]: _ensure_referee_path() from specs import get_spec, SPECS spec = get_spec(op) # Same exemplar rule as training: the real seed kernel for the next op in SPECS that isn't this # one (almost always rmsnorm's row-wise reduction). This is the single thing that makes the 1B # write correct kernels here instead of elementwise guesses. exemplar_op = next((o for o in SPECS if o != op), op) exemplar = _load_seed_kernel(exemplar_op) user = (f"Op `{op}`: {spec.notes}\nSignature:\n{spec.signature_hint}\n\n" f"Here is a valid Triton kernel for a DIFFERENT op (`{exemplar_op}`) as a style guide:\n" f"```python\n{exemplar}\n```\n") if spark: user += (f"\nThe person building this kernel added their own idea: \"{spark}\". " "Honour their idea where you can, but correctness is mandatory.") return [{"role": "system", "content": LOCAL_SYS}, {"role": "user", "content": user}] def _render_prompt(messages: list[dict[str, str]]) -> str: # MiniCPM5 uses ChatML (<|im_start|>/<|im_end|>). It is a reasoning model, so we append the # empty block (the enable_thinking=False convention) to make it answer # directly with the kernel instead of emitting a reasoning trace. Verified against the # tokenizer's apply_chat_template(enable_thinking=False) output. out = [] for m in messages: out.append(f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>\n") out.append("<|im_start|>assistant\n\n\n\n\n") return "".join(out) def _common_gguf_names() -> list[str]: names = [] stems = ("MiniCPM5-1B", "minicpm5-1b", "model", "ggml-model") for q in LOCAL_QUANT_PREFS: for stem in stems: names.append(f"{stem}-{q}.gguf") names.append(f"{stem}.{q}.gguf") return names def _download_gguf_from_repo(repo_id: str) -> str: from huggingface_hub import HfApi, hf_hub_download explicit = os.environ.get("LOCAL_GGUF_FILE") cache_dir = os.environ.get("LOCAL_GGUF_CACHE") if explicit: return hf_hub_download(repo_id=repo_id, filename=explicit, cache_dir=cache_dir) files = [] try: files = HfApi().list_repo_files(repo_id) except Exception: files = [] ggufs = [f for f in files if f.lower().endswith(".gguf")] for quant in LOCAL_QUANT_PREFS: hit = next((f for f in ggufs if quant.lower() in f.lower()), None) if hit: return hf_hub_download(repo_id=repo_id, filename=hit, cache_dir=cache_dir) if ggufs: return hf_hub_download(repo_id=repo_id, filename=ggufs[0], cache_dir=cache_dir) last = None for filename in _common_gguf_names(): try: return hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir) except Exception as e: last = e raise RuntimeError(f"no GGUF found in {repo_id}: {last}") def _resolve_local_gguf() -> str: errors = [] for repo_id in (LOCAL_FINE_TUNED_REPO, LOCAL_BASE_REPO): try: return _download_gguf_from_repo(repo_id) except Exception as e: errors.append(f"{repo_id}: {type(e).__name__}: {str(e)[:180]}") raise RuntimeError("could not load a local GGUF. " + " | ".join(errors)) _CUDA_PRELOADED = False def _preload_cuda_libs(): # The CUDA llama.cpp wheel's libllama.so links libcudart.so.12 / libcublas*.so.12, which on HF # Spaces live inside torch's bundled nvidia-* packages and are NOT on the default loader path, # so the import dies with "libcudart.so.12: cannot open shared object file". Preload them with # RTLD_GLOBAL (in dependency order) so libllama.so resolves their symbols. No-op if absent. global _CUDA_PRELOADED if _CUDA_PRELOADED: return _CUDA_PRELOADED = True import ctypes import glob roots = [] try: import torch roots.append(os.path.join(os.path.dirname(torch.__file__), "lib")) except Exception: pass try: import site roots += list(site.getsitepackages() if hasattr(site, "getsitepackages") else []) except Exception: pass roots += ["/usr/local/lib/python3.10/site-packages", "/usr/local/cuda/lib64", "/usr/lib/x86_64-linux-gnu", "/usr/local/lib"] # Match versioned sonames too (e.g. libcudart.so.12.4.127 with no libcudart.so.12 symlink); # RTLD_GLOBAL makes the symbols visible to libllama.so regardless of the exact soname. for pat in ("libcudart.so*", "libcublas.so*", "libcublasLt.so*"): loaded = False for root in roots: if loaded: break try: hits = sorted(glob.glob(os.path.join(root, "**", pat), recursive=True)) except Exception: hits = [] for hit in hits: try: ctypes.CDLL(hit, mode=ctypes.RTLD_GLOBAL) loaded = True break except Exception: continue # Confirm the CUDA llama.cpp lib loads at startup (loading the .so needs no GPU device, so it spends # no ZeroGPU quota). It also warms the import so the in-@spaces.GPU call is instant. if os.environ.get("LOCAL_LLAMA_GPU_LAYERS", "-1") != "0": try: _preload_cuda_libs() import llama_cpp as _lc_probe print(f"[startup] llama_cpp {_lc_probe.__version__} loaded OK", flush=True) except Exception as _e: print(f"[startup] llama_cpp load FAILED: {type(_e).__name__}: {str(_e)[:200]}", flush=True) def _get_local_llm(): global _LOCAL_LLM, _LOCAL_LLM_PATH on_zero = bool(os.environ.get("SPACES_ZERO_GPU")) gpu_layers = int(os.environ.get("LOCAL_LLAMA_GPU_LAYERS", "-1")) # Default to full GPU offload (-1). On ZeroGPU the H200 is ~30x faster than its throttled # shared CPU for this 1B, and the GPU is granted only inside @spaces.GPU and detached between # calls, so a GPU-resident model can't be reused across mints: rebuild it each call there. # Off ZeroGPU (dedicated GPU or CPU) the model is cached once. Set LOCAL_LLAMA_GPU_LAYERS=0 # to force CPU (e.g. when only a CPU-only llama.cpp wheel is installed). if _LOCAL_LLM is not None and not (on_zero and gpu_layers != 0): return _LOCAL_LLM if gpu_layers != 0: _preload_cuda_libs() from llama_cpp import Llama _LOCAL_LLM_PATH = _resolve_local_gguf() threads = int(os.environ.get("LOCAL_LLAMA_THREADS", str(max(1, (os.cpu_count() or 4) - 1)))) llm = Llama( model_path=_LOCAL_LLM_PATH, n_ctx=int(os.environ.get("LOCAL_LLAMA_CTX", "4096")), n_threads=threads, n_gpu_layers=gpu_layers, verbose=bool(int(os.environ.get("LOCAL_LLAMA_VERBOSE", "0"))), ) if not (on_zero and gpu_layers != 0): _LOCAL_LLM = llm return llm def _llama_complete(messages: list[dict[str, str]], temp: float, seed: int) -> str: # Use raw completion with our think-suppressed ChatML render (not create_chat_completion, which # leaves the reasoning trace on and makes a 1B ramble instead of writing the kernel). llm = _get_local_llm() max_tokens = int(os.environ.get("LOCAL_MAX_TOKENS", "768")) out = llm.create_completion( _render_prompt(messages), max_tokens=max_tokens, temperature=temp, top_p=0.97, seed=seed, stop=["<|im_end|>", "<|im_start|>"], ) return out["choices"][0]["text"] def _local_gpu_duration() -> int: # This is a ZeroGPU reservation window, not a benchmark setting. Keep the default within the # usual free-tier budget, but make the cap explicit so slower/cold hardware can raise it without # code changes. The current Space had LOCAL_GPU_DURATION=135, which over-reserved quota; the # default cap trims that to 120 while still leaving headroom for cold local mints. try: requested = int(os.environ.get("LOCAL_GPU_DURATION", "120")) except Exception: requested = 120 try: cap = int(os.environ.get("LOCAL_GPU_DURATION_CAP", "120")) except Exception: cap = 120 return max(45, min(max(45, cap), requested)) LOCAL_GPU_DURATION = _local_gpu_duration() @spaces.GPU(duration=LOCAL_GPU_DURATION) def _local_gpu_mint(op: str, k: int, temp: float, spark: str) -> dict: # Generation AND verification happen inside this one GPU window. That is required on ZeroGPU, # where the GPU exists only inside @spaces.GPU: llama.cpp offloads generation to the H200 # (LOCAL_LLAMA_GPU_LAYERS=-1, seconds instead of minutes on the throttled CPU) and the referee # compiles + runs Triton, all here. Off ZeroGPU the decorator is a no-op and the GPU is always # present, so the same code path works on a dedicated GPU and locally too. _ensure_referee_path() # Time every verified kernel against eager AND torch.compile (default + max-autotune), so local # mode shows the honest baselines, not just the inflated vs-eager fusion win. The max-autotune # compile is a few seconds for these ops (inductor-cached across the k attempts), so it fits. from harness import evaluate_inprocess_full messages = _local_prompt_messages(op, spark) srcs = [] base_seed = int(time.time() * 1000) & 0x7FFFFFFF for i in range(k): text = _llama_complete(messages, temp=temp, seed=base_seed + i) src = extract_kernel(text) if src: srcs.append(src) statuses = [] best = None n_shapes = int(os.environ.get("LOCAL_REFEREE_SHAPES", "2")) n_iters = int(os.environ.get("LOCAL_REFEREE_ITERS", "30")) for i, src in enumerate(srcs): try: res = evaluate_inprocess_full(src, op, n_shapes=n_shapes, n_iters=n_iters, seed=i) except Exception: statuses.append("runtime_fail") continue statuses.append(res.status) # Keep the FASTEST verified kernel (lowest latency = best vs every baseline at once). if res.status == "ok" and (best is None or res.latency_ms < best["result"]["latency_ms"]): best = {"source": src, "result": res.to_dict()} return {"n_srcs": len(srcs), "statuses": statuses, "best": best} def _local_explain(label: str, res) -> str: if res is None or res.get("status") != "ok": status = "no verified attempt" if res is None else res.get("status", "failed") return (f"The local referee rejected these kernels ({status}). It still compiled and " "checked them inside this Space; try minting again or simplify the operation.") se = float(res.get("speedup_eager", 0) or 0) honest = res.get("speedup_maxauto") or res.get("speedup_compile") bl = "torch.compile max-autotune" if res.get("speedup_maxauto") else "torch.compile" if honest: return (f"Verified correct. {float(honest):.2f}x faster than {bl} (the honest baseline that " f"also fuses and autotunes), and {se:.0f}x vs unfused PyTorch eager. The whole loop " "ran in this Space: the 1B wrote it via llama.cpp, the referee compiled, checked it " "against PyTorch, and timed it.") return (f"Verified correct, {se:.0f}x faster than unfused PyTorch eager. The 1B wrote it via " "llama.cpp and the referee checked and timed it, all inside this Space.") def local_mint(recipe: dict) -> dict: # The 1B reliably writes the named ops and single-activation machines (norm + optional residual # + one activation), which is what it was trained on. Arbitrary multi-activation chains are # off-distribution and it can't write them correctly, so local mode declines them with a clear # message instead of looping through rejected attempts. Pro mode (the 27B) is the place for those. op, label = _local_op_from_recipe(recipe) if op.startswith("chain|"): return {"op": label, "verified": False, "local": True, "baseline": "eager", "statuses": [], "gen_seconds": 0.0, "verify_seconds": 0.0, "speedup_compile": None, "unsupported": True, "k": 0, "explanation": ("This machine stacks two activations, which is outside what the " "kernelsmith models were trained on (the grammar is a norm plus one " "activation), so neither the 1B nor the 27B writes it reliably. Drop " "to a single activation and it mints with a verified kernel.")} k = max(1, min(LOCAL_MAX_ATTEMPTS, int(recipe.get("k", 1)))) temp = max(0.2, min(1.3, float(recipe.get("temp", 0.7)))) spark = (recipe.get("spark") or "").strip()[:200] # Pre-fetch the GGUF to disk OUTSIDE the GPU window, so the ZeroGPU 120s budget is spent on # generate + verify rather than a 1.15GB download. _resolve_local_gguf() t0 = time.time() work = _local_gpu_mint(op, k, temp, spark) total_s = round(time.time() - t0, 1) statuses = work.get("statuses") or [] best = work.get("best") if not work.get("n_srcs"): return {"op": label, "verified": False, "local": True, "baseline": "eager", "statuses": statuses, "gen_seconds": total_s, "verify_seconds": 0.0, "speedup_compile": None, "k": k, "explanation": "llama.cpp returned no parseable kernel this time. Try minting again."} if not best: return {"op": label, "verified": False, "local": True, "baseline": "eager", "statuses": statuses, "gen_seconds": total_s, "verify_seconds": 0.0, "speedup_compile": None, "explanation": _local_explain(label, None), "k": k} res = best["result"] def _su(key): v = res.get(key) return round(float(v), 2) if v else None return {"op": label, "verified": True, "local": True, "baseline": "compile", "speedup_eager": round(float(res.get("speedup_eager", 0)), 2), "speedup_compile": _su("speedup_compile"), "speedup_maxauto": _su("speedup_maxauto"), "latency_ms": res.get("latency_ms"), "eager_ms": res.get("eager_ms"), "compile_ms": res.get("compile_ms"), "maxauto_ms": res.get("maxauto_ms"), "source": best["source"], "statuses": statuses, "gen_seconds": total_s, "verify_seconds": 0.0, "explanation": _local_explain(label, res), "n_verified": sum(1 for s in statuses if s == "ok"), "k": k} def lb_url(u): return u.replace("-mint-mint.modal.run", "-leaderboard.modal.run") def _blob(d): return f"" def _with_nonce(d: dict, r: dict) -> dict: out = dict(d or {}) if r.get("_n") is not None: out["_n"] = r.get("_n") return out def do_mint(recipe_json: str) -> str: try: r = json.loads(recipe_json or "{}") except Exception: return _blob({"error": "bad recipe"}) if r.get("local"): try: return _blob(_with_nonce(local_mint(r), r)) except Exception as e: return _blob(_with_nonce( {"error": f"Local offline mint failed: {type(e).__name__}: {str(e)[:240]}", "local": True, "baseline": "eager"}, r)) base = PRO if r.get("pro") else ONE_B if r.get("mode") == "classic": payload = {"op": r.get("classic", "softmax")} else: payload = {"blocks": {"norm": r.get("norm", "rmsnorm"), "residual": bool(r.get("residual")), "acts": r.get("acts") or ["gelu"]}} payload.update({"name": (r.get("name") or "anonymous")[:40], "k": int(r.get("k", 4)), "temp": float(r.get("temp", 0.7)), "spark": (r.get("spark") or "")[:200]}) # During a cold start Modal can return an empty body before the model finishes loading, # which makes .json() blow up. Retry a few times, then explain it plainly. last = "" for attempt in range(4): try: resp = requests.post(base, json=payload, timeout=600) if resp.status_code == 200 and resp.text.strip(): try: return _blob(_with_nonce(resp.json(), r)) except ValueError: last = "the model returned an unexpected response" else: last = f"the model is still waking up (HTTP {resp.status_code})" except Exception as e: last = str(e) time.sleep(4) pro = " The 27B in Pro mode takes longer to wake. Try again, or turn Pro off." if r.get("pro") else "" return _blob(_with_nonce( {"error": f"The model is still waking up. Give it about 30 seconds and press Mint again.{pro}"}, r)) def fetch_lb(_=None): rows = [] seen = set() sources = [] errors = [] for u in (ONE_B, PRO): try: url = lb_url(u) got = requests.get(url, timeout=20).json().get("rows", []) if got: sources.append(url) for row in got: key = (row.get("op"), row.get("name"), row.get("ts"), row.get("speedup_compile"), row.get("speedup_eager")) if key in seen: continue seen.add(key) rows.append(row) except Exception as e: errors.append(f"{lb_url(u)}: {type(e).__name__}") continue # Always include fetched_at so the hidden Gradio HTML changes even when the rows are identical. # That makes the visible Refresh button observably work. return _blob({"rows": rows, "fetched_at": time.time(), "sources": sources, "errors": errors[:2]}) CSS = open(os.path.join(os.path.dirname(__file__), "style.css")).read() if os.path.exists( os.path.join(os.path.dirname(__file__), "style.css")) else "" REPLAYS_PATH = os.path.join(os.path.dirname(__file__), "replays", "index.json") try: with open(REPLAYS_PATH, encoding="utf-8") as f: REPLAYS_JSON = json.dumps(json.load(f), separators=(",", ":")).replace("
Kernel Mint an OUROBOROS demo

A 1B model writes a real GPU kernel. A referee that can't be fooled checks it.

Compose an operation, and a 1-billion-parameter model writes a fused Triton kernel for it. The kernel is compiled, checked against PyTorch on adversarial inputs, and timed against PyTorch's own compiler before anything counts as a win.

inputa row of activations
residual
normalize
activate

Runs entirely in this Space: a 1B GGUF model (llama.cpp) writes the kernel on this Space's GPU, then the in-process referee compiles it, checks correctness against PyTorch, and times it against PyTorch eager, torch.compile, and max-autotune. No Modal or cloud model calls. Pro uses the 27B.

input
readyPress Mint to begin

Compose a pipeline on the left and mint it, or watch a recorded verified mint to see the whole referee loop instantly.

No mints yet.
""" APP_HTML = (APP_HTML .replace("__KM_REPLAYS__", REPLAYS_JSON) .replace("__KM_LB_ONE__", lb_url(ONE_B)) .replace("__KM_LB_PRO__", lb_url(PRO))) APP_JS = r""" () => { const ACTS = { gelu:{lbl:'GELU',verb:'smoothly bends',info:'GELU is the gentle bend used inside many transformers. It lets a little of the negative side through.',f:x=>0.5*x*(1+Math.tanh(0.7978845608*(x+0.044715*x*x*x)))}, silu:{lbl:'SiLU',verb:'swishes',info:'SiLU, also called Swish, is x times sigmoid(x). It is common in LLaMA style feed-forward blocks.',f:x=>x/(1+Math.exp(-x))}, relu:{lbl:'ReLU',verb:'clips negatives',info:'ReLU keeps positive values and clips negative values to zero. It is fast, classic, and direct.',f:x=>Math.max(x,0)}, tanh:{lbl:'Tanh',verb:'squashes to -1..1',info:'Tanh squashes each value into the range from -1 to 1. Large values flatten near the top and bottom.',f:x=>Math.tanh(x)}, sigmoid:{lbl:'Sigmoid',verb:'squashes to 0..1',info:'Sigmoid squashes each value into the range from 0 to 1. It is often used for gates and probabilities.',f:x=>1/(1+Math.exp(-x))}, relu2:{lbl:'ReLU2',verb:'clips then squares',info:'Squared ReLU clips negative values to zero, then squares the remaining positive values.',f:x=>{const r=Math.max(x,0);return r*r;}}, gelu_erf:{lbl:'GELU exact',verb:'bends exactly',info:'Exact GELU uses the erf form of GELU. It behaves almost like the tanh approximation, but follows the exact formula.',f:x=>0.5*x*(1+Math.tanh(0.7978845608*(x+0.044715*x*x*x)))}, leaky_relu:{lbl:'Leaky ReLU',verb:'leaks negatives',info:'Leaky ReLU is like ReLU, but negative values leak through at a small scale instead of becoming zero.',f:x=>x>0?x:0.01*x}, elu:{lbl:'ELU',verb:'curves negatives',info:'ELU passes positive values through and curves negative values smoothly down toward -1.',f:x=>x>0?x:Math.exp(x)-1}, mish:{lbl:'Mish',verb:'self gates',info:'Mish is a smooth self-gating activation, x times tanh of softplus(x).',f:x=>x*Math.tanh(Math.log(1+Math.exp(x)))}, }; const NORMS = { rmsnorm:{lbl:'RMSNorm',info:'RMSNorm makes a row of values a consistent size by dividing by root mean square, then scaling.'}, layernorm:{lbl:'LayerNorm',info:'LayerNorm centers a row of values by subtracting the average, then rescales by the spread.'}, }; const MEM = {info:'Residual add means adding the input back before the next operation. This is the skip path used in deep models.'}; const CLASSICS = { softmax:{lbl:'Softmax',info:'Softmax turns a row of scores into probabilities that add to 1.'}, softmax_scale:{lbl:'Scaled softmax',info:'Scaled softmax applies a scale before softmax. It is the attention score step used before attention weights.'}, swiglu:{lbl:'SwiGLU',info:'SwiGLU is a gated feed-forward operation: SiLU on the gate multiplied by the up projection.'}, geglu:{lbl:'GeGLU',info:'GeGLU is a gated feed-forward operation like SwiGLU, but with a GELU gate.'}, rmsnorm:{lbl:'RMSNorm',info:'RMSNorm on its own, without an activation after it.'}, layernorm:{lbl:'LayerNorm',info:'LayerNorm on its own, without an activation after it.'}, }; const INPUT = [0.6,-0.9,1.4,-0.3,0.2,-1.6,0.8,1.1,-0.5,0.0,1.7,-1.1,0.4,-0.2]; const root=document.getElementById('km'); if(!root||root.dataset.init)return; root.dataset.init='1'; const $=s=>root.querySelector(s), $$=s=>Array.from(root.querySelectorAll(s)); const LB_URLS=[root.dataset.lbOne,root.dataset.lbPro].filter(Boolean); const cleanText=s=>String(s??'').replace(/\u2013|\u2014/g,' - ').replace(/\u2192/g,'->').replace(/\u00d7/g,'x').replace(/[\u2600-\u27BF]/g,'').replace(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g,'').replace(/\s+/g,' ').trim(); const esc=s=>cleanText(s).replace(/[&<>"']/g,c=>({'&':'&','<':'<','>':'>','"':'"',"'":'''}[c])); let REPLAYS={}; try{REPLAYS=JSON.parse(($('#km-replays-data')||{}).textContent||'{}');}catch(e){REPLAYS={};} const REPLAY_META={ rmsnorm_gelu:{label:'RMSNorm to GELU',short:'RMSNorm GELU'}, softmax:{label:'Softmax',short:'Softmax'}, add_layernorm_silu:{label:'Residual plus LayerNorm to SiLU',short:'Residual LayerNorm SiLU'} }; const REPLAY_ORDER=['rmsnorm_gelu','softmax','add_layernorm_silu'].filter(k=>REPLAYS[k]); const state={tab:'build',mode:'build',slots:{mem:false,norm:'rmsnorm',act:'gelu',act2:null},classic:'softmax'}; let buildTries=4, expertTries=5, resolvedExpert=null, lastMintedMode='build'; let activeMintNonce=null, awaitingMint=false, pendingLBRefresh=false, lbRefreshTimer=null, lbPollTimer=null, lbUiTimer=null; const HIST=[]; function animate(el,frames,opts){ if(el&&el.animate)el.animate(frames,opts); } function labelRecipe(r){ if(!r)return 'unknown'; if(r.mode==='classic')return CLASSICS[r.classic]?.lbl||r.classic||'classic op'; const p=[]; if(r.residual)p.push('residual add'); p.push(NORMS[r.norm]?.lbl||r.norm); (r.acts||[]).forEach(a=>p.push(ACTS[a]?.lbl||a)); return p.join(' + '); } function machineInfo(){ if(state.mode==='classic')return CLASSICS[state.classic].info; const p=[]; p.push(state.slots.mem?'add input':'no residual add'); p.push(NORMS[state.slots.norm].lbl); p.push(ACTS[state.slots.act].lbl); if(state.slots.act2)p.push(ACTS[state.slots.act2].lbl); return 'Current operation: '+p.join(' + ')+'. The model must fuse it into one verified kernel.'; } function setInfo(t){ const info=$('#km-info'); if(!info)return; info.textContent=cleanText(t||machineInfo()); animate(info,[{opacity:.45},{opacity:1}],{duration:180,easing:'cubic-bezier(0.16,1,0.3,1)'}); } function positionTabPill(){ const tabs=$('#km-tabs'), pill=$('#km-tab-pill'), btn=$('.km-tab.on'); if(!tabs||!pill||!btn)return; const tr=tabs.getBoundingClientRect(), br=btn.getBoundingClientRect(); pill.style.width=br.width+'px'; pill.style.transform='translateX('+(br.left-tr.left-4)+'px)'; } function showTab(tab){ state.tab=tab; $$('.km-tab').forEach(b=>b.classList.toggle('on',b.dataset.tab===tab)); $$('.km-page').forEach(p=>{p.hidden=p.dataset.page!==tab;}); requestAnimationFrame(positionTabPill); if(tab==='expert')resolveExpert(); if(tab==='lb'){refreshLB(true); setLBPolling(true);} else setLBPolling(false); } $$('.km-tab').forEach(b=>b.addEventListener('click',()=>showTab(b.dataset.tab))); window.addEventListener('resize',positionTabPill); function optionInfo(slot,key){ if(slot==='mem')return key==='add'?MEM.info:'No residual add. The kernel starts from the input row and does not add a skip value.'; if(slot==='norm')return NORMS[key].info; if(slot==='act'||slot==='act2')return key==='none'?'No second activation. The pipeline stops after the first activation.':ACTS[key].info; return ''; } function optionLabel(slot,key){ if(slot==='mem')return key==='add'?'add input':'none'; if(slot==='norm')return NORMS[key].lbl; if(slot==='act'||slot==='act2')return key==='none'?'none':ACTS[key].lbl; return key; } function selectedKey(slot){ if(slot==='mem')return state.slots.mem?'add':'none'; if(slot==='act2')return state.slots.act2||'none'; return state.slots[slot]; } function setSlot(slot,key,quiet){ if(slot==='mem')state.slots.mem=key==='add'; else if(slot==='act2')state.slots.act2=key==='none'?null:key; else state.slots[slot]=key; const wrap=root.querySelector('.km-pick[data-slot="'+slot+'"]'); if(wrap)wrap.querySelectorAll('.km-opt').forEach(b=>b.classList.toggle('on',b.dataset.key===selectedKey(slot))); if(!quiet)setInfo(optionInfo(slot,key)); render(); } function renderPickers(){ $$('.km-pick[data-slot]').forEach(wrap=>{ const slot=wrap.dataset.slot; let keys=[]; if(slot==='mem')keys=['none','add']; if(slot==='norm')keys=Object.keys(NORMS); if(slot==='act')keys=Object.keys(ACTS); if(slot==='act2')keys=['none'].concat(Object.keys(ACTS)); wrap.innerHTML=''; keys.forEach(key=>{ const b=document.createElement('button'); b.type='button'; b.className='km-opt'; b.dataset.key=key; b.textContent=optionLabel(slot,key); b.title=optionInfo(slot,key); b.classList.toggle('on',key===selectedKey(slot)); b.addEventListener('mouseenter',()=>setInfo(optionInfo(slot,key))); b.addEventListener('focus',()=>setInfo(optionInfo(slot,key))); b.addEventListener('click',()=>setSlot(slot,key)); wrap.appendChild(b); }); }); } function renderClassics(){ const cwrap=$('#km-classics'); if(!cwrap)return; cwrap.innerHTML=''; Object.entries(CLASSICS).forEach(([k,v])=>{ const b=document.createElement('button'); b.type='button'; b.className='km-opt'; b.dataset.key=k; b.textContent=v.lbl; b.title=v.info; b.classList.toggle('on',state.classic===k); b.addEventListener('mouseenter',()=>setInfo(v.info)); b.addEventListener('focus',()=>setInfo(v.info)); b.addEventListener('click',()=>{ state.classic=k; cwrap.querySelectorAll('.km-opt').forEach(x=>x.classList.toggle('on',x===b)); setInfo(v.info); render(); }); cwrap.appendChild(b); }); } function setComposerMode(mode){ state.mode=mode; $$('.km-segb').forEach(b=>b.classList.toggle('on',b.dataset.mode===mode)); const compose=$('#km-compose'), classic=$('#km-classic'); if(compose)compose.hidden=mode!=='build'; if(classic)classic.hidden=mode!=='classic'; setInfo(machineInfo()); render(); } $$('.km-segb').forEach(b=>b.addEventListener('click',()=>setComposerMode(b.dataset.mode))); function setupStepper(id,outId,initial,onChange){ const wrap=$(id), out=$(outId); if(!wrap)return; wrap.innerHTML=''; const set=v=>{ onChange(v); if(out)out.textContent=String(v); wrap.querySelectorAll('button').forEach(b=>b.classList.toggle('on',+b.dataset.v===v)); }; for(let i=1;i<=8;i++){ const b=document.createElement('button'); b.type='button'; b.dataset.v=String(i); b.textContent=String(i); b.addEventListener('click',()=>set(i)); wrap.appendChild(b); } set(initial); } function setupTemp(id,outId){ const el=$(id), out=$(outId); if(!el)return; const sync=()=>{if(out)out.textContent=(+el.value/100).toFixed(2);}; el.addEventListener('input',sync); sync(); } function syncModePair(localSel,proSel){ const local=$(localSel), pro=$(proSel); if(!local||!pro)return; const sync=()=>{ if(local.checked){pro.checked=false;pro.disabled=true;} else pro.disabled=false; }; local.addEventListener('change',sync); pro.addEventListener('change',()=>{if(pro.checked)local.checked=false;sync();}); sync(); } const svg=$('#km-svg'), W=320, H=150, PAD=12, n=INPUT.length; const xs=i=>PAD+i*(W-2*PAD)/(n-1); const ys=v=>{const t=Math.max(-2.2,Math.min(2.2,v));return H/2-t*(H/2-PAD)/2.2;}; let dots=[], line=null, cur=INPUT.slice(), anim=null; function buildSvg(){ if(!svg)return; svg.innerHTML=''; const mid=document.createElementNS('http://www.w3.org/2000/svg','line'); mid.setAttribute('x1',0); mid.setAttribute('x2',W); mid.setAttribute('y1',H/2); mid.setAttribute('y2',H/2); mid.setAttribute('stroke','#1f2d26'); mid.setAttribute('stroke-width','1'); svg.appendChild(mid); line=document.createElementNS('http://www.w3.org/2000/svg','polyline'); line.setAttribute('fill','none'); line.setAttribute('stroke','#34e0a1'); line.setAttribute('stroke-width','2'); line.setAttribute('stroke-linejoin','round'); svg.appendChild(line); dots=cur.map((v,i)=>{const c=document.createElementNS('http://www.w3.org/2000/svg','circle');c.setAttribute('r','3.4');c.setAttribute('cx',xs(i));svg.appendChild(c);return c;}); paint(cur); } function paint(v){ if(!line)return; line.setAttribute('points',v.map((x,i)=>xs(i)+','+ys(x)).join(' ')); v.forEach((x,i)=>{if(dots[i]){dots[i].setAttribute('cy',ys(x));dots[i].setAttribute('fill',x>=0?'#34e0a1':'#ff7a7a');}}); } const ease=t=>1-Math.pow(1-t,3); function morph(to,ms=480){ if(!line)return; const from=cur.slice(), t0=performance.now(); if(anim)cancelAnimationFrame(anim); const step=t=>{ const k=Math.min(1,(t-t0)/ms), e=ease(k); cur=from.map((f,i)=>f+(to[i]-f)*e); paint(cur); if(k<1)anim=requestAnimationFrame(step); else cur=to.slice(); }; anim=requestAnimationFrame(step); } function normalize(a,kind){ const m=a.reduce((s,x)=>s+x,0)/a.length; if(kind==='layernorm'){ const v=a.reduce((s,x)=>s+(x-m)*(x-m),0)/a.length; return a.map(x=>(x-m)/Math.sqrt(v+1e-5)); } const r=Math.sqrt(a.reduce((s,x)=>s+x*x,0)/a.length+1e-6); return a.map(x=>x/r); } function classicViz(key){ let v=INPUT.slice(); if(key.startsWith('softmax')){ const scale=key==='softmax_scale'?0.7:1; const z=v.map(x=>x*scale), mx=Math.max(...z), e=z.map(x=>Math.exp(x-mx)), s=e.reduce((a,b)=>a+b,0); v=e.map(x=>x/s*4-0.5); }else if(key==='rmsnorm'||key==='layernorm'){ v=normalize(v,key); }else if(key==='swiglu'){ v=v.map(x=>(x/(1+Math.exp(-x)))*(0.7+Math.abs(x)*0.3)); }else if(key==='geglu'){ v=v.map(x=>ACTS.gelu.f(x)*(0.7+Math.abs(x)*0.3)); } return v; } function pipeline(){ if(state.mode==='classic')return [{l:'input',v:INPUT.slice()},{l:CLASSICS[state.classic].lbl,v:classicViz(state.classic)}]; const st=[{l:'input',v:INPUT.slice()}]; let v=INPUT.slice(); if(state.slots.mem){v=v.map((x,i)=>x+INPUT[(i+3)%n]*0.5);st.push({l:'residual add',v:v.slice()});} v=normalize(v,state.slots.norm);st.push({l:NORMS[state.slots.norm].lbl,v:v.slice()}); v=v.map(ACTS[state.slots.act].f);st.push({l:ACTS[state.slots.act].lbl+' '+ACTS[state.slots.act].verb,v:v.slice()}); if(state.slots.act2){v=v.map(ACTS[state.slots.act2].f);st.push({l:ACTS[state.slots.act2].lbl+' second pass',v:v.slice()});} return st; } let playing=false; function play(){ if(playing)return; playing=true; const st=pipeline(); let i=0; const stage=$('#km-stage-label'); const next=()=>{ if(i>=st.length){playing=false;return;} if(stage){stage.textContent=cleanText(st[i].l);animate(stage,[{opacity:0,transform:'translateY(4px)'},{opacity:1,transform:'none'}],{duration:200,easing:'cubic-bezier(0.16,1,0.3,1)'});} morph(st[i].v); i++; setTimeout(next,680); }; next(); } function render(){ const st=pipeline(), stage=$('#km-stage-label'); if(stage)stage.textContent=cleanText(st[st.length-1].l); morph(st[st.length-1].v,360); updateReplayOffer(); } function buildRecipe(){ const tempEl=$('#km-temp'), proEl=$('#km-pro'), localEl=$('#km-local'), nameEl=$('#km-name'); const local=!!(localEl&&localEl.checked); return { mode:state.mode, norm:state.slots.norm, residual:!!state.slots.mem, acts:[state.slots.act].concat(state.slots.act2?[state.slots.act2]:[]), classic:state.classic, spark:'', k:buildTries, temp:(+(tempEl?tempEl.value:70))/100, pro:!local&&!!(proEl&&proEl.checked), local, name:nameEl?nameEl.value:'' }; } function expertRecipe(){ if(!resolvedExpert||resolvedExpert.unknown)return null; const tempEl=$('#km-xtemp'), proEl=$('#km-xpro'), localEl=$('#km-xlocal'), briefEl=$('#km-xbrief'); const local=!!(localEl&&localEl.checked); const r=Object.assign({},resolvedExpert.recipe); r.k=expertTries; r.temp=(+(tempEl?tempEl.value:70))/100; r.pro=!local&&!!(proEl&&proEl.checked); r.local=local; r.name='expert'; r.spark=briefEl?briefEl.value:''; return r; } function currentRecipe(){ return state.tab==='expert'?(expertRecipe()||buildRecipe()):buildRecipe(); } function bridge(recipe){ const inp=document.querySelector('#km_recipe textarea'); if(!inp)return; // _n is a nonce: it makes the recipe JSON differ on every mint so Gradio always registers a // value change and re-runs do_mint. Without it, minting the same recipe twice in a row left // the textarea value unchanged and the second submit was dropped (the "needs a double-click" // bug). The backend echoes it back so stale blobs from the previous run cannot repaint the old // result over the new loading card while Gradio is preparing the next response. const nonce=Date.now().toString(36)+'-'+Math.random().toString(36).slice(2); const payload=Object.assign({},recipe,{_n:nonce}); activeMintNonce=nonce; awaitingMint=true; const set=Object.getOwnPropertyDescriptor(window.HTMLTextAreaElement.prototype,'value').set; set.call(inp,JSON.stringify(payload)); inp.dispatchEvent(new Event('input',{bubbles:true})); // NOTE: we intentionally do NOT dispatch 'change' here. The leaderboard fetch is triggered // separately via refreshLB() so it never competes with do_mint in Gradio's queue. setTimeout(()=>{ const g=document.querySelector('#km_go button')||document.querySelector('#km_go'); if(g)g.click(); },80); } function setLBStatus(text,kind){ const s=$('#km-lb-status'); if(!s)return; s.textContent=text; s.classList.toggle('bad',kind==='bad'); s.classList.toggle('warn',kind==='warn'); } function setLBButton(on){ const b=$('#km-lb-refresh'); if(!b)return; b.disabled=on; b.textContent=on?'Refreshing...':'Refresh'; } async function refreshLBDirect(){ const rows=[], seen=new Set(), sources=[]; for(const url of LB_URLS){ try{ const res=await fetch(url,{cache:'no-store'}); if(!res.ok)continue; const data=await res.json(); const got=Array.isArray(data.rows)?data.rows:[]; if(got.length)sources.push(url); got.forEach(row=>{ const key=[row.op,row.name,row.ts,row.speedup_compile,row.speedup_eager].join('|'); if(seen.has(key))return; seen.add(key); rows.push(row); }); }catch(e){} } if(!sources.length)throw new Error('direct leaderboard fetch failed'); renderLB(rows,{fetched_at:Date.now()/1000,sources}); } function refreshLB(manual=false){ if(awaitingMint){ pendingLBRefresh=true; if(manual)setLBStatus('Queued until the current mint finishes.','warn'); return; } if(manual||state.tab==='lb'){ setLBButton(true); setLBStatus('Checking leaderboard...',''); if(lbUiTimer)clearTimeout(lbUiTimer); lbUiTimer=setTimeout(()=>{ refreshLBDirect().catch(()=>{ setLBButton(false); setLBStatus('Still waiting for the Gradio refresh bridge. Direct browser fetch was blocked. Try once more or reopen the tab.','warn'); }); },8000); } const btn=document.querySelector('#km_lb_btn button')||document.querySelector('#km_lb_btn'); if(btn)btn.click(); else{ setLBButton(false); setLBStatus('Refresh bridge is not mounted yet. Reopen the Leaderboard tab.','bad'); } } function scheduleLBRefresh(delay=700){ pendingLBRefresh=false; if(lbRefreshTimer)clearTimeout(lbRefreshTimer); lbRefreshTimer=setTimeout(()=>refreshLB(false),delay); } function setLBPolling(on){ if(lbPollTimer){clearInterval(lbPollTimer);lbPollTimer=null;} if(on)lbPollTimer=setInterval(()=>{if(state.tab==='lb')refreshLB(false);},15000); } let mintSafetyTimer=null; function setMinting(on){ ['#km-build','#km-xbuild'].forEach(id=>{const b=$(id); if(b){b.disabled=on; b.classList.toggle('minting',on);}}); if(mintSafetyTimer){clearTimeout(mintSafetyTimer); mintSafetyTimer=null;} // Bulletproof re-enable: showResult() clears this, but if a result never arrives (hung backend) // the button must not stay dead. 4 min is longer than any real mint. if(on) mintSafetyTimer=setTimeout(()=>setMinting(false), 240000); } function replayName(key){return (REPLAY_META[key]&&REPLAY_META[key].label)||(REPLAYS[key]&&cleanText(REPLAYS[key].op))||key;} function matchingReplayKey(){ if(state.mode==='classic'&&state.classic==='softmax'&&REPLAYS.softmax)return 'softmax'; if(state.mode==='build'&&!state.slots.mem&&state.slots.norm==='rmsnorm'&&state.slots.act==='gelu'&&!state.slots.act2&&REPLAYS.rmsnorm_gelu)return 'rmsnorm_gelu'; if(state.mode==='build'&&state.slots.mem&&state.slots.norm==='layernorm'&&state.slots.act==='silu'&&!state.slots.act2&&REPLAYS.add_layernorm_silu)return 'add_layernorm_silu'; return null; } function updateReplayOffer(){ const btn=$('#km-replay'); if(!btn||!REPLAY_ORDER.length)return; const key=matchingReplayKey()||REPLAY_ORDER[0]; btn.dataset.replayKey=key; btn.textContent=matchingReplayKey()?'Watch this recorded mint':'Watch a recorded mint'; $$('.km-rpick').forEach(b=>b.classList.toggle('on',b.dataset.replayKey===key)); } function renderReplayChoices(){ const wrap=$('#km-replay-picks'); if(!wrap)return; if(!REPLAY_ORDER.length){wrap.hidden=true;return;} wrap.hidden=false; wrap.innerHTML=REPLAY_ORDER.map(k=>"").join(''); updateReplayOffer(); } const sp=x=>Number.isFinite(+x)?(+x).toFixed(1).replace(/\.0$/,''):'?'; const sp2=x=>Number.isFinite(+x)?(+x).toFixed(2):'?'; const sp3=x=>Number.isFinite(+x)?(+x).toFixed(3).replace(/0+$/,'').replace(/\.$/,''):'?'; const isOk=s=>String(s||'').toLowerCase()==='ok'||String(s||'').toLowerCase()==='pass'; function attemptSummary(d){ const statuses=Array.isArray(d.statuses)?d.statuses:[]; const passed=statuses.length?statuses.filter(isOk).length:(Number.isFinite(+d.n_verified)?+d.n_verified:0); const total=statuses.length||(Number.isFinite(+d.k)?+d.k:passed||0); return {statuses,passed,total}; } function attemptChips(d,limit){ const a=attemptSummary(d), statuses=a.statuses.slice(0,limit??a.statuses.length); if(!statuses.length)return ''; return "
"+statuses.map((s,i)=>{ const ok=isOk(s); return ""+(ok?'OK':'FAIL')+" attempt "+(i+1)+", "+esc(s)+""; }).join('')+"
"; } function target(mode){ if(mode==='expert'){ return {verdict:$('#km-xverdict'),source:$('#km-xsource'),code:$('#km-xsource .km-code')}; } return {verdict:$('#km-verdict'),source:$('#km-source'),code:$('#km-code')}; } function clearSource(mode){ const t=target(mode); if(t.source)t.source.hidden=true; if(t.code)t.code.textContent=''; } function setSource(d,kind,mode){ const t=target(mode); if(!t.source||!t.code)return; if(!d.source){clearSource(mode);return;} t.source.hidden=false; t.source.open=false; const sum=t.source.querySelector('summary'); if(sum)sum.textContent=kind==='recorded'?'Recorded Triton kernel source':'Live Triton kernel source'; t.code.textContent=d.source; } function wakingError(msg){return /waking|wake|timeout|timed out|still|unexpected|http|modal|loading/i.test(String(msg||''));} function showResult(d,opts={}){ if(!opts.recorded&&awaitingMint&&String(d&&d._n||'')!==String(activeMintNonce||'')){ return; } if(!opts.recorded&&awaitingMint){ awaitingMint=false; activeMintNonce=null; } const mode=opts.mode||lastMintedMode||'build'; stopBuild(); stopReplay(); if(!opts.recorded) setMinting(false); // result arrived -> re-enable Mint (replays don't disable it) const t=target(mode), v=t.verdict; if(!v)return; clearSource(mode); if(d.error){ const local=!!d.local; const wake=!local&&wakingError(d.error); const replayKey=matchingReplayKey()||REPLAY_ORDER[0]||''; const replayBtn=replayKey?"":""; v.innerHTML="
"+ "
!
"+(local?'local offline mint':'live mint')+""+(wake?'The live model is still waking':(local?'The local mint did not finish':'The live mint did not finish'))+"
"+ "

"+(wake?'The backend scales to zero, so the first live mint can take about 90 seconds. The replay below is a real earlier mint, not a mockup.':(local?esc(d.error):'Backend said: '+esc(d.error)))+"

"+ (wake?replayBtn+"

Backend said: "+esc(d.error)+"

":replayBtn)+"
"; if(pendingLBRefresh)scheduleLBRefresh(); return; } const a=attemptSummary(d); if(!d.verified){ v.innerHTML="
"+ "
NO
referee verdictNot verified yet
"+ "
"+a.passed+" of "+(a.total||'?')+" attempts passed the referee
"+ attemptChips(d)+"

"+esc(d.explanation||'Press Mint again, raise attempts, or change the operation.')+"

"; if(pendingLBRefresh)scheduleLBRefresh(); return; } const recorded=!!opts.recorded, local=!!d.local; // Headline the HONEST baseline (the compiler), not the inflated vs-eager fusion number. // Strongest available wins: max-autotune > torch.compile default > eager. const se=+d.speedup_eager, sc=+d.speedup_compile, sm=+d.speedup_maxauto; const hasC=Number.isFinite(sc)&&sc>0, hasM=Number.isFinite(sm)&&sm>0; const heroSpeed=hasM?sm:(hasC?sc:se); const heroLabel=hasM?'vs torch.compile max-autotune':(hasC?'vs torch.compile':'vs PyTorch eager'); const missingLocalCompiler=local&&!recorded&&!hasC&&!hasM; const compilerGuard=missingLocalCompiler ? "
Compiler baselines did not return for this Local run. Re-mint; a certified Local result should include torch.compile and max-autotune next to eager.
" : ""; const runLabel=local?'local offline mint':(recorded?'recorded mint':'live mint'); const noteText=recorded?'recorded backend run':(local?'local offline run':'live backend run'); const note=""+noteText+""; const board=d.beat_champion?"27B champion beaten":d.on_leaderboard?"leaderboard result":""; v.innerHTML="
"+note+board+ "
OK
"+runLabel+"Verified by the referee"+esc(d.op||'kernel')+"
"+ "
"+a.passed+" of "+(a.total||'?')+" attempts passed the referee
"+ attemptChips(d)+ compilerGuard+ // Every comparison as its own big number, strongest (most honest) baseline first, eager last // and labelled 'unfused' so the side-by-side is self-explanatory: the model's real edge is // the ~1.1x over the compiler; the big eager number is mostly the fusion win. "
"+ (hasM?"
"+sp3(sm)+"xvs torch.compile max-autotune
":"")+ (hasC?""+sp3(sc)+"xvs torch.compile
":"")+ "
"+sp(se)+"xvs PyTorch eager (unfused)
"+ "
"+(Number.isFinite(+d.gen_seconds)?sp(d.gen_seconds)+'s':'?')+"generation
"+ "

"+esc(d.explanation||'')+"

"; animate(v.firstElementChild,[{transform:'scale(0.96)',opacity:0},{transform:'scale(1)',opacity:1}],{duration:300,easing:'cubic-bezier(0.16,1,0.3,1)'}); setSource(d,recorded?'recorded':'live',mode); if(!recorded&&mode==='build'){ HIST.unshift(""+esc(d.op||'kernel')+""+sp(d.speedup_maxauto||d.speedup_compile||d.speedup_eager)+"x vs compiler"); const hist=$('#km-hist'); if(hist)hist.innerHTML=HIST.slice(0,10).map(h=>"
"+h+"
").join(''); } // The leaderboard refresh is deliberately decoupled from the mint call. Once the result has // arrived, the queue is free again, so refresh the board without hiding the just-rendered result. if(!recorded)scheduleLBRefresh(900); } let replayTimer=null; function stopReplay(){if(replayTimer){clearTimeout(replayTimer);replayTimer=null;}} function startReplay(key){ const d=REPLAYS[key]; if(!d)return; lastMintedMode=state.tab==='expert'?'expert':'build'; stopBuild(); stopReplay(); play(); const t=target(lastMintedMode), v=t.verdict; if(!v)return; clearSource(lastMintedMode); const statuses=Array.isArray(d.statuses)?d.statuses:[]; const total=statuses.length||(Number.isFinite(+d.k)?+d.k:4); const passed=statuses.length?statuses.filter(isOk).length:(Number.isFinite(+d.n_verified)?+d.n_verified:0); v.innerHTML="
"+ "
RUN
recorded mint"+esc(replayName(key))+"This is a genuine earlier backend run. Mint still starts a live run.
"+ "
loading recorded attempts...
"+ "
"+ "
"+ "

The model took "+(Number.isFinite(+d.gen_seconds)?sp(d.gen_seconds)+'s':'real time')+" on this recorded run. The replay shows the referee loop without waiting for cold start.

"; const att=$('#km-replay-attempts'), stage=$('#km-replay-stage'), fill=$('#km-replay-fill'); let i=0; const tick=()=>{ if(i"+(ok?'OK':'FAIL')+" attempt "+(i+1)+", "+esc(st)+""); if(fill)fill.style.width=Math.round(((i+1)/(total+1))*100)+'%'; i++; replayTimer=setTimeout(tick,360); return; } if(stage)stage.textContent='best verified kernel selected'; if(fill)fill.style.width='100%'; replayTimer=setTimeout(()=>showResult(d,{recorded:true,mode:lastMintedMode,replayKey:key}),420); }; replayTimer=setTimeout(tick,180); } let buildTimer=null; function startBuild(mode,recipe){ stopReplay(); const t=target(mode), v=t.verdict; if(!v)return; clearSource(mode); const k=recipe.k||4, started=Date.now(), replayKey=mode==='build'?(matchingReplayKey()||REPLAY_ORDER[0]||''):(REPLAY_ORDER[0]||''); const local=!!recipe.local; const stages=local?['loading local GGUF...','llama.cpp drafting '+Math.min(k,2)+' kernels...','compiling Triton on this Space GPU...','checking correctness vs PyTorch...','timing vs torch.compile max-autotune...']: ['model waking...','drafting '+k+' kernels...','compiling Triton...','checking correctness vs PyTorch...','timing vs torch.compile max-autotune...']; const stepHtml=stages.map((s,i)=>""+esc(s)+"").join(''); v.innerHTML="
"+esc(stages[0])+""+ "
"+stepHtml+"
"+ "
"+ ""+(local?"Local mode drafts up to two kernels with llama.cpp on this Space's GPU, then verifies them with the in-process referee and times them against PyTorch eager, torch.compile, and max-autotune. First run may also download/cache the GGUF.":"The model drafts up to "+k+" kernels. The referee compiles, correctness-checks, and times every candidate. First build of a session can take about 90 seconds while the model wakes.")+""+ (replayKey?"":"")+"
"; // Honest progress: do_mint is one blocking call with no progress signal, so we cannot show a // live readout. Instead we walk the referee's real phases ONCE on a rough estimate, monotonically // (the index only ever increases), and park on the final phase until the result arrives. It never // loops back or resets the way the old modulo version did. A gentle pulse keeps the viz alive. const stageStart=[0,12,18,23,28]; // seconds at which each phase is assumed to begin let baseVals=null; try{const st=pipeline();baseVals=st[st.length-1].v.slice();}catch(e){} buildTimer=setInterval(()=>{ const elapsed=(Date.now()-started)/1000; let ix=0; for(let j=0;j=stageStart[j])ix=j;} ix=Math.min(ix,stages.length-1); const label=(ix===stages.length-1&&elapsed>45)?'still timing vs torch.compile max-autotune...':stages[ix]; const el=$('#km-bstage'); if(el&&el.textContent!==label){el.textContent=label;animate(el,[{opacity:.35,transform:'translateY(3px)'},{opacity:1,transform:'none'}],{duration:220,easing:'cubic-bezier(0.16,1,0.3,1)'});} $$('#km-build-steps span').forEach((s,j)=>{s.classList.toggle('on',j===ix);s.classList.toggle('done',jx*p),650);} },650); } function stopBuild(){ if(buildTimer){clearInterval(buildTimer);buildTimer=null;} const f=$('#km-barfill'); if(f)f.style.width='100%'; render(); } function renderLB(rows,meta={}){ const el=$('#km-lb-view'); if(!el)return; if(lbUiTimer){clearTimeout(lbUiTimer);lbUiTimer=null;} setLBButton(false); const fetched=Number(meta.fetched_at||0); const stamp=fetched?new Date(fetched*1000).toLocaleTimeString([], {hour:'2-digit', minute:'2-digit', second:'2-digit'}):'just now'; const sourceCount=Array.isArray(meta.sources)?meta.sources.length:0; setLBStatus('Last checked '+stamp+(sourceCount?' from '+sourceCount+' endpoint'+(sourceCount>1?'s':'')+'.':'.'),''); if(!rows.length){el.innerHTML='No kernels yet. Be the first.';return;} el.innerHTML=""+ rows.slice(0,25).map((r,i)=>"").join('')+"
#machinebuildervs compilervs PyTorch
"+(i+1)+""+esc(r.op||'kernel')+""+esc(r.name||'anonymous')+""+sp2(r.speedup_compile)+"x"+sp(r.speedup_eager)+"x
"; } function watch(id,cb){ const el=document.querySelector(id); if(!el)return; const read=()=>{ const s=el.querySelector('[data-blob]'); if(s){try{cb(JSON.parse(atob(s.dataset.blob)));}catch(e){}} }; new MutationObserver(read).observe(el,{childList:true,subtree:true}); read(); } setTimeout(()=>{watch('#km_result',showResult);watch('#km_lb',d=>renderLB(d.rows||[],d));},250); const rf=$('#km-lb-refresh'); if(rf)rf.addEventListener('click',()=>refreshLB(true)); const CLASSIC_TERMS=[ {key:'softmax_scale',terms:['scaled softmax','scale softmax','attention scale']}, {key:'swiglu',terms:['swiglu','swi glu']}, {key:'geglu',terms:['geglu','ge glu']}, {key:'softmax',terms:['softmax']} ]; const ACT_TERMS=[ {key:'gelu_erf',terms:['gelu_erf','exact gelu','erf gelu']}, {key:'relu2',terms:['relu2','relu squared','squared relu','square relu']}, {key:'leaky_relu',terms:['leaky_relu','leaky relu','leaky']}, {key:'silu',terms:['silu','swish','swishes']}, {key:'sigmoid',terms:['sigmoid']}, {key:'gelu',terms:['gelu']}, {key:'relu',terms:['relu']}, {key:'tanh',terms:['tanh']}, {key:'elu',terms:['elu']}, {key:'mish',terms:['mish']} ]; function termMatches(text,items){ // word-boundary match so 'elu' does not fire inside 'gelu', 'relu' not inside 'leaky_relu', etc. // (underscores and digits are word chars, so \b correctly keeps relu2 and gelu_erf distinct.) const found=[]; items.forEach(item=>{ let best=-1, bestLen=0; item.terms.forEach(term=>{ const re=new RegExp('\\b'+term.replace(/[.*+?^${}()|[\]\\]/g,'\\$&')+'\\b'); const m=re.exec(text); if(m&&(best<0||m.indexbestLen))){best=m.index;bestLen=term.length;} }); if(best>=0)found.push({key:item.key,ix:best,len:bestLen}); }); found.sort((a,b)=>a.ix-b.ix||b.len-a.len); // drop a shorter match that overlaps a longer one ('relu' inside 'leaky relu', 'softmax' // inside 'scaled softmax'), keeping the first/longest at each span. const kept=[]; found.forEach(m=>{ if(!kept.some(k=>m.ix{if(!acts.includes(m.key)&&acts.length<2)acts.push(m.key);}); if(!normHit&&!acts.length){ return {unknown:true,message:'Could not map this brief. Use RMSNorm, LayerNorm, residual, and a single supported activation, or a named op such as softmax, scaled softmax, SwiGLU, or GeGLU.'}; } if(!acts.length){ if(normHit&&!residual){ return {unknown:false,label:CLASSICS[normHit].lbl,recipe:{mode:'classic',norm:normHit,residual:false,acts:[],classic:normHit,spark:raw}}; } return {unknown:true,message:'This brief needs a supported activation after the residual or normalization so the referee can verify it.'}; } const recipe={mode:'build',norm:normHit||'rmsnorm',residual,acts,classic:'softmax',spark:raw}; return {unknown:false,label:labelRecipe(recipe),recipe}; } function resolveExpert(){ const box=$('#km-xbrief'), res=$('#km-xresolved'), out=res?res.querySelector('.v'):null, btn=$('#km-xbuild'); if(!box||!res||!out)return; resolvedExpert=resolveBrief(box.value); res.classList.toggle('unknown',!!resolvedExpert.unknown); out.textContent=resolvedExpert.unknown?resolvedExpert.message:resolvedExpert.label; if(btn)btn.disabled=!!resolvedExpert.unknown; } const OP_TAGS=[ ['softmax','softmax over each row'], ['scaled_softmax','scaled softmax over each row'], ['swiglu','SwiGLU gated feed forward'], ['geglu','GeGLU gated feed forward'], ['rmsnorm','RMSNorm only'], ['layernorm','LayerNorm only'], ['add_rmsnorm_gelu','fused RMSNorm with residual and GELU'], ['layernorm_silu','fused LayerNorm with SiLU'], ['rmsnorm_mish','fused RMSNorm with Mish'], ['add_layernorm_silu','fused LayerNorm with residual and SiLU'], ['rmsnorm_gelu_relu2','fused RMSNorm with GELU then squared ReLU'] ].concat(Object.keys(ACTS).map(k=>[k,ACTS[k].lbl+' activation'])); function renderOpTags(sel,interactive){ const wrap=$(sel); if(!wrap)return; wrap.innerHTML=''; OP_TAGS.forEach(([label,brief])=>{ const b=document.createElement('button'); b.type='button'; b.className='km-optag'; b.textContent=label; b.dataset.brief=brief; if(interactive)b.addEventListener('click',()=>{ const box=$('#km-xbrief'); if(box){box.value=brief;box.dispatchEvent(new Event('input',{bubbles:true}));box.focus();} }); wrap.appendChild(b); }); } function renderGlossary(){ const lb=$('#km-learn-blocks'); if(!lb)return; const rows=[['Residual',MEM.info],...Object.values(NORMS).map(v=>[v.lbl,v.info]),...Object.values(ACTS).map(v=>[v.lbl,v.info]),...Object.values(CLASSICS).map(v=>[v.lbl,v.info])]; lb.innerHTML=rows.map(([n,d])=>"
"+esc(n)+""+esc(d)+"
").join(''); } function setupCopyButtons(){ $$('.km-copy').forEach(btn=>btn.addEventListener('click',()=>{ const block=btn.closest('.km-codeblock'); if(!block)return; const clone=block.cloneNode(true), copy=clone.querySelector('.km-copy'); if(copy)copy.remove(); const text=clone.textContent.trim(); const done=()=>{const old=btn.textContent;btn.textContent='Copied';setTimeout(()=>{btn.textContent=old;},900);}; if(navigator.clipboard&&navigator.clipboard.writeText)navigator.clipboard.writeText(text).then(done).catch(done); else done(); })); } root.addEventListener('click',e=>{ const b=e.target.closest('[data-replay-key]'); if(!b||!root.contains(b))return; const key=b.dataset.replayKey; if(!REPLAYS[key])return; e.preventDefault(); startReplay(key); }); const xb=$('#km-xbrief'); if(xb)xb.addEventListener('input',resolveExpert); // (second-activation builder removed: the kernelsmith grammar is norm + one activation) const buildBtn=$('#km-build'); if(buildBtn)buildBtn.addEventListener('click',()=>{ lastMintedMode='build'; const r=buildRecipe(); animate(buildBtn,[{transform:'scale(0.97)'},{transform:'scale(1)'}],{duration:140,easing:'cubic-bezier(0.16,1,0.3,1)'}); play(); setMinting(true); startBuild('build',r); bridge(r); }); const xBuild=$('#km-xbuild'); if(xBuild)xBuild.addEventListener('click',()=>{ resolveExpert(); const r=expertRecipe(); if(!r)return; lastMintedMode='expert'; animate(xBuild,[{transform:'scale(0.97)'},{transform:'scale(1)'}],{duration:140,easing:'cubic-bezier(0.16,1,0.3,1)'}); setMinting(true); startBuild('expert',r); bridge(r); }); setupStepper('#km-tries','#km-tries-v',4,v=>{buildTries=v;}); setupStepper('#km-xtries','#km-xtries-v',5,v=>{expertTries=v;}); setupTemp('#km-temp','#km-temp-v'); setupTemp('#km-xtemp','#km-xtemp-v'); syncModePair('#km-local','#km-pro'); syncModePair('#km-xlocal','#km-xpro'); // The build and expert pages each have their own "Local (offline)" toggle. Local mode is the ONLY // path that computes the max-autotune number, so a page in Modal mode shows just 2 comparisons. // Keep both toggles in lockstep so build and expert always run the same way and show the same // set of comparison numbers (the guard stops the change events from ping-ponging). (function(){ const a=$('#km-local'), b=$('#km-xlocal'); if(!a||!b)return; a.addEventListener('change',()=>{ if(b.checked!==a.checked){ b.checked=a.checked; b.dispatchEvent(new Event('change')); } }); b.addEventListener('change',()=>{ if(a.checked!==b.checked){ a.checked=b.checked; a.dispatchEvent(new Event('change')); } }); })(); renderPickers(); renderClassics(); renderReplayChoices(); renderOpTags('#km-oplist',true); renderOpTags('#km-docs-ops',false); renderGlossary(); setupCopyButtons(); buildSvg(); setInfo(machineInfo()); resolveExpert(); showTab('build'); render(); } """ with gr.Blocks(title="Kernel Mint", head=f"") as demo: gr.HTML(APP_HTML) # bridge components: kept in the DOM (NOT visible=False, which removes them entirely so the # custom JS can't reach the inner textarea/button) and hidden with CSS (#km_recipe etc.). recipe_in = gr.Textbox(elem_id="km_recipe") result_out = gr.HTML(elem_id="km_result") lb_out = gr.HTML(elem_id="km_lb") go = gr.Button("go", elem_id="km_go") lb_btn = gr.Button("lb", elem_id="km_lb_btn") # trigger_mode="multiple": the default ("once") silently DROPS a click that lands while a # previous mint is still running, so minting one kernel after another needed two clicks and the # old result lingered. The frontend also disables the Mint button during a mint (one at a time, # no wasted ZeroGPU calls); this is the belt-and-suspenders so no click is ever swallowed. go.click(do_mint, recipe_in, result_out, trigger_mode="multiple") # Leaderboard refresh is its OWN trigger, decoupled from minting. Previously fetch_lb was wired # to recipe_in.change, so every mint (which sets recipe_in) also fired a leaderboard fetch; on a # slow backend that fetch sat ahead of do_mint in Gradio's queue and the mint appeared to do # nothing until a second click. Now the JS clicks #km_lb_btn only when it actually wants the board. lb_btn.click(fetch_lb, None, lb_out) demo.load(fetch_lb, None, lb_out) demo.load(None, None, None, js=APP_JS) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))