"""OUROBOROS Kernel Mint: a GPU-kernel demo (Gradio + custom JS).
Compose a fused operation, and a 1B model writes a real Triton kernel for it while an
immutable referee checks correctness against PyTorch and times it against torch.compile. The
whole interactive surface is a custom JS widget in one gr.HTML; the mint runs through Python
(no CORS) and hits the 1B model (default) or the 27B (Pro mode).
"""
from __future__ import annotations
import base64
import json
import os
import re
import sys
import time
import gradio as gr
import requests
# `spaces.GPU` only matters on ZeroGPU, where the GPU is granted lazily per call. On a dedicated
# GPU Space (e.g. L4) or locally the GPU is always attached, so the decorator must be a plain
# pass-through — otherwise it can fight the always-present CUDA context. Gate on the env var HF sets
# only on ZeroGPU hardware, so the same app.py runs unchanged on ZeroGPU, a paid GPU, and a laptop.
try:
if not os.environ.get("SPACES_ZERO_GPU"):
raise ImportError("not on ZeroGPU: use the no-op GPU shim")
import spaces
except Exception:
class _SpacesShim:
@staticmethod
def GPU(*_args, **_kwargs):
def _decorator(fn):
return fn
return _decorator
spaces = _SpacesShim()
ONE_B = os.environ.get("BACKEND_URL", "https://ymrohit--ouroboros-kernel-mint-mint-mint.modal.run")
PRO = os.environ.get("BACKEND_PRO_URL", "https://ymrohit--ouroboros-kernel-mint-pro-mint-mint.modal.run")
ROOT = os.path.dirname(__file__)
REFEREE = os.path.join(ROOT, "referee")
SEED_DIR = os.path.join(ROOT, "seed_kernels")
LOCAL_FINE_TUNED_REPO = os.environ.get(
"LOCAL_GGUF_REPO", "YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF")
LOCAL_BASE_REPO = os.environ.get("LOCAL_GGUF_FALLBACK_REPO", "openbmb/MiniCPM5-1B-GGUF")
LOCAL_QUANT_PREFS = tuple(q.strip() for q in os.environ.get(
"LOCAL_GGUF_QUANTS", "Q5_K_M,Q6_K,Q4_K_M,Q8_0,F16,BF16").split(",") if q.strip())
LOCAL_MAX_ATTEMPTS = int(os.environ.get("LOCAL_MAX_ATTEMPTS", "2"))
_LOCAL_LLM = None
_LOCAL_LLM_PATH = None
def _ensure_referee_path():
if REFEREE not in sys.path:
sys.path.insert(0, REFEREE)
LOCAL_SYS = ("You are an expert GPU kernel engineer. Write a single correct, fast Triton "
"kernel. Output ONLY one fenced python code block defining `run(*inputs)` and "
"its @triton.jit kernel. Accumulate reductions in float32. No prose.")
# The model was trained (rl_kernelsmith.py Proposer.prompt) with a real per-op SEED kernel as the
# style guide, and for almost every op that exemplar is rmsnorm's ROW-WISE reduction. We must hand
# the model that SAME exemplar at inference, or it goes off-distribution and writes flat elementwise
# code for reduction ops (wrong results). Seeds are bundled in ./seed_kernels; this inline copy is
# only a fallback if a file is missing.
_FALLBACK_EXEMPLAR = """# GOLD seed kernel: fused RMSNorm, one row per program, fp32 accumulation.
@triton.jit
def _rmsnorm_kernel(x_ptr, w_ptr, y_ptr, stride, N, eps, BLOCK: tl.constexpr):
row = tl.program_id(0)
x_ptr += row * stride
y_ptr += row * stride
acc = tl.zeros([BLOCK], dtype=tl.float32)
for off in range(0, N, BLOCK):
cols = off + tl.arange(0, BLOCK)
x = tl.load(x_ptr + cols, mask=cols < N, other=0.0).to(tl.float32)
acc += x * x
rms = tl.rsqrt(tl.sum(acc) / N + eps)
for off in range(0, N, BLOCK):
cols = off + tl.arange(0, BLOCK)
mask = cols < N
x = tl.load(x_ptr + cols, mask=mask, other=0.0).to(tl.float32)
w = tl.load(w_ptr + cols, mask=mask, other=0.0).to(tl.float32)
tl.store(y_ptr + cols, (x * rms * w), mask=mask)
def run(x, w):
M, N = x.shape
y = torch.empty_like(x)
_rmsnorm_kernel[(M,)](x, w, y, x.stride(0), N, 1e-6, BLOCK=1024)
return y
"""
def _load_seed_kernel(op: str) -> str:
try:
with open(os.path.join(SEED_DIR, f"{op}.py")) as f:
return f.read()
except OSError:
return _FALLBACK_EXEMPLAR
def extract_kernel(text: str) -> str:
"""Pull a fenced Python kernel out of a llama.cpp completion."""
m = re.search(r"```(?:python)?\s*(.*?)```", text or "", re.S)
body = m.group(1) if m else (text or "")
starts = [body.find(k) for k in ("@triton", "import ", "def run", "def _") if body.find(k) >= 0]
i = min(starts or [0])
return body[i:].strip()
def _local_op_from_recipe(r: dict) -> tuple[str, str]:
_ensure_referee_path()
from specs import SPECS
if r.get("mode") == "classic":
op = (r.get("classic") or "softmax").strip()
if op not in SPECS:
raise ValueError(f"unknown op '{op}'")
return op, op
normfull = "rmsnorm" if str(r.get("norm", "rmsnorm")).startswith("rms") else "layernorm"
norm = "rms" if normfull == "rmsnorm" else "layer"
residual = bool(r.get("residual"))
acts = [a for a in (r.get("acts") or []) if a][:3] or ["gelu"]
label = ("residual + " if residual else "") + \
("RMSNorm" if norm == "rms" else "LayerNorm") + " -> " + " -> ".join(acts)
named = ("add_" if residual else "") + normfull + "_" + acts[0]
if len(acts) == 1 and named in SPECS:
return named, label
return f"chain|{norm}|{'1' if residual else '0'}|{','.join(acts)}", label
def _local_prompt_messages(op: str, spark: str = "") -> list[dict[str, str]]:
_ensure_referee_path()
from specs import get_spec, SPECS
spec = get_spec(op)
# Same exemplar rule as training: the real seed kernel for the next op in SPECS that isn't this
# one (almost always rmsnorm's row-wise reduction). This is the single thing that makes the 1B
# write correct kernels here instead of elementwise guesses.
exemplar_op = next((o for o in SPECS if o != op), op)
exemplar = _load_seed_kernel(exemplar_op)
user = (f"Op `{op}`: {spec.notes}\nSignature:\n{spec.signature_hint}\n\n"
f"Here is a valid Triton kernel for a DIFFERENT op (`{exemplar_op}`) as a style guide:\n"
f"```python\n{exemplar}\n```\n")
if spark:
user += (f"\nThe person building this kernel added their own idea: \"{spark}\". "
"Honour their idea where you can, but correctness is mandatory.")
return [{"role": "system", "content": LOCAL_SYS}, {"role": "user", "content": user}]
def _render_prompt(messages: list[dict[str, str]]) -> str:
# MiniCPM5 uses ChatML (<|im_start|>/<|im_end|>). It is a reasoning model, so we append the
# empty block (the enable_thinking=False convention) to make it answer
# directly with the kernel instead of emitting a reasoning trace. Verified against the
# tokenizer's apply_chat_template(enable_thinking=False) output.
out = []
for m in messages:
out.append(f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>\n")
out.append("<|im_start|>assistant\n\n\n\n\n")
return "".join(out)
def _common_gguf_names() -> list[str]:
names = []
stems = ("MiniCPM5-1B", "minicpm5-1b", "model", "ggml-model")
for q in LOCAL_QUANT_PREFS:
for stem in stems:
names.append(f"{stem}-{q}.gguf")
names.append(f"{stem}.{q}.gguf")
return names
def _download_gguf_from_repo(repo_id: str) -> str:
from huggingface_hub import HfApi, hf_hub_download
explicit = os.environ.get("LOCAL_GGUF_FILE")
cache_dir = os.environ.get("LOCAL_GGUF_CACHE")
if explicit:
return hf_hub_download(repo_id=repo_id, filename=explicit, cache_dir=cache_dir)
files = []
try:
files = HfApi().list_repo_files(repo_id)
except Exception:
files = []
ggufs = [f for f in files if f.lower().endswith(".gguf")]
for quant in LOCAL_QUANT_PREFS:
hit = next((f for f in ggufs if quant.lower() in f.lower()), None)
if hit:
return hf_hub_download(repo_id=repo_id, filename=hit, cache_dir=cache_dir)
if ggufs:
return hf_hub_download(repo_id=repo_id, filename=ggufs[0], cache_dir=cache_dir)
last = None
for filename in _common_gguf_names():
try:
return hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir)
except Exception as e:
last = e
raise RuntimeError(f"no GGUF found in {repo_id}: {last}")
def _resolve_local_gguf() -> str:
errors = []
for repo_id in (LOCAL_FINE_TUNED_REPO, LOCAL_BASE_REPO):
try:
return _download_gguf_from_repo(repo_id)
except Exception as e:
errors.append(f"{repo_id}: {type(e).__name__}: {str(e)[:180]}")
raise RuntimeError("could not load a local GGUF. " + " | ".join(errors))
_CUDA_PRELOADED = False
def _preload_cuda_libs():
# The CUDA llama.cpp wheel's libllama.so links libcudart.so.12 / libcublas*.so.12, which on HF
# Spaces live inside torch's bundled nvidia-* packages and are NOT on the default loader path,
# so the import dies with "libcudart.so.12: cannot open shared object file". Preload them with
# RTLD_GLOBAL (in dependency order) so libllama.so resolves their symbols. No-op if absent.
global _CUDA_PRELOADED
if _CUDA_PRELOADED:
return
_CUDA_PRELOADED = True
import ctypes
import glob
roots = []
try:
import torch
roots.append(os.path.join(os.path.dirname(torch.__file__), "lib"))
except Exception:
pass
try:
import site
roots += list(site.getsitepackages() if hasattr(site, "getsitepackages") else [])
except Exception:
pass
roots += ["/usr/local/lib/python3.10/site-packages", "/usr/local/cuda/lib64",
"/usr/lib/x86_64-linux-gnu", "/usr/local/lib"]
# Match versioned sonames too (e.g. libcudart.so.12.4.127 with no libcudart.so.12 symlink);
# RTLD_GLOBAL makes the symbols visible to libllama.so regardless of the exact soname.
for pat in ("libcudart.so*", "libcublas.so*", "libcublasLt.so*"):
loaded = False
for root in roots:
if loaded:
break
try:
hits = sorted(glob.glob(os.path.join(root, "**", pat), recursive=True))
except Exception:
hits = []
for hit in hits:
try:
ctypes.CDLL(hit, mode=ctypes.RTLD_GLOBAL)
loaded = True
break
except Exception:
continue
# Confirm the CUDA llama.cpp lib loads at startup (loading the .so needs no GPU device, so it spends
# no ZeroGPU quota). It also warms the import so the in-@spaces.GPU call is instant.
if os.environ.get("LOCAL_LLAMA_GPU_LAYERS", "-1") != "0":
try:
_preload_cuda_libs()
import llama_cpp as _lc_probe
print(f"[startup] llama_cpp {_lc_probe.__version__} loaded OK", flush=True)
except Exception as _e:
print(f"[startup] llama_cpp load FAILED: {type(_e).__name__}: {str(_e)[:200]}", flush=True)
def _get_local_llm():
global _LOCAL_LLM, _LOCAL_LLM_PATH
on_zero = bool(os.environ.get("SPACES_ZERO_GPU"))
gpu_layers = int(os.environ.get("LOCAL_LLAMA_GPU_LAYERS", "-1"))
# Default to full GPU offload (-1). On ZeroGPU the H200 is ~30x faster than its throttled
# shared CPU for this 1B, and the GPU is granted only inside @spaces.GPU and detached between
# calls, so a GPU-resident model can't be reused across mints: rebuild it each call there.
# Off ZeroGPU (dedicated GPU or CPU) the model is cached once. Set LOCAL_LLAMA_GPU_LAYERS=0
# to force CPU (e.g. when only a CPU-only llama.cpp wheel is installed).
if _LOCAL_LLM is not None and not (on_zero and gpu_layers != 0):
return _LOCAL_LLM
if gpu_layers != 0:
_preload_cuda_libs()
from llama_cpp import Llama
_LOCAL_LLM_PATH = _resolve_local_gguf()
threads = int(os.environ.get("LOCAL_LLAMA_THREADS", str(max(1, (os.cpu_count() or 4) - 1))))
llm = Llama(
model_path=_LOCAL_LLM_PATH,
n_ctx=int(os.environ.get("LOCAL_LLAMA_CTX", "4096")),
n_threads=threads,
n_gpu_layers=gpu_layers,
verbose=bool(int(os.environ.get("LOCAL_LLAMA_VERBOSE", "0"))),
)
if not (on_zero and gpu_layers != 0):
_LOCAL_LLM = llm
return llm
def _llama_complete(messages: list[dict[str, str]], temp: float, seed: int) -> str:
# Use raw completion with our think-suppressed ChatML render (not create_chat_completion, which
# leaves the reasoning trace on and makes a 1B ramble instead of writing the kernel).
llm = _get_local_llm()
max_tokens = int(os.environ.get("LOCAL_MAX_TOKENS", "768"))
out = llm.create_completion(
_render_prompt(messages),
max_tokens=max_tokens,
temperature=temp,
top_p=0.97,
seed=seed,
stop=["<|im_end|>", "<|im_start|>"],
)
return out["choices"][0]["text"]
def _local_gpu_duration() -> int:
# This is a ZeroGPU reservation window, not a benchmark setting. Keep the default within the
# usual free-tier budget, but make the cap explicit so slower/cold hardware can raise it without
# code changes. The current Space had LOCAL_GPU_DURATION=135, which over-reserved quota; the
# default cap trims that to 120 while still leaving headroom for cold local mints.
try:
requested = int(os.environ.get("LOCAL_GPU_DURATION", "120"))
except Exception:
requested = 120
try:
cap = int(os.environ.get("LOCAL_GPU_DURATION_CAP", "120"))
except Exception:
cap = 120
return max(45, min(max(45, cap), requested))
LOCAL_GPU_DURATION = _local_gpu_duration()
@spaces.GPU(duration=LOCAL_GPU_DURATION)
def _local_gpu_mint(op: str, k: int, temp: float, spark: str) -> dict:
# Generation AND verification happen inside this one GPU window. That is required on ZeroGPU,
# where the GPU exists only inside @spaces.GPU: llama.cpp offloads generation to the H200
# (LOCAL_LLAMA_GPU_LAYERS=-1, seconds instead of minutes on the throttled CPU) and the referee
# compiles + runs Triton, all here. Off ZeroGPU the decorator is a no-op and the GPU is always
# present, so the same code path works on a dedicated GPU and locally too.
_ensure_referee_path()
# Time every verified kernel against eager AND torch.compile (default + max-autotune), so local
# mode shows the honest baselines, not just the inflated vs-eager fusion win. The max-autotune
# compile is a few seconds for these ops (inductor-cached across the k attempts), so it fits.
from harness import evaluate_inprocess_full
messages = _local_prompt_messages(op, spark)
srcs = []
base_seed = int(time.time() * 1000) & 0x7FFFFFFF
for i in range(k):
text = _llama_complete(messages, temp=temp, seed=base_seed + i)
src = extract_kernel(text)
if src:
srcs.append(src)
statuses = []
best = None
n_shapes = int(os.environ.get("LOCAL_REFEREE_SHAPES", "2"))
n_iters = int(os.environ.get("LOCAL_REFEREE_ITERS", "30"))
for i, src in enumerate(srcs):
try:
res = evaluate_inprocess_full(src, op, n_shapes=n_shapes, n_iters=n_iters, seed=i)
except Exception:
statuses.append("runtime_fail")
continue
statuses.append(res.status)
# Keep the FASTEST verified kernel (lowest latency = best vs every baseline at once).
if res.status == "ok" and (best is None or res.latency_ms < best["result"]["latency_ms"]):
best = {"source": src, "result": res.to_dict()}
return {"n_srcs": len(srcs), "statuses": statuses, "best": best}
def _local_explain(label: str, res) -> str:
if res is None or res.get("status") != "ok":
status = "no verified attempt" if res is None else res.get("status", "failed")
return (f"The local referee rejected these kernels ({status}). It still compiled and "
"checked them inside this Space; try minting again or simplify the operation.")
se = float(res.get("speedup_eager", 0) or 0)
honest = res.get("speedup_maxauto") or res.get("speedup_compile")
bl = "torch.compile max-autotune" if res.get("speedup_maxauto") else "torch.compile"
if honest:
return (f"Verified correct. {float(honest):.2f}x faster than {bl} (the honest baseline that "
f"also fuses and autotunes), and {se:.0f}x vs unfused PyTorch eager. The whole loop "
"ran in this Space: the 1B wrote it via llama.cpp, the referee compiled, checked it "
"against PyTorch, and timed it.")
return (f"Verified correct, {se:.0f}x faster than unfused PyTorch eager. The 1B wrote it via "
"llama.cpp and the referee checked and timed it, all inside this Space.")
def local_mint(recipe: dict) -> dict:
# The 1B reliably writes the named ops and single-activation machines (norm + optional residual
# + one activation), which is what it was trained on. Arbitrary multi-activation chains are
# off-distribution and it can't write them correctly, so local mode declines them with a clear
# message instead of looping through rejected attempts. Pro mode (the 27B) is the place for those.
op, label = _local_op_from_recipe(recipe)
if op.startswith("chain|"):
return {"op": label, "verified": False, "local": True, "baseline": "eager",
"statuses": [], "gen_seconds": 0.0, "verify_seconds": 0.0,
"speedup_compile": None, "unsupported": True, "k": 0,
"explanation": ("This machine stacks two activations, which is outside what the "
"kernelsmith models were trained on (the grammar is a norm plus one "
"activation), so neither the 1B nor the 27B writes it reliably. Drop "
"to a single activation and it mints with a verified kernel.")}
k = max(1, min(LOCAL_MAX_ATTEMPTS, int(recipe.get("k", 1))))
temp = max(0.2, min(1.3, float(recipe.get("temp", 0.7))))
spark = (recipe.get("spark") or "").strip()[:200]
# Pre-fetch the GGUF to disk OUTSIDE the GPU window, so the ZeroGPU 120s budget is spent on
# generate + verify rather than a 1.15GB download.
_resolve_local_gguf()
t0 = time.time()
work = _local_gpu_mint(op, k, temp, spark)
total_s = round(time.time() - t0, 1)
statuses = work.get("statuses") or []
best = work.get("best")
if not work.get("n_srcs"):
return {"op": label, "verified": False, "local": True, "baseline": "eager",
"statuses": statuses, "gen_seconds": total_s, "verify_seconds": 0.0,
"speedup_compile": None, "k": k,
"explanation": "llama.cpp returned no parseable kernel this time. Try minting again."}
if not best:
return {"op": label, "verified": False, "local": True, "baseline": "eager",
"statuses": statuses, "gen_seconds": total_s, "verify_seconds": 0.0,
"speedup_compile": None, "explanation": _local_explain(label, None), "k": k}
res = best["result"]
def _su(key):
v = res.get(key)
return round(float(v), 2) if v else None
return {"op": label, "verified": True, "local": True, "baseline": "compile",
"speedup_eager": round(float(res.get("speedup_eager", 0)), 2),
"speedup_compile": _su("speedup_compile"), "speedup_maxauto": _su("speedup_maxauto"),
"latency_ms": res.get("latency_ms"), "eager_ms": res.get("eager_ms"),
"compile_ms": res.get("compile_ms"), "maxauto_ms": res.get("maxauto_ms"),
"source": best["source"], "statuses": statuses, "gen_seconds": total_s,
"verify_seconds": 0.0, "explanation": _local_explain(label, res),
"n_verified": sum(1 for s in statuses if s == "ok"), "k": k}
def lb_url(u):
return u.replace("-mint-mint.modal.run", "-leaderboard.modal.run")
def _blob(d):
return f""
def _with_nonce(d: dict, r: dict) -> dict:
out = dict(d or {})
if r.get("_n") is not None:
out["_n"] = r.get("_n")
return out
def do_mint(recipe_json: str) -> str:
try:
r = json.loads(recipe_json or "{}")
except Exception:
return _blob({"error": "bad recipe"})
if r.get("local"):
try:
return _blob(_with_nonce(local_mint(r), r))
except Exception as e:
return _blob(_with_nonce(
{"error": f"Local offline mint failed: {type(e).__name__}: {str(e)[:240]}",
"local": True, "baseline": "eager"}, r))
base = PRO if r.get("pro") else ONE_B
if r.get("mode") == "classic":
payload = {"op": r.get("classic", "softmax")}
else:
payload = {"blocks": {"norm": r.get("norm", "rmsnorm"), "residual": bool(r.get("residual")),
"acts": r.get("acts") or ["gelu"]}}
payload.update({"name": (r.get("name") or "anonymous")[:40], "k": int(r.get("k", 4)),
"temp": float(r.get("temp", 0.7)), "spark": (r.get("spark") or "")[:200]})
# During a cold start Modal can return an empty body before the model finishes loading,
# which makes .json() blow up. Retry a few times, then explain it plainly.
last = ""
for attempt in range(4):
try:
resp = requests.post(base, json=payload, timeout=600)
if resp.status_code == 200 and resp.text.strip():
try:
return _blob(_with_nonce(resp.json(), r))
except ValueError:
last = "the model returned an unexpected response"
else:
last = f"the model is still waking up (HTTP {resp.status_code})"
except Exception as e:
last = str(e)
time.sleep(4)
pro = " The 27B in Pro mode takes longer to wake. Try again, or turn Pro off." if r.get("pro") else ""
return _blob(_with_nonce(
{"error": f"The model is still waking up. Give it about 30 seconds and press Mint again.{pro}"},
r))
def fetch_lb(_=None):
rows = []
seen = set()
sources = []
errors = []
for u in (ONE_B, PRO):
try:
url = lb_url(u)
got = requests.get(url, timeout=20).json().get("rows", [])
if got:
sources.append(url)
for row in got:
key = (row.get("op"), row.get("name"), row.get("ts"),
row.get("speedup_compile"), row.get("speedup_eager"))
if key in seen:
continue
seen.add(key)
rows.append(row)
except Exception as e:
errors.append(f"{lb_url(u)}: {type(e).__name__}")
continue
# Always include fetched_at so the hidden Gradio HTML changes even when the rows are identical.
# That makes the visible Refresh button observably work.
return _blob({"rows": rows, "fetched_at": time.time(), "sources": sources, "errors": errors[:2]})
CSS = open(os.path.join(os.path.dirname(__file__), "style.css")).read() if os.path.exists(
os.path.join(os.path.dirname(__file__), "style.css")) else ""
REPLAYS_PATH = os.path.join(os.path.dirname(__file__), "replays", "index.json")
try:
with open(REPLAYS_PATH, encoding="utf-8") as f:
REPLAYS_JSON = json.dumps(json.load(f), separators=(",", ":")).replace("", "<\\/")
except Exception:
REPLAYS_JSON = "{}"
APP_HTML = r"""
Kernel Mintan OUROBOROS demo
A 1B model writes a real GPU kernel. A referee that can't be fooled checks it.
Compose an operation, and a 1-billion-parameter model writes a fused Triton kernel for it. The kernel is compiled, checked against PyTorch on adversarial inputs, and timed against PyTorch's own compiler before anything counts as a win.
Pipeline
inputa row of activations
↓
residual
↓
normalize
↓
activate
A named operation from a real transformer
Runs entirely in this Space: a 1B GGUF model (llama.cpp) writes the kernel on this Space's GPU, then the in-process referee compiles it, checks correctness against PyTorch, and times it against PyTorch eager, torch.compile, and max-autotune. No Modal or cloud model calls. Pro uses the 27B.
The operation, stage by stage
input
→
readyPress Mint to begin
Compose a pipeline on the left and mint it, or watch a recorded verified mint to see the whole referee loop instantly.
Triton kernel source
This session
No mints yet.
Free-test the verifier.
Describe the fused operation you want in plain language. The model writes a Triton kernel and the referee certifies it against PyTorch, torch.compile, and max-autotune. The referee can only certify operations it has a reference for, so your request is resolved to the nearest one it can check.
Your brief
resolves to
rmsnorm with gelu
Uses llama.cpp inside the Space and verifies on this Space's GPU, timed against torch.compile and max-autotune (the honest baselines), not just eager. Pro mode keeps the 27B on Modal.
What the referee can verify
It certifies fusions of a normalization (RMSNorm or LayerNorm), an optional residual add, and one activation, plus a set of named operators from real transformers. Click one to load it into your brief.
Triton kernel source
Leaderboard
The best verified kernel per operation. Crowned rows were minted by the 27B model. Every speedup here was measured by the referee, not reported by the model.
Loading.
Auto-refreshes while this tab is open.
reproduce the claim
Run the smith, then make the referee decide.
The Space has two execution paths. The Local (offline) switch runs the 1B GGUF model with llama.cpp inside this Space, then verifies the candidate kernel in-process on the same GPU. Pro uses the 27B adapter through the Modal backend. Both paths compare against PyTorch eager, torch.compile, and torch.compile max-autotune before a result counts.
Turn on Local (offline). It disables Pro because the two paths are intentionally separate.
Pick a named op such as rmsnorm_gelu, softmax, or swiglu, or compose one norm plus one activation.
Press Mint kernel. The Space downloads or reuses the GGUF, drafts up to two kernels, compiles Triton, checks PyTorch correctness, and times eager, torch.compile, and max-autotune.
A certified Local result should show compiler baselines. If a cold ZeroGPU session times out during max-autotune, mint again. A result that only beats eager is not treated as the headline claim.
2. Run the Space clone
Same UI, your GPU.
Use this when you want the full app locally. With no `SPACES_ZERO_GPU` env var, the GPU decorator becomes a no-op and the local verifier runs directly on your attached CUDA device.
from huggingface_hub import HfApi, hf_hub_download
from llama_cpp import Llama
repo = "YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF"
files = [f for f in HfApi().list_repo_files(repo) if f.lower().endswith(".gguf")]
filename = next((f for f in files if "Q5_K_M" in f), files[0])
gguf = hf_hub_download(repo, filename=filename)
llm = Llama(model_path=gguf, n_ctx=4096, n_gpu_layers=-1)
system = "You are an expert GPU kernel engineer. Output only one fenced python code block."
user = "Write a fused Triton kernel for row-wise softmax. Use stable max-subtraction. Return run(x)."
prompt = (
f"<|im_start|>system\n{system}<|im_end|>\n"
f"<|im_start|>user\n{user}<|im_end|>\n"
"<|im_start|>assistant\n```python\n"
)
out = llm.create_completion(prompt, max_tokens=768, temperature=0.7, top_p=0.97)
print(out["choices"][0]["text"])
4. Run the 1B adapter directly
Transformers plus PEFT.
Use this when you want the published LoRA adapter instead of GGUF. It is slower to start than llama.cpp, but useful for inspection or further fine-tuning.
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
tok = AutoTokenizer.from_pretrained("openbmb/MiniCPM5-1B", trust_remote_code=True)
base = AutoModelForCausalLM.from_pretrained(
"openbmb/MiniCPM5-1B",
trust_remote_code=True,
torch_dtype="auto",
device_map="auto",
)
model = PeftModel.from_pretrained(base, "YMRohit/ouroboros-kernelsmith-minicpm5-1b")
model.eval()
5. Run the 27B smith locally
Same prompt contract, much bigger hardware.
The 27B artifact is the stronger Qwen3.6-27B smith used for the 76 verified compiler-beating kernels. Run it locally if you have a large GPU or multi-GPU box. The training run used Modal H200s and peaked around 110 GB VRAM; inference is still a heavyweight path compared with the 1B GGUF.
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
tok = AutoTokenizer.from_pretrained("Qwen/Qwen3.6-27B", trust_remote_code=True)
base = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen3.6-27B",
trust_remote_code=True,
torch_dtype="auto",
device_map="auto",
)
model = PeftModel.from_pretrained(base, "YMRohit/ouroboros-kernelsmith-qwen3.6-27b")
model.eval()
If that does not fit, use Pro in this Space. It calls the hosted 27B backend and still sends the output through the same three-baseline referee before returning a result.
6. Prompt contract
Ask for one operation, one code block, one `run(...)` entry point.
The model is not a general Python assistant here. It was trained to emit a single Triton kernel for a verifier-known operation. Keep the prompt narrow and include the exact operation, input tensors, output contract, and baseline target.
System
You are an expert GPU kernel engineer. Write a single correct, fast Triton kernel. Output ONLY one fenced python code block defining run(*inputs) and its @triton.jit kernel. Accumulate reductions in float32. No prose.
User template
Operation: add_rmsnorm_gelu
Inputs: x, residual, weight. Each row is one transformer hidden state.
Reference: y = gelu(rmsnorm(x + residual, weight)).
Return: one fenced python block with imports, one @triton.jit kernel, and run(x, residual, weight).
Target: correct vs PyTorch first, then faster than torch.compile max-autotune.
name the opstate tensor orderrequire fp32 reductionsforbid proseverify before trusting
Prompt example A
Residual RMSNorm plus GELU.
Write a fused Triton kernel for add_rmsnorm_gelu.
Inputs are x, residual, and weight, all CUDA tensors.
Compute RMSNorm over each row after x + residual, multiply by weight, then apply GELU.
Use fp32 accumulation for the row reduction.
Return exactly one fenced python code block with run(x, residual, weight).
Prompt example B
Stable softmax.
Write a fused Triton kernel for row-wise softmax.
Input x is a CUDA tensor shaped [M, N].
Use the stable max-subtraction form.
Return exactly one fenced python code block with run(x).
Do not include explanation text outside the code block.
Prompt example C
SwiGLU gate.
Write a fused Triton kernel for swiglu.
Inputs are gate and up tensors with the same shape.
Compute silu(gate) * up elementwise.
Return exactly one fenced python code block with run(gate, up).
Keep the launch grid simple and contiguous-row friendly.
7. Verify a generated kernel
Never score the model output by eye.
Save the candidate as `candidate.py`, then send it to the referee. The result object reports correctness, latency, and speedups vs eager, torch.compile, and max-autotune.
import pathlib
import sys
sys.path.insert(0, "referee")
from harness import evaluate_inprocess_full
kernel_src = pathlib.Path("candidate.py").read_text()
result = evaluate_inprocess_full(kernel_src, "add_rmsnorm_gelu", n_shapes=2, n_iters=30)
print(result.to_dict())
A kernel is submission-grade only if `status` is `ok`, correctness is true, and the compiler baselines are present. The strongest number is `speedup_maxauto` because that compares against torch.compile max-autotune.
8. What the referee can certify
Known operations only, by design.
The referee needs a PyTorch reference. It certifies norm and activation fusions plus named transformer operations. Unknown math can still be interesting, but this app will not label it a verified win until there is a reference spec.
"""
APP_HTML = (APP_HTML
.replace("__KM_REPLAYS__", REPLAYS_JSON)
.replace("__KM_LB_ONE__", lb_url(ONE_B))
.replace("__KM_LB_PRO__", lb_url(PRO)))
APP_JS = r"""
() => {
const ACTS = {
gelu:{lbl:'GELU',verb:'smoothly bends',info:'GELU is the gentle bend used inside many transformers. It lets a little of the negative side through.',f:x=>0.5*x*(1+Math.tanh(0.7978845608*(x+0.044715*x*x*x)))},
silu:{lbl:'SiLU',verb:'swishes',info:'SiLU, also called Swish, is x times sigmoid(x). It is common in LLaMA style feed-forward blocks.',f:x=>x/(1+Math.exp(-x))},
relu:{lbl:'ReLU',verb:'clips negatives',info:'ReLU keeps positive values and clips negative values to zero. It is fast, classic, and direct.',f:x=>Math.max(x,0)},
tanh:{lbl:'Tanh',verb:'squashes to -1..1',info:'Tanh squashes each value into the range from -1 to 1. Large values flatten near the top and bottom.',f:x=>Math.tanh(x)},
sigmoid:{lbl:'Sigmoid',verb:'squashes to 0..1',info:'Sigmoid squashes each value into the range from 0 to 1. It is often used for gates and probabilities.',f:x=>1/(1+Math.exp(-x))},
relu2:{lbl:'ReLU2',verb:'clips then squares',info:'Squared ReLU clips negative values to zero, then squares the remaining positive values.',f:x=>{const r=Math.max(x,0);return r*r;}},
gelu_erf:{lbl:'GELU exact',verb:'bends exactly',info:'Exact GELU uses the erf form of GELU. It behaves almost like the tanh approximation, but follows the exact formula.',f:x=>0.5*x*(1+Math.tanh(0.7978845608*(x+0.044715*x*x*x)))},
leaky_relu:{lbl:'Leaky ReLU',verb:'leaks negatives',info:'Leaky ReLU is like ReLU, but negative values leak through at a small scale instead of becoming zero.',f:x=>x>0?x:0.01*x},
elu:{lbl:'ELU',verb:'curves negatives',info:'ELU passes positive values through and curves negative values smoothly down toward -1.',f:x=>x>0?x:Math.exp(x)-1},
mish:{lbl:'Mish',verb:'self gates',info:'Mish is a smooth self-gating activation, x times tanh of softplus(x).',f:x=>x*Math.tanh(Math.log(1+Math.exp(x)))},
};
const NORMS = {
rmsnorm:{lbl:'RMSNorm',info:'RMSNorm makes a row of values a consistent size by dividing by root mean square, then scaling.'},
layernorm:{lbl:'LayerNorm',info:'LayerNorm centers a row of values by subtracting the average, then rescales by the spread.'},
};
const MEM = {info:'Residual add means adding the input back before the next operation. This is the skip path used in deep models.'};
const CLASSICS = {
softmax:{lbl:'Softmax',info:'Softmax turns a row of scores into probabilities that add to 1.'},
softmax_scale:{lbl:'Scaled softmax',info:'Scaled softmax applies a scale before softmax. It is the attention score step used before attention weights.'},
swiglu:{lbl:'SwiGLU',info:'SwiGLU is a gated feed-forward operation: SiLU on the gate multiplied by the up projection.'},
geglu:{lbl:'GeGLU',info:'GeGLU is a gated feed-forward operation like SwiGLU, but with a GELU gate.'},
rmsnorm:{lbl:'RMSNorm',info:'RMSNorm on its own, without an activation after it.'},
layernorm:{lbl:'LayerNorm',info:'LayerNorm on its own, without an activation after it.'},
};
const INPUT = [0.6,-0.9,1.4,-0.3,0.2,-1.6,0.8,1.1,-0.5,0.0,1.7,-1.1,0.4,-0.2];
const root=document.getElementById('km');
if(!root||root.dataset.init)return;
root.dataset.init='1';
const $=s=>root.querySelector(s), $$=s=>Array.from(root.querySelectorAll(s));
const LB_URLS=[root.dataset.lbOne,root.dataset.lbPro].filter(Boolean);
const cleanText=s=>String(s??'').replace(/\u2013|\u2014/g,' - ').replace(/\u2192/g,'->').replace(/\u00d7/g,'x').replace(/[\u2600-\u27BF]/g,'').replace(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g,'').replace(/\s+/g,' ').trim();
const esc=s=>cleanText(s).replace(/[&<>"']/g,c=>({'&':'&','<':'<','>':'>','"':'"',"'":'''}[c]));
let REPLAYS={};
try{REPLAYS=JSON.parse(($('#km-replays-data')||{}).textContent||'{}');}catch(e){REPLAYS={};}
const REPLAY_META={
rmsnorm_gelu:{label:'RMSNorm to GELU',short:'RMSNorm GELU'},
softmax:{label:'Softmax',short:'Softmax'},
add_layernorm_silu:{label:'Residual plus LayerNorm to SiLU',short:'Residual LayerNorm SiLU'}
};
const REPLAY_ORDER=['rmsnorm_gelu','softmax','add_layernorm_silu'].filter(k=>REPLAYS[k]);
const state={tab:'build',mode:'build',slots:{mem:false,norm:'rmsnorm',act:'gelu',act2:null},classic:'softmax'};
let buildTries=4, expertTries=5, resolvedExpert=null, lastMintedMode='build';
let activeMintNonce=null, awaitingMint=false, pendingLBRefresh=false, lbRefreshTimer=null, lbPollTimer=null, lbUiTimer=null;
const HIST=[];
function animate(el,frames,opts){ if(el&&el.animate)el.animate(frames,opts); }
function labelRecipe(r){
if(!r)return 'unknown';
if(r.mode==='classic')return CLASSICS[r.classic]?.lbl||r.classic||'classic op';
const p=[];
if(r.residual)p.push('residual add');
p.push(NORMS[r.norm]?.lbl||r.norm);
(r.acts||[]).forEach(a=>p.push(ACTS[a]?.lbl||a));
return p.join(' + ');
}
function machineInfo(){
if(state.mode==='classic')return CLASSICS[state.classic].info;
const p=[];
p.push(state.slots.mem?'add input':'no residual add');
p.push(NORMS[state.slots.norm].lbl);
p.push(ACTS[state.slots.act].lbl);
if(state.slots.act2)p.push(ACTS[state.slots.act2].lbl);
return 'Current operation: '+p.join(' + ')+'. The model must fuse it into one verified kernel.';
}
function setInfo(t){
const info=$('#km-info');
if(!info)return;
info.textContent=cleanText(t||machineInfo());
animate(info,[{opacity:.45},{opacity:1}],{duration:180,easing:'cubic-bezier(0.16,1,0.3,1)'});
}
function positionTabPill(){
const tabs=$('#km-tabs'), pill=$('#km-tab-pill'), btn=$('.km-tab.on');
if(!tabs||!pill||!btn)return;
const tr=tabs.getBoundingClientRect(), br=btn.getBoundingClientRect();
pill.style.width=br.width+'px';
pill.style.transform='translateX('+(br.left-tr.left-4)+'px)';
}
function showTab(tab){
state.tab=tab;
$$('.km-tab').forEach(b=>b.classList.toggle('on',b.dataset.tab===tab));
$$('.km-page').forEach(p=>{p.hidden=p.dataset.page!==tab;});
requestAnimationFrame(positionTabPill);
if(tab==='expert')resolveExpert();
if(tab==='lb'){refreshLB(true); setLBPolling(true);}
else setLBPolling(false);
}
$$('.km-tab').forEach(b=>b.addEventListener('click',()=>showTab(b.dataset.tab)));
window.addEventListener('resize',positionTabPill);
function optionInfo(slot,key){
if(slot==='mem')return key==='add'?MEM.info:'No residual add. The kernel starts from the input row and does not add a skip value.';
if(slot==='norm')return NORMS[key].info;
if(slot==='act'||slot==='act2')return key==='none'?'No second activation. The pipeline stops after the first activation.':ACTS[key].info;
return '';
}
function optionLabel(slot,key){
if(slot==='mem')return key==='add'?'add input':'none';
if(slot==='norm')return NORMS[key].lbl;
if(slot==='act'||slot==='act2')return key==='none'?'none':ACTS[key].lbl;
return key;
}
function selectedKey(slot){
if(slot==='mem')return state.slots.mem?'add':'none';
if(slot==='act2')return state.slots.act2||'none';
return state.slots[slot];
}
function setSlot(slot,key,quiet){
if(slot==='mem')state.slots.mem=key==='add';
else if(slot==='act2')state.slots.act2=key==='none'?null:key;
else state.slots[slot]=key;
const wrap=root.querySelector('.km-pick[data-slot="'+slot+'"]');
if(wrap)wrap.querySelectorAll('.km-opt').forEach(b=>b.classList.toggle('on',b.dataset.key===selectedKey(slot)));
if(!quiet)setInfo(optionInfo(slot,key));
render();
}
function renderPickers(){
$$('.km-pick[data-slot]').forEach(wrap=>{
const slot=wrap.dataset.slot;
let keys=[];
if(slot==='mem')keys=['none','add'];
if(slot==='norm')keys=Object.keys(NORMS);
if(slot==='act')keys=Object.keys(ACTS);
if(slot==='act2')keys=['none'].concat(Object.keys(ACTS));
wrap.innerHTML='';
keys.forEach(key=>{
const b=document.createElement('button');
b.type='button';
b.className='km-opt';
b.dataset.key=key;
b.textContent=optionLabel(slot,key);
b.title=optionInfo(slot,key);
b.classList.toggle('on',key===selectedKey(slot));
b.addEventListener('mouseenter',()=>setInfo(optionInfo(slot,key)));
b.addEventListener('focus',()=>setInfo(optionInfo(slot,key)));
b.addEventListener('click',()=>setSlot(slot,key));
wrap.appendChild(b);
});
});
}
function renderClassics(){
const cwrap=$('#km-classics');
if(!cwrap)return;
cwrap.innerHTML='';
Object.entries(CLASSICS).forEach(([k,v])=>{
const b=document.createElement('button');
b.type='button';
b.className='km-opt';
b.dataset.key=k;
b.textContent=v.lbl;
b.title=v.info;
b.classList.toggle('on',state.classic===k);
b.addEventListener('mouseenter',()=>setInfo(v.info));
b.addEventListener('focus',()=>setInfo(v.info));
b.addEventListener('click',()=>{
state.classic=k;
cwrap.querySelectorAll('.km-opt').forEach(x=>x.classList.toggle('on',x===b));
setInfo(v.info);
render();
});
cwrap.appendChild(b);
});
}
function setComposerMode(mode){
state.mode=mode;
$$('.km-segb').forEach(b=>b.classList.toggle('on',b.dataset.mode===mode));
const compose=$('#km-compose'), classic=$('#km-classic');
if(compose)compose.hidden=mode!=='build';
if(classic)classic.hidden=mode!=='classic';
setInfo(machineInfo());
render();
}
$$('.km-segb').forEach(b=>b.addEventListener('click',()=>setComposerMode(b.dataset.mode)));
function setupStepper(id,outId,initial,onChange){
const wrap=$(id), out=$(outId);
if(!wrap)return;
wrap.innerHTML='';
const set=v=>{
onChange(v);
if(out)out.textContent=String(v);
wrap.querySelectorAll('button').forEach(b=>b.classList.toggle('on',+b.dataset.v===v));
};
for(let i=1;i<=8;i++){
const b=document.createElement('button');
b.type='button';
b.dataset.v=String(i);
b.textContent=String(i);
b.addEventListener('click',()=>set(i));
wrap.appendChild(b);
}
set(initial);
}
function setupTemp(id,outId){
const el=$(id), out=$(outId);
if(!el)return;
const sync=()=>{if(out)out.textContent=(+el.value/100).toFixed(2);};
el.addEventListener('input',sync);
sync();
}
function syncModePair(localSel,proSel){
const local=$(localSel), pro=$(proSel);
if(!local||!pro)return;
const sync=()=>{
if(local.checked){pro.checked=false;pro.disabled=true;}
else pro.disabled=false;
};
local.addEventListener('change',sync);
pro.addEventListener('change',()=>{if(pro.checked)local.checked=false;sync();});
sync();
}
const svg=$('#km-svg'), W=320, H=150, PAD=12, n=INPUT.length;
const xs=i=>PAD+i*(W-2*PAD)/(n-1);
const ys=v=>{const t=Math.max(-2.2,Math.min(2.2,v));return H/2-t*(H/2-PAD)/2.2;};
let dots=[], line=null, cur=INPUT.slice(), anim=null;
function buildSvg(){
if(!svg)return;
svg.innerHTML='';
const mid=document.createElementNS('http://www.w3.org/2000/svg','line');
mid.setAttribute('x1',0); mid.setAttribute('x2',W); mid.setAttribute('y1',H/2); mid.setAttribute('y2',H/2);
mid.setAttribute('stroke','#1f2d26'); mid.setAttribute('stroke-width','1'); svg.appendChild(mid);
line=document.createElementNS('http://www.w3.org/2000/svg','polyline');
line.setAttribute('fill','none'); line.setAttribute('stroke','#34e0a1'); line.setAttribute('stroke-width','2'); line.setAttribute('stroke-linejoin','round');
svg.appendChild(line);
dots=cur.map((v,i)=>{const c=document.createElementNS('http://www.w3.org/2000/svg','circle');c.setAttribute('r','3.4');c.setAttribute('cx',xs(i));svg.appendChild(c);return c;});
paint(cur);
}
function paint(v){
if(!line)return;
line.setAttribute('points',v.map((x,i)=>xs(i)+','+ys(x)).join(' '));
v.forEach((x,i)=>{if(dots[i]){dots[i].setAttribute('cy',ys(x));dots[i].setAttribute('fill',x>=0?'#34e0a1':'#ff7a7a');}});
}
const ease=t=>1-Math.pow(1-t,3);
function morph(to,ms=480){
if(!line)return;
const from=cur.slice(), t0=performance.now();
if(anim)cancelAnimationFrame(anim);
const step=t=>{
const k=Math.min(1,(t-t0)/ms), e=ease(k);
cur=from.map((f,i)=>f+(to[i]-f)*e);
paint(cur);
if(k<1)anim=requestAnimationFrame(step); else cur=to.slice();
};
anim=requestAnimationFrame(step);
}
function normalize(a,kind){
const m=a.reduce((s,x)=>s+x,0)/a.length;
if(kind==='layernorm'){
const v=a.reduce((s,x)=>s+(x-m)*(x-m),0)/a.length;
return a.map(x=>(x-m)/Math.sqrt(v+1e-5));
}
const r=Math.sqrt(a.reduce((s,x)=>s+x*x,0)/a.length+1e-6);
return a.map(x=>x/r);
}
function classicViz(key){
let v=INPUT.slice();
if(key.startsWith('softmax')){
const scale=key==='softmax_scale'?0.7:1;
const z=v.map(x=>x*scale), mx=Math.max(...z), e=z.map(x=>Math.exp(x-mx)), s=e.reduce((a,b)=>a+b,0);
v=e.map(x=>x/s*4-0.5);
}else if(key==='rmsnorm'||key==='layernorm'){
v=normalize(v,key);
}else if(key==='swiglu'){
v=v.map(x=>(x/(1+Math.exp(-x)))*(0.7+Math.abs(x)*0.3));
}else if(key==='geglu'){
v=v.map(x=>ACTS.gelu.f(x)*(0.7+Math.abs(x)*0.3));
}
return v;
}
function pipeline(){
if(state.mode==='classic')return [{l:'input',v:INPUT.slice()},{l:CLASSICS[state.classic].lbl,v:classicViz(state.classic)}];
const st=[{l:'input',v:INPUT.slice()}];
let v=INPUT.slice();
if(state.slots.mem){v=v.map((x,i)=>x+INPUT[(i+3)%n]*0.5);st.push({l:'residual add',v:v.slice()});}
v=normalize(v,state.slots.norm);st.push({l:NORMS[state.slots.norm].lbl,v:v.slice()});
v=v.map(ACTS[state.slots.act].f);st.push({l:ACTS[state.slots.act].lbl+' '+ACTS[state.slots.act].verb,v:v.slice()});
if(state.slots.act2){v=v.map(ACTS[state.slots.act2].f);st.push({l:ACTS[state.slots.act2].lbl+' second pass',v:v.slice()});}
return st;
}
let playing=false;
function play(){
if(playing)return;
playing=true;
const st=pipeline();
let i=0;
const stage=$('#km-stage-label');
const next=()=>{
if(i>=st.length){playing=false;return;}
if(stage){stage.textContent=cleanText(st[i].l);animate(stage,[{opacity:0,transform:'translateY(4px)'},{opacity:1,transform:'none'}],{duration:200,easing:'cubic-bezier(0.16,1,0.3,1)'});}
morph(st[i].v);
i++;
setTimeout(next,680);
};
next();
}
function render(){
const st=pipeline(), stage=$('#km-stage-label');
if(stage)stage.textContent=cleanText(st[st.length-1].l);
morph(st[st.length-1].v,360);
updateReplayOffer();
}
function buildRecipe(){
const tempEl=$('#km-temp'), proEl=$('#km-pro'), localEl=$('#km-local'), nameEl=$('#km-name');
const local=!!(localEl&&localEl.checked);
return {
mode:state.mode,
norm:state.slots.norm,
residual:!!state.slots.mem,
acts:[state.slots.act].concat(state.slots.act2?[state.slots.act2]:[]),
classic:state.classic,
spark:'',
k:buildTries,
temp:(+(tempEl?tempEl.value:70))/100,
pro:!local&&!!(proEl&&proEl.checked),
local,
name:nameEl?nameEl.value:''
};
}
function expertRecipe(){
if(!resolvedExpert||resolvedExpert.unknown)return null;
const tempEl=$('#km-xtemp'), proEl=$('#km-xpro'), localEl=$('#km-xlocal'), briefEl=$('#km-xbrief');
const local=!!(localEl&&localEl.checked);
const r=Object.assign({},resolvedExpert.recipe);
r.k=expertTries;
r.temp=(+(tempEl?tempEl.value:70))/100;
r.pro=!local&&!!(proEl&&proEl.checked);
r.local=local;
r.name='expert';
r.spark=briefEl?briefEl.value:'';
return r;
}
function currentRecipe(){
return state.tab==='expert'?(expertRecipe()||buildRecipe()):buildRecipe();
}
function bridge(recipe){
const inp=document.querySelector('#km_recipe textarea');
if(!inp)return;
// _n is a nonce: it makes the recipe JSON differ on every mint so Gradio always registers a
// value change and re-runs do_mint. Without it, minting the same recipe twice in a row left
// the textarea value unchanged and the second submit was dropped (the "needs a double-click"
// bug). The backend echoes it back so stale blobs from the previous run cannot repaint the old
// result over the new loading card while Gradio is preparing the next response.
const nonce=Date.now().toString(36)+'-'+Math.random().toString(36).slice(2);
const payload=Object.assign({},recipe,{_n:nonce});
activeMintNonce=nonce;
awaitingMint=true;
const set=Object.getOwnPropertyDescriptor(window.HTMLTextAreaElement.prototype,'value').set;
set.call(inp,JSON.stringify(payload));
inp.dispatchEvent(new Event('input',{bubbles:true}));
// NOTE: we intentionally do NOT dispatch 'change' here. The leaderboard fetch is triggered
// separately via refreshLB() so it never competes with do_mint in Gradio's queue.
setTimeout(()=>{
const g=document.querySelector('#km_go button')||document.querySelector('#km_go');
if(g)g.click();
},80);
}
function setLBStatus(text,kind){
const s=$('#km-lb-status');
if(!s)return;
s.textContent=text;
s.classList.toggle('bad',kind==='bad');
s.classList.toggle('warn',kind==='warn');
}
function setLBButton(on){
const b=$('#km-lb-refresh');
if(!b)return;
b.disabled=on;
b.textContent=on?'Refreshing...':'Refresh';
}
async function refreshLBDirect(){
const rows=[], seen=new Set(), sources=[];
for(const url of LB_URLS){
try{
const res=await fetch(url,{cache:'no-store'});
if(!res.ok)continue;
const data=await res.json();
const got=Array.isArray(data.rows)?data.rows:[];
if(got.length)sources.push(url);
got.forEach(row=>{
const key=[row.op,row.name,row.ts,row.speedup_compile,row.speedup_eager].join('|');
if(seen.has(key))return;
seen.add(key);
rows.push(row);
});
}catch(e){}
}
if(!sources.length)throw new Error('direct leaderboard fetch failed');
renderLB(rows,{fetched_at:Date.now()/1000,sources});
}
function refreshLB(manual=false){
if(awaitingMint){
pendingLBRefresh=true;
if(manual)setLBStatus('Queued until the current mint finishes.','warn');
return;
}
if(manual||state.tab==='lb'){
setLBButton(true);
setLBStatus('Checking leaderboard...','');
if(lbUiTimer)clearTimeout(lbUiTimer);
lbUiTimer=setTimeout(()=>{
refreshLBDirect().catch(()=>{
setLBButton(false);
setLBStatus('Still waiting for the Gradio refresh bridge. Direct browser fetch was blocked. Try once more or reopen the tab.','warn');
});
},8000);
}
const btn=document.querySelector('#km_lb_btn button')||document.querySelector('#km_lb_btn');
if(btn)btn.click();
else{
setLBButton(false);
setLBStatus('Refresh bridge is not mounted yet. Reopen the Leaderboard tab.','bad');
}
}
function scheduleLBRefresh(delay=700){
pendingLBRefresh=false;
if(lbRefreshTimer)clearTimeout(lbRefreshTimer);
lbRefreshTimer=setTimeout(()=>refreshLB(false),delay);
}
function setLBPolling(on){
if(lbPollTimer){clearInterval(lbPollTimer);lbPollTimer=null;}
if(on)lbPollTimer=setInterval(()=>{if(state.tab==='lb')refreshLB(false);},15000);
}
let mintSafetyTimer=null;
function setMinting(on){
['#km-build','#km-xbuild'].forEach(id=>{const b=$(id); if(b){b.disabled=on; b.classList.toggle('minting',on);}});
if(mintSafetyTimer){clearTimeout(mintSafetyTimer); mintSafetyTimer=null;}
// Bulletproof re-enable: showResult() clears this, but if a result never arrives (hung backend)
// the button must not stay dead. 4 min is longer than any real mint.
if(on) mintSafetyTimer=setTimeout(()=>setMinting(false), 240000);
}
function replayName(key){return (REPLAY_META[key]&&REPLAY_META[key].label)||(REPLAYS[key]&&cleanText(REPLAYS[key].op))||key;}
function matchingReplayKey(){
if(state.mode==='classic'&&state.classic==='softmax'&&REPLAYS.softmax)return 'softmax';
if(state.mode==='build'&&!state.slots.mem&&state.slots.norm==='rmsnorm'&&state.slots.act==='gelu'&&!state.slots.act2&&REPLAYS.rmsnorm_gelu)return 'rmsnorm_gelu';
if(state.mode==='build'&&state.slots.mem&&state.slots.norm==='layernorm'&&state.slots.act==='silu'&&!state.slots.act2&&REPLAYS.add_layernorm_silu)return 'add_layernorm_silu';
return null;
}
function updateReplayOffer(){
const btn=$('#km-replay');
if(!btn||!REPLAY_ORDER.length)return;
const key=matchingReplayKey()||REPLAY_ORDER[0];
btn.dataset.replayKey=key;
btn.textContent=matchingReplayKey()?'Watch this recorded mint':'Watch a recorded mint';
$$('.km-rpick').forEach(b=>b.classList.toggle('on',b.dataset.replayKey===key));
}
function renderReplayChoices(){
const wrap=$('#km-replay-picks');
if(!wrap)return;
if(!REPLAY_ORDER.length){wrap.hidden=true;return;}
wrap.hidden=false;
wrap.innerHTML=REPLAY_ORDER.map(k=>"").join('');
updateReplayOffer();
}
const sp=x=>Number.isFinite(+x)?(+x).toFixed(1).replace(/\.0$/,''):'?';
const sp2=x=>Number.isFinite(+x)?(+x).toFixed(2):'?';
const sp3=x=>Number.isFinite(+x)?(+x).toFixed(3).replace(/0+$/,'').replace(/\.$/,''):'?';
const isOk=s=>String(s||'').toLowerCase()==='ok'||String(s||'').toLowerCase()==='pass';
function attemptSummary(d){
const statuses=Array.isArray(d.statuses)?d.statuses:[];
const passed=statuses.length?statuses.filter(isOk).length:(Number.isFinite(+d.n_verified)?+d.n_verified:0);
const total=statuses.length||(Number.isFinite(+d.k)?+d.k:passed||0);
return {statuses,passed,total};
}
function attemptChips(d,limit){
const a=attemptSummary(d), statuses=a.statuses.slice(0,limit??a.statuses.length);
if(!statuses.length)return '';
return "
"+(local?'local offline mint':'live mint')+""+(wake?'The live model is still waking':(local?'The local mint did not finish':'The live mint did not finish'))+"
"+
"
"+(wake?'The backend scales to zero, so the first live mint can take about 90 seconds. The replay below is a real earlier mint, not a mockup.':(local?esc(d.error):'Backend said: '+esc(d.error)))+"
"+runLabel+"Verified by the referee"+esc(d.op||'kernel')+"
"+
"
"+a.passed+" of "+(a.total||'?')+" attempts passed the referee
"+
attemptChips(d)+
compilerGuard+
// Every comparison as its own big number, strongest (most honest) baseline first, eager last
// and labelled 'unfused' so the side-by-side is self-explanatory: the model's real edge is
// the ~1.1x over the compiler; the big eager number is mostly the fusion win.
"
";
animate(v.firstElementChild,[{transform:'scale(0.96)',opacity:0},{transform:'scale(1)',opacity:1}],{duration:300,easing:'cubic-bezier(0.16,1,0.3,1)'});
setSource(d,recorded?'recorded':'live',mode);
if(!recorded&&mode==='build'){
HIST.unshift(""+esc(d.op||'kernel')+""+sp(d.speedup_maxauto||d.speedup_compile||d.speedup_eager)+"x vs compiler");
const hist=$('#km-hist');
if(hist)hist.innerHTML=HIST.slice(0,10).map(h=>"
"+h+"
").join('');
}
// The leaderboard refresh is deliberately decoupled from the mint call. Once the result has
// arrived, the queue is free again, so refresh the board without hiding the just-rendered result.
if(!recorded)scheduleLBRefresh(900);
}
let replayTimer=null;
function stopReplay(){if(replayTimer){clearTimeout(replayTimer);replayTimer=null;}}
function startReplay(key){
const d=REPLAYS[key];
if(!d)return;
lastMintedMode=state.tab==='expert'?'expert':'build';
stopBuild();
stopReplay();
play();
const t=target(lastMintedMode), v=t.verdict;
if(!v)return;
clearSource(lastMintedMode);
const statuses=Array.isArray(d.statuses)?d.statuses:[];
const total=statuses.length||(Number.isFinite(+d.k)?+d.k:4);
const passed=statuses.length?statuses.filter(isOk).length:(Number.isFinite(+d.n_verified)?+d.n_verified:0);
v.innerHTML="
"+
"
RUN
recorded mint"+esc(replayName(key))+"This is a genuine earlier backend run. Mint still starts a live run.
"+
"
loading recorded attempts...
"+
""+
"
"+
"
The model took "+(Number.isFinite(+d.gen_seconds)?sp(d.gen_seconds)+'s':'real time')+" on this recorded run. The replay shows the referee loop without waiting for cold start.
";
const att=$('#km-replay-attempts'), stage=$('#km-replay-stage'), fill=$('#km-replay-fill');
let i=0;
const tick=()=>{
if(i"+(ok?'OK':'FAIL')+" attempt "+(i+1)+", "+esc(st)+"");
if(fill)fill.style.width=Math.round(((i+1)/(total+1))*100)+'%';
i++;
replayTimer=setTimeout(tick,360);
return;
}
if(stage)stage.textContent='best verified kernel selected';
if(fill)fill.style.width='100%';
replayTimer=setTimeout(()=>showResult(d,{recorded:true,mode:lastMintedMode,replayKey:key}),420);
};
replayTimer=setTimeout(tick,180);
}
let buildTimer=null;
function startBuild(mode,recipe){
stopReplay();
const t=target(mode), v=t.verdict;
if(!v)return;
clearSource(mode);
const k=recipe.k||4, started=Date.now(), replayKey=mode==='build'?(matchingReplayKey()||REPLAY_ORDER[0]||''):(REPLAY_ORDER[0]||'');
const local=!!recipe.local;
const stages=local?['loading local GGUF...','llama.cpp drafting '+Math.min(k,2)+' kernels...','compiling Triton on this Space GPU...','checking correctness vs PyTorch...','timing vs torch.compile max-autotune...']:
['model waking...','drafting '+k+' kernels...','compiling Triton...','checking correctness vs PyTorch...','timing vs torch.compile max-autotune...'];
const stepHtml=stages.map((s,i)=>""+esc(s)+"").join('');
v.innerHTML="
"+esc(stages[0])+""+
"
"+stepHtml+"
"+
"
"+
""+(local?"Local mode drafts up to two kernels with llama.cpp on this Space's GPU, then verifies them with the in-process referee and times them against PyTorch eager, torch.compile, and max-autotune. First run may also download/cache the GGUF.":"The model drafts up to "+k+" kernels. The referee compiles, correctness-checks, and times every candidate. First build of a session can take about 90 seconds while the model wakes.")+""+
(replayKey?"":"")+"
";
// Honest progress: do_mint is one blocking call with no progress signal, so we cannot show a
// live readout. Instead we walk the referee's real phases ONCE on a rough estimate, monotonically
// (the index only ever increases), and park on the final phase until the result arrives. It never
// loops back or resets the way the old modulo version did. A gentle pulse keeps the viz alive.
const stageStart=[0,12,18,23,28]; // seconds at which each phase is assumed to begin
let baseVals=null; try{const st=pipeline();baseVals=st[st.length-1].v.slice();}catch(e){}
buildTimer=setInterval(()=>{
const elapsed=(Date.now()-started)/1000;
let ix=0; for(let j=0;j=stageStart[j])ix=j;}
ix=Math.min(ix,stages.length-1);
const label=(ix===stages.length-1&&elapsed>45)?'still timing vs torch.compile max-autotune...':stages[ix];
const el=$('#km-bstage');
if(el&&el.textContent!==label){el.textContent=label;animate(el,[{opacity:.35,transform:'translateY(3px)'},{opacity:1,transform:'none'}],{duration:220,easing:'cubic-bezier(0.16,1,0.3,1)'});}
$$('#km-build-steps span').forEach((s,j)=>{s.classList.toggle('on',j===ix);s.classList.toggle('done',jx*p),650);}
},650);
}
function stopBuild(){
if(buildTimer){clearInterval(buildTimer);buildTimer=null;}
const f=$('#km-barfill');
if(f)f.style.width='100%';
render();
}
function renderLB(rows,meta={}){
const el=$('#km-lb-view');
if(!el)return;
if(lbUiTimer){clearTimeout(lbUiTimer);lbUiTimer=null;}
setLBButton(false);
const fetched=Number(meta.fetched_at||0);
const stamp=fetched?new Date(fetched*1000).toLocaleTimeString([], {hour:'2-digit', minute:'2-digit', second:'2-digit'}):'just now';
const sourceCount=Array.isArray(meta.sources)?meta.sources.length:0;
setLBStatus('Last checked '+stamp+(sourceCount?' from '+sourceCount+' endpoint'+(sourceCount>1?'s':'')+'.':'.'),'');
if(!rows.length){el.innerHTML='No kernels yet. Be the first.';return;}
el.innerHTML="
#
machine
builder
vs compiler
vs PyTorch
"+
rows.slice(0,25).map((r,i)=>"
"+(i+1)+"
"+esc(r.op||'kernel')+"
"+esc(r.name||'anonymous')+"
"+sp2(r.speedup_compile)+"x
"+sp(r.speedup_eager)+"x
").join('')+"
";
}
function watch(id,cb){
const el=document.querySelector(id);
if(!el)return;
const read=()=>{
const s=el.querySelector('[data-blob]');
if(s){try{cb(JSON.parse(atob(s.dataset.blob)));}catch(e){}}
};
new MutationObserver(read).observe(el,{childList:true,subtree:true});
read();
}
setTimeout(()=>{watch('#km_result',showResult);watch('#km_lb',d=>renderLB(d.rows||[],d));},250);
const rf=$('#km-lb-refresh');
if(rf)rf.addEventListener('click',()=>refreshLB(true));
const CLASSIC_TERMS=[
{key:'softmax_scale',terms:['scaled softmax','scale softmax','attention scale']},
{key:'swiglu',terms:['swiglu','swi glu']},
{key:'geglu',terms:['geglu','ge glu']},
{key:'softmax',terms:['softmax']}
];
const ACT_TERMS=[
{key:'gelu_erf',terms:['gelu_erf','exact gelu','erf gelu']},
{key:'relu2',terms:['relu2','relu squared','squared relu','square relu']},
{key:'leaky_relu',terms:['leaky_relu','leaky relu','leaky']},
{key:'silu',terms:['silu','swish','swishes']},
{key:'sigmoid',terms:['sigmoid']},
{key:'gelu',terms:['gelu']},
{key:'relu',terms:['relu']},
{key:'tanh',terms:['tanh']},
{key:'elu',terms:['elu']},
{key:'mish',terms:['mish']}
];
function termMatches(text,items){
// word-boundary match so 'elu' does not fire inside 'gelu', 'relu' not inside 'leaky_relu', etc.
// (underscores and digits are word chars, so \b correctly keeps relu2 and gelu_erf distinct.)
const found=[];
items.forEach(item=>{
let best=-1, bestLen=0;
item.terms.forEach(term=>{
const re=new RegExp('\\b'+term.replace(/[.*+?^${}()|[\]\\]/g,'\\$&')+'\\b');
const m=re.exec(text);
if(m&&(best<0||m.indexbestLen))){best=m.index;bestLen=term.length;}
});
if(best>=0)found.push({key:item.key,ix:best,len:bestLen});
});
found.sort((a,b)=>a.ix-b.ix||b.len-a.len);
// drop a shorter match that overlaps a longer one ('relu' inside 'leaky relu', 'softmax'
// inside 'scaled softmax'), keeping the first/longest at each span.
const kept=[];
found.forEach(m=>{ if(!kept.some(k=>m.ix{if(!acts.includes(m.key)&&acts.length<2)acts.push(m.key);});
if(!normHit&&!acts.length){
return {unknown:true,message:'Could not map this brief. Use RMSNorm, LayerNorm, residual, and a single supported activation, or a named op such as softmax, scaled softmax, SwiGLU, or GeGLU.'};
}
if(!acts.length){
if(normHit&&!residual){
return {unknown:false,label:CLASSICS[normHit].lbl,recipe:{mode:'classic',norm:normHit,residual:false,acts:[],classic:normHit,spark:raw}};
}
return {unknown:true,message:'This brief needs a supported activation after the residual or normalization so the referee can verify it.'};
}
const recipe={mode:'build',norm:normHit||'rmsnorm',residual,acts,classic:'softmax',spark:raw};
return {unknown:false,label:labelRecipe(recipe),recipe};
}
function resolveExpert(){
const box=$('#km-xbrief'), res=$('#km-xresolved'), out=res?res.querySelector('.v'):null, btn=$('#km-xbuild');
if(!box||!res||!out)return;
resolvedExpert=resolveBrief(box.value);
res.classList.toggle('unknown',!!resolvedExpert.unknown);
out.textContent=resolvedExpert.unknown?resolvedExpert.message:resolvedExpert.label;
if(btn)btn.disabled=!!resolvedExpert.unknown;
}
const OP_TAGS=[
['softmax','softmax over each row'],
['scaled_softmax','scaled softmax over each row'],
['swiglu','SwiGLU gated feed forward'],
['geglu','GeGLU gated feed forward'],
['rmsnorm','RMSNorm only'],
['layernorm','LayerNorm only'],
['add_rmsnorm_gelu','fused RMSNorm with residual and GELU'],
['layernorm_silu','fused LayerNorm with SiLU'],
['rmsnorm_mish','fused RMSNorm with Mish'],
['add_layernorm_silu','fused LayerNorm with residual and SiLU'],
['rmsnorm_gelu_relu2','fused RMSNorm with GELU then squared ReLU']
].concat(Object.keys(ACTS).map(k=>[k,ACTS[k].lbl+' activation']));
function renderOpTags(sel,interactive){
const wrap=$(sel);
if(!wrap)return;
wrap.innerHTML='';
OP_TAGS.forEach(([label,brief])=>{
const b=document.createElement('button');
b.type='button';
b.className='km-optag';
b.textContent=label;
b.dataset.brief=brief;
if(interactive)b.addEventListener('click',()=>{
const box=$('#km-xbrief');
if(box){box.value=brief;box.dispatchEvent(new Event('input',{bubbles:true}));box.focus();}
});
wrap.appendChild(b);
});
}
function renderGlossary(){
const lb=$('#km-learn-blocks');
if(!lb)return;
const rows=[['Residual',MEM.info],...Object.values(NORMS).map(v=>[v.lbl,v.info]),...Object.values(ACTS).map(v=>[v.lbl,v.info]),...Object.values(CLASSICS).map(v=>[v.lbl,v.info])];
lb.innerHTML=rows.map(([n,d])=>"
"+esc(n)+""+esc(d)+"
").join('');
}
function setupCopyButtons(){
$$('.km-copy').forEach(btn=>btn.addEventListener('click',()=>{
const block=btn.closest('.km-codeblock');
if(!block)return;
const clone=block.cloneNode(true), copy=clone.querySelector('.km-copy');
if(copy)copy.remove();
const text=clone.textContent.trim();
const done=()=>{const old=btn.textContent;btn.textContent='Copied';setTimeout(()=>{btn.textContent=old;},900);};
if(navigator.clipboard&&navigator.clipboard.writeText)navigator.clipboard.writeText(text).then(done).catch(done);
else done();
}));
}
root.addEventListener('click',e=>{
const b=e.target.closest('[data-replay-key]');
if(!b||!root.contains(b))return;
const key=b.dataset.replayKey;
if(!REPLAYS[key])return;
e.preventDefault();
startReplay(key);
});
const xb=$('#km-xbrief');
if(xb)xb.addEventListener('input',resolveExpert);
// (second-activation builder removed: the kernelsmith grammar is norm + one activation)
const buildBtn=$('#km-build');
if(buildBtn)buildBtn.addEventListener('click',()=>{
lastMintedMode='build';
const r=buildRecipe();
animate(buildBtn,[{transform:'scale(0.97)'},{transform:'scale(1)'}],{duration:140,easing:'cubic-bezier(0.16,1,0.3,1)'});
play();
setMinting(true);
startBuild('build',r);
bridge(r);
});
const xBuild=$('#km-xbuild');
if(xBuild)xBuild.addEventListener('click',()=>{
resolveExpert();
const r=expertRecipe();
if(!r)return;
lastMintedMode='expert';
animate(xBuild,[{transform:'scale(0.97)'},{transform:'scale(1)'}],{duration:140,easing:'cubic-bezier(0.16,1,0.3,1)'});
setMinting(true);
startBuild('expert',r);
bridge(r);
});
setupStepper('#km-tries','#km-tries-v',4,v=>{buildTries=v;});
setupStepper('#km-xtries','#km-xtries-v',5,v=>{expertTries=v;});
setupTemp('#km-temp','#km-temp-v');
setupTemp('#km-xtemp','#km-xtemp-v');
syncModePair('#km-local','#km-pro');
syncModePair('#km-xlocal','#km-xpro');
// The build and expert pages each have their own "Local (offline)" toggle. Local mode is the ONLY
// path that computes the max-autotune number, so a page in Modal mode shows just 2 comparisons.
// Keep both toggles in lockstep so build and expert always run the same way and show the same
// set of comparison numbers (the guard stops the change events from ping-ponging).
(function(){
const a=$('#km-local'), b=$('#km-xlocal');
if(!a||!b)return;
a.addEventListener('change',()=>{ if(b.checked!==a.checked){ b.checked=a.checked; b.dispatchEvent(new Event('change')); } });
b.addEventListener('change',()=>{ if(a.checked!==b.checked){ a.checked=b.checked; a.dispatchEvent(new Event('change')); } });
})();
renderPickers();
renderClassics();
renderReplayChoices();
renderOpTags('#km-oplist',true);
renderOpTags('#km-docs-ops',false);
renderGlossary();
setupCopyButtons();
buildSvg();
setInfo(machineInfo());
resolveExpert();
showTab('build');
render();
}
"""
with gr.Blocks(title="Kernel Mint", head=f"") as demo:
gr.HTML(APP_HTML)
# bridge components: kept in the DOM (NOT visible=False, which removes them entirely so the
# custom JS can't reach the inner textarea/button) and hidden with CSS (#km_recipe etc.).
recipe_in = gr.Textbox(elem_id="km_recipe")
result_out = gr.HTML(elem_id="km_result")
lb_out = gr.HTML(elem_id="km_lb")
go = gr.Button("go", elem_id="km_go")
lb_btn = gr.Button("lb", elem_id="km_lb_btn")
# trigger_mode="multiple": the default ("once") silently DROPS a click that lands while a
# previous mint is still running, so minting one kernel after another needed two clicks and the
# old result lingered. The frontend also disables the Mint button during a mint (one at a time,
# no wasted ZeroGPU calls); this is the belt-and-suspenders so no click is ever swallowed.
go.click(do_mint, recipe_in, result_out, trigger_mode="multiple")
# Leaderboard refresh is its OWN trigger, decoupled from minting. Previously fetch_lb was wired
# to recipe_in.change, so every mint (which sets recipe_in) also fired a leaderboard fetch; on a
# slow backend that fetch sat ahead of do_mint in Gradio's queue and the mint appeared to do
# nothing until a second click. Now the JS clicks #km_lb_btn only when it actually wants the board.
lb_btn.click(fetch_lb, None, lb_out)
demo.load(fetch_lb, None, lb_out)
demo.load(None, None, None, js=APP_JS)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))