Spaces:
Running on Zero
Running on Zero
| """OUROBOROS Kernel Mint: a GPU-kernel demo (Gradio + custom JS). | |
| Compose a fused operation, and a 1B model writes a real Triton kernel for it while an | |
| immutable referee checks correctness against PyTorch and times it against torch.compile. The | |
| whole interactive surface is a custom JS widget in one gr.HTML; the mint runs through Python | |
| (no CORS) and hits the 1B model (default) or the 27B (Pro mode). | |
| """ | |
| from __future__ import annotations | |
| import base64 | |
| import json | |
| import os | |
| import re | |
| import sys | |
| import time | |
| import gradio as gr | |
| import requests | |
| # `spaces.GPU` only matters on ZeroGPU, where the GPU is granted lazily per call. On a dedicated | |
| # GPU Space (e.g. L4) or locally the GPU is always attached, so the decorator must be a plain | |
| # pass-through — otherwise it can fight the always-present CUDA context. Gate on the env var HF sets | |
| # only on ZeroGPU hardware, so the same app.py runs unchanged on ZeroGPU, a paid GPU, and a laptop. | |
| try: | |
| if not os.environ.get("SPACES_ZERO_GPU"): | |
| raise ImportError("not on ZeroGPU: use the no-op GPU shim") | |
| import spaces | |
| except Exception: | |
| class _SpacesShim: | |
| def GPU(*_args, **_kwargs): | |
| def _decorator(fn): | |
| return fn | |
| return _decorator | |
| spaces = _SpacesShim() | |
| ONE_B = os.environ.get("BACKEND_URL", "https://ymrohit--ouroboros-kernel-mint-mint-mint.modal.run") | |
| PRO = os.environ.get("BACKEND_PRO_URL", "https://ymrohit--ouroboros-kernel-mint-pro-mint-mint.modal.run") | |
| ROOT = os.path.dirname(__file__) | |
| REFEREE = os.path.join(ROOT, "referee") | |
| SEED_DIR = os.path.join(ROOT, "seed_kernels") | |
| LOCAL_FINE_TUNED_REPO = os.environ.get( | |
| "LOCAL_GGUF_REPO", "YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF") | |
| LOCAL_BASE_REPO = os.environ.get("LOCAL_GGUF_FALLBACK_REPO", "openbmb/MiniCPM5-1B-GGUF") | |
| LOCAL_QUANT_PREFS = tuple(q.strip() for q in os.environ.get( | |
| "LOCAL_GGUF_QUANTS", "Q5_K_M,Q6_K,Q4_K_M,Q8_0,F16,BF16").split(",") if q.strip()) | |
| LOCAL_MAX_ATTEMPTS = int(os.environ.get("LOCAL_MAX_ATTEMPTS", "2")) | |
| _LOCAL_LLM = None | |
| _LOCAL_LLM_PATH = None | |
| def _ensure_referee_path(): | |
| if REFEREE not in sys.path: | |
| sys.path.insert(0, REFEREE) | |
| LOCAL_SYS = ("You are an expert GPU kernel engineer. Write a single correct, fast Triton " | |
| "kernel. Output ONLY one fenced python code block defining `run(*inputs)` and " | |
| "its @triton.jit kernel. Accumulate reductions in float32. No prose.") | |
| # The model was trained (rl_kernelsmith.py Proposer.prompt) with a real per-op SEED kernel as the | |
| # style guide, and for almost every op that exemplar is rmsnorm's ROW-WISE reduction. We must hand | |
| # the model that SAME exemplar at inference, or it goes off-distribution and writes flat elementwise | |
| # code for reduction ops (wrong results). Seeds are bundled in ./seed_kernels; this inline copy is | |
| # only a fallback if a file is missing. | |
| _FALLBACK_EXEMPLAR = """# GOLD seed kernel: fused RMSNorm, one row per program, fp32 accumulation. | |
| @triton.jit | |
| def _rmsnorm_kernel(x_ptr, w_ptr, y_ptr, stride, N, eps, BLOCK: tl.constexpr): | |
| row = tl.program_id(0) | |
| x_ptr += row * stride | |
| y_ptr += row * stride | |
| acc = tl.zeros([BLOCK], dtype=tl.float32) | |
| for off in range(0, N, BLOCK): | |
| cols = off + tl.arange(0, BLOCK) | |
| x = tl.load(x_ptr + cols, mask=cols < N, other=0.0).to(tl.float32) | |
| acc += x * x | |
| rms = tl.rsqrt(tl.sum(acc) / N + eps) | |
| for off in range(0, N, BLOCK): | |
| cols = off + tl.arange(0, BLOCK) | |
| mask = cols < N | |
| x = tl.load(x_ptr + cols, mask=mask, other=0.0).to(tl.float32) | |
| w = tl.load(w_ptr + cols, mask=mask, other=0.0).to(tl.float32) | |
| tl.store(y_ptr + cols, (x * rms * w), mask=mask) | |
| def run(x, w): | |
| M, N = x.shape | |
| y = torch.empty_like(x) | |
| _rmsnorm_kernel[(M,)](x, w, y, x.stride(0), N, 1e-6, BLOCK=1024) | |
| return y | |
| """ | |
| def _load_seed_kernel(op: str) -> str: | |
| try: | |
| with open(os.path.join(SEED_DIR, f"{op}.py")) as f: | |
| return f.read() | |
| except OSError: | |
| return _FALLBACK_EXEMPLAR | |
| def extract_kernel(text: str) -> str: | |
| """Pull a fenced Python kernel out of a llama.cpp completion.""" | |
| m = re.search(r"```(?:python)?\s*(.*?)```", text or "", re.S) | |
| body = m.group(1) if m else (text or "") | |
| starts = [body.find(k) for k in ("@triton", "import ", "def run", "def _") if body.find(k) >= 0] | |
| i = min(starts or [0]) | |
| return body[i:].strip() | |
| def _local_op_from_recipe(r: dict) -> tuple[str, str]: | |
| _ensure_referee_path() | |
| from specs import SPECS | |
| if r.get("mode") == "classic": | |
| op = (r.get("classic") or "softmax").strip() | |
| if op not in SPECS: | |
| raise ValueError(f"unknown op '{op}'") | |
| return op, op | |
| normfull = "rmsnorm" if str(r.get("norm", "rmsnorm")).startswith("rms") else "layernorm" | |
| norm = "rms" if normfull == "rmsnorm" else "layer" | |
| residual = bool(r.get("residual")) | |
| acts = [a for a in (r.get("acts") or []) if a][:3] or ["gelu"] | |
| label = ("residual + " if residual else "") + \ | |
| ("RMSNorm" if norm == "rms" else "LayerNorm") + " -> " + " -> ".join(acts) | |
| named = ("add_" if residual else "") + normfull + "_" + acts[0] | |
| if len(acts) == 1 and named in SPECS: | |
| return named, label | |
| return f"chain|{norm}|{'1' if residual else '0'}|{','.join(acts)}", label | |
| def _local_prompt_messages(op: str, spark: str = "") -> list[dict[str, str]]: | |
| _ensure_referee_path() | |
| from specs import get_spec, SPECS | |
| spec = get_spec(op) | |
| # Same exemplar rule as training: the real seed kernel for the next op in SPECS that isn't this | |
| # one (almost always rmsnorm's row-wise reduction). This is the single thing that makes the 1B | |
| # write correct kernels here instead of elementwise guesses. | |
| exemplar_op = next((o for o in SPECS if o != op), op) | |
| exemplar = _load_seed_kernel(exemplar_op) | |
| user = (f"Op `{op}`: {spec.notes}\nSignature:\n{spec.signature_hint}\n\n" | |
| f"Here is a valid Triton kernel for a DIFFERENT op (`{exemplar_op}`) as a style guide:\n" | |
| f"```python\n{exemplar}\n```\n") | |
| if spark: | |
| user += (f"\nThe person building this kernel added their own idea: \"{spark}\". " | |
| "Honour their idea where you can, but correctness is mandatory.") | |
| return [{"role": "system", "content": LOCAL_SYS}, {"role": "user", "content": user}] | |
| def _render_prompt(messages: list[dict[str, str]]) -> str: | |
| # MiniCPM5 uses ChatML (<|im_start|>/<|im_end|>). It is a reasoning model, so we append the | |
| # empty <think></think> block (the enable_thinking=False convention) to make it answer | |
| # directly with the kernel instead of emitting a reasoning trace. Verified against the | |
| # tokenizer's apply_chat_template(enable_thinking=False) output. | |
| out = [] | |
| for m in messages: | |
| out.append(f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>\n") | |
| out.append("<|im_start|>assistant\n<think>\n\n</think>\n\n") | |
| return "".join(out) | |
| def _common_gguf_names() -> list[str]: | |
| names = [] | |
| stems = ("MiniCPM5-1B", "minicpm5-1b", "model", "ggml-model") | |
| for q in LOCAL_QUANT_PREFS: | |
| for stem in stems: | |
| names.append(f"{stem}-{q}.gguf") | |
| names.append(f"{stem}.{q}.gguf") | |
| return names | |
| def _download_gguf_from_repo(repo_id: str) -> str: | |
| from huggingface_hub import HfApi, hf_hub_download | |
| explicit = os.environ.get("LOCAL_GGUF_FILE") | |
| cache_dir = os.environ.get("LOCAL_GGUF_CACHE") | |
| if explicit: | |
| return hf_hub_download(repo_id=repo_id, filename=explicit, cache_dir=cache_dir) | |
| files = [] | |
| try: | |
| files = HfApi().list_repo_files(repo_id) | |
| except Exception: | |
| files = [] | |
| ggufs = [f for f in files if f.lower().endswith(".gguf")] | |
| for quant in LOCAL_QUANT_PREFS: | |
| hit = next((f for f in ggufs if quant.lower() in f.lower()), None) | |
| if hit: | |
| return hf_hub_download(repo_id=repo_id, filename=hit, cache_dir=cache_dir) | |
| if ggufs: | |
| return hf_hub_download(repo_id=repo_id, filename=ggufs[0], cache_dir=cache_dir) | |
| last = None | |
| for filename in _common_gguf_names(): | |
| try: | |
| return hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir) | |
| except Exception as e: | |
| last = e | |
| raise RuntimeError(f"no GGUF found in {repo_id}: {last}") | |
| def _resolve_local_gguf() -> str: | |
| errors = [] | |
| for repo_id in (LOCAL_FINE_TUNED_REPO, LOCAL_BASE_REPO): | |
| try: | |
| return _download_gguf_from_repo(repo_id) | |
| except Exception as e: | |
| errors.append(f"{repo_id}: {type(e).__name__}: {str(e)[:180]}") | |
| raise RuntimeError("could not load a local GGUF. " + " | ".join(errors)) | |
| _CUDA_PRELOADED = False | |
| def _preload_cuda_libs(): | |
| # The CUDA llama.cpp wheel's libllama.so links libcudart.so.12 / libcublas*.so.12, which on HF | |
| # Spaces live inside torch's bundled nvidia-* packages and are NOT on the default loader path, | |
| # so the import dies with "libcudart.so.12: cannot open shared object file". Preload them with | |
| # RTLD_GLOBAL (in dependency order) so libllama.so resolves their symbols. No-op if absent. | |
| global _CUDA_PRELOADED | |
| if _CUDA_PRELOADED: | |
| return | |
| _CUDA_PRELOADED = True | |
| import ctypes | |
| import glob | |
| roots = [] | |
| try: | |
| import torch | |
| roots.append(os.path.join(os.path.dirname(torch.__file__), "lib")) | |
| except Exception: | |
| pass | |
| try: | |
| import site | |
| roots += list(site.getsitepackages() if hasattr(site, "getsitepackages") else []) | |
| except Exception: | |
| pass | |
| roots += ["/usr/local/lib/python3.10/site-packages", "/usr/local/cuda/lib64", | |
| "/usr/lib/x86_64-linux-gnu", "/usr/local/lib"] | |
| # Match versioned sonames too (e.g. libcudart.so.12.4.127 with no libcudart.so.12 symlink); | |
| # RTLD_GLOBAL makes the symbols visible to libllama.so regardless of the exact soname. | |
| for pat in ("libcudart.so*", "libcublas.so*", "libcublasLt.so*"): | |
| loaded = False | |
| for root in roots: | |
| if loaded: | |
| break | |
| try: | |
| hits = sorted(glob.glob(os.path.join(root, "**", pat), recursive=True)) | |
| except Exception: | |
| hits = [] | |
| for hit in hits: | |
| try: | |
| ctypes.CDLL(hit, mode=ctypes.RTLD_GLOBAL) | |
| loaded = True | |
| break | |
| except Exception: | |
| continue | |
| # Confirm the CUDA llama.cpp lib loads at startup (loading the .so needs no GPU device, so it spends | |
| # no ZeroGPU quota). It also warms the import so the in-@spaces.GPU call is instant. | |
| if os.environ.get("LOCAL_LLAMA_GPU_LAYERS", "-1") != "0": | |
| try: | |
| _preload_cuda_libs() | |
| import llama_cpp as _lc_probe | |
| print(f"[startup] llama_cpp {_lc_probe.__version__} loaded OK", flush=True) | |
| except Exception as _e: | |
| print(f"[startup] llama_cpp load FAILED: {type(_e).__name__}: {str(_e)[:200]}", flush=True) | |
| def _get_local_llm(): | |
| global _LOCAL_LLM, _LOCAL_LLM_PATH | |
| on_zero = bool(os.environ.get("SPACES_ZERO_GPU")) | |
| gpu_layers = int(os.environ.get("LOCAL_LLAMA_GPU_LAYERS", "-1")) | |
| # Default to full GPU offload (-1). On ZeroGPU the H200 is ~30x faster than its throttled | |
| # shared CPU for this 1B, and the GPU is granted only inside @spaces.GPU and detached between | |
| # calls, so a GPU-resident model can't be reused across mints: rebuild it each call there. | |
| # Off ZeroGPU (dedicated GPU or CPU) the model is cached once. Set LOCAL_LLAMA_GPU_LAYERS=0 | |
| # to force CPU (e.g. when only a CPU-only llama.cpp wheel is installed). | |
| if _LOCAL_LLM is not None and not (on_zero and gpu_layers != 0): | |
| return _LOCAL_LLM | |
| if gpu_layers != 0: | |
| _preload_cuda_libs() | |
| from llama_cpp import Llama | |
| _LOCAL_LLM_PATH = _resolve_local_gguf() | |
| threads = int(os.environ.get("LOCAL_LLAMA_THREADS", str(max(1, (os.cpu_count() or 4) - 1)))) | |
| llm = Llama( | |
| model_path=_LOCAL_LLM_PATH, | |
| n_ctx=int(os.environ.get("LOCAL_LLAMA_CTX", "4096")), | |
| n_threads=threads, | |
| n_gpu_layers=gpu_layers, | |
| verbose=bool(int(os.environ.get("LOCAL_LLAMA_VERBOSE", "0"))), | |
| ) | |
| if not (on_zero and gpu_layers != 0): | |
| _LOCAL_LLM = llm | |
| return llm | |
| def _llama_complete(messages: list[dict[str, str]], temp: float, seed: int) -> str: | |
| # Use raw completion with our think-suppressed ChatML render (not create_chat_completion, which | |
| # leaves the reasoning trace on and makes a 1B ramble instead of writing the kernel). | |
| llm = _get_local_llm() | |
| max_tokens = int(os.environ.get("LOCAL_MAX_TOKENS", "768")) | |
| out = llm.create_completion( | |
| _render_prompt(messages), | |
| max_tokens=max_tokens, | |
| temperature=temp, | |
| top_p=0.97, | |
| seed=seed, | |
| stop=["<|im_end|>", "<|im_start|>"], | |
| ) | |
| return out["choices"][0]["text"] | |
| def _local_gpu_duration() -> int: | |
| # This is a ZeroGPU reservation window, not a benchmark setting. Keep the default within the | |
| # usual free-tier budget, but make the cap explicit so slower/cold hardware can raise it without | |
| # code changes. The current Space had LOCAL_GPU_DURATION=135, which over-reserved quota; the | |
| # default cap trims that to 120 while still leaving headroom for cold local mints. | |
| try: | |
| requested = int(os.environ.get("LOCAL_GPU_DURATION", "120")) | |
| except Exception: | |
| requested = 120 | |
| try: | |
| cap = int(os.environ.get("LOCAL_GPU_DURATION_CAP", "120")) | |
| except Exception: | |
| cap = 120 | |
| return max(45, min(max(45, cap), requested)) | |
| LOCAL_GPU_DURATION = _local_gpu_duration() | |
| def _local_gpu_mint(op: str, k: int, temp: float, spark: str) -> dict: | |
| # Generation AND verification happen inside this one GPU window. That is required on ZeroGPU, | |
| # where the GPU exists only inside @spaces.GPU: llama.cpp offloads generation to the H200 | |
| # (LOCAL_LLAMA_GPU_LAYERS=-1, seconds instead of minutes on the throttled CPU) and the referee | |
| # compiles + runs Triton, all here. Off ZeroGPU the decorator is a no-op and the GPU is always | |
| # present, so the same code path works on a dedicated GPU and locally too. | |
| _ensure_referee_path() | |
| # Time every verified kernel against eager AND torch.compile (default + max-autotune), so local | |
| # mode shows the honest baselines, not just the inflated vs-eager fusion win. The max-autotune | |
| # compile is a few seconds for these ops (inductor-cached across the k attempts), so it fits. | |
| from harness import evaluate_inprocess_full | |
| messages = _local_prompt_messages(op, spark) | |
| srcs = [] | |
| base_seed = int(time.time() * 1000) & 0x7FFFFFFF | |
| for i in range(k): | |
| text = _llama_complete(messages, temp=temp, seed=base_seed + i) | |
| src = extract_kernel(text) | |
| if src: | |
| srcs.append(src) | |
| statuses = [] | |
| best = None | |
| n_shapes = int(os.environ.get("LOCAL_REFEREE_SHAPES", "2")) | |
| n_iters = int(os.environ.get("LOCAL_REFEREE_ITERS", "30")) | |
| for i, src in enumerate(srcs): | |
| try: | |
| res = evaluate_inprocess_full(src, op, n_shapes=n_shapes, n_iters=n_iters, seed=i) | |
| except Exception: | |
| statuses.append("runtime_fail") | |
| continue | |
| statuses.append(res.status) | |
| # Keep the FASTEST verified kernel (lowest latency = best vs every baseline at once). | |
| if res.status == "ok" and (best is None or res.latency_ms < best["result"]["latency_ms"]): | |
| best = {"source": src, "result": res.to_dict()} | |
| return {"n_srcs": len(srcs), "statuses": statuses, "best": best} | |
| def _local_explain(label: str, res) -> str: | |
| if res is None or res.get("status") != "ok": | |
| status = "no verified attempt" if res is None else res.get("status", "failed") | |
| return (f"The local referee rejected these kernels ({status}). It still compiled and " | |
| "checked them inside this Space; try minting again or simplify the operation.") | |
| se = float(res.get("speedup_eager", 0) or 0) | |
| honest = res.get("speedup_maxauto") or res.get("speedup_compile") | |
| bl = "torch.compile max-autotune" if res.get("speedup_maxauto") else "torch.compile" | |
| if honest: | |
| return (f"Verified correct. {float(honest):.2f}x faster than {bl} (the honest baseline that " | |
| f"also fuses and autotunes), and {se:.0f}x vs unfused PyTorch eager. The whole loop " | |
| "ran in this Space: the 1B wrote it via llama.cpp, the referee compiled, checked it " | |
| "against PyTorch, and timed it.") | |
| return (f"Verified correct, {se:.0f}x faster than unfused PyTorch eager. The 1B wrote it via " | |
| "llama.cpp and the referee checked and timed it, all inside this Space.") | |
| def local_mint(recipe: dict) -> dict: | |
| # The 1B reliably writes the named ops and single-activation machines (norm + optional residual | |
| # + one activation), which is what it was trained on. Arbitrary multi-activation chains are | |
| # off-distribution and it can't write them correctly, so local mode declines them with a clear | |
| # message instead of looping through rejected attempts. Pro mode (the 27B) is the place for those. | |
| op, label = _local_op_from_recipe(recipe) | |
| if op.startswith("chain|"): | |
| return {"op": label, "verified": False, "local": True, "baseline": "eager", | |
| "statuses": [], "gen_seconds": 0.0, "verify_seconds": 0.0, | |
| "speedup_compile": None, "unsupported": True, "k": 0, | |
| "explanation": ("This machine stacks two activations, which is outside what the " | |
| "kernelsmith models were trained on (the grammar is a norm plus one " | |
| "activation), so neither the 1B nor the 27B writes it reliably. Drop " | |
| "to a single activation and it mints with a verified kernel.")} | |
| k = max(1, min(LOCAL_MAX_ATTEMPTS, int(recipe.get("k", 1)))) | |
| temp = max(0.2, min(1.3, float(recipe.get("temp", 0.7)))) | |
| spark = (recipe.get("spark") or "").strip()[:200] | |
| # Pre-fetch the GGUF to disk OUTSIDE the GPU window, so the ZeroGPU 120s budget is spent on | |
| # generate + verify rather than a 1.15GB download. | |
| _resolve_local_gguf() | |
| t0 = time.time() | |
| work = _local_gpu_mint(op, k, temp, spark) | |
| total_s = round(time.time() - t0, 1) | |
| statuses = work.get("statuses") or [] | |
| best = work.get("best") | |
| if not work.get("n_srcs"): | |
| return {"op": label, "verified": False, "local": True, "baseline": "eager", | |
| "statuses": statuses, "gen_seconds": total_s, "verify_seconds": 0.0, | |
| "speedup_compile": None, "k": k, | |
| "explanation": "llama.cpp returned no parseable kernel this time. Try minting again."} | |
| if not best: | |
| return {"op": label, "verified": False, "local": True, "baseline": "eager", | |
| "statuses": statuses, "gen_seconds": total_s, "verify_seconds": 0.0, | |
| "speedup_compile": None, "explanation": _local_explain(label, None), "k": k} | |
| res = best["result"] | |
| def _su(key): | |
| v = res.get(key) | |
| return round(float(v), 2) if v else None | |
| return {"op": label, "verified": True, "local": True, "baseline": "compile", | |
| "speedup_eager": round(float(res.get("speedup_eager", 0)), 2), | |
| "speedup_compile": _su("speedup_compile"), "speedup_maxauto": _su("speedup_maxauto"), | |
| "latency_ms": res.get("latency_ms"), "eager_ms": res.get("eager_ms"), | |
| "compile_ms": res.get("compile_ms"), "maxauto_ms": res.get("maxauto_ms"), | |
| "source": best["source"], "statuses": statuses, "gen_seconds": total_s, | |
| "verify_seconds": 0.0, "explanation": _local_explain(label, res), | |
| "n_verified": sum(1 for s in statuses if s == "ok"), "k": k} | |
| def lb_url(u): | |
| return u.replace("-mint-mint.modal.run", "-leaderboard.modal.run") | |
| def _blob(d): | |
| return f"<span data-blob='{base64.b64encode(json.dumps(d).encode()).decode()}'></span>" | |
| def _with_nonce(d: dict, r: dict) -> dict: | |
| out = dict(d or {}) | |
| if r.get("_n") is not None: | |
| out["_n"] = r.get("_n") | |
| return out | |
| def do_mint(recipe_json: str) -> str: | |
| try: | |
| r = json.loads(recipe_json or "{}") | |
| except Exception: | |
| return _blob({"error": "bad recipe"}) | |
| if r.get("local"): | |
| try: | |
| return _blob(_with_nonce(local_mint(r), r)) | |
| except Exception as e: | |
| return _blob(_with_nonce( | |
| {"error": f"Local offline mint failed: {type(e).__name__}: {str(e)[:240]}", | |
| "local": True, "baseline": "eager"}, r)) | |
| base = PRO if r.get("pro") else ONE_B | |
| if r.get("mode") == "classic": | |
| payload = {"op": r.get("classic", "softmax")} | |
| else: | |
| payload = {"blocks": {"norm": r.get("norm", "rmsnorm"), "residual": bool(r.get("residual")), | |
| "acts": r.get("acts") or ["gelu"]}} | |
| payload.update({"name": (r.get("name") or "anonymous")[:40], "k": int(r.get("k", 4)), | |
| "temp": float(r.get("temp", 0.7)), "spark": (r.get("spark") or "")[:200]}) | |
| # During a cold start Modal can return an empty body before the model finishes loading, | |
| # which makes .json() blow up. Retry a few times, then explain it plainly. | |
| last = "" | |
| for attempt in range(4): | |
| try: | |
| resp = requests.post(base, json=payload, timeout=600) | |
| if resp.status_code == 200 and resp.text.strip(): | |
| try: | |
| return _blob(_with_nonce(resp.json(), r)) | |
| except ValueError: | |
| last = "the model returned an unexpected response" | |
| else: | |
| last = f"the model is still waking up (HTTP {resp.status_code})" | |
| except Exception as e: | |
| last = str(e) | |
| time.sleep(4) | |
| pro = " The 27B in Pro mode takes longer to wake. Try again, or turn Pro off." if r.get("pro") else "" | |
| return _blob(_with_nonce( | |
| {"error": f"The model is still waking up. Give it about 30 seconds and press Mint again.{pro}"}, | |
| r)) | |
| def fetch_lb(_=None): | |
| rows = [] | |
| seen = set() | |
| sources = [] | |
| errors = [] | |
| for u in (ONE_B, PRO): | |
| try: | |
| url = lb_url(u) | |
| got = requests.get(url, timeout=20).json().get("rows", []) | |
| if got: | |
| sources.append(url) | |
| for row in got: | |
| key = (row.get("op"), row.get("name"), row.get("ts"), | |
| row.get("speedup_compile"), row.get("speedup_eager")) | |
| if key in seen: | |
| continue | |
| seen.add(key) | |
| rows.append(row) | |
| except Exception as e: | |
| errors.append(f"{lb_url(u)}: {type(e).__name__}") | |
| continue | |
| # Always include fetched_at so the hidden Gradio HTML changes even when the rows are identical. | |
| # That makes the visible Refresh button observably work. | |
| return _blob({"rows": rows, "fetched_at": time.time(), "sources": sources, "errors": errors[:2]}) | |
| CSS = open(os.path.join(os.path.dirname(__file__), "style.css")).read() if os.path.exists( | |
| os.path.join(os.path.dirname(__file__), "style.css")) else "" | |
| REPLAYS_PATH = os.path.join(os.path.dirname(__file__), "replays", "index.json") | |
| try: | |
| with open(REPLAYS_PATH, encoding="utf-8") as f: | |
| REPLAYS_JSON = json.dumps(json.load(f), separators=(",", ":")).replace("</", "<\\/") | |
| except Exception: | |
| REPLAYS_JSON = "{}" | |
| APP_HTML = r""" | |
| <div id="km" data-build="2026-06-15-leaderboard-refresh-fallback" | |
| data-lb-one="__KM_LB_ONE__" | |
| data-lb-pro="__KM_LB_PRO__"> | |
| <div id="km-top"> | |
| <div id="km-brand"> | |
| <span class="mark"><span class="dot"></span>Kernel Mint</span> | |
| <span class="sub">an OUROBOROS demo</span> | |
| </div> | |
| <div id="km-tabs"> | |
| <span id="km-tab-pill"></span> | |
| <button class="km-tab on" data-tab="build">Build</button> | |
| <button class="km-tab" data-tab="expert">Expert</button> | |
| <button class="km-tab" data-tab="lb">Leaderboard</button> | |
| <button class="km-tab" data-tab="docs">Run & verify</button> | |
| </div> | |
| </div> | |
| <script id="km-replays-data" type="application/json">__KM_REPLAYS__</script> | |
| <!-- ============ BUILD ============ --> | |
| <div class="km-page" data-page="build"> | |
| <header id="km-hero"> | |
| <h1>A 1B model writes a real GPU kernel. <span class="g">A referee that can't be fooled checks it.</span></h1> | |
| <p>Compose an operation, and a 1-billion-parameter model writes a fused Triton kernel for it. The kernel is compiled, checked against PyTorch on adversarial inputs, and timed against PyTorch's own compiler before anything counts as a win.</p> | |
| </header> | |
| <div id="km-grid"> | |
| <section class="km-card"> | |
| <div class="km-seg" role="tablist"> | |
| <button class="km-segb on" data-mode="build">Compose</button> | |
| <button class="km-segb" data-mode="classic">Classic ops</button> | |
| </div> | |
| <div id="km-compose"> | |
| <div class="km-section-label">Pipeline</div> | |
| <div id="km-pipe"> | |
| <div class="km-stage fixed"><span class="km-stage-k">input</span><span class="km-dim">a row of activations</span></div> | |
| <div class="km-flow">↓</div> | |
| <div class="km-stage"><span class="km-stage-k">residual</span><span class="km-pick" data-slot="mem"></span></div> | |
| <div class="km-flow">↓</div> | |
| <div class="km-stage"><span class="km-stage-k">normalize</span><span class="km-pick" data-slot="norm"></span></div> | |
| <div class="km-flow">↓</div> | |
| <div class="km-stage"><span class="km-stage-k">activate</span><span class="km-pick" data-slot="act"></span></div> | |
| </div> | |
| </div> | |
| <div id="km-classic" hidden> | |
| <div class="km-section-label">A named operation from a real transformer</div> | |
| <div class="km-pick" id="km-classics"></div> | |
| </div> | |
| <div id="km-info"></div> | |
| <div class="km-params"> | |
| <div class="km-field"> | |
| <label>Attempts the model drafts <b id="km-tries-v">4</b></label> | |
| <div class="km-stepper" id="km-tries"></div> | |
| </div> | |
| <div class="km-field"> | |
| <label>Sampling temperature <b id="km-temp-v">0.70</b></label> | |
| <input type="range" class="km-range" id="km-temp" min="20" max="130" value="70"> | |
| </div> | |
| <div class="km-field"> | |
| <label>Name for the leaderboard</label> | |
| <input class="km-input" id="km-name" placeholder="optional" autocomplete="off"> | |
| </div> | |
| <label class="km-toggle"><input type="checkbox" id="km-pro"><span class="km-switch"></span>Use the 27B model (slower, knows more)</label> | |
| <label class="km-toggle km-local-toggle"><input type="checkbox" id="km-local"><span class="km-switch"></span>Local (offline)</label> | |
| <p class="km-mode-help">Runs entirely in this Space: a 1B GGUF model (llama.cpp) writes the kernel on this Space's GPU, then the in-process referee compiles it, checks correctness against PyTorch, and times it against PyTorch eager, torch.compile, and max-autotune. No Modal or cloud model calls. Pro uses the 27B.</p> | |
| </div> | |
| <div class="km-actions"> | |
| <button class="km-btn primary" id="km-build">Mint kernel</button> | |
| <button class="km-btn ghost" id="km-replay" data-replay-key="rmsnorm_gelu">Watch a recorded mint</button> | |
| </div> | |
| <div id="km-replay-picks" class="km-replay-picks" aria-label="recorded mints"></div> | |
| </section> | |
| <section class="km-card"> | |
| <div class="km-section-label">The operation, stage by stage</div> | |
| <div id="km-viz"><svg id="km-svg" viewBox="0 0 320 150" preserveAspectRatio="none"></svg> | |
| <div id="km-stage-label">input</div></div> | |
| <div id="km-verdict"><div class="km-card"><div class="km-result-top"><div class="km-verdict-icon">→</div> | |
| <div><span class="km-run-label">ready</span><b>Press Mint to begin</b></div></div> | |
| <p class="km-dim" style="margin-top:10px">Compose a pipeline on the left and mint it, or watch a recorded verified mint to see the whole referee loop instantly.</p></div></div> | |
| <details id="km-source" class="km-source" hidden> | |
| <summary>Triton kernel source</summary> | |
| <pre id="km-code"></pre> | |
| </details> | |
| <div class="km-section-label">This session</div> | |
| <div id="km-hist" class="km-dim">No mints yet.</div> | |
| </section> | |
| </div> | |
| </div> | |
| <!-- ============ EXPERT ============ --> | |
| <div class="km-page" data-page="expert" hidden> | |
| <header id="km-hero"> | |
| <h1>Free-test the verifier.</h1> | |
| <p>Describe the fused operation you want in plain language. The model writes a Triton kernel and the referee certifies it against PyTorch, torch.compile, and max-autotune. The referee can only certify operations it has a reference for, so your request is resolved to the nearest one it can check.</p> | |
| </header> | |
| <div class="km-expert-grid"> | |
| <section class="km-card"> | |
| <div class="km-section-label">Your brief</div> | |
| <textarea class="km-textarea" id="km-xbrief" placeholder="e.g. fused RMSNorm with a residual add and a SiLU activation, contiguous rows"></textarea> | |
| <div id="km-xresolved" class="km-resolved"><span class="k">resolves to</span><div class="v">rmsnorm with gelu</div></div> | |
| <div class="km-params"> | |
| <div class="km-field"> | |
| <label>Attempts the model drafts <b id="km-xtries-v">5</b></label> | |
| <div class="km-stepper" id="km-xtries"></div> | |
| </div> | |
| <div class="km-field"> | |
| <label>Sampling temperature <b id="km-xtemp-v">0.70</b></label> | |
| <input type="range" class="km-range" id="km-xtemp" min="20" max="130" value="70"> | |
| </div> | |
| <label class="km-toggle"><input type="checkbox" id="km-xpro"><span class="km-switch"></span>Use the 27B model</label> | |
| <label class="km-toggle km-local-toggle"><input type="checkbox" id="km-xlocal"><span class="km-switch"></span>Local (offline)</label> | |
| <p class="km-mode-help">Uses llama.cpp inside the Space and verifies on this Space's GPU, timed against torch.compile and max-autotune (the honest baselines), not just eager. Pro mode keeps the 27B on Modal.</p> | |
| </div> | |
| <div class="km-actions"> | |
| <button class="km-btn primary" id="km-xbuild">Mint from brief</button> | |
| </div> | |
| </section> | |
| <section class="km-card"> | |
| <div class="km-section-label">What the referee can verify</div> | |
| <p class="km-limits">It certifies fusions of a <b>normalization</b> (RMSNorm or LayerNorm), an optional <b>residual add</b>, and one <b>activation</b>, plus a set of named operators from real transformers. Click one to load it into your brief.</p> | |
| <div class="km-oplist" id="km-oplist"></div> | |
| <div id="km-xverdict" style="margin-top:16px"></div> | |
| <details id="km-xsource" class="km-source" hidden> | |
| <summary>Triton kernel source</summary> | |
| <pre class="km-code"></pre> | |
| </details> | |
| </section> | |
| </div> | |
| </div> | |
| <!-- ============ LEADERBOARD ============ --> | |
| <div class="km-page" data-page="lb" hidden> | |
| <section class="km-card"> | |
| <div class="km-h">Leaderboard</div> | |
| <p class="km-lede">The best verified kernel per operation. Crowned rows were minted by the 27B model. Every speedup here was measured by the referee, not reported by the model.</p> | |
| <div id="km-lb-view" class="km-dim" style="margin-top:16px">Loading.</div> | |
| <div class="km-lb-tools"> | |
| <button id="km-lb-refresh" class="km-mini">Refresh</button> | |
| <span id="km-lb-status" class="km-lb-status">Auto-refreshes while this tab is open.</span> | |
| </div> | |
| </section> | |
| </div> | |
| <!-- ============ RUN + VERIFY / DOCS ============ --> | |
| <div class="km-page" data-page="docs" hidden> | |
| <section class="km-docs"> | |
| <div class="km-doc-hero"> | |
| <span class="km-doc-kicker">reproduce the claim</span> | |
| <h2>Run the smith, then make the referee decide.</h2> | |
| <p>The Space has two execution paths. The <b>Local (offline)</b> switch runs the 1B GGUF model with llama.cpp inside this Space, then verifies the candidate kernel in-process on the same GPU. <b>Pro</b> uses the 27B adapter through the Modal backend. Both paths compare against PyTorch eager, torch.compile, and torch.compile max-autotune before a result counts.</p> | |
| </div> | |
| <div class="km-doc-metrics" aria-label="execution modes"> | |
| <div><b>1B local</b><span>MiniCPM5-1B GGUF, llama.cpp, in-Space referee</span></div> | |
| <div><b>27B Pro</b><span>Qwen3.6-27B adapter, Modal H200 backend</span></div> | |
| <div><b>referee</b><span>compile, allclose, eager, compile, max-autotune</span></div> | |
| </div> | |
| <div class="km-doc-grid"> | |
| <article class="km-doc-panel wide"> | |
| <div class="km-doc-label">1. Use this Space properly</div> | |
| <h3>The easiest local run is already on this page.</h3> | |
| <ol class="km-doc-steps"> | |
| <li>Open <b>Build</b> or <b>Expert</b>.</li> | |
| <li>Turn on <b>Local (offline)</b>. It disables Pro because the two paths are intentionally separate.</li> | |
| <li>Pick a named op such as <code>rmsnorm_gelu</code>, <code>softmax</code>, or <code>swiglu</code>, or compose one norm plus one activation.</li> | |
| <li>Press <b>Mint kernel</b>. The Space downloads or reuses the GGUF, drafts up to two kernels, compiles Triton, checks PyTorch correctness, and times eager, torch.compile, and max-autotune.</li> | |
| </ol> | |
| <div class="km-doc-callout">A certified Local result should show compiler baselines. If a cold ZeroGPU session times out during max-autotune, mint again. A result that only beats eager is not treated as the headline claim.</div> | |
| </article> | |
| <article class="km-doc-panel"> | |
| <div class="km-doc-label">2. Run the Space clone</div> | |
| <h3>Same UI, your GPU.</h3> | |
| <p>Use this when you want the full app locally. With no `SPACES_ZERO_GPU` env var, the GPU decorator becomes a no-op and the local verifier runs directly on your attached CUDA device.</p> | |
| <div class="km-codeblock"><button class="km-copy">Copy</button>git clone https://huggingface.co/spaces/build-small-hackathon/ouroboros-kernel-mint | |
| cd ouroboros-kernel-mint | |
| python -m venv .venv | |
| . .venv/bin/activate | |
| python -m pip install -r requirements.txt | |
| python app.py</div> | |
| <p>Useful knobs:</p> | |
| <div class="km-codeblock"><button class="km-copy">Copy</button>LOCAL_GGUF_REPO=YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF | |
| LOCAL_GGUF_QUANTS=Q5_K_M,Q4_K_M | |
| LOCAL_LLAMA_GPU_LAYERS=-1 | |
| LOCAL_GPU_DURATION=120 | |
| python app.py</div> | |
| </article> | |
| <article class="km-doc-panel"> | |
| <div class="km-doc-label">3. Run the 1B smith directly</div> | |
| <h3>GGUF path, no Modal.</h3> | |
| <p>This mirrors the Space-local path. It uses the fine-tuned GGUF repo first and falls back to OpenBMB's base GGUF only if needed.</p> | |
| <div class="km-codeblock"><button class="km-copy">Copy</button>python -m pip install torch triton huggingface_hub \ | |
| --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu130 \ | |
| llama-cpp-python==0.3.28</div> | |
| <div class="km-codeblock"><button class="km-copy">Copy</button>from huggingface_hub import HfApi, hf_hub_download | |
| from llama_cpp import Llama | |
| repo = "YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF" | |
| files = [f for f in HfApi().list_repo_files(repo) if f.lower().endswith(".gguf")] | |
| filename = next((f for f in files if "Q5_K_M" in f), files[0]) | |
| gguf = hf_hub_download(repo, filename=filename) | |
| llm = Llama(model_path=gguf, n_ctx=4096, n_gpu_layers=-1) | |
| system = "You are an expert GPU kernel engineer. Output only one fenced python code block." | |
| user = "Write a fused Triton kernel for row-wise softmax. Use stable max-subtraction. Return run(x)." | |
| prompt = ( | |
| f"<|im_start|>system\n{system}<|im_end|>\n" | |
| f"<|im_start|>user\n{user}<|im_end|>\n" | |
| "<|im_start|>assistant\n```python\n" | |
| ) | |
| out = llm.create_completion(prompt, max_tokens=768, temperature=0.7, top_p=0.97) | |
| print(out["choices"][0]["text"])</div> | |
| </article> | |
| <article class="km-doc-panel"> | |
| <div class="km-doc-label">4. Run the 1B adapter directly</div> | |
| <h3>Transformers plus PEFT.</h3> | |
| <p>Use this when you want the published LoRA adapter instead of GGUF. It is slower to start than llama.cpp, but useful for inspection or further fine-tuning.</p> | |
| <div class="km-codeblock"><button class="km-copy">Copy</button>python -m pip install torch transformers peft accelerate triton</div> | |
| <div class="km-codeblock"><button class="km-copy">Copy</button>from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from peft import PeftModel | |
| tok = AutoTokenizer.from_pretrained("openbmb/MiniCPM5-1B", trust_remote_code=True) | |
| base = AutoModelForCausalLM.from_pretrained( | |
| "openbmb/MiniCPM5-1B", | |
| trust_remote_code=True, | |
| torch_dtype="auto", | |
| device_map="auto", | |
| ) | |
| model = PeftModel.from_pretrained(base, "YMRohit/ouroboros-kernelsmith-minicpm5-1b") | |
| model.eval()</div> | |
| </article> | |
| <article class="km-doc-panel"> | |
| <div class="km-doc-label">5. Run the 27B smith locally</div> | |
| <h3>Same prompt contract, much bigger hardware.</h3> | |
| <p>The 27B artifact is the stronger Qwen3.6-27B smith used for the 76 verified compiler-beating kernels. Run it locally if you have a large GPU or multi-GPU box. The training run used Modal H200s and peaked around 110 GB VRAM; inference is still a heavyweight path compared with the 1B GGUF.</p> | |
| <div class="km-codeblock"><button class="km-copy">Copy</button>python -m pip install torch transformers peft accelerate triton</div> | |
| <div class="km-codeblock"><button class="km-copy">Copy</button>from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from peft import PeftModel | |
| tok = AutoTokenizer.from_pretrained("Qwen/Qwen3.6-27B", trust_remote_code=True) | |
| base = AutoModelForCausalLM.from_pretrained( | |
| "Qwen/Qwen3.6-27B", | |
| trust_remote_code=True, | |
| torch_dtype="auto", | |
| device_map="auto", | |
| ) | |
| model = PeftModel.from_pretrained(base, "YMRohit/ouroboros-kernelsmith-qwen3.6-27b") | |
| model.eval()</div> | |
| <p>If that does not fit, use <b>Pro</b> in this Space. It calls the hosted 27B backend and still sends the output through the same three-baseline referee before returning a result.</p> | |
| </article> | |
| <article class="km-doc-panel wide"> | |
| <div class="km-doc-label">6. Prompt contract</div> | |
| <h3>Ask for one operation, one code block, one `run(...)` entry point.</h3> | |
| <p>The model is not a general Python assistant here. It was trained to emit a single Triton kernel for a verifier-known operation. Keep the prompt narrow and include the exact operation, input tensors, output contract, and baseline target.</p> | |
| <div class="km-prompt-grid"> | |
| <div> | |
| <b>System</b> | |
| <div class="km-codeblock"><button class="km-copy">Copy</button>You are an expert GPU kernel engineer. Write a single correct, fast Triton kernel. Output ONLY one fenced python code block defining run(*inputs) and its @triton.jit kernel. Accumulate reductions in float32. No prose.</div> | |
| </div> | |
| <div> | |
| <b>User template</b> | |
| <div class="km-codeblock"><button class="km-copy">Copy</button>Operation: add_rmsnorm_gelu | |
| Inputs: x, residual, weight. Each row is one transformer hidden state. | |
| Reference: y = gelu(rmsnorm(x + residual, weight)). | |
| Return: one fenced python block with imports, one @triton.jit kernel, and run(x, residual, weight). | |
| Target: correct vs PyTorch first, then faster than torch.compile max-autotune.</div> | |
| </div> | |
| </div> | |
| <div class="km-doc-chiprow"> | |
| <span class="km-doc-chip">name the op</span> | |
| <span class="km-doc-chip">state tensor order</span> | |
| <span class="km-doc-chip">require fp32 reductions</span> | |
| <span class="km-doc-chip">forbid prose</span> | |
| <span class="km-doc-chip">verify before trusting</span> | |
| </div> | |
| </article> | |
| <article class="km-doc-panel"> | |
| <div class="km-doc-label">Prompt example A</div> | |
| <h3>Residual RMSNorm plus GELU.</h3> | |
| <div class="km-codeblock"><button class="km-copy">Copy</button>Write a fused Triton kernel for add_rmsnorm_gelu. | |
| Inputs are x, residual, and weight, all CUDA tensors. | |
| Compute RMSNorm over each row after x + residual, multiply by weight, then apply GELU. | |
| Use fp32 accumulation for the row reduction. | |
| Return exactly one fenced python code block with run(x, residual, weight).</div> | |
| </article> | |
| <article class="km-doc-panel"> | |
| <div class="km-doc-label">Prompt example B</div> | |
| <h3>Stable softmax.</h3> | |
| <div class="km-codeblock"><button class="km-copy">Copy</button>Write a fused Triton kernel for row-wise softmax. | |
| Input x is a CUDA tensor shaped [M, N]. | |
| Use the stable max-subtraction form. | |
| Return exactly one fenced python code block with run(x). | |
| Do not include explanation text outside the code block.</div> | |
| </article> | |
| <article class="km-doc-panel"> | |
| <div class="km-doc-label">Prompt example C</div> | |
| <h3>SwiGLU gate.</h3> | |
| <div class="km-codeblock"><button class="km-copy">Copy</button>Write a fused Triton kernel for swiglu. | |
| Inputs are gate and up tensors with the same shape. | |
| Compute silu(gate) * up elementwise. | |
| Return exactly one fenced python code block with run(gate, up). | |
| Keep the launch grid simple and contiguous-row friendly.</div> | |
| </article> | |
| <article class="km-doc-panel wide"> | |
| <div class="km-doc-label">7. Verify a generated kernel</div> | |
| <h3>Never score the model output by eye.</h3> | |
| <p>Save the candidate as `candidate.py`, then send it to the referee. The result object reports correctness, latency, and speedups vs eager, torch.compile, and max-autotune.</p> | |
| <div class="km-codeblock"><button class="km-copy">Copy</button>git clone https://github.com/ymrohit/ouroboros-kernelsmith.git | |
| cd ouroboros-kernelsmith | |
| python -m pip install torch triton numpy</div> | |
| <div class="km-codeblock"><button class="km-copy">Copy</button>import pathlib | |
| import sys | |
| sys.path.insert(0, "referee") | |
| from harness import evaluate_inprocess_full | |
| kernel_src = pathlib.Path("candidate.py").read_text() | |
| result = evaluate_inprocess_full(kernel_src, "add_rmsnorm_gelu", n_shapes=2, n_iters=30) | |
| print(result.to_dict())</div> | |
| <div class="km-doc-callout">A kernel is submission-grade only if `status` is `ok`, correctness is true, and the compiler baselines are present. The strongest number is `speedup_maxauto` because that compares against torch.compile max-autotune.</div> | |
| </article> | |
| <article class="km-doc-panel wide"> | |
| <div class="km-doc-label">8. What the referee can certify</div> | |
| <h3>Known operations only, by design.</h3> | |
| <p>The referee needs a PyTorch reference. It certifies norm and activation fusions plus named transformer operations. Unknown math can still be interesting, but this app will not label it a verified win until there is a reference spec.</p> | |
| <div id="km-docs-ops" class="km-oplist"></div> | |
| </article> | |
| <article class="km-doc-panel wide"> | |
| <div class="km-doc-label">9. Glossary</div> | |
| <div id="km-learn-blocks"></div> | |
| </article> | |
| <article class="km-doc-panel wide"> | |
| <div class="km-doc-label">Source and artifacts</div> | |
| <p>MIT licensed code: <a href="https://github.com/ymrohit/ouroboros-kernelsmith">ymrohit/ouroboros-kernelsmith</a>. Models: <a href="https://huggingface.co/YMRohit/ouroboros-kernelsmith-minicpm5-1b">MiniCPM5-1B smith</a>, <a href="https://huggingface.co/YMRohit/ouroboros-kernelsmith-minicpm5-1b-GGUF">MiniCPM5-1B GGUF</a>, and <a href="https://huggingface.co/YMRohit/ouroboros-kernelsmith-qwen3.6-27b">Qwen3.6-27B smith</a>. Evidence: <a href="https://huggingface.co/datasets/YMRohit/ouroboros-kernel-corpus">verified kernel corpus</a>.</p> | |
| </article> | |
| </div> | |
| </section> | |
| </div> | |
| </div> | |
| """ | |
| APP_HTML = (APP_HTML | |
| .replace("__KM_REPLAYS__", REPLAYS_JSON) | |
| .replace("__KM_LB_ONE__", lb_url(ONE_B)) | |
| .replace("__KM_LB_PRO__", lb_url(PRO))) | |
| APP_JS = r""" | |
| () => { | |
| const ACTS = { | |
| gelu:{lbl:'GELU',verb:'smoothly bends',info:'GELU is the gentle bend used inside many transformers. It lets a little of the negative side through.',f:x=>0.5*x*(1+Math.tanh(0.7978845608*(x+0.044715*x*x*x)))}, | |
| silu:{lbl:'SiLU',verb:'swishes',info:'SiLU, also called Swish, is x times sigmoid(x). It is common in LLaMA style feed-forward blocks.',f:x=>x/(1+Math.exp(-x))}, | |
| relu:{lbl:'ReLU',verb:'clips negatives',info:'ReLU keeps positive values and clips negative values to zero. It is fast, classic, and direct.',f:x=>Math.max(x,0)}, | |
| tanh:{lbl:'Tanh',verb:'squashes to -1..1',info:'Tanh squashes each value into the range from -1 to 1. Large values flatten near the top and bottom.',f:x=>Math.tanh(x)}, | |
| sigmoid:{lbl:'Sigmoid',verb:'squashes to 0..1',info:'Sigmoid squashes each value into the range from 0 to 1. It is often used for gates and probabilities.',f:x=>1/(1+Math.exp(-x))}, | |
| relu2:{lbl:'ReLU2',verb:'clips then squares',info:'Squared ReLU clips negative values to zero, then squares the remaining positive values.',f:x=>{const r=Math.max(x,0);return r*r;}}, | |
| gelu_erf:{lbl:'GELU exact',verb:'bends exactly',info:'Exact GELU uses the erf form of GELU. It behaves almost like the tanh approximation, but follows the exact formula.',f:x=>0.5*x*(1+Math.tanh(0.7978845608*(x+0.044715*x*x*x)))}, | |
| leaky_relu:{lbl:'Leaky ReLU',verb:'leaks negatives',info:'Leaky ReLU is like ReLU, but negative values leak through at a small scale instead of becoming zero.',f:x=>x>0?x:0.01*x}, | |
| elu:{lbl:'ELU',verb:'curves negatives',info:'ELU passes positive values through and curves negative values smoothly down toward -1.',f:x=>x>0?x:Math.exp(x)-1}, | |
| mish:{lbl:'Mish',verb:'self gates',info:'Mish is a smooth self-gating activation, x times tanh of softplus(x).',f:x=>x*Math.tanh(Math.log(1+Math.exp(x)))}, | |
| }; | |
| const NORMS = { | |
| rmsnorm:{lbl:'RMSNorm',info:'RMSNorm makes a row of values a consistent size by dividing by root mean square, then scaling.'}, | |
| layernorm:{lbl:'LayerNorm',info:'LayerNorm centers a row of values by subtracting the average, then rescales by the spread.'}, | |
| }; | |
| const MEM = {info:'Residual add means adding the input back before the next operation. This is the skip path used in deep models.'}; | |
| const CLASSICS = { | |
| softmax:{lbl:'Softmax',info:'Softmax turns a row of scores into probabilities that add to 1.'}, | |
| softmax_scale:{lbl:'Scaled softmax',info:'Scaled softmax applies a scale before softmax. It is the attention score step used before attention weights.'}, | |
| swiglu:{lbl:'SwiGLU',info:'SwiGLU is a gated feed-forward operation: SiLU on the gate multiplied by the up projection.'}, | |
| geglu:{lbl:'GeGLU',info:'GeGLU is a gated feed-forward operation like SwiGLU, but with a GELU gate.'}, | |
| rmsnorm:{lbl:'RMSNorm',info:'RMSNorm on its own, without an activation after it.'}, | |
| layernorm:{lbl:'LayerNorm',info:'LayerNorm on its own, without an activation after it.'}, | |
| }; | |
| const INPUT = [0.6,-0.9,1.4,-0.3,0.2,-1.6,0.8,1.1,-0.5,0.0,1.7,-1.1,0.4,-0.2]; | |
| const root=document.getElementById('km'); | |
| if(!root||root.dataset.init)return; | |
| root.dataset.init='1'; | |
| const $=s=>root.querySelector(s), $$=s=>Array.from(root.querySelectorAll(s)); | |
| const LB_URLS=[root.dataset.lbOne,root.dataset.lbPro].filter(Boolean); | |
| const cleanText=s=>String(s??'').replace(/\u2013|\u2014/g,' - ').replace(/\u2192/g,'->').replace(/\u00d7/g,'x').replace(/[\u2600-\u27BF]/g,'').replace(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g,'').replace(/\s+/g,' ').trim(); | |
| const esc=s=>cleanText(s).replace(/[&<>"']/g,c=>({'&':'&','<':'<','>':'>','"':'"',"'":'''}[c])); | |
| let REPLAYS={}; | |
| try{REPLAYS=JSON.parse(($('#km-replays-data')||{}).textContent||'{}');}catch(e){REPLAYS={};} | |
| const REPLAY_META={ | |
| rmsnorm_gelu:{label:'RMSNorm to GELU',short:'RMSNorm GELU'}, | |
| softmax:{label:'Softmax',short:'Softmax'}, | |
| add_layernorm_silu:{label:'Residual plus LayerNorm to SiLU',short:'Residual LayerNorm SiLU'} | |
| }; | |
| const REPLAY_ORDER=['rmsnorm_gelu','softmax','add_layernorm_silu'].filter(k=>REPLAYS[k]); | |
| const state={tab:'build',mode:'build',slots:{mem:false,norm:'rmsnorm',act:'gelu',act2:null},classic:'softmax'}; | |
| let buildTries=4, expertTries=5, resolvedExpert=null, lastMintedMode='build'; | |
| let activeMintNonce=null, awaitingMint=false, pendingLBRefresh=false, lbRefreshTimer=null, lbPollTimer=null, lbUiTimer=null; | |
| const HIST=[]; | |
| function animate(el,frames,opts){ if(el&&el.animate)el.animate(frames,opts); } | |
| function labelRecipe(r){ | |
| if(!r)return 'unknown'; | |
| if(r.mode==='classic')return CLASSICS[r.classic]?.lbl||r.classic||'classic op'; | |
| const p=[]; | |
| if(r.residual)p.push('residual add'); | |
| p.push(NORMS[r.norm]?.lbl||r.norm); | |
| (r.acts||[]).forEach(a=>p.push(ACTS[a]?.lbl||a)); | |
| return p.join(' + '); | |
| } | |
| function machineInfo(){ | |
| if(state.mode==='classic')return CLASSICS[state.classic].info; | |
| const p=[]; | |
| p.push(state.slots.mem?'add input':'no residual add'); | |
| p.push(NORMS[state.slots.norm].lbl); | |
| p.push(ACTS[state.slots.act].lbl); | |
| if(state.slots.act2)p.push(ACTS[state.slots.act2].lbl); | |
| return 'Current operation: '+p.join(' + ')+'. The model must fuse it into one verified kernel.'; | |
| } | |
| function setInfo(t){ | |
| const info=$('#km-info'); | |
| if(!info)return; | |
| info.textContent=cleanText(t||machineInfo()); | |
| animate(info,[{opacity:.45},{opacity:1}],{duration:180,easing:'cubic-bezier(0.16,1,0.3,1)'}); | |
| } | |
| function positionTabPill(){ | |
| const tabs=$('#km-tabs'), pill=$('#km-tab-pill'), btn=$('.km-tab.on'); | |
| if(!tabs||!pill||!btn)return; | |
| const tr=tabs.getBoundingClientRect(), br=btn.getBoundingClientRect(); | |
| pill.style.width=br.width+'px'; | |
| pill.style.transform='translateX('+(br.left-tr.left-4)+'px)'; | |
| } | |
| function showTab(tab){ | |
| state.tab=tab; | |
| $$('.km-tab').forEach(b=>b.classList.toggle('on',b.dataset.tab===tab)); | |
| $$('.km-page').forEach(p=>{p.hidden=p.dataset.page!==tab;}); | |
| requestAnimationFrame(positionTabPill); | |
| if(tab==='expert')resolveExpert(); | |
| if(tab==='lb'){refreshLB(true); setLBPolling(true);} | |
| else setLBPolling(false); | |
| } | |
| $$('.km-tab').forEach(b=>b.addEventListener('click',()=>showTab(b.dataset.tab))); | |
| window.addEventListener('resize',positionTabPill); | |
| function optionInfo(slot,key){ | |
| if(slot==='mem')return key==='add'?MEM.info:'No residual add. The kernel starts from the input row and does not add a skip value.'; | |
| if(slot==='norm')return NORMS[key].info; | |
| if(slot==='act'||slot==='act2')return key==='none'?'No second activation. The pipeline stops after the first activation.':ACTS[key].info; | |
| return ''; | |
| } | |
| function optionLabel(slot,key){ | |
| if(slot==='mem')return key==='add'?'add input':'none'; | |
| if(slot==='norm')return NORMS[key].lbl; | |
| if(slot==='act'||slot==='act2')return key==='none'?'none':ACTS[key].lbl; | |
| return key; | |
| } | |
| function selectedKey(slot){ | |
| if(slot==='mem')return state.slots.mem?'add':'none'; | |
| if(slot==='act2')return state.slots.act2||'none'; | |
| return state.slots[slot]; | |
| } | |
| function setSlot(slot,key,quiet){ | |
| if(slot==='mem')state.slots.mem=key==='add'; | |
| else if(slot==='act2')state.slots.act2=key==='none'?null:key; | |
| else state.slots[slot]=key; | |
| const wrap=root.querySelector('.km-pick[data-slot="'+slot+'"]'); | |
| if(wrap)wrap.querySelectorAll('.km-opt').forEach(b=>b.classList.toggle('on',b.dataset.key===selectedKey(slot))); | |
| if(!quiet)setInfo(optionInfo(slot,key)); | |
| render(); | |
| } | |
| function renderPickers(){ | |
| $$('.km-pick[data-slot]').forEach(wrap=>{ | |
| const slot=wrap.dataset.slot; | |
| let keys=[]; | |
| if(slot==='mem')keys=['none','add']; | |
| if(slot==='norm')keys=Object.keys(NORMS); | |
| if(slot==='act')keys=Object.keys(ACTS); | |
| if(slot==='act2')keys=['none'].concat(Object.keys(ACTS)); | |
| wrap.innerHTML=''; | |
| keys.forEach(key=>{ | |
| const b=document.createElement('button'); | |
| b.type='button'; | |
| b.className='km-opt'; | |
| b.dataset.key=key; | |
| b.textContent=optionLabel(slot,key); | |
| b.title=optionInfo(slot,key); | |
| b.classList.toggle('on',key===selectedKey(slot)); | |
| b.addEventListener('mouseenter',()=>setInfo(optionInfo(slot,key))); | |
| b.addEventListener('focus',()=>setInfo(optionInfo(slot,key))); | |
| b.addEventListener('click',()=>setSlot(slot,key)); | |
| wrap.appendChild(b); | |
| }); | |
| }); | |
| } | |
| function renderClassics(){ | |
| const cwrap=$('#km-classics'); | |
| if(!cwrap)return; | |
| cwrap.innerHTML=''; | |
| Object.entries(CLASSICS).forEach(([k,v])=>{ | |
| const b=document.createElement('button'); | |
| b.type='button'; | |
| b.className='km-opt'; | |
| b.dataset.key=k; | |
| b.textContent=v.lbl; | |
| b.title=v.info; | |
| b.classList.toggle('on',state.classic===k); | |
| b.addEventListener('mouseenter',()=>setInfo(v.info)); | |
| b.addEventListener('focus',()=>setInfo(v.info)); | |
| b.addEventListener('click',()=>{ | |
| state.classic=k; | |
| cwrap.querySelectorAll('.km-opt').forEach(x=>x.classList.toggle('on',x===b)); | |
| setInfo(v.info); | |
| render(); | |
| }); | |
| cwrap.appendChild(b); | |
| }); | |
| } | |
| function setComposerMode(mode){ | |
| state.mode=mode; | |
| $$('.km-segb').forEach(b=>b.classList.toggle('on',b.dataset.mode===mode)); | |
| const compose=$('#km-compose'), classic=$('#km-classic'); | |
| if(compose)compose.hidden=mode!=='build'; | |
| if(classic)classic.hidden=mode!=='classic'; | |
| setInfo(machineInfo()); | |
| render(); | |
| } | |
| $$('.km-segb').forEach(b=>b.addEventListener('click',()=>setComposerMode(b.dataset.mode))); | |
| function setupStepper(id,outId,initial,onChange){ | |
| const wrap=$(id), out=$(outId); | |
| if(!wrap)return; | |
| wrap.innerHTML=''; | |
| const set=v=>{ | |
| onChange(v); | |
| if(out)out.textContent=String(v); | |
| wrap.querySelectorAll('button').forEach(b=>b.classList.toggle('on',+b.dataset.v===v)); | |
| }; | |
| for(let i=1;i<=8;i++){ | |
| const b=document.createElement('button'); | |
| b.type='button'; | |
| b.dataset.v=String(i); | |
| b.textContent=String(i); | |
| b.addEventListener('click',()=>set(i)); | |
| wrap.appendChild(b); | |
| } | |
| set(initial); | |
| } | |
| function setupTemp(id,outId){ | |
| const el=$(id), out=$(outId); | |
| if(!el)return; | |
| const sync=()=>{if(out)out.textContent=(+el.value/100).toFixed(2);}; | |
| el.addEventListener('input',sync); | |
| sync(); | |
| } | |
| function syncModePair(localSel,proSel){ | |
| const local=$(localSel), pro=$(proSel); | |
| if(!local||!pro)return; | |
| const sync=()=>{ | |
| if(local.checked){pro.checked=false;pro.disabled=true;} | |
| else pro.disabled=false; | |
| }; | |
| local.addEventListener('change',sync); | |
| pro.addEventListener('change',()=>{if(pro.checked)local.checked=false;sync();}); | |
| sync(); | |
| } | |
| const svg=$('#km-svg'), W=320, H=150, PAD=12, n=INPUT.length; | |
| const xs=i=>PAD+i*(W-2*PAD)/(n-1); | |
| const ys=v=>{const t=Math.max(-2.2,Math.min(2.2,v));return H/2-t*(H/2-PAD)/2.2;}; | |
| let dots=[], line=null, cur=INPUT.slice(), anim=null; | |
| function buildSvg(){ | |
| if(!svg)return; | |
| svg.innerHTML=''; | |
| const mid=document.createElementNS('http://www.w3.org/2000/svg','line'); | |
| mid.setAttribute('x1',0); mid.setAttribute('x2',W); mid.setAttribute('y1',H/2); mid.setAttribute('y2',H/2); | |
| mid.setAttribute('stroke','#1f2d26'); mid.setAttribute('stroke-width','1'); svg.appendChild(mid); | |
| line=document.createElementNS('http://www.w3.org/2000/svg','polyline'); | |
| line.setAttribute('fill','none'); line.setAttribute('stroke','#34e0a1'); line.setAttribute('stroke-width','2'); line.setAttribute('stroke-linejoin','round'); | |
| svg.appendChild(line); | |
| dots=cur.map((v,i)=>{const c=document.createElementNS('http://www.w3.org/2000/svg','circle');c.setAttribute('r','3.4');c.setAttribute('cx',xs(i));svg.appendChild(c);return c;}); | |
| paint(cur); | |
| } | |
| function paint(v){ | |
| if(!line)return; | |
| line.setAttribute('points',v.map((x,i)=>xs(i)+','+ys(x)).join(' ')); | |
| v.forEach((x,i)=>{if(dots[i]){dots[i].setAttribute('cy',ys(x));dots[i].setAttribute('fill',x>=0?'#34e0a1':'#ff7a7a');}}); | |
| } | |
| const ease=t=>1-Math.pow(1-t,3); | |
| function morph(to,ms=480){ | |
| if(!line)return; | |
| const from=cur.slice(), t0=performance.now(); | |
| if(anim)cancelAnimationFrame(anim); | |
| const step=t=>{ | |
| const k=Math.min(1,(t-t0)/ms), e=ease(k); | |
| cur=from.map((f,i)=>f+(to[i]-f)*e); | |
| paint(cur); | |
| if(k<1)anim=requestAnimationFrame(step); else cur=to.slice(); | |
| }; | |
| anim=requestAnimationFrame(step); | |
| } | |
| function normalize(a,kind){ | |
| const m=a.reduce((s,x)=>s+x,0)/a.length; | |
| if(kind==='layernorm'){ | |
| const v=a.reduce((s,x)=>s+(x-m)*(x-m),0)/a.length; | |
| return a.map(x=>(x-m)/Math.sqrt(v+1e-5)); | |
| } | |
| const r=Math.sqrt(a.reduce((s,x)=>s+x*x,0)/a.length+1e-6); | |
| return a.map(x=>x/r); | |
| } | |
| function classicViz(key){ | |
| let v=INPUT.slice(); | |
| if(key.startsWith('softmax')){ | |
| const scale=key==='softmax_scale'?0.7:1; | |
| const z=v.map(x=>x*scale), mx=Math.max(...z), e=z.map(x=>Math.exp(x-mx)), s=e.reduce((a,b)=>a+b,0); | |
| v=e.map(x=>x/s*4-0.5); | |
| }else if(key==='rmsnorm'||key==='layernorm'){ | |
| v=normalize(v,key); | |
| }else if(key==='swiglu'){ | |
| v=v.map(x=>(x/(1+Math.exp(-x)))*(0.7+Math.abs(x)*0.3)); | |
| }else if(key==='geglu'){ | |
| v=v.map(x=>ACTS.gelu.f(x)*(0.7+Math.abs(x)*0.3)); | |
| } | |
| return v; | |
| } | |
| function pipeline(){ | |
| if(state.mode==='classic')return [{l:'input',v:INPUT.slice()},{l:CLASSICS[state.classic].lbl,v:classicViz(state.classic)}]; | |
| const st=[{l:'input',v:INPUT.slice()}]; | |
| let v=INPUT.slice(); | |
| if(state.slots.mem){v=v.map((x,i)=>x+INPUT[(i+3)%n]*0.5);st.push({l:'residual add',v:v.slice()});} | |
| v=normalize(v,state.slots.norm);st.push({l:NORMS[state.slots.norm].lbl,v:v.slice()}); | |
| v=v.map(ACTS[state.slots.act].f);st.push({l:ACTS[state.slots.act].lbl+' '+ACTS[state.slots.act].verb,v:v.slice()}); | |
| if(state.slots.act2){v=v.map(ACTS[state.slots.act2].f);st.push({l:ACTS[state.slots.act2].lbl+' second pass',v:v.slice()});} | |
| return st; | |
| } | |
| let playing=false; | |
| function play(){ | |
| if(playing)return; | |
| playing=true; | |
| const st=pipeline(); | |
| let i=0; | |
| const stage=$('#km-stage-label'); | |
| const next=()=>{ | |
| if(i>=st.length){playing=false;return;} | |
| if(stage){stage.textContent=cleanText(st[i].l);animate(stage,[{opacity:0,transform:'translateY(4px)'},{opacity:1,transform:'none'}],{duration:200,easing:'cubic-bezier(0.16,1,0.3,1)'});} | |
| morph(st[i].v); | |
| i++; | |
| setTimeout(next,680); | |
| }; | |
| next(); | |
| } | |
| function render(){ | |
| const st=pipeline(), stage=$('#km-stage-label'); | |
| if(stage)stage.textContent=cleanText(st[st.length-1].l); | |
| morph(st[st.length-1].v,360); | |
| updateReplayOffer(); | |
| } | |
| function buildRecipe(){ | |
| const tempEl=$('#km-temp'), proEl=$('#km-pro'), localEl=$('#km-local'), nameEl=$('#km-name'); | |
| const local=!!(localEl&&localEl.checked); | |
| return { | |
| mode:state.mode, | |
| norm:state.slots.norm, | |
| residual:!!state.slots.mem, | |
| acts:[state.slots.act].concat(state.slots.act2?[state.slots.act2]:[]), | |
| classic:state.classic, | |
| spark:'', | |
| k:buildTries, | |
| temp:(+(tempEl?tempEl.value:70))/100, | |
| pro:!local&&!!(proEl&&proEl.checked), | |
| local, | |
| name:nameEl?nameEl.value:'' | |
| }; | |
| } | |
| function expertRecipe(){ | |
| if(!resolvedExpert||resolvedExpert.unknown)return null; | |
| const tempEl=$('#km-xtemp'), proEl=$('#km-xpro'), localEl=$('#km-xlocal'), briefEl=$('#km-xbrief'); | |
| const local=!!(localEl&&localEl.checked); | |
| const r=Object.assign({},resolvedExpert.recipe); | |
| r.k=expertTries; | |
| r.temp=(+(tempEl?tempEl.value:70))/100; | |
| r.pro=!local&&!!(proEl&&proEl.checked); | |
| r.local=local; | |
| r.name='expert'; | |
| r.spark=briefEl?briefEl.value:''; | |
| return r; | |
| } | |
| function currentRecipe(){ | |
| return state.tab==='expert'?(expertRecipe()||buildRecipe()):buildRecipe(); | |
| } | |
| function bridge(recipe){ | |
| const inp=document.querySelector('#km_recipe textarea'); | |
| if(!inp)return; | |
| // _n is a nonce: it makes the recipe JSON differ on every mint so Gradio always registers a | |
| // value change and re-runs do_mint. Without it, minting the same recipe twice in a row left | |
| // the textarea value unchanged and the second submit was dropped (the "needs a double-click" | |
| // bug). The backend echoes it back so stale blobs from the previous run cannot repaint the old | |
| // result over the new loading card while Gradio is preparing the next response. | |
| const nonce=Date.now().toString(36)+'-'+Math.random().toString(36).slice(2); | |
| const payload=Object.assign({},recipe,{_n:nonce}); | |
| activeMintNonce=nonce; | |
| awaitingMint=true; | |
| const set=Object.getOwnPropertyDescriptor(window.HTMLTextAreaElement.prototype,'value').set; | |
| set.call(inp,JSON.stringify(payload)); | |
| inp.dispatchEvent(new Event('input',{bubbles:true})); | |
| // NOTE: we intentionally do NOT dispatch 'change' here. The leaderboard fetch is triggered | |
| // separately via refreshLB() so it never competes with do_mint in Gradio's queue. | |
| setTimeout(()=>{ | |
| const g=document.querySelector('#km_go button')||document.querySelector('#km_go'); | |
| if(g)g.click(); | |
| },80); | |
| } | |
| function setLBStatus(text,kind){ | |
| const s=$('#km-lb-status'); | |
| if(!s)return; | |
| s.textContent=text; | |
| s.classList.toggle('bad',kind==='bad'); | |
| s.classList.toggle('warn',kind==='warn'); | |
| } | |
| function setLBButton(on){ | |
| const b=$('#km-lb-refresh'); | |
| if(!b)return; | |
| b.disabled=on; | |
| b.textContent=on?'Refreshing...':'Refresh'; | |
| } | |
| async function refreshLBDirect(){ | |
| const rows=[], seen=new Set(), sources=[]; | |
| for(const url of LB_URLS){ | |
| try{ | |
| const res=await fetch(url,{cache:'no-store'}); | |
| if(!res.ok)continue; | |
| const data=await res.json(); | |
| const got=Array.isArray(data.rows)?data.rows:[]; | |
| if(got.length)sources.push(url); | |
| got.forEach(row=>{ | |
| const key=[row.op,row.name,row.ts,row.speedup_compile,row.speedup_eager].join('|'); | |
| if(seen.has(key))return; | |
| seen.add(key); | |
| rows.push(row); | |
| }); | |
| }catch(e){} | |
| } | |
| if(!sources.length)throw new Error('direct leaderboard fetch failed'); | |
| renderLB(rows,{fetched_at:Date.now()/1000,sources}); | |
| } | |
| function refreshLB(manual=false){ | |
| if(awaitingMint){ | |
| pendingLBRefresh=true; | |
| if(manual)setLBStatus('Queued until the current mint finishes.','warn'); | |
| return; | |
| } | |
| if(manual||state.tab==='lb'){ | |
| setLBButton(true); | |
| setLBStatus('Checking leaderboard...',''); | |
| if(lbUiTimer)clearTimeout(lbUiTimer); | |
| lbUiTimer=setTimeout(()=>{ | |
| refreshLBDirect().catch(()=>{ | |
| setLBButton(false); | |
| setLBStatus('Still waiting for the Gradio refresh bridge. Direct browser fetch was blocked. Try once more or reopen the tab.','warn'); | |
| }); | |
| },8000); | |
| } | |
| const btn=document.querySelector('#km_lb_btn button')||document.querySelector('#km_lb_btn'); | |
| if(btn)btn.click(); | |
| else{ | |
| setLBButton(false); | |
| setLBStatus('Refresh bridge is not mounted yet. Reopen the Leaderboard tab.','bad'); | |
| } | |
| } | |
| function scheduleLBRefresh(delay=700){ | |
| pendingLBRefresh=false; | |
| if(lbRefreshTimer)clearTimeout(lbRefreshTimer); | |
| lbRefreshTimer=setTimeout(()=>refreshLB(false),delay); | |
| } | |
| function setLBPolling(on){ | |
| if(lbPollTimer){clearInterval(lbPollTimer);lbPollTimer=null;} | |
| if(on)lbPollTimer=setInterval(()=>{if(state.tab==='lb')refreshLB(false);},15000); | |
| } | |
| let mintSafetyTimer=null; | |
| function setMinting(on){ | |
| ['#km-build','#km-xbuild'].forEach(id=>{const b=$(id); if(b){b.disabled=on; b.classList.toggle('minting',on);}}); | |
| if(mintSafetyTimer){clearTimeout(mintSafetyTimer); mintSafetyTimer=null;} | |
| // Bulletproof re-enable: showResult() clears this, but if a result never arrives (hung backend) | |
| // the button must not stay dead. 4 min is longer than any real mint. | |
| if(on) mintSafetyTimer=setTimeout(()=>setMinting(false), 240000); | |
| } | |
| function replayName(key){return (REPLAY_META[key]&&REPLAY_META[key].label)||(REPLAYS[key]&&cleanText(REPLAYS[key].op))||key;} | |
| function matchingReplayKey(){ | |
| if(state.mode==='classic'&&state.classic==='softmax'&&REPLAYS.softmax)return 'softmax'; | |
| if(state.mode==='build'&&!state.slots.mem&&state.slots.norm==='rmsnorm'&&state.slots.act==='gelu'&&!state.slots.act2&&REPLAYS.rmsnorm_gelu)return 'rmsnorm_gelu'; | |
| if(state.mode==='build'&&state.slots.mem&&state.slots.norm==='layernorm'&&state.slots.act==='silu'&&!state.slots.act2&&REPLAYS.add_layernorm_silu)return 'add_layernorm_silu'; | |
| return null; | |
| } | |
| function updateReplayOffer(){ | |
| const btn=$('#km-replay'); | |
| if(!btn||!REPLAY_ORDER.length)return; | |
| const key=matchingReplayKey()||REPLAY_ORDER[0]; | |
| btn.dataset.replayKey=key; | |
| btn.textContent=matchingReplayKey()?'Watch this recorded mint':'Watch a recorded mint'; | |
| $$('.km-rpick').forEach(b=>b.classList.toggle('on',b.dataset.replayKey===key)); | |
| } | |
| function renderReplayChoices(){ | |
| const wrap=$('#km-replay-picks'); | |
| if(!wrap)return; | |
| if(!REPLAY_ORDER.length){wrap.hidden=true;return;} | |
| wrap.hidden=false; | |
| wrap.innerHTML=REPLAY_ORDER.map(k=>"<button class='km-rpick' data-replay-key='"+esc(k)+"'>"+esc((REPLAY_META[k]&&REPLAY_META[k].short)||replayName(k))+" <span>"+sp3(REPLAYS[k].speedup_compile)+"x</span></button>").join(''); | |
| updateReplayOffer(); | |
| } | |
| const sp=x=>Number.isFinite(+x)?(+x).toFixed(1).replace(/\.0$/,''):'?'; | |
| const sp2=x=>Number.isFinite(+x)?(+x).toFixed(2):'?'; | |
| const sp3=x=>Number.isFinite(+x)?(+x).toFixed(3).replace(/0+$/,'').replace(/\.$/,''):'?'; | |
| const isOk=s=>String(s||'').toLowerCase()==='ok'||String(s||'').toLowerCase()==='pass'; | |
| function attemptSummary(d){ | |
| const statuses=Array.isArray(d.statuses)?d.statuses:[]; | |
| const passed=statuses.length?statuses.filter(isOk).length:(Number.isFinite(+d.n_verified)?+d.n_verified:0); | |
| const total=statuses.length||(Number.isFinite(+d.k)?+d.k:passed||0); | |
| return {statuses,passed,total}; | |
| } | |
| function attemptChips(d,limit){ | |
| const a=attemptSummary(d), statuses=a.statuses.slice(0,limit??a.statuses.length); | |
| if(!statuses.length)return ''; | |
| return "<div class='km-attempts'>"+statuses.map((s,i)=>{ | |
| const ok=isOk(s); | |
| return "<span class='km-attempt "+(ok?'ok':'bad')+"'><b>"+(ok?'OK':'FAIL')+"</b> attempt "+(i+1)+", "+esc(s)+"</span>"; | |
| }).join('')+"</div>"; | |
| } | |
| function target(mode){ | |
| if(mode==='expert'){ | |
| return {verdict:$('#km-xverdict'),source:$('#km-xsource'),code:$('#km-xsource .km-code')}; | |
| } | |
| return {verdict:$('#km-verdict'),source:$('#km-source'),code:$('#km-code')}; | |
| } | |
| function clearSource(mode){ | |
| const t=target(mode); | |
| if(t.source)t.source.hidden=true; | |
| if(t.code)t.code.textContent=''; | |
| } | |
| function setSource(d,kind,mode){ | |
| const t=target(mode); | |
| if(!t.source||!t.code)return; | |
| if(!d.source){clearSource(mode);return;} | |
| t.source.hidden=false; | |
| t.source.open=false; | |
| const sum=t.source.querySelector('summary'); | |
| if(sum)sum.textContent=kind==='recorded'?'Recorded Triton kernel source':'Live Triton kernel source'; | |
| t.code.textContent=d.source; | |
| } | |
| function wakingError(msg){return /waking|wake|timeout|timed out|still|unexpected|http|modal|loading/i.test(String(msg||''));} | |
| function showResult(d,opts={}){ | |
| if(!opts.recorded&&awaitingMint&&String(d&&d._n||'')!==String(activeMintNonce||'')){ | |
| return; | |
| } | |
| if(!opts.recorded&&awaitingMint){ | |
| awaitingMint=false; | |
| activeMintNonce=null; | |
| } | |
| const mode=opts.mode||lastMintedMode||'build'; | |
| stopBuild(); | |
| stopReplay(); | |
| if(!opts.recorded) setMinting(false); // result arrived -> re-enable Mint (replays don't disable it) | |
| const t=target(mode), v=t.verdict; | |
| if(!v)return; | |
| clearSource(mode); | |
| if(d.error){ | |
| const local=!!d.local; | |
| const wake=!local&&wakingError(d.error); | |
| const replayKey=matchingReplayKey()||REPLAY_ORDER[0]||''; | |
| const replayBtn=replayKey?"<button class='km-inline-replay' data-replay-key='"+esc(replayKey)+"'>Watch a recorded mint while it wakes</button>":""; | |
| v.innerHTML="<div class='km-card err'>"+ | |
| "<div class='km-result-top'><div class='km-verdict-icon bad'>!</div><div><span class='km-run-label'>"+(local?'local offline mint':'live mint')+"</span><b>"+(wake?'The live model is still waking':(local?'The local mint did not finish':'The live mint did not finish'))+"</b></div></div>"+ | |
| "<p class='km-dim'>"+(wake?'The backend scales to zero, so the first live mint can take about 90 seconds. The replay below is a real earlier mint, not a mockup.':(local?esc(d.error):'Backend said: '+esc(d.error)))+"</p>"+ | |
| (wake?replayBtn+"<p class='km-rawerr'>Backend said: "+esc(d.error)+"</p>":replayBtn)+"</div>"; | |
| if(pendingLBRefresh)scheduleLBRefresh(); | |
| return; | |
| } | |
| const a=attemptSummary(d); | |
| if(!d.verified){ | |
| v.innerHTML="<div class='km-card miss'>"+ | |
| "<div class='km-result-top'><div class='km-verdict-icon bad'>NO</div><div><span class='km-run-label'>referee verdict</span><b>Not verified yet</b></div></div>"+ | |
| "<div class='km-ref-summary'><b>"+a.passed+" of "+(a.total||'?')+"</b> attempts passed the referee</div>"+ | |
| attemptChips(d)+"<p class='km-dim'>"+esc(d.explanation||'Press Mint again, raise attempts, or change the operation.')+"</p></div>"; | |
| if(pendingLBRefresh)scheduleLBRefresh(); | |
| return; | |
| } | |
| const recorded=!!opts.recorded, local=!!d.local; | |
| // Headline the HONEST baseline (the compiler), not the inflated vs-eager fusion number. | |
| // Strongest available wins: max-autotune > torch.compile default > eager. | |
| const se=+d.speedup_eager, sc=+d.speedup_compile, sm=+d.speedup_maxauto; | |
| const hasC=Number.isFinite(sc)&&sc>0, hasM=Number.isFinite(sm)&&sm>0; | |
| const heroSpeed=hasM?sm:(hasC?sc:se); | |
| const heroLabel=hasM?'vs torch.compile max-autotune':(hasC?'vs torch.compile':'vs PyTorch eager'); | |
| const missingLocalCompiler=local&&!recorded&&!hasC&&!hasM; | |
| const compilerGuard=missingLocalCompiler | |
| ? "<div class='km-baseline-guard'>Compiler baselines did not return for this Local run. Re-mint; a certified Local result should include torch.compile and max-autotune next to eager.</div>" | |
| : ""; | |
| const runLabel=local?'local offline mint':(recorded?'recorded mint':'live mint'); | |
| const noteText=recorded?'recorded backend run':(local?'local offline run':'live backend run'); | |
| const note="<span class='km-pill-note "+(recorded?'recorded':(local?'local':'hit'))+"'>"+noteText+"</span>"; | |
| const board=d.beat_champion?"<span class='km-crown'>27B champion beaten</span>":d.on_leaderboard?"<span class='km-board'>leaderboard result</span>":""; | |
| v.innerHTML="<div class='km-card ok km-result-card'>"+note+board+ | |
| "<div class='km-result-top'><div class='km-verdict-icon'>OK</div><div><span class='km-run-label'>"+runLabel+"</span><b>Verified by the referee</b><span>"+esc(d.op||'kernel')+"</span></div></div>"+ | |
| "<div class='km-ref-summary'><b>"+a.passed+" of "+(a.total||'?')+"</b> attempts passed the referee</div>"+ | |
| attemptChips(d)+ | |
| compilerGuard+ | |
| // Every comparison as its own big number, strongest (most honest) baseline first, eager last | |
| // and labelled 'unfused' so the side-by-side is self-explanatory: the model's real edge is | |
| // the ~1.1x over the compiler; the big eager number is mostly the fusion win. | |
| "<div class='km-stats km-cmp'>"+ | |
| (hasM?"<div class='km-cmp-key'><big>"+sp3(sm)+"x</big><span>vs torch.compile max-autotune</span></div>":"")+ | |
| (hasC?"<div"+(hasM?"":" class='km-cmp-key'")+"><big>"+sp3(sc)+"x</big><span>vs torch.compile</span></div>":"")+ | |
| "<div><big>"+sp(se)+"x</big><span>vs PyTorch eager (unfused)</span></div>"+ | |
| "<div><big>"+(Number.isFinite(+d.gen_seconds)?sp(d.gen_seconds)+'s':'?')+"</big><span>generation</span></div></div>"+ | |
| "<p class='km-dim'>"+esc(d.explanation||'')+"</p></div>"; | |
| animate(v.firstElementChild,[{transform:'scale(0.96)',opacity:0},{transform:'scale(1)',opacity:1}],{duration:300,easing:'cubic-bezier(0.16,1,0.3,1)'}); | |
| setSource(d,recorded?'recorded':'live',mode); | |
| if(!recorded&&mode==='build'){ | |
| HIST.unshift("<code>"+esc(d.op||'kernel')+"</code><span>"+sp(d.speedup_maxauto||d.speedup_compile||d.speedup_eager)+"x vs compiler</span>"); | |
| const hist=$('#km-hist'); | |
| if(hist)hist.innerHTML=HIST.slice(0,10).map(h=>"<div class='km-hrow'>"+h+"</div>").join(''); | |
| } | |
| // The leaderboard refresh is deliberately decoupled from the mint call. Once the result has | |
| // arrived, the queue is free again, so refresh the board without hiding the just-rendered result. | |
| if(!recorded)scheduleLBRefresh(900); | |
| } | |
| let replayTimer=null; | |
| function stopReplay(){if(replayTimer){clearTimeout(replayTimer);replayTimer=null;}} | |
| function startReplay(key){ | |
| const d=REPLAYS[key]; | |
| if(!d)return; | |
| lastMintedMode=state.tab==='expert'?'expert':'build'; | |
| stopBuild(); | |
| stopReplay(); | |
| play(); | |
| const t=target(lastMintedMode), v=t.verdict; | |
| if(!v)return; | |
| clearSource(lastMintedMode); | |
| const statuses=Array.isArray(d.statuses)?d.statuses:[]; | |
| const total=statuses.length||(Number.isFinite(+d.k)?+d.k:4); | |
| const passed=statuses.length?statuses.filter(isOk).length:(Number.isFinite(+d.n_verified)?+d.n_verified:0); | |
| v.innerHTML="<div class='km-card building replaying'>"+ | |
| "<div class='km-result-top'><div class='km-verdict-icon'>RUN</div><div><span class='km-run-label recorded'>recorded mint</span><b>"+esc(replayName(key))+"</b><span>This is a genuine earlier backend run. Mint still starts a live run.</span></div></div>"+ | |
| "<div class='km-replay-stage' id='km-replay-stage'>loading recorded attempts...</div>"+ | |
| "<div class='km-attempts' id='km-replay-attempts'></div>"+ | |
| "<div class='km-bar'><i id='km-replay-fill'></i></div>"+ | |
| "<p class='km-dim'>The model took "+(Number.isFinite(+d.gen_seconds)?sp(d.gen_seconds)+'s':'real time')+" on this recorded run. The replay shows the referee loop without waiting for cold start.</p></div>"; | |
| const att=$('#km-replay-attempts'), stage=$('#km-replay-stage'), fill=$('#km-replay-fill'); | |
| let i=0; | |
| const tick=()=>{ | |
| if(i<total){ | |
| const st=statuses[i]||(i<passed?'ok':'not verified'), ok=isOk(st); | |
| if(stage)stage.textContent='referee verdict for attempt '+(i+1)+' of '+total; | |
| if(att)att.insertAdjacentHTML('beforeend',"<span class='km-attempt "+(ok?'ok':'bad')+"'><b>"+(ok?'OK':'FAIL')+"</b> attempt "+(i+1)+", "+esc(st)+"</span>"); | |
| if(fill)fill.style.width=Math.round(((i+1)/(total+1))*100)+'%'; | |
| i++; | |
| replayTimer=setTimeout(tick,360); | |
| return; | |
| } | |
| if(stage)stage.textContent='best verified kernel selected'; | |
| if(fill)fill.style.width='100%'; | |
| replayTimer=setTimeout(()=>showResult(d,{recorded:true,mode:lastMintedMode,replayKey:key}),420); | |
| }; | |
| replayTimer=setTimeout(tick,180); | |
| } | |
| let buildTimer=null; | |
| function startBuild(mode,recipe){ | |
| stopReplay(); | |
| const t=target(mode), v=t.verdict; | |
| if(!v)return; | |
| clearSource(mode); | |
| const k=recipe.k||4, started=Date.now(), replayKey=mode==='build'?(matchingReplayKey()||REPLAY_ORDER[0]||''):(REPLAY_ORDER[0]||''); | |
| const local=!!recipe.local; | |
| const stages=local?['loading local GGUF...','llama.cpp drafting '+Math.min(k,2)+' kernels...','compiling Triton on this Space GPU...','checking correctness vs PyTorch...','timing vs torch.compile max-autotune...']: | |
| ['model waking...','drafting '+k+' kernels...','compiling Triton...','checking correctness vs PyTorch...','timing vs torch.compile max-autotune...']; | |
| const stepHtml=stages.map((s,i)=>"<span class='"+(i===0?'on':'')+"'>"+esc(s)+"</span>").join(''); | |
| v.innerHTML="<div class='km-card building'><b class='km-bstage' id='km-bstage'>"+esc(stages[0])+"</b>"+ | |
| "<div class='km-build-steps' id='km-build-steps'>"+stepHtml+"</div>"+ | |
| "<div class='km-bar'><i id='km-barfill'></i></div>"+ | |
| "<span class='km-dim'>"+(local?"Local mode drafts up to two kernels with llama.cpp on this Space's GPU, then verifies them with the in-process referee and times them against PyTorch eager, torch.compile, and max-autotune. First run may also download/cache the GGUF.":"The model drafts up to "+k+" kernels. The referee compiles, correctness-checks, and times every candidate. First build of a session can take about 90 seconds while the model wakes.")+"</span>"+ | |
| (replayKey?"<button class='km-inline-replay' data-replay-key='"+esc(replayKey)+"'>Watch a recorded mint while this runs</button>":"")+"</div>"; | |
| // Honest progress: do_mint is one blocking call with no progress signal, so we cannot show a | |
| // live readout. Instead we walk the referee's real phases ONCE on a rough estimate, monotonically | |
| // (the index only ever increases), and park on the final phase until the result arrives. It never | |
| // loops back or resets the way the old modulo version did. A gentle pulse keeps the viz alive. | |
| const stageStart=[0,12,18,23,28]; // seconds at which each phase is assumed to begin | |
| let baseVals=null; try{const st=pipeline();baseVals=st[st.length-1].v.slice();}catch(e){} | |
| buildTimer=setInterval(()=>{ | |
| const elapsed=(Date.now()-started)/1000; | |
| let ix=0; for(let j=0;j<stageStart.length;j++){if(elapsed>=stageStart[j])ix=j;} | |
| ix=Math.min(ix,stages.length-1); | |
| const label=(ix===stages.length-1&&elapsed>45)?'still timing vs torch.compile max-autotune...':stages[ix]; | |
| const el=$('#km-bstage'); | |
| if(el&&el.textContent!==label){el.textContent=label;animate(el,[{opacity:.35,transform:'translateY(3px)'},{opacity:1,transform:'none'}],{duration:220,easing:'cubic-bezier(0.16,1,0.3,1)'});} | |
| $$('#km-build-steps span').forEach((s,j)=>{s.classList.toggle('on',j===ix);s.classList.toggle('done',j<ix);}); | |
| const f=$('#km-barfill'); | |
| if(f)f.style.width=Math.min(94,(Date.now()-started)/900)+'%'; | |
| if(baseVals){const p=0.96+0.04*Math.sin(elapsed*1.5);morph(baseVals.map(x=>x*p),650);} | |
| },650); | |
| } | |
| function stopBuild(){ | |
| if(buildTimer){clearInterval(buildTimer);buildTimer=null;} | |
| const f=$('#km-barfill'); | |
| if(f)f.style.width='100%'; | |
| render(); | |
| } | |
| function renderLB(rows,meta={}){ | |
| const el=$('#km-lb-view'); | |
| if(!el)return; | |
| if(lbUiTimer){clearTimeout(lbUiTimer);lbUiTimer=null;} | |
| setLBButton(false); | |
| const fetched=Number(meta.fetched_at||0); | |
| const stamp=fetched?new Date(fetched*1000).toLocaleTimeString([], {hour:'2-digit', minute:'2-digit', second:'2-digit'}):'just now'; | |
| const sourceCount=Array.isArray(meta.sources)?meta.sources.length:0; | |
| setLBStatus('Last checked '+stamp+(sourceCount?' from '+sourceCount+' endpoint'+(sourceCount>1?'s':'')+'.':'.'),''); | |
| if(!rows.length){el.innerHTML='<i>No kernels yet. Be the first.</i>';return;} | |
| el.innerHTML="<table class='km-lbt'><thead><tr><th>#</th><th>machine</th><th>builder</th><th>vs compiler</th><th>vs PyTorch</th></tr></thead><tbody>"+ | |
| rows.slice(0,25).map((r,i)=>"<tr class='"+(r.champion?'champ':'')+"'><td class='km-rank'>"+(i+1)+"</td><td><code>"+esc(r.op||'kernel')+"</code></td><td>"+esc(r.name||'anonymous')+"</td><td class='hot'>"+sp2(r.speedup_compile)+"x</td><td>"+sp(r.speedup_eager)+"x</td></tr>").join('')+"</tbody></table>"; | |
| } | |
| function watch(id,cb){ | |
| const el=document.querySelector(id); | |
| if(!el)return; | |
| const read=()=>{ | |
| const s=el.querySelector('[data-blob]'); | |
| if(s){try{cb(JSON.parse(atob(s.dataset.blob)));}catch(e){}} | |
| }; | |
| new MutationObserver(read).observe(el,{childList:true,subtree:true}); | |
| read(); | |
| } | |
| setTimeout(()=>{watch('#km_result',showResult);watch('#km_lb',d=>renderLB(d.rows||[],d));},250); | |
| const rf=$('#km-lb-refresh'); | |
| if(rf)rf.addEventListener('click',()=>refreshLB(true)); | |
| const CLASSIC_TERMS=[ | |
| {key:'softmax_scale',terms:['scaled softmax','scale softmax','attention scale']}, | |
| {key:'swiglu',terms:['swiglu','swi glu']}, | |
| {key:'geglu',terms:['geglu','ge glu']}, | |
| {key:'softmax',terms:['softmax']} | |
| ]; | |
| const ACT_TERMS=[ | |
| {key:'gelu_erf',terms:['gelu_erf','exact gelu','erf gelu']}, | |
| {key:'relu2',terms:['relu2','relu squared','squared relu','square relu']}, | |
| {key:'leaky_relu',terms:['leaky_relu','leaky relu','leaky']}, | |
| {key:'silu',terms:['silu','swish','swishes']}, | |
| {key:'sigmoid',terms:['sigmoid']}, | |
| {key:'gelu',terms:['gelu']}, | |
| {key:'relu',terms:['relu']}, | |
| {key:'tanh',terms:['tanh']}, | |
| {key:'elu',terms:['elu']}, | |
| {key:'mish',terms:['mish']} | |
| ]; | |
| function termMatches(text,items){ | |
| // word-boundary match so 'elu' does not fire inside 'gelu', 'relu' not inside 'leaky_relu', etc. | |
| // (underscores and digits are word chars, so \b correctly keeps relu2 and gelu_erf distinct.) | |
| const found=[]; | |
| items.forEach(item=>{ | |
| let best=-1, bestLen=0; | |
| item.terms.forEach(term=>{ | |
| const re=new RegExp('\\b'+term.replace(/[.*+?^${}()|[\]\\]/g,'\\$&')+'\\b'); | |
| const m=re.exec(text); | |
| if(m&&(best<0||m.index<best||(m.index===best&&term.length>bestLen))){best=m.index;bestLen=term.length;} | |
| }); | |
| if(best>=0)found.push({key:item.key,ix:best,len:bestLen}); | |
| }); | |
| found.sort((a,b)=>a.ix-b.ix||b.len-a.len); | |
| // drop a shorter match that overlaps a longer one ('relu' inside 'leaky relu', 'softmax' | |
| // inside 'scaled softmax'), keeping the first/longest at each span. | |
| const kept=[]; | |
| found.forEach(m=>{ if(!kept.some(k=>m.ix<k.ix+k.len&&k.ix<m.ix+m.len))kept.push(m); }); | |
| return kept; | |
| } | |
| function resolveBrief(text){ | |
| const raw=text||'', t=raw.toLowerCase(); | |
| const classics=termMatches(t,CLASSIC_TERMS); | |
| if(classics.length){ | |
| const key=classics[0].key; | |
| return {unknown:false,label:CLASSICS[key].lbl,recipe:{mode:'classic',norm:'rmsnorm',residual:false,acts:[],classic:key,spark:raw}}; | |
| } | |
| const normHit=t.includes('layernorm')||t.includes('layer norm')?'layernorm':(t.includes('rmsnorm')||t.includes('rms norm')||/\brms\b/.test(t)?'rmsnorm':null); | |
| const residual=/residual|resid|\bskip\b|add input|\+ input/.test(t); | |
| const acts=[]; | |
| termMatches(t,ACT_TERMS).forEach(m=>{if(!acts.includes(m.key)&&acts.length<2)acts.push(m.key);}); | |
| if(!normHit&&!acts.length){ | |
| return {unknown:true,message:'Could not map this brief. Use RMSNorm, LayerNorm, residual, and a single supported activation, or a named op such as softmax, scaled softmax, SwiGLU, or GeGLU.'}; | |
| } | |
| if(!acts.length){ | |
| if(normHit&&!residual){ | |
| return {unknown:false,label:CLASSICS[normHit].lbl,recipe:{mode:'classic',norm:normHit,residual:false,acts:[],classic:normHit,spark:raw}}; | |
| } | |
| return {unknown:true,message:'This brief needs a supported activation after the residual or normalization so the referee can verify it.'}; | |
| } | |
| const recipe={mode:'build',norm:normHit||'rmsnorm',residual,acts,classic:'softmax',spark:raw}; | |
| return {unknown:false,label:labelRecipe(recipe),recipe}; | |
| } | |
| function resolveExpert(){ | |
| const box=$('#km-xbrief'), res=$('#km-xresolved'), out=res?res.querySelector('.v'):null, btn=$('#km-xbuild'); | |
| if(!box||!res||!out)return; | |
| resolvedExpert=resolveBrief(box.value); | |
| res.classList.toggle('unknown',!!resolvedExpert.unknown); | |
| out.textContent=resolvedExpert.unknown?resolvedExpert.message:resolvedExpert.label; | |
| if(btn)btn.disabled=!!resolvedExpert.unknown; | |
| } | |
| const OP_TAGS=[ | |
| ['softmax','softmax over each row'], | |
| ['scaled_softmax','scaled softmax over each row'], | |
| ['swiglu','SwiGLU gated feed forward'], | |
| ['geglu','GeGLU gated feed forward'], | |
| ['rmsnorm','RMSNorm only'], | |
| ['layernorm','LayerNorm only'], | |
| ['add_rmsnorm_gelu','fused RMSNorm with residual and GELU'], | |
| ['layernorm_silu','fused LayerNorm with SiLU'], | |
| ['rmsnorm_mish','fused RMSNorm with Mish'], | |
| ['add_layernorm_silu','fused LayerNorm with residual and SiLU'], | |
| ['rmsnorm_gelu_relu2','fused RMSNorm with GELU then squared ReLU'] | |
| ].concat(Object.keys(ACTS).map(k=>[k,ACTS[k].lbl+' activation'])); | |
| function renderOpTags(sel,interactive){ | |
| const wrap=$(sel); | |
| if(!wrap)return; | |
| wrap.innerHTML=''; | |
| OP_TAGS.forEach(([label,brief])=>{ | |
| const b=document.createElement('button'); | |
| b.type='button'; | |
| b.className='km-optag'; | |
| b.textContent=label; | |
| b.dataset.brief=brief; | |
| if(interactive)b.addEventListener('click',()=>{ | |
| const box=$('#km-xbrief'); | |
| if(box){box.value=brief;box.dispatchEvent(new Event('input',{bubbles:true}));box.focus();} | |
| }); | |
| wrap.appendChild(b); | |
| }); | |
| } | |
| function renderGlossary(){ | |
| const lb=$('#km-learn-blocks'); | |
| if(!lb)return; | |
| const rows=[['Residual',MEM.info],...Object.values(NORMS).map(v=>[v.lbl,v.info]),...Object.values(ACTS).map(v=>[v.lbl,v.info]),...Object.values(CLASSICS).map(v=>[v.lbl,v.info])]; | |
| lb.innerHTML=rows.map(([n,d])=>"<div class='km-gloss'><b>"+esc(n)+"</b><span>"+esc(d)+"</span></div>").join(''); | |
| } | |
| function setupCopyButtons(){ | |
| $$('.km-copy').forEach(btn=>btn.addEventListener('click',()=>{ | |
| const block=btn.closest('.km-codeblock'); | |
| if(!block)return; | |
| const clone=block.cloneNode(true), copy=clone.querySelector('.km-copy'); | |
| if(copy)copy.remove(); | |
| const text=clone.textContent.trim(); | |
| const done=()=>{const old=btn.textContent;btn.textContent='Copied';setTimeout(()=>{btn.textContent=old;},900);}; | |
| if(navigator.clipboard&&navigator.clipboard.writeText)navigator.clipboard.writeText(text).then(done).catch(done); | |
| else done(); | |
| })); | |
| } | |
| root.addEventListener('click',e=>{ | |
| const b=e.target.closest('[data-replay-key]'); | |
| if(!b||!root.contains(b))return; | |
| const key=b.dataset.replayKey; | |
| if(!REPLAYS[key])return; | |
| e.preventDefault(); | |
| startReplay(key); | |
| }); | |
| const xb=$('#km-xbrief'); | |
| if(xb)xb.addEventListener('input',resolveExpert); | |
| // (second-activation builder removed: the kernelsmith grammar is norm + one activation) | |
| const buildBtn=$('#km-build'); | |
| if(buildBtn)buildBtn.addEventListener('click',()=>{ | |
| lastMintedMode='build'; | |
| const r=buildRecipe(); | |
| animate(buildBtn,[{transform:'scale(0.97)'},{transform:'scale(1)'}],{duration:140,easing:'cubic-bezier(0.16,1,0.3,1)'}); | |
| play(); | |
| setMinting(true); | |
| startBuild('build',r); | |
| bridge(r); | |
| }); | |
| const xBuild=$('#km-xbuild'); | |
| if(xBuild)xBuild.addEventListener('click',()=>{ | |
| resolveExpert(); | |
| const r=expertRecipe(); | |
| if(!r)return; | |
| lastMintedMode='expert'; | |
| animate(xBuild,[{transform:'scale(0.97)'},{transform:'scale(1)'}],{duration:140,easing:'cubic-bezier(0.16,1,0.3,1)'}); | |
| setMinting(true); | |
| startBuild('expert',r); | |
| bridge(r); | |
| }); | |
| setupStepper('#km-tries','#km-tries-v',4,v=>{buildTries=v;}); | |
| setupStepper('#km-xtries','#km-xtries-v',5,v=>{expertTries=v;}); | |
| setupTemp('#km-temp','#km-temp-v'); | |
| setupTemp('#km-xtemp','#km-xtemp-v'); | |
| syncModePair('#km-local','#km-pro'); | |
| syncModePair('#km-xlocal','#km-xpro'); | |
| // The build and expert pages each have their own "Local (offline)" toggle. Local mode is the ONLY | |
| // path that computes the max-autotune number, so a page in Modal mode shows just 2 comparisons. | |
| // Keep both toggles in lockstep so build and expert always run the same way and show the same | |
| // set of comparison numbers (the guard stops the change events from ping-ponging). | |
| (function(){ | |
| const a=$('#km-local'), b=$('#km-xlocal'); | |
| if(!a||!b)return; | |
| a.addEventListener('change',()=>{ if(b.checked!==a.checked){ b.checked=a.checked; b.dispatchEvent(new Event('change')); } }); | |
| b.addEventListener('change',()=>{ if(a.checked!==b.checked){ a.checked=b.checked; a.dispatchEvent(new Event('change')); } }); | |
| })(); | |
| renderPickers(); | |
| renderClassics(); | |
| renderReplayChoices(); | |
| renderOpTags('#km-oplist',true); | |
| renderOpTags('#km-docs-ops',false); | |
| renderGlossary(); | |
| setupCopyButtons(); | |
| buildSvg(); | |
| setInfo(machineInfo()); | |
| resolveExpert(); | |
| showTab('build'); | |
| render(); | |
| } | |
| """ | |
| with gr.Blocks(title="Kernel Mint", head=f"<style>{CSS}</style>") as demo: | |
| gr.HTML(APP_HTML) | |
| # bridge components: kept in the DOM (NOT visible=False, which removes them entirely so the | |
| # custom JS can't reach the inner textarea/button) and hidden with CSS (#km_recipe etc.). | |
| recipe_in = gr.Textbox(elem_id="km_recipe") | |
| result_out = gr.HTML(elem_id="km_result") | |
| lb_out = gr.HTML(elem_id="km_lb") | |
| go = gr.Button("go", elem_id="km_go") | |
| lb_btn = gr.Button("lb", elem_id="km_lb_btn") | |
| # trigger_mode="multiple": the default ("once") silently DROPS a click that lands while a | |
| # previous mint is still running, so minting one kernel after another needed two clicks and the | |
| # old result lingered. The frontend also disables the Mint button during a mint (one at a time, | |
| # no wasted ZeroGPU calls); this is the belt-and-suspenders so no click is ever swallowed. | |
| go.click(do_mint, recipe_in, result_out, trigger_mode="multiple") | |
| # Leaderboard refresh is its OWN trigger, decoupled from minting. Previously fetch_lb was wired | |
| # to recipe_in.change, so every mint (which sets recipe_in) also fired a leaderboard fetch; on a | |
| # slow backend that fetch sat ahead of do_mint in Gradio's queue and the mint appeared to do | |
| # nothing until a second click. Now the JS clicks #km_lb_btn only when it actually wants the board. | |
| lb_btn.click(fetch_lb, None, lb_out) | |
| demo.load(fetch_lb, None, lb_out) | |
| demo.load(None, None, None, js=APP_JS) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860))) | |