q / index.html
Humuhumu33's picture
keep-warm pulse: hold GPU boost clock while typing so casual chat decodes at boosted rate
70592b3 verified
Raw
History Blame Contribute Delete
35.5 kB
<!doctype html><html><head><meta charset=utf8><meta name=viewport content="width=device-width,initial-scale=1,viewport-fit=cover">
<title>Q — private AI, in your browser</title>
<style>
:root{--bg:#0b0e14;--panel:#141922;--ink:#e6e9ef;--dim:#8a94a6;--q:#7c5cff;--u:#1f6feb;--line:#1e2531}
*{box-sizing:border-box}html,body{height:100%}
body{margin:0;font:15px/1.55 -apple-system,Segoe UI,Roboto,system-ui,monospace;background:var(--bg);color:var(--ink);display:flex;flex-direction:column;overscroll-behavior:none}
header{padding:10px 16px;border-bottom:1px solid var(--line);display:flex;align-items:center;gap:10px;flex:0 0 auto}
header b{font-weight:600}header .s{color:var(--dim);font-size:12px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap}
#log{flex:1;overflow:auto;padding:16px;display:flex;flex-direction:column;gap:12px;-webkit-overflow-scrolling:touch}
.msg{max-width:82%;padding:9px 13px;border-radius:12px;white-space:pre-wrap;word-wrap:break-word}
.u{align-self:flex-end;background:var(--u)}.a{align-self:flex-start;background:var(--panel);border:1px solid #232b3a}
.a.think{color:var(--dim);font-style:italic}
footer{padding:12px 16px;border-top:1px solid var(--line);display:flex;gap:8px;flex:0 0 auto;padding-bottom:calc(12px + env(safe-area-inset-bottom))}
#in{flex:1;background:var(--panel);border:1px solid #232b3a;color:var(--ink);border-radius:10px;padding:10px 12px;font:inherit;resize:none;max-height:140px}
button{background:var(--q);color:#fff;border:0;border-radius:10px;padding:0 18px;font:inherit;cursor:pointer}button:disabled{opacity:.4;cursor:default}
.stat{color:var(--dim);font-size:11px;margin-top:3px}
</style></head><body>
<header><b>Q</b> <span class=s id=st>booting…</span></header>
<div id=log></div>
<footer><textarea id=in rows=1 placeholder="Message Q…" disabled></textarea><button id=send disabled>Send</button></footer>
<script type=module>
import { ready, loadModel, MODELS, defaultModelIndex } from "./core/loader.js";
import { createEngine } from "./core/engine.js";
import { selfPersona, selfFacts } from "./core/q-self.mjs"; // ONE grounded self-knowledge, shared with the messenger + voice
const $ = (s) => document.querySelector(s);
const log = $("#log"), input = $("#in"), send = $("#send"), st = $("#st");
const bubble = (cls, text = "") => { const d = document.createElement("div"); d.className = "msg " + cls; d.textContent = text; log.appendChild(d); log.scrollTop = log.scrollHeight; return d; };
const params = new URLSearchParams(location.search);
const pick = params.get("m");
let m = pick ? (MODELS.find((x) => new RegExp(pick, "i").test(x.name)) || MODELS[0]) : MODELS[defaultModelIndex()];
// STREAM FROM HF: a bare link streams the HOLOGRAMTECH BitNet κ-object from Hugging Face; ?hf=<org/repo> or
// ?kappa=<absolute-url> override. The κ-object is content-addressed + pinned, so the host is an UNTRUSTED CDN —
// every block is re-derived (Law L5); a bad byte is rejected. Blocks cache locally after first load (0-net on return).
{
let hf = params.get("hf"), kappa = params.get("kappa");
if (!hf && !kappa && !pick) hf = "HOLOGRAMTECH/q-bitnet-2b"; // bare link → stream BitNet from Hologram's HF repo
if (hf || kappa) {
const base = (kappa || `https://huggingface.co/${hf}/resolve/main`).replace(/\/+$/, "");
const bit = MODELS.find((x) => (x.fam || "").toLowerCase() === "bitnet") || m;
// drop the model's SHA-256 manifest pin: a ?hf repo may carry a different (e.g. BLAKE3) manifest, so the
// stale pin would falsely reject it. Per-block κ verification still applies (untrusted-CDN-safe).
m = { ...bit, kappaUrl: base, manifestKappa: undefined, name: bit.name + " · via " + (hf || new URL(base).host) };
}
}
// ?stream=layer → page the model layer-by-layer instead of resident. For t2 (BitNet) this exercises the DRAFT
// t2-streaming engine path (correctness first; the .qvf remote path adds fast-first-token). Default = resident.
{
const sm = params.get("stream");
if (sm && sm !== "false" && sm !== "resident") m = { ...m, stream: sm };
}
// ?verify=gpu → re-derive each BLAKE3 weight-block κ ENTIRELY on the GPU (2.74 GB/s) instead of pure-JS BLAKE3.
if (params.get("verify") === "gpu") globalThis.__gpuVerify = true;
// ?spec → speculative decode (n-gram draft + batched-K verify). Byte-identical to greedy; big wins on echo-heavy
// text (code/quote/retrieval), no gain on free-form chat. ?bench=spec runs the A/B measurement harness after load.
if (params.get("spec") || params.get("bench") === "spec") globalThis.__spec = true;
// GROUND the model as on-device Q (a base/instruct model has NO self-knowledge — without this it confabulates
// a generic "I run on OpenAI/AWS cloud servers" identity, which is false). Injected as the SYSTEM turn.
function frameSystem() {
const PERSONA = selfPersona({ model: m, engine });
if (m.llama3) return `<|start_header_id|>system<|end_header_id|>\n\n${PERSONA}<|eot_id|>`;
if (m.qwen) return `<|im_start|>system\n${PERSONA}<|im_end|>\n`;
if (m.olmo) return `<|system|>\n${PERSONA}\n`;
return PERSONA + "\n\n";
}
// GROUNDED IDENTITY (the anti-confabulation guard). A 2B model reverts to its training prior — "I'm GPT-3.5 on
// AWS" — when asked what/where it is, no matter the system prompt. But identity is not a guess: it is the TRUTH
// of THIS running instance. So provenance questions are answered DETERMINISTICALLY from the live facts (the real
// resident model + its κ + the real host it streamed from), never from the model. Grounded, not performed.
const IDENTITY_RX = /\b(are|r)\s*(you|u)\b.*\b(gpt|chatgpt|openai|claude|anthropic|gemini|bard|llama|language model|an? ai|running|local|on[- ]?device|in the browser|on (a )?server|in the cloud|hosted)\b|\bwhat( kind of| sort of| type of)?\b.*\b(model|llm|ai|are you|based on|powered by|architecture|run on|running)\b|\bwho\b.*\b(are you|made|built|created|trained|develop)\b|\bwhere\b.*\b(run|running|host|hosted|are you|live|located)\b|\bpowered by\b|\bwhat are you\b|\b(openai|chatgpt|gpt-?\d|aws|amazon web|google cloud|cloud server)\b|\b(local|cloud|server)\b.*\bmodel\b|\bdo you run\b/i;
function groundedIdentity() {
const f = selfFacts({ model: m, engine });
const name = f.model || (m && m.name) || "an on-device model";
const host = (f.weightsFrom && !/^local$/i.test(f.weightsFrom)) ? f.weightsFrom : "Hugging Face";
const q = f.quant ? ` (${f.quant})` : "";
return `I'm Q. I run the ${name}${q} entirely in your browser on WebGPU — not GPT, not OpenAI, and not on any server or cloud. `
+ `My weights streamed from ${host} and are content-addressed: every block is re-derived byte-for-byte as it loads, so nothing can be tampered with and no host has to be trusted. `
+ `Once I'm loaded, nothing you type ever leaves your device.`;
}
let engine = null, convIds = [], busy = false, armed = false, pending = null;
input.disabled = send.disabled = false; input.placeholder = "Message Q… (model loading — will send the moment it's ready)"; input.focus();
async function generate(text, skipUser) {
busy = true; input.disabled = send.disabled = true;
if (!skipUser) bubble("u", text);
// Answer identity/provenance questions from the grounded truth, not the model's confabulation.
if (IDENTITY_RX.test(text)) {
const a = bubble("a", groundedIdentity());
const stat = document.createElement("div"); stat.className = "stat"; stat.textContent = "grounded · from this instance"; a.after(stat);
busy = false; input.disabled = send.disabled = false; input.focus(); return;
}
const a = bubble("a think", "…"); let first = true;
const stat = document.createElement("div"); stat.className = "stat";
try {
let framed = engine.frameTurn(text, convIds.length > 0);
if (convIds.length === 0) framed = frameSystem() + framed;
let turnIds = engine.tokenize(framed);
if (m.bos && engine.bosId != null && convIds.length === 0) turnIds = [engine.bosId, ...turnIds];
const res = await engine.generate(convIds.concat(turnIds), { maxNew: m.cap || 256, onToken: ({ text: t, stats }) => {
if (first && t) { a.classList.remove("think"); a.textContent = ""; first = false; }
a.textContent = t; log.scrollTop = log.scrollHeight;
if (stats) stat.textContent = `${stats.tokps ? stats.tokps.toFixed(0) + " tok/s" : ""}${stats.msExec ? " · " + stats.msExec.toFixed(1) + "ms GPU/tok" : ""}${stats.ttft ? " · TTFT " + Math.round(stats.ttft) + "ms" : ""}`;
} });
if (first) { a.classList.remove("think"); a.textContent = res.text || "(no output)"; }
convIds = res.ids; a.after(stat);
} catch (e) { a.classList.remove("think"); a.textContent = "⚠ " + e.message; }
busy = false; input.disabled = send.disabled = false; input.focus();
}
// Warm the GPU to boost clock, then PRIME the system-prompt KV so the first real turn reuses it instead of
// re-prefilling ~140 tokens cold (that was the ~12s TTFT). sync() keeps the cache only if it's a clean prefix
// of the first turn — frameSystem() ends on the atomic <|eot_id|> token, so tokenize(system) is exactly that.
async function warmUp() {
const gpu = engine && engine._gpu;
if (!gpu || !gpu.sync || !gpu.generate || !gpu.reset) return;
try {
// throwaway via decode() (not generate()) so it BUILDS the batched-decode pipelines now — else the first real
// message pays ~1.7s of one-time shader compilation. Also boosts the GPU clock.
gpu.reset(); await (gpu.decode || gpu.generate)(engine.tokenize("Hello there."), 24, m.rep ?? 1.3); gpu.reset();
let primeIds = engine.tokenize(frameSystem());
if (m.bos && engine.bosId != null) primeIds = [engine.bosId, ...primeIds];
await gpu.sync(primeIds, true); // prefill the system prompt into KV; leaves cache == primeIds
} catch (e) { console.warn("warmUp", e); }
}
// STATIC grounded greeting — no model call (so it can't confabulate a generic "friendly AI assistant" line, and
// it doesn't disturb the primed system-prompt cache). Instant.
async function proactiveGreeting() {
const f = selfFacts({ model: m, engine });
const host = (f.weightsFrom && !/^local$/i.test(f.weightsFrom)) ? f.weightsFrom : "Hugging Face";
bubble("a", `Hey — I'm Q, running entirely in your browser${f.gpu ? " on your GPU" : ""}, no server. My weights streamed from ${host} and are verified by re-derivation, so nothing you type ever leaves your device. What can I help you with?`);
busy = false; input.disabled = send.disabled = false; input.focus();
}
function onSend() {
const text = input.value.trim(); if (!text || busy) return;
input.value = ""; input.style.height = "auto";
if (!armed) { pending = text; bubble("u", text); const w = bubble("a think", "…starting the model, one moment…"); w.dataset.pending = "1"; return; }
generate(text);
}
send.onclick = onSend;
input.onkeydown = (e) => { if (e.key === "Enter" && !e.shiftKey) { e.preventDefault(); onSend(); } };
input.oninput = () => { input.style.height = "auto"; input.style.height = Math.min(140, input.scrollHeight) + "px"; };
// KEEP-WARM: casual chat lets the GPU cool between messages, so each reply decodes at a medium clock (~40ms/tok)
// instead of boosted (~20ms). While the user is typing/focused, run a tiny THROWAWAY GPU compute (scratch buffer,
// never touches the model KV) to hold the clock up → the next message decodes boosted. Pauses during generation
// and when the tab is hidden. This is the idle-cooling fix for sustained high tok/s in normal use.
let _warmDev = null, _warmPipe = null, _warmBg = null, _warmBusy = false, _warmTimer = null;
function ensureWarmKit() {
if (_warmPipe) return true;
const dev = engine && engine._gpu && engine._gpu._dev && engine._gpu._dev();
if (!dev) return false;
_warmDev = dev;
const WGSL = `@group(0) @binding(0) var<storage,read_write> b: array<vec4<f32>>;
@compute @workgroup_size(64) fn main(@builtin(global_invocation_id) g:vec3<u32>){
let i = g.x % 65536u; var a = b[i]; for(var k=0u;k<1024u;k++){ a = a*1.0000001 + vec4<f32>(0.5); } b[i] = a; }`;
try {
_warmPipe = dev.createComputePipeline({ layout: "auto", compute: { module: dev.createShaderModule({ code: WGSL }), entryPoint: "main" } });
const buf = dev.createBuffer({ size: 65536 * 16, usage: GPUBufferUsage.STORAGE });
_warmBg = dev.createBindGroup({ layout: _warmPipe.getBindGroupLayout(0), entries: [{ binding: 0, resource: { buffer: buf } }] });
} catch (e) { console.warn("warmkit", e); return false; }
return true;
}
async function warmPulse(ms = 220) {
if (busy || _warmBusy || document.hidden || !armed) return; // never compete with real inference
if (!ensureWarmKit()) return;
_warmBusy = true;
try {
const end = performance.now() + ms;
while (performance.now() < end && !busy && !document.hidden) {
const e = _warmDev.createCommandEncoder(); const p = e.beginComputePass(); p.setPipeline(_warmPipe); p.setBindGroup(0, _warmBg); p.dispatchWorkgroups(4096); p.end(); _warmDev.queue.submit([e.finish()]);
await _warmDev.queue.onSubmittedWorkDone();
}
} catch (e) { /* ignore — best effort */ } finally { _warmBusy = false; }
}
input.addEventListener("focus", () => warmPulse(300));
input.addEventListener("keydown", () => { clearTimeout(_warmTimer); _warmTimer = setTimeout(() => warmPulse(220), 50); });
// ── SPEC-DECODE A/B HARNESS (?bench=spec) ── measures baseline greedy vs speculative on the operator's real
// GPU across echo-heavy and free-form prompts: byte-identical check (G1), mean accepted tokens/verify (G2),
// and decode tok/s for both. One load, one table — the honest verdict on whether spec-decode earns its place.
const SPEC_BENCH = [
{ tag: "code / edit (echo-heavy)", text: "Here is a function:\n\nfunction add(a, b) {\n return a + b;\n}\n\nRewrite it exactly the same but rename add to sum." },
{ tag: "retrieval / quote", text: "Passage: \"The quick brown fox jumps over the lazy dog near the river bank at dawn.\" Repeat that passage back to me word for word." },
{ tag: "free-form chat", text: "In one short sentence, why is the sky blue?" },
];
const eqArr = (a, b) => a.length === b.length && a.every((x, i) => x === b[i]);
async function runSpecBench() {
log.innerHTML = ""; input.disabled = send.disabled = true;
if (!engine.specAvailable) { st.textContent = "spec-decode unavailable for this model"; bubble("a", "This model can't use the batched-verify head (specAvailable=false)."); return; }
const N = 192, rep = m.rep ?? 1.3, rows = []; // long gen so DECODE dominates (prefill was the old confound)
let prev = { windows: 0, drafted: 0, accepted: 0 };
// warm up to boost the GPU clock (cold vs boosted differs ~2.4×) — measure only when warm
st.textContent = "warming up (boosting GPU clock)…";
globalThis.__spec = false; engine.reset();
await engine.generate(engine.tokenize(engine.frameTurn("Write one sentence about the sea.", false)), { maxNew: 48, repPenalty: rep });
for (const b of SPEC_BENCH) {
st.textContent = `bench: ${b.tag}…`;
const ids = engine.tokenize(engine.frameTurn(b.text, false)); // NO long system prompt — keep prefill small
globalThis.__spec = false; engine.reset();
const t0 = performance.now(); const r0 = await engine.generate(ids.slice(), { maxNew: N, repPenalty: rep }); const w0 = performance.now() - t0;
globalThis.__spec = true; engine.reset();
const t1 = performance.now(); const r1 = await engine.generate(ids.slice(), { maxNew: N, repPenalty: rep }); const w1 = performance.now() - t1;
globalThis.__spec = false;
const cur = (r1.stats && r1.stats.spec) || prev;
const dd = { windows: cur.windows - prev.windows, drafted: cur.drafted - prev.drafted, accepted: cur.accepted - prev.accepted }; prev = { windows: cur.windows, drafted: cur.drafted, accepted: cur.accepted };
rows.push({
tag: b.tag, same: eqArr(r0.outIds, r1.outIds), nB: r0.outIds.length, nS: r1.outIds.length,
baseTok: r0.outIds.length / (w0 / 1000), specTok: r1.outIds.length / (w1 / 1000),
perVerify: dd.windows ? 1 + dd.accepted / dd.windows : 0, accept: dd.drafted ? dd.accepted / dd.drafted : 0, windows: dd.windows,
});
}
const allSame = rows.every((r) => r.same);
const fmt = (x) => x.toFixed(x < 10 ? 1 : 0);
const tbl = `<div style="font-family:ui-monospace,monospace;font-size:13px;max-width:900px;margin:0 auto;padding:8px">
<div style="font-size:18px;font-weight:700;margin-bottom:4px">Speculative decode — measured on your GPU</div>
<div style="color:var(--dim);margin-bottom:12px">BitNet-2B · n-gram draft + batched-K verify · greedy, byte-exact by construction</div>
<table style="width:100%;border-collapse:collapse">
<tr style="color:var(--dim);text-align:left"><th style="padding:6px 8px">workload</th><th style="padding:6px 8px;text-align:right">baseline</th><th style="padding:6px 8px;text-align:right">spec</th><th style="padding:6px 8px;text-align:right">speedup</th><th style="padding:6px 8px;text-align:right">tok/verify</th><th style="padding:6px 8px;text-align:right">accept</th><th style="padding:6px 8px;text-align:right">byte-exact</th></tr>
${rows.map((r) => `<tr style="border-top:1px solid var(--line)"><td style="padding:6px 8px">${r.tag}</td><td style="padding:6px 8px;text-align:right">${fmt(r.baseTok)} tok/s</td><td style="padding:6px 8px;text-align:right">${fmt(r.specTok)} tok/s</td><td style="padding:6px 8px;text-align:right;color:${r.specTok > r.baseTok * 1.05 ? "#48c26c" : r.specTok < r.baseTok * 0.95 ? "#f0616d" : "var(--dim)"}">${(r.specTok / r.baseTok).toFixed(2)}×</td><td style="padding:6px 8px;text-align:right">${r.perVerify.toFixed(2)}</td><td style="padding:6px 8px;text-align:right">${(r.accept * 100).toFixed(0)}%</td><td style="padding:6px 8px;text-align:right;color:${r.same ? "#48c26c" : "#f0616d"}">${r.same ? "✓ identical" : "✗ DIVERGED"}</td></tr>`).join("")}
</table>
<div style="margin-top:14px;font-weight:600;color:${allSame ? "#48c26c" : "#f0616d"}">${allSame ? "✓ G1 PASS — spec output is byte-identical to greedy on every prompt." : "✗ G1 FAIL — spec diverged from greedy; not shippable until fixed (see console)."}</div>
</div>`;
log.innerHTML = tbl;
st.textContent = "spec-decode bench · done";
console.log("[specbench]", rows);
}
// ── LIVE DECODE PROFILE (?bench=perf) ── warms the GPU to boost clock, then measures the REAL decode path
// (engine.generate) steady-state tok/s at constant clock — separating "is there a lever left" from the boost-clock
// noise that makes cold vs warm runs differ ~2.4×. Compares to the 220 tok/s bandwidth roofline.
async function runPerfBench() {
log.innerHTML = ""; input.disabled = send.disabled = true;
const rep = m.rep ?? 1.3;
const ids = engine.tokenize(frameSystem() + engine.frameTurn("Write a detailed paragraph about how ocean currents move heat around the planet.", false));
globalThis.__spec = false;
st.textContent = "warming up (boosting GPU clock)…";
engine.reset(); await engine.generate(ids.slice(), { maxNew: 64, repPenalty: rep }); // warmup → boost clock + warm caches
const runs = [];
for (let i = 0; i < 3; i++) {
st.textContent = `measuring run ${i + 1}/3…`;
engine.reset();
const t0 = performance.now();
const r = await engine.generate(ids.slice(), { maxNew: 128, repPenalty: rep });
const dt = performance.now() - t0;
runs.push({ n: r.outIds.length, wall: dt, e2e: r.outIds.length / (dt / 1000), steady: (r.stats && r.stats.tokps) || 0, msExec: (r.stats && r.stats.msExec) || 0 });
}
const best = runs.slice().sort((a, b) => b.steady - a.steady)[0];
const ROOF = 220, KERNEL = 158; // measured: bandwidth roofline · boosted sustained single-matmul
const pct = 100 * best.steady / ROOF, msTok = best.steady ? 1000 / best.steady : 0;
const near = best.steady >= 0.6 * KERNEL;
const tbl = `<div style="font-family:ui-monospace,monospace;font-size:13px;max-width:860px;margin:0 auto;padding:8px">
<div style="font-size:18px;font-weight:700;margin-bottom:4px">Live decode — measured at boosted clock</div>
<div style="color:var(--dim);margin-bottom:12px">BitNet-2B · real engine.generate path · warmed then timed ×3 · bandwidth roofline 220 tok/s</div>
<table style="width:100%;border-collapse:collapse">
<tr style="color:var(--dim);text-align:left"><th style="padding:6px 8px">run</th><th style="padding:6px 8px;text-align:right">steady tok/s</th><th style="padding:6px 8px;text-align:right">end-to-end tok/s</th><th style="padding:6px 8px;text-align:right">ms/token</th><th style="padding:6px 8px;text-align:right">GPU ms/tok</th></tr>
${runs.map((r, i) => `<tr style="border-top:1px solid var(--line)"><td style="padding:6px 8px">run ${i + 1}</td><td style="padding:6px 8px;text-align:right">${r.steady.toFixed(0)}</td><td style="padding:6px 8px;text-align:right">${r.e2e.toFixed(0)}</td><td style="padding:6px 8px;text-align:right">${(r.steady ? 1000 / r.steady : 0).toFixed(1)}</td><td style="padding:6px 8px;text-align:right">${r.msExec ? r.msExec.toFixed(1) : "—"}</td></tr>`).join("")}
</table>
<div style="margin-top:12px">Best steady: <b>${best.steady.toFixed(0)} tok/s</b> = <b>${pct.toFixed(0)}%</b> of the 220 bandwidth roofline (sustained single-matmul reference ≈ ${KERNEL} tok/s).</div>
<div style="margin-top:10px;font-weight:600;color:${near ? "#48c26c" : "#e0a94a"}">${near
? "✓ Live decode is near the sustained-kernel rate — little recoverable overhead. The kernel/roofline is the ceiling; further tok/s needs fewer weight-bytes (lower-bit/MoE), spec-decode on echo text, or more bandwidth (discrete GPU)."
: `⚠ Live decode (${best.steady.toFixed(0)}) sits well below the sustained kernel (~${KERNEL}) at the SAME clock — the gap is per-token CPU round-trips (fences / JS embed / detokenize) letting the GPU idle. Decode-loop saturation is the real lever, and it's what also unlocks spec-decode's ~free batched verify.`}</div>
</div>`;
log.innerHTML = tbl; st.textContent = "live decode profile · done"; console.log("[perfbench]", runs);
}
// ── DISCRETE-GPU VALIDATION (?bench=discrete) ── the whole thesis in one page, on whatever GPU opens it:
// (1) real VRAM bandwidth + decode roofline, (2) live BitNet tok/s warmed, (3) does spec-decode FLIP from the
// iGPU loss to a win once bandwidth-bound? Confirms the discrete GPU (bandwidth >350 GB/s) is actually in use.
const BW_FILL = `@group(0) @binding(0) var<storage,read_write> d: array<u32>;
@group(0) @binding(1) var<uniform> P: vec4<u32>;
@compute @workgroup_size(256) fn main(@builtin(global_invocation_id) g:vec3<u32>){ let n=P.x; var i=g.x; loop{ if(i>=n){break;} d[i]=(i*2654435761u+1u); i=i+P.y; } }`;
const BW_READ = `@group(0) @binding(0) var<storage,read> d: array<vec4<u32>>;
@group(0) @binding(1) var<storage,read_write> sink: array<u32>;
@group(0) @binding(2) var<uniform> P: vec4<u32>;
@compute @workgroup_size(256) fn main(@builtin(global_invocation_id) g:vec3<u32>){ let n=P.x; let stride=P.y; var acc=vec4<u32>(0u); var i=g.x; loop{ if(i>=n){break;} acc=acc^d[i]; i=i+stride; } sink[g.x]=acc.x^acc.y^acc.z^acc.w; }`;
async function measureVramBW(dev) {
const L = dev.limits;
const bytes = Math.floor(Math.min(L.maxStorageBufferBindingSize, L.maxBufferSize, 512 * 1024 * 1024) / 16) * 16, nVec = bytes / 16;
const buf = dev.createBuffer({ size: bytes, usage: GPUBufferUsage.STORAGE });
const wg = Math.min(L.maxComputeWorkgroupsPerDimension, 65535), TOTAL = wg * 256;
const sink = dev.createBuffer({ size: TOTAL * 4, usage: GPUBufferUsage.STORAGE });
const P = dev.createBuffer({ size: 16, usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST });
dev.queue.writeBuffer(P, 0, new Uint32Array([bytes / 4, TOTAL, 0, 0]));
const fp = dev.createComputePipeline({ layout: "auto", compute: { module: dev.createShaderModule({ code: BW_FILL }), entryPoint: "main" } });
const fbg = dev.createBindGroup({ layout: fp.getBindGroupLayout(0), entries: [{ binding: 0, resource: { buffer: buf } }, { binding: 1, resource: { buffer: P } }] });
{ const e = dev.createCommandEncoder(); const p = e.beginComputePass(); p.setPipeline(fp); p.setBindGroup(0, fbg); p.dispatchWorkgroups(wg); p.end(); dev.queue.submit([e.finish()]); await dev.queue.onSubmittedWorkDone(); }
dev.queue.writeBuffer(P, 0, new Uint32Array([nVec, TOTAL, 0, 0]));
const rp = dev.createComputePipeline({ layout: "auto", compute: { module: dev.createShaderModule({ code: BW_READ }), entryPoint: "main" } });
const rbg = dev.createBindGroup({ layout: rp.getBindGroupLayout(0), entries: [{ binding: 0, resource: { buffer: buf } }, { binding: 1, resource: { buffer: sink } }, { binding: 2, resource: { buffer: P } }] });
const run = async (passes) => { const e = dev.createCommandEncoder(); for (let k = 0; k < passes; k++) { const p = e.beginComputePass(); p.setPipeline(rp); p.setBindGroup(0, rbg); p.dispatchWorkgroups(wg); p.end(); } const t0 = performance.now(); dev.queue.submit([e.finish()]); await dev.queue.onSubmittedWorkDone(); return performance.now() - t0; };
await run(4); let best = 1e9; for (let k = 0; k < 5; k++) best = Math.min(best, await run(32));
buf.destroy(); sink.destroy();
return (bytes * 32 / 1073741824) / (best / 1000);
}
async function runDiscreteBench() {
log.innerHTML = ""; input.disabled = send.disabled = true;
const rep = m.rep ?? 1.3, gpu = engine._gpu, dev = gpu && gpu._dev && gpu._dev();
let adapterStr = "unknown";
try { const a = await navigator.gpu.requestAdapter({ powerPreference: "high-performance" }); const inf = (a && a.info) || {}; adapterStr = ((inf.vendor || "") + " " + (inf.architecture || "") + " " + (inf.device || "")).trim() || "unknown"; } catch {}
st.textContent = "measuring VRAM bandwidth…";
let gbps = 0; try { gbps = await measureVramBW(dev); } catch (e) { console.warn("bw", e); }
const roofTok = gbps / 0.69, discrete = gbps > 350;
st.textContent = "warming + measuring live decode…";
globalThis.__spec = false;
engine.reset(); await engine.generate(engine.tokenize(engine.frameTurn("Write one sentence about the sea.", false)), { maxNew: 48, repPenalty: rep });
let live = 0;
for (let i = 0; i < 3; i++) { engine.reset(); const r = await engine.generate(engine.tokenize(engine.frameTurn("Write a detailed paragraph about ocean currents.", false)), { maxNew: 128, repPenalty: rep }); live = Math.max(live, (r.stats && r.stats.tokps) || 0); }
st.textContent = "measuring spec-decode flip…";
const sIds = engine.tokenize(engine.frameTurn("Passage: \"The quick brown fox jumps over the lazy dog near the river bank at dawn.\" Repeat that passage back to me word for word.", false));
globalThis.__spec = false; engine.reset(); let t0 = performance.now(); const rb = await engine.generate(sIds.slice(), { maxNew: 192, repPenalty: rep }); const bTok = rb.outIds.length / ((performance.now() - t0) / 1000);
let sTok = 0, perVerify = 0, accept = 0, same = true, hasSpec = !!engine.specAvailable;
if (hasSpec) {
globalThis.__spec = true; engine.reset(); t0 = performance.now(); const rs = await engine.generate(sIds.slice(), { maxNew: 192, repPenalty: rep }); sTok = rs.outIds.length / ((performance.now() - t0) / 1000); globalThis.__spec = false;
const sp = (rs.stats && rs.stats.spec) || null;
perVerify = sp && sp.windows ? 1 + sp.accepted / sp.windows : 0; accept = sp && sp.drafted ? 100 * sp.accepted / sp.drafted : 0;
same = rb.outIds.length === rs.outIds.length && rb.outIds.every((x, i) => x === rs.outIds[i]);
}
const flip = hasSpec && sTok > bTok * 1.1, spRatio = bTok ? sTok / bTok : 0;
const row = (k, v) => `<tr style="border-top:1px solid var(--line)"><td style="padding:7px 8px;color:var(--dim)">${k}</td><td style="padding:7px 8px;font-weight:600">${v}</td></tr>`;
log.innerHTML = `<div style="font-family:ui-monospace,monospace;font-size:13px;max-width:820px;margin:0 auto;padding:8px">
<div style="font-size:18px;font-weight:700;margin-bottom:4px">Discrete-GPU validation</div>
<div style="color:var(--dim);margin-bottom:12px">adapter: ${adapterStr} · BitNet-2B</div>
<table style="width:100%;border-collapse:collapse">
${row("GPU in use", `<span style="color:${discrete ? "#48c26c" : "#e0a94a"}">${discrete ? "DISCRETE ✓" : "integrated (bandwidth "+gbps.toFixed(0)+" GB/s — not a discrete GPU)"}</span>`)}
${row("VRAM bandwidth", `${gbps.toFixed(0)} GB/s`)}
${row("Decode roofline", `${roofTok.toFixed(0)} tok/s`)}
${row("Live decode (warmed)", `${live.toFixed(0)} tok/s · ${(100*live/roofTok).toFixed(0)}% of roofline`)}
${row("Spec-decode (retrieval)", hasSpec ? `${sTok.toFixed(0)} vs ${bTok.toFixed(0)} baseline · <b style="color:${flip ? "#48c26c" : "#e0a94a"}">${spRatio.toFixed(2)}×</b> · ${perVerify.toFixed(2)} tok/verify · ${accept.toFixed(0)}% accept · ${same ? "byte-exact ✓" : "DIVERGED ✗"}` : "unavailable")}
</table>
<div style="margin-top:14px;font-size:15px;font-weight:600;color:${flip ? "#48c26c" : "#e0a94a"}">${!discrete
? "⚠ This is still an integrated GPU (bandwidth ≤350 GB/s). Open on a machine with a discrete GPU to validate the >1000 path — the browser may be picking the iGPU for power saving."
: flip
? `✓ CONFIRMED: on discrete silicon spec-decode FLIPPED to a ${spRatio.toFixed(2)}× win (byte-exact), and the roofline is ${roofTok.toFixed(0)} tok/s. The code we built delivers the high-throughput path unchanged on this hardware.`
: `Spec-decode is ${spRatio.toFixed(2)}× here — not yet a clear win. Bandwidth ${gbps.toFixed(0)} GB/s (roofline ${roofTok.toFixed(0)}); the forward may still be compute-bound on this GPU. Send me the numbers.`}</div>
</div>`;
st.textContent = "discrete validation · done"; console.log("[discrete]", { adapterStr, gbps, roofTok, live, bTok, sTok, perVerify, accept, same });
}
// ── PER-PASS GPU TRACE (?bench=trace) ── where does a token's ~40ms go? Runs a profiled forward through the
// step() path (window.__profile → timestamp-query'd ns per pass) and dumps the breakdown sorted by cost, so the
// 83% non-weight overhead is named exactly (attention / argmax / lm_head / norms / dispatch count) — no guessing.
async function runTraceBench() {
log.innerHTML = ""; input.disabled = send.disabled = true;
const rep = m.rep ?? 1.3, gpu = engine._gpu;
if (!gpu || !gpu.generate) { st.textContent = "trace unavailable (no raw handle)"; bubble("a", "engine._gpu.generate missing"); return; }
const ids = engine.tokenize(engine.frameTurn("Write a detailed paragraph about how mountains form over geological time.", false));
globalThis.__spec = false;
st.textContent = "warming up (boosting clock)…";
gpu.reset(); await gpu.generate(ids.slice(), 40, rep); // warm → boost clock; __profileData will hold the LAST token's passes
st.textContent = "tracing a token…";
window.__profile = 1; gpu.reset(); await gpu.generate(ids.slice(), 24, rep); window.__profile = 0;
const pd = window.__profileData;
if (!pd || !pd.passes) { st.textContent = "no profile data (timestamp-query unsupported?)"; bubble("a", "window.__profileData empty — this GPU/browser may lack the timestamp-query feature."); return; }
const items = Object.entries(pd.passes).map(([tag, v]) => ({ tag, ms: v.ms, n: v.n })).sort((a, b) => b.ms - a.ms);
const tot = pd.passSumMs || items.reduce((s, x) => s + x.ms, 0);
const bar = (x) => Math.round(280 * x / (items[0].ms || 1));
const tbl = `<div style="font-family:ui-monospace,monospace;font-size:13px;max-width:820px;margin:0 auto;padding:8px">
<div style="font-size:18px;font-weight:700;margin-bottom:4px">Per-pass GPU trace — one token</div>
<div style="color:var(--dim);margin-bottom:12px">BitNet-2B · warmed · ${pd.nPasses} dispatches · GPU pass-sum ${tot.toFixed(1)} ms · wall-span ${(pd.gpuSpanMs||0).toFixed(1)} ms</div>
<table style="width:100%;border-collapse:collapse">
<tr style="color:var(--dim);text-align:left"><th style="padding:5px 8px">pass</th><th style="padding:5px 8px;text-align:right">GPU ms</th><th style="padding:5px 8px;text-align:right">% tok</th><th style="padding:5px 8px;text-align:right">count</th><th></th></tr>
${items.map(x => `<tr style="border-top:1px solid var(--line)"><td style="padding:5px 8px">${x.tag}</td><td style="padding:5px 8px;text-align:right">${x.ms.toFixed(2)}</td><td style="padding:5px 8px;text-align:right">${(100*x.ms/tot).toFixed(0)}%</td><td style="padding:5px 8px;text-align:right">${x.n}</td><td style="padding:5px 8px"><span style="display:inline-block;height:9px;border-radius:2px;background:var(--q);width:${bar(x.ms)}px"></span></td></tr>`).join("")}
</table>
<div style="margin-top:12px;color:var(--dim)">The top rows are the lever. Weight-matmul passes that dominate ⇒ we're near bandwidth (little to win). Attention / argmax / lm_head / norms dominating ⇒ that's the non-weight overhead to cut (fuse passes, cheaper argmax, fewer dispatches).</div>
</div>`;
log.innerHTML = tbl; st.textContent = "per-pass trace · done"; console.log("[trace]", pd);
}
try {
if (!navigator.gpu) throw new Error("This browser has no WebGPU — open in Chrome, Edge, or a recent mobile browser.");
st.textContent = `loading ${m.name} (${m.size})…`;
// The load animation IS the proof: each weight block re-derived on YOUR GPU (Law L5), a live honest GB/s.
// Takes over the status only once blocks start verifying; before that, the loader's own messages show.
(function vTick(){ if (armed) return; const v = globalThis.__vs; if (v && v.n) { const gb = v.bytes/1073741824, gbps = v.ms > 0 ? gb/(v.ms/1000) : 0; st.textContent = `🛡 verifying on your GPU · ${v.n} blocks · ${gb.toFixed(2)} GB${gbps ? " · " + gbps.toFixed(1) + " GB/s" : ""}`; } requestAnimationFrame(vTick); })();
const loaded = await loadModel(m, { onStatus: (s) => { if (s && !(globalThis.__vs && globalThis.__vs.n)) st.textContent = `${m.name}: ${s}`; }, onProgress: (d, t, w) => { if (!(globalThis.__vs && globalThis.__vs.n)) st.textContent = `${m.name}: ${w} ${t ? Math.round(100 * d / t) : 0}%`; } });
if (!loaded || !loaded.gpu) throw new Error("model load failed");
engine = await createEngine(m, loaded);
armed = true;
st.textContent = `${m.name} · ${m.size} · in your browser · ready`;
input.placeholder = "Message Q…";
if (params.get("bench") === "spec") { globalThis.__spec = false; await runSpecBench(); }
else if (params.get("bench") === "perf") { await runPerfBench(); }
else if (params.get("bench") === "trace") { await runTraceBench(); }
else if (params.get("bench") === "discrete") { globalThis.__spec = false; await runDiscreteBench(); }
else if (pending) { st.textContent = "warming up…"; await warmUp(); st.textContent = `${m.name} · ${m.size} · in your browser · ready`; const w = [...log.querySelectorAll(".a")].reverse().find((x) => x.dataset.pending); if (w) w.remove(); const p = pending; pending = null; generate(p, true); }
else { st.textContent = "warming up…"; await warmUp(); st.textContent = `${m.name} · ${m.size} · in your browser · ready`; await proactiveGreeting(); }
} catch (e) { st.textContent = "⚠ " + e.message; bubble("a", "Could not start: " + e.message); }
</script></body></html>