Spaces:

abhijitramesh
/

webgpu-bench

Running

App Files Files Community

GitHub Actions commited on 26 days ago

Commit

44a16ab

1 Parent(s): b861182

sync from abhijitramesh/webgpu-bench@43e1f069db

Browse files

Files changed (4) hide show

css/style.css +0 -6
js/run/bench-worker.js +40 -14
js/run/controller.js +27 -16
js/run/core.js +35 -13

css/style.css CHANGED Viewed

@@ -1918,12 +1918,6 @@ a:hover { color: var(--info); }
   font-size: 12px;
   font-family: var(--font-mono);
 }
-.run-family-warning {
-  margin-left: auto;
-  color: var(--accent-amber);
-  font-size: 11px;
-  font-weight: 600;
-}
 .run-variant-list {
   display: flex;

   font-size: 12px;
   font-family: var(--font-mono);
 }
 .run-variant-list {
   display: flex;

js/run/bench-worker.js CHANGED Viewed

@@ -35,7 +35,22 @@
 const post = (msg) => self.postMessage(msg);
 const log = (line) => post({ type: 'log', line });
-const status = (s, msg) => post({ type: 'status', status: s, msg });
 // ─── OPFS-backed model loading (wllama-style) ───
 // For >2GB GGUFs we can't put the whole file on the WASM heap (TypedArray
@@ -283,7 +298,7 @@ async function runOne({ params, stream, buffer, opfsPath }) {
   const Module = await self.createBenchModule({
     locateFile: (filename) => `/build/${buildType}/${filename}`,
     print: (text) => log(`[wasm] ${text}`),
-    printErr: (text) => log(`[wasm:err] ${text}`),
     onAbort: (reason) => {
       const msg = `WASM aborted: ${reason}`;
       result.error = msg;
@@ -405,7 +420,7 @@ async function runOne({ params, stream, buffer, opfsPath }) {
   // useful pp/tg numbers via synthetic-token paths.
   if (consistencyPrompt) {
     try {
-      status('consistency', 'Running consistency check...');
       log(`bench_run("...", ${consistencyNPredict}) — consistency phase`);
       const raw = await Module.ccall(
         'bench_run', 'string',
@@ -427,11 +442,18 @@ async function runOne({ params, stream, buffer, opfsPath }) {
         );
         const ev = parseBenchResult('bench_eval_tokens', evalRaw);
         result.consistency = { ...result.consistency, ...ev };
-        log(
-          `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
-          `${ev.n_agree}/${ev.n_tokens})` +
-          (ev.first_disagreement >= 0 ? ` — first diverge @ ${ev.first_disagreement}` : '')
-        );
       }
     } catch (err) {
       log(`Consistency phase failed: ${err.message} — continuing to perf phase`);
@@ -450,14 +472,14 @@ async function runOne({ params, stream, buffer, opfsPath }) {
     if (wantPp) {
       try {
         if (!noWarmup) {
-          status('perf', `warmup pp${nPrompt}`);
           log(`bench_pp(${nPrompt}) — warmup`);
           const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
           parseBenchResult('bench_pp warmup', raw);
         }
         const samples_ns = [];
         for (let i = 0; i < nReps; i++) {
-          status('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
           const t0 = performance.now();
           const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
           const t_ns = (performance.now() - t0) * 1e6;
@@ -474,14 +496,18 @@ async function runOne({ params, stream, buffer, opfsPath }) {
     if (wantTg) {
       try {
         if (!noWarmup) {
-          status('perf', `warmup tg`);
-          log('bench_tg(1) — warmup');
-          const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
           parseBenchResult('bench_tg warmup', raw);
         }
         const samples_ns = [];
         for (let i = 0; i < nReps; i++) {
-          status('perf', `tg${nGen} ${i + 1}/${nReps}`);
           const t0 = performance.now();
           const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
           const t_ns = (performance.now() - t0) * 1e6;

 const post = (msg) => self.postMessage(msg);
 const log = (line) => post({ type: 'log', line });
+// sinceMs: optional epoch ms. Forwarded to controller so the row ticks an
+// elapsed counter while a long-running ccall (warmup, big-model rep) is in
+// flight — JSPI doesn't yield often enough on CPU paths to drive ticks here.
+const status = (s, msg, sinceMs) => post({ type: 'status', status: s, msg, sinceMs });
+// Below this many compared tokens, the consistency agreement rate is
+// statistical noise (e.g. early-EOS models that produce 1 token always
+// report 100%). Mirror of CONSISTENCY_MIN_TOKENS in core.js.
+const CONSISTENCY_MIN_TOKENS = 8;
+// llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the
+// actually-bad lines as :err so real failures stand out. Mirror in core.js.
+function classifyWasmStderr(text) {
+  return /\b(error|abort(ed)?|failed|fatal|panic|assert)\b|GGML_ASSERT/i.test(text)
+    ? '[wasm:err]' : '[wasm]';
+}
 // ─── OPFS-backed model loading (wllama-style) ───
 // For >2GB GGUFs we can't put the whole file on the WASM heap (TypedArray
   const Module = await self.createBenchModule({
     locateFile: (filename) => `/build/${buildType}/${filename}`,
     print: (text) => log(`[wasm] ${text}`),
+    printErr: (text) => log(`${classifyWasmStderr(text)} ${text}`),
     onAbort: (reason) => {
       const msg = `WASM aborted: ${reason}`;
       result.error = msg;
   // useful pp/tg numbers via synthetic-token paths.
   if (consistencyPrompt) {
     try {
+      status('consistency', 'Running consistency check...', Date.now());
       log(`bench_run("...", ${consistencyNPredict}) — consistency phase`);
       const raw = await Module.ccall(
         'bench_run', 'string',
         );
         const ev = parseBenchResult('bench_eval_tokens', evalRaw);
         result.consistency = { ...result.consistency, ...ev };
+        if (ev.n_tokens < CONSISTENCY_MIN_TOKENS) {
+          log(
+            `Consistency: insufficient samples (${ev.n_tokens} token` +
+            `${ev.n_tokens === 1 ? '' : 's'} before EOS) — agreement rate not meaningful`
+          );
+        } else {
+          log(
+            `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
+            `${ev.n_agree}/${ev.n_tokens})` +
+            (ev.first_disagreement >= 0 ? ` — first diverge @ ${ev.first_disagreement}` : '')
+          );
+        }
       }
     } catch (err) {
       log(`Consistency phase failed: ${err.message} — continuing to perf phase`);
     if (wantPp) {
       try {
         if (!noWarmup) {
+          status('perf', `warmup pp${nPrompt}`, Date.now());
           log(`bench_pp(${nPrompt}) — warmup`);
           const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
           parseBenchResult('bench_pp warmup', raw);
         }
         const samples_ns = [];
         for (let i = 0; i < nReps; i++) {
+          status('perf', `pp${nPrompt} ${i + 1}/${nReps}`, Date.now());
           const t0 = performance.now();
           const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
           const t_ns = (performance.now() - t0) * 1e6;
     if (wantTg) {
       try {
         if (!noWarmup) {
+          // Run the full nGen-token decode loop as warmup (was bench_tg(1)).
+          // A 1-token warmup exercises the decode kernel once, which leaves
+          // the first timed rep absorbing pipeline-cache / shader-specialize
+          // cost on every subsequent step.
+          status('perf', `warmup tg${nGen}`, Date.now());
+          log(`bench_tg(${nGen}) — warmup`);
+          const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
           parseBenchResult('bench_tg warmup', raw);
         }
         const samples_ns = [];
         for (let i = 0; i < nReps; i++) {
+          status('perf', `tg${nGen} ${i + 1}/${nReps}`, Date.now());
           const t0 = performance.now();
           const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
           const t_ns = (performance.now() - t0) * 1e6;

js/run/controller.js CHANGED Viewed

@@ -168,10 +168,10 @@ function isQuickVariant(v) {
 }
 function computeWarnings(modelName, quant) {
-  const w = [];
-  if (/^granite-4/i.test(modelName)) w.push('needs SSM_SCAN');
-  if (quant === 'Q1_0') w.push('needs Q1_0');
-  return w;
 }
 function cacheKey(v) { return `${v.repo}/${v.filename}`; }
@@ -432,13 +432,6 @@ function renderModels() {
     stats.textContent = `${variants.length} variants · ${fitsCount} fit · ${quickFitCount} quick`;
     header.append(toggleBtn, selectAll, nameLabel, paramChip, stats);
-    if (/^granite-4/i.test(family)) {
-      const w = document.createElement('span');
-      w.className = 'run-family-warning';
-      w.textContent = '⚠ needs SSM_SCAN in llama.cpp';
-      header.appendChild(w);
-    }
     familyEl.appendChild(header);
     const list = document.createElement('div');
@@ -856,12 +849,29 @@ function progressRowFor(v) {
     `;
     tbody.appendChild(tr);
   }
   return {
-    setStatus(status, msg) {
       tr.className = `run-row-${rowClassFor(status)}`;
-      tr.querySelector('.status').textContent = msg ? `${status} — ${msg}` : status;
     },
     setProgress(fraction, downloaded, total) {
       const pct = (fraction * 100).toFixed(1);
       const detail = total > 0
         ? `${pct}% (${formatSize(downloaded / (1024 * 1024))} / ${formatSize(total / (1024 * 1024))})`
@@ -869,6 +879,7 @@ function progressRowFor(v) {
       tr.querySelector('.status').textContent = detail ? `downloading ${detail}` : 'downloading';
     },
     fillFromRecord(record) {
       tr.className = `run-row-${record.status === 'done' ? 'ok' : 'error'}`;
       tr.querySelector('.status').textContent = record.status;
       // Format llama-bench style: "avg \u00b1 stddev" with the test name as
@@ -1219,7 +1230,7 @@ function runInWorker({
     worker.onmessage = (e) => {
       const msg = e.data || {};
-      if (msg.type === 'status') onStatus?.(msg.status, msg.msg);
       else if (msg.type === 'progress') onProgress?.(msg.fraction, msg.downloaded, msg.total);
       else if (msg.type === 'log') onLog?.(msg.line);
       else if (msg.type === 'result') finish(msg.record);
@@ -1454,7 +1465,7 @@ async function runVariantWithIterations(v, row) {
         nCtx: DEFAULT_N_CTX,
         nGpuLayers: 0,
       }, {
-        onStatus: (status, msg) => row.setStatus(`cpu/${status}`, msg),
         onProgress: (fr, downloaded, total) => row.setProgress(fr, downloaded, total),
         onLog: logLine,
       });
@@ -1499,7 +1510,7 @@ async function runVariantWithIterations(v, row) {
       nCtx: DEFAULT_N_CTX,
       nGpuLayers: DEFAULT_N_GPU_LAYERS,
     }, {
-      onStatus: (s, m) => row.setStatus(`gpu/${s}`, m),
       onProgress: (fr, d, t) => row.setProgress(fr, d, t),
       onLog: logLine,
     });

 }
 function computeWarnings(modelName, quant) {
+  // SSM_SCAN and Q1_0 are both supported in the bundled llama.cpp
+  // (ggml-webgpu.cpp). granite-4 ran cleanly in the apr-30 run; Q1_0 is
+  // wired into the fast-path dequant table. No warnings to surface today.
+  return [];
 }
 function cacheKey(v) { return `${v.repo}/${v.filename}`; }
     stats.textContent = `${variants.length} variants · ${fitsCount} fit · ${quickFitCount} quick`;
     header.append(toggleBtn, selectAll, nameLabel, paramChip, stats);
     familyEl.appendChild(header);
     const list = document.createElement('div');
     `;
     tbody.appendChild(tr);
   }
+  let tickInterval = null;
+  const stopTicker = () => {
+    if (tickInterval !== null) { clearInterval(tickInterval); tickInterval = null; }
+  };
   return {
+    // sinceMs: optional epoch ms. When set, the cell ticks once a second so
+    // long-running phases (CPU pp512 warmup, big-model rep calls) show
+    // wall-clock progress instead of looking hung. Cleared on next setStatus.
+    setStatus(status, msg, sinceMs) {
+      stopTicker();
       tr.className = `run-row-${rowClassFor(status)}`;
+      const cell = tr.querySelector('.status');
+      const render = () => {
+        const base = msg ? `${status} — ${msg}` : status;
+        cell.textContent = sinceMs
+          ? `${base} (${Math.floor((Date.now() - sinceMs) / 1000)}s)`
+          : base;
+      };
+      render();
+      if (sinceMs) tickInterval = setInterval(render, 1000);
     },
     setProgress(fraction, downloaded, total) {
+      stopTicker();
       const pct = (fraction * 100).toFixed(1);
       const detail = total > 0
         ? `${pct}% (${formatSize(downloaded / (1024 * 1024))} / ${formatSize(total / (1024 * 1024))})`
       tr.querySelector('.status').textContent = detail ? `downloading ${detail}` : 'downloading';
     },
     fillFromRecord(record) {
+      stopTicker();
       tr.className = `run-row-${record.status === 'done' ? 'ok' : 'error'}`;
       tr.querySelector('.status').textContent = record.status;
       // Format llama-bench style: "avg \u00b1 stddev" with the test name as
     worker.onmessage = (e) => {
       const msg = e.data || {};
+      if (msg.type === 'status') onStatus?.(msg.status, msg.msg, msg.sinceMs);
       else if (msg.type === 'progress') onProgress?.(msg.fraction, msg.downloaded, msg.total);
       else if (msg.type === 'log') onLog?.(msg.line);
       else if (msg.type === 'result') finish(msg.record);
         nCtx: DEFAULT_N_CTX,
         nGpuLayers: 0,
       }, {
+        onStatus: (status, msg, sinceMs) => row.setStatus(`cpu/${status}`, msg, sinceMs),
         onProgress: (fr, downloaded, total) => row.setProgress(fr, downloaded, total),
         onLog: logLine,
       });
       nCtx: DEFAULT_N_CTX,
       nGpuLayers: DEFAULT_N_GPU_LAYERS,
     }, {
+      onStatus: (s, m, sinceMs) => row.setStatus(`gpu/${s}`, m, sinceMs),
       onProgress: (fr, d, t) => row.setProgress(fr, d, t),
       onLog: logLine,
     });

js/run/core.js CHANGED Viewed

@@ -10,6 +10,17 @@ const DEFAULT_N_PROMPT = 512;
 const DEFAULT_N_GEN = 128;
 const DEFAULT_N_REPS = 5;
 async function loadBenchScriptOnce(buildType) {
   if (typeof globalThis.createBenchModule === 'function') return;
   const script = document.createElement('script');
@@ -96,7 +107,7 @@ async function runBenchActions(Module, {
   // aborting the whole run.
   if (consistencyPrompt) {
     try {
-      onStatus?.('consistency', 'Running consistency check...');
       onLog?.(`bench_run("...", ${consistencyNPredict}) — consistency phase`);
       const raw = await Module.ccall(
         'bench_run', 'string',
@@ -118,11 +129,18 @@ async function runBenchActions(Module, {
         );
         const ev = parseBenchResult('bench_eval_tokens', evalRaw);
         out.consistency = { ...out.consistency, ...ev };
-        onLog?.(
-          `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
-          `${ev.n_agree}/${ev.n_tokens})` +
-          (ev.first_disagreement >= 0 ? ` — first diverge @ ${ev.first_disagreement}` : '')
-        );
       }
     } catch (err) {
       onLog?.(`Consistency phase failed: ${err.message} — continuing to perf phase`);
@@ -142,14 +160,14 @@ async function runBenchActions(Module, {
     if (wantPp) {
       try {
         if (!noWarmup) {
-          onStatus?.('perf', `warmup pp${nPrompt}`);
           onLog?.(`bench_pp(${nPrompt}) — warmup`);
           const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
           parseBenchResult('bench_pp warmup', raw);
         }
         const samples_ns = [];
         for (let i = 0; i < nReps; i++) {
-          onStatus?.('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
           const t0 = performance.now();
           const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
           const t_ns = (performance.now() - t0) * 1e6;
@@ -166,14 +184,18 @@ async function runBenchActions(Module, {
     if (wantTg) {
       try {
         if (!noWarmup) {
-          onStatus?.('perf', `warmup tg`);
-          onLog?.('bench_tg(1) — warmup');
-          const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
           parseBenchResult('bench_tg warmup', raw);
         }
         const samples_ns = [];
         for (let i = 0; i < nReps; i++) {
-          onStatus?.('perf', `tg${nGen} ${i + 1}/${nReps}`);
           const t0 = performance.now();
           const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
           const t_ns = (performance.now() - t0) * 1e6;
@@ -269,7 +291,7 @@ export async function runBenchmarkCore({
     Module = await globalThis.createBenchModule({
       print: (text) => onLog(`[wasm] ${text}`),
-      printErr: (text) => onLog(`[wasm:err] ${text}`),
       onAbort: (reason) => {
         const msg = `WASM aborted: ${reason}`;
         result.error = msg;

 const DEFAULT_N_GEN = 128;
 const DEFAULT_N_REPS = 5;
+// Below this many compared tokens, the agreement rate is statistical noise
+// (e.g. early-EOS models that produce 1 token will always report 100%).
+const CONSISTENCY_MIN_TOKENS = 8;
+// llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the
+// actually-bad lines as :err so real failures stand out from routine output.
+function classifyWasmStderr(text) {
+  return /\b(error|abort(ed)?|failed|fatal|panic|assert)\b|GGML_ASSERT/i.test(text)
+    ? '[wasm:err]' : '[wasm]';
+}
 async function loadBenchScriptOnce(buildType) {
   if (typeof globalThis.createBenchModule === 'function') return;
   const script = document.createElement('script');
   // aborting the whole run.
   if (consistencyPrompt) {
     try {
+      onStatus?.('consistency', 'Running consistency check...', Date.now());
       onLog?.(`bench_run("...", ${consistencyNPredict}) — consistency phase`);
       const raw = await Module.ccall(
         'bench_run', 'string',
         );
         const ev = parseBenchResult('bench_eval_tokens', evalRaw);
         out.consistency = { ...out.consistency, ...ev };
+        if (ev.n_tokens < CONSISTENCY_MIN_TOKENS) {
+          onLog?.(
+            `Consistency: insufficient samples (${ev.n_tokens} token` +
+            `${ev.n_tokens === 1 ? '' : 's'} before EOS) — agreement rate not meaningful`
+          );
+        } else {
+          onLog?.(
+            `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
+            `${ev.n_agree}/${ev.n_tokens})` +
+            (ev.first_disagreement >= 0 ? ` — first diverge @ ${ev.first_disagreement}` : '')
+          );
+        }
       }
     } catch (err) {
       onLog?.(`Consistency phase failed: ${err.message} — continuing to perf phase`);
     if (wantPp) {
       try {
         if (!noWarmup) {
+          onStatus?.('perf', `warmup pp${nPrompt}`, Date.now());
           onLog?.(`bench_pp(${nPrompt}) — warmup`);
           const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
           parseBenchResult('bench_pp warmup', raw);
         }
         const samples_ns = [];
         for (let i = 0; i < nReps; i++) {
+          onStatus?.('perf', `pp${nPrompt} ${i + 1}/${nReps}`, Date.now());
           const t0 = performance.now();
           const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
           const t_ns = (performance.now() - t0) * 1e6;
     if (wantTg) {
       try {
         if (!noWarmup) {
+          // Run the full nGen-token decode loop as warmup (was bench_tg(1)).
+          // A 1-token warmup exercises the decode kernel once, which leaves
+          // the first timed rep absorbing pipeline-cache / shader-specialize
+          // cost on every subsequent step.
+          onStatus?.('perf', `warmup tg${nGen}`, Date.now());
+          onLog?.(`bench_tg(${nGen}) — warmup`);
+          const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
           parseBenchResult('bench_tg warmup', raw);
         }
         const samples_ns = [];
         for (let i = 0; i < nReps; i++) {
+          onStatus?.('perf', `tg${nGen} ${i + 1}/${nReps}`, Date.now());
           const t0 = performance.now();
           const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
           const t_ns = (performance.now() - t0) * 1e6;
     Module = await globalThis.createBenchModule({
       print: (text) => onLog(`[wasm] ${text}`),
+      printErr: (text) => onLog(`${classifyWasmStderr(text)} ${text}`),
       onAbort: (reason) => {
         const msg = `WASM aborted: ${reason}`;
         result.error = msg;