Spaces:

abhijitramesh
/

webgpu-bench

Running

App Files Files Community

GitHub Actions commited on Apr 29

Commit

e72601b

1 Parent(s): e6a49d5

sync from abhijitramesh/webgpu-bench@1be8b82935

Browse files

Files changed (7) hide show

harness.js +23 -8
js/dataset.js +14 -0
js/run/bench-worker.js +146 -98
js/run/controller.js +154 -118
js/run/core.js +201 -90
js/tables.js +26 -6
run.html +9 -1

harness.js CHANGED Viewed

@@ -21,13 +21,22 @@ window.addEventListener('unhandledrejection', (e) => {
 (async function () {
   const params = new URLSearchParams(window.location.search);
-  const modelFile   = params.get('model')        || '';
-  const hfRepo      = params.get('hfRepo')       || 'unsloth/Llama-3.2-1B-Instruct-GGUF';
-  const prompt      = params.get('prompt')       || 'Hello, how are you?';
-  const nPredict    = parseInt(params.get('nPredict')   || '128', 10);
-  const nCtx        = parseInt(params.get('nCtx')       || '2048', 10);
-  const nGpuLayers  = parseInt(params.get('nGpuLayers') || '999', 10);
-  const refTokenIds = params.get('refTokenIds') || null;
   const hasJspi = 'Suspending' in WebAssembly;
@@ -73,7 +82,13 @@ window.addEventListener('unhandledrejection', (e) => {
   const result = await runBenchmarkCore({
     source: localSource(),
-    modelFile, hfRepo, prompt, nPredict, nCtx, nGpuLayers, refTokenIds,
     onStatus, onProgress, onLog,
   });

 (async function () {
   const params = new URLSearchParams(window.location.search);
+  const modelFile           = params.get('model')         || '';
+  const hfRepo              = params.get('hfRepo')        || 'unsloth/Llama-3.2-1B-Instruct-GGUF';
+  const consistencyPrompt   = params.get('prompt')        || 'Hello, how are you?';
+  const consistencyNPredict = parseInt(params.get('nPredict')   || '128', 10);
+  const nPrompt             = parseInt(params.get('nPrompt')    || '512', 10);
+  const nGen                = parseInt(params.get('nGen')       || '128', 10);
+  const nReps               = parseInt(params.get('nReps')      || '5', 10);
+  const nCtx                = parseInt(params.get('nCtx')       || '2048', 10);
+  const nGpuLayers          = parseInt(params.get('nGpuLayers') || '999', 10);
+  const refTokenIds         = params.get('refTokenIds') || null;
+  // mode=perf → skip consistency entirely (e.g. for the GPU perf-only pass).
+  // mode=consistency → skip perf (e.g. CPU baseline pass that just needs token_ids).
+  // default 'both' runs both phases in one model load.
+  const mode                = params.get('mode') || 'both';
+  const runConsistency      = mode !== 'perf';
+  const runPerf             = mode !== 'consistency';
   const hasJspi = 'Suspending' in WebAssembly;
   const result = await runBenchmarkCore({
     source: localSource(),
+    modelFile, hfRepo,
+    consistencyPrompt, consistencyNPredict, refTokenIds,
+    runConsistency,
+    nPrompt: runPerf ? nPrompt : 0,
+    nGen:    runPerf ? nGen    : 0,
+    nReps,
+    nCtx, nGpuLayers,
     onStatus, onProgress, onLog,
   });

js/dataset.js CHANGED Viewed

@@ -120,6 +120,13 @@ async function fetchRunsBatch(datasetRepo, files) {
    produces. Keep field-for-field aligned with build-site.js so the merged
    results are indistinguishable from the baseline. */
 function flattenForDashboard(r, slug) {
   return {
     machineSlug: slug,
     timestamp: r.timestamp,
@@ -137,6 +144,13 @@ function flattenForDashboard(r, slug) {
     wallTimeMs: r.wallTimeMs,
     prefill_tok_s: r.metrics?.prefill_tok_s ?? null,
     decode_tok_s: r.metrics?.decode_tok_s ?? null,
     n_p_eval: r.metrics?.n_p_eval ?? null,
     t_p_eval_ms: r.metrics?.t_p_eval_ms ?? null,
     n_eval: r.metrics?.n_eval ?? null,

    produces. Keep field-for-field aligned with build-site.js so the merged
    results are indistinguishable from the baseline. */
 function flattenForDashboard(r, slug) {
+  // New-format records have metrics.tests = [{name:'pp512',...},{name:'tg128',...}].
+  // Old-format records have flat metrics.prefill_tok_s / decode_tok_s only.
+  // Surface both shapes so the table can render llama-bench-style "avg \u00b1 stddev"
+  // when stddev is available without breaking on older rows.
+  const tests = Array.isArray(r.metrics?.tests) ? r.metrics.tests : null;
+  const pp = tests?.find(t => t.name?.startsWith('pp')) || null;
+  const tg = tests?.find(t => t.name?.startsWith('tg')) || null;
   return {
     machineSlug: slug,
     timestamp: r.timestamp,
     wallTimeMs: r.wallTimeMs,
     prefill_tok_s: r.metrics?.prefill_tok_s ?? null,
     decode_tok_s: r.metrics?.decode_tok_s ?? null,
+    // llama-bench shape: per-test stddev + the test labels (pp{N} / tg{N})
+    prefill_stddev_ts: pp?.stddev_ts ?? r.metrics?.prefill_tok_s_stdev ?? null,
+    decode_stddev_ts:  tg?.stddev_ts ?? r.metrics?.decode_tok_s_stdev  ?? null,
+    pp_test_name: pp?.name ?? null,
+    tg_test_name: tg?.name ?? null,
+    pp_n_prompt: pp?.n_prompt ?? r.nPrompt ?? null,
+    tg_n_gen:    tg?.n_gen    ?? r.nGen    ?? null,
     n_p_eval: r.metrics?.n_p_eval ?? null,
     t_p_eval_ms: r.metrics?.t_p_eval_ms ?? null,
     n_eval: r.metrics?.n_eval ?? null,

js/run/bench-worker.js CHANGED Viewed

@@ -6,12 +6,15 @@
 //
 //   main → worker: {
 //     type: 'run',
-//     params: { buildType, prompt, nPredict, nCtx, nGpuLayers, refTokenIds,
-//               contentLength },
-//     // Exactly one of these — depends on whether the runtime supports
-//     // transferable ReadableStreams (most desktops do; iOS Safari and some
-//     // mobile Chrome configs don't, in which case the main thread drains
-//     // the stream into an ArrayBuffer and transfers the buffer instead):
 //     stream?: ReadableStream<Uint8Array>,  // TRANSFERRED
 //     buffer?: ArrayBuffer                  // TRANSFERRED (mobile fallback)
 //   }
@@ -25,15 +28,57 @@
 // decode loops ignore signals, and termination is the only reliable way to
 // stop an in-flight WASM call.
 //
-// NOTE ON DUPLICATION: the orchestration below mirrors runBenchmarkCore()
-// in site/js/run/core.js. core.js stays the authoritative main-thread path
-// (used by harness.js + runner.js Playwright harness). When changing one,
-// change the other.
 const post = (msg) => self.postMessage(msg);
 const log = (line) => post({ type: 'log', line });
 const status = (s, msg) => post({ type: 'status', status: s, msg });
 self.onmessage = async (e) => {
   const { type } = e.data || {};
   if (type !== 'run') {
@@ -58,12 +103,18 @@ self.onmessage = async (e) => {
 async function runOne({ params, stream, buffer }) {
   const {
     buildType,
-    prompt,
-    nPredict,
     nCtx,
     nGpuLayers,
     refTokenIds,
-    contentLength,
   } = params;
   if (!stream && !buffer) {
     throw new Error('runOne: exactly one of `stream` or `buffer` must be provided');
@@ -85,9 +136,6 @@ async function runOne({ params, stream, buffer }) {
     try {
       const adapter = await self.navigator.gpu.requestAdapter();
       if (adapter) {
-        // GPUAdapterInfo is a host object — structured-clone can't serialize
-        // it across postMessage. Copy the fields we care about into a plain
-        // object before storing on result.
         const info = adapter.info;
         result.gpuAdapterInfo = info ? {
           vendor: info.vendor || '',
@@ -118,10 +166,6 @@ async function runOne({ params, stream, buffer }) {
   }
   const Module = await self.createBenchModule({
-    // In a worker loaded via importScripts(), Emscripten can't infer the
-    // script's directory and falls back to self.location (this worker's
-    // own URL), which makes it look for bench.wasm next to bench-worker.js.
-    // Pin the lookup to the build directory so it grabs the right file.
     locateFile: (filename) => `/build/${buildType}/${filename}`,
     print: (text) => log(`[wasm] ${text}`),
     printErr: (text) => log(`[wasm:err] ${text}`),
@@ -135,15 +179,6 @@ async function runOne({ params, stream, buffer }) {
   log('WASM module loaded');
   // ─── Stream the model into the WASM heap (HeapFS-style) ───
-  // Avoid the JS-side MEMFS staging buffer by allocating space inside the
-  // WASM heap with _malloc and writing chunks directly via HEAPU8.set. Then
-  // register the file with MEMFS using a Uint8Array view backed by the heap
-  // region, so llama.cpp's mmap can take the zero-copy branch in MEMFS.mmap
-  // (which fires when contents.buffer === HEAP8.buffer).
-  //
-  // Heap growth during bench_init/bench_load detaches old views, so we
-  // override node.contents with a getter that always rebuilds the view
-  // from the saved pointer + length against the current Module.HEAPU8.
   if (!(contentLength > 0)) {
     throw new Error('content-length is required for streaming into WASM heap');
   }
@@ -168,10 +203,6 @@ async function runOne({ params, stream, buffer }) {
         post({ type: 'progress', fraction: downloaded / contentLength, downloaded, total: contentLength });
       }
     } else {
-      // Buffered path (mobile fallback): the whole file is already in
-      // memory. Copy it into the WASM heap in one shot. Progress was
-      // emitted on the main thread while buffering, so we just report 100%
-      // here for the loading phase.
       const view = new Uint8Array(buffer);
       if (view.byteLength !== contentLength) {
         log(`warning: buffer size ${view.byteLength} != content-length ${contentLength}`);
@@ -182,21 +213,15 @@ async function runOne({ params, stream, buffer }) {
     }
     log(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
-    // Register as a MEMFS file with a heap-backed view. canOwn=true so MEMFS
-    // doesn't make its own copy.
     const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
     Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
-    // Replace contents with a getter — heap growth (e.g. when llama.cpp
-    // allocates KV cache) replaces Module.HEAPU8.buffer, which would
-    // detach our static view. The getter rebuilds against the live buffer.
     const node = Module.FS.lookupPath('/model.gguf').node;
     Object.defineProperty(node, 'contents', {
       get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
       set: () => { /* read-only file */ },
       configurable: true,
     });
-    // usedBytes is read by MEMFS for stat() — keep it accurate.
     node.usedBytes = contentLength;
   } catch (err) {
     Module._free(modelPtr);
@@ -222,86 +247,109 @@ async function runOne({ params, stream, buffer }) {
   if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
   log('Model loaded');
-  // Drop the MEMFS node — the bytes themselves stay alive in the WASM heap
-  // because llama.cpp's mmap captured a pointer into our _malloc'd region.
-  // We free that region after bench_exit.
   try {
     Module.FS.unlink('/model.gguf');
   } catch (err) {
     log(`Warning: could not remove model FS node: ${err.message}`);
   }
-  // ─── Inference ───
-  status('running', 'Running inference...');
-  const resultJson = await Module.ccall(
-    'bench_run',
-    'string',
-    ['string', 'number'],
-    [prompt, nPredict],
-    { async: true },
-  );
-  log(`bench_run returned: ${String(resultJson).substring(0, 200)}`);
-  const inferResult = JSON.parse(resultJson);
-  if (inferResult.error) throw new Error(`Inference error: ${inferResult.error}`);
-  const prefillTokS = inferResult.t_p_eval_ms > 0
-    ? (inferResult.n_p_eval / (inferResult.t_p_eval_ms / 1000)).toFixed(2)
-    : 'N/A';
-  const decodeTokS = inferResult.t_eval_ms > 0
-    ? (inferResult.n_eval / (inferResult.t_eval_ms / 1000)).toFixed(2)
-    : 'N/A';
-  result.metrics = {
-    ...inferResult,
-    prefill_tok_s: parseFloat(prefillTokS) || 0,
-    decode_tok_s: parseFloat(decodeTokS) || 0,
-  };
-  result.output = inferResult.output || '';
-  // ─── Consistency check ───
-  if (refTokenIds && nGpuLayers > 0 && inferResult.token_ids?.length > 0) {
-    log('Running forced-decoding consistency check...');
-    const evalJson = await Module.ccall(
-      'bench_eval_tokens',
-      'string',
-      ['string', 'string'],
-      [prompt, refTokenIds],
       { async: true },
     );
-    const evalResult = JSON.parse(evalJson);
-    if (evalResult.error) {
-      log(`Consistency check error: ${evalResult.error}`);
-    } else {
-      result.consistency = evalResult;
       log(
-        `Consistency: ${(evalResult.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
-        `${evalResult.n_agree}/${evalResult.n_tokens} tokens)`,
       );
-      if (evalResult.first_disagreement >= 0) {
-        log(`First disagreement at token position ${evalResult.first_disagreement}`);
       }
     }
   }
   await Module.ccall('bench_exit', null, [], [], { async: true });
-  // Free the heap-resident model bytes now that llama.cpp has unmapped.
   if (modelPtr) {
     Module._free(modelPtr);
     modelPtr = 0;
   }
   result.status = 'done';
-  status('done', `Done! Prefill: ${prefillTokS} tok/s | Decode: ${decodeTokS} tok/s`);
-  log(
-    `Prefill: ${prefillTokS} tok/s (${inferResult.n_p_eval} tokens in ` +
-    `${inferResult.t_p_eval_ms.toFixed(0)} ms)`,
-  );
-  log(
-    `Decode:  ${decodeTokS} tok/s (${inferResult.n_eval} tokens in ` +
-    `${inferResult.t_eval_ms.toFixed(0)} ms)`,
-  );
-  log(`Output: ${(inferResult.output || '').substring(0, 200)}`);
   return result;
 }

 //
 //   main → worker: {
 //     type: 'run',
+//     params: {
+//       buildType, contentLength,
+//       // model load
+//       nCtx, nGpuLayers,
+//       // consistency phase (set consistencyPrompt to '' to skip)
+//       consistencyPrompt, consistencyNPredict, refTokenIds,
+//       // perf phase
+//       nPrompt, nGen, nReps, noWarmup,
+//     },
 //     stream?: ReadableStream<Uint8Array>,  // TRANSFERRED
 //     buffer?: ArrayBuffer                  // TRANSFERRED (mobile fallback)
 //   }
 // decode loops ignore signals, and termination is the only reliable way to
 // stop an in-flight WASM call.
 //
+// NOTE ON DUPLICATION: the orchestration below mirrors runBenchmarkCore() +
+// runBenchActions() in site/js/run/core.js. core.js stays the authoritative
+// main-thread path (used by harness.js + runner.js Playwright harness). When
+// changing one, change the other.
 const post = (msg) => self.postMessage(msg);
 const log = (line) => post({ type: 'log', line });
 const status = (s, msg) => post({ type: 'status', status: s, msg });
+// Aggregate raw nanosecond samples into the llama-bench result shape.
+// Mirrors core.js buildTest — keep them identical.
+function buildTest(name, n_prompt, n_gen, samples_ns) {
+  const n = samples_ns.length;
+  if (n === 0) {
+    return { name, n_prompt, n_gen, avg_ns: 0, stddev_ns: 0, avg_ts: 0, stddev_ts: 0, samples_ns: [], samples_ts: [] };
+  }
+  const avg_ns = samples_ns.reduce((a, b) => a + b, 0) / n;
+  const var_ns = n > 1
+    ? samples_ns.reduce((a, b) => a + (b - avg_ns) * (b - avg_ns), 0) / (n - 1)
+    : 0;
+  const stddev_ns = Math.sqrt(var_ns);
+  const n_tokens = n_prompt + n_gen;
+  const samples_ts = samples_ns.map(t => t > 0 ? (1e9 * n_tokens) / t : 0);
+  const avg_ts = samples_ts.reduce((a, b) => a + b, 0) / n;
+  const var_ts = n > 1
+    ? samples_ts.reduce((a, b) => a + (b - avg_ts) * (b - avg_ts), 0) / (n - 1)
+    : 0;
+  const stddev_ts = Math.sqrt(var_ts);
+  const round2 = x => Math.round(x * 100) / 100;
+  return {
+    name,
+    n_prompt,
+    n_gen,
+    avg_ns: Math.round(avg_ns),
+    stddev_ns: Math.round(stddev_ns),
+    avg_ts: round2(avg_ts),
+    stddev_ts: round2(stddev_ts),
+    samples_ns: samples_ns.map(Math.round),
+    samples_ts: samples_ts.map(round2),
+  };
+}
+function parseBenchResult(label, raw) {
+  let r;
+  try { r = JSON.parse(raw); } catch (e) {
+    throw new Error(`${label}: invalid JSON from C (${e.message})`);
+  }
+  if (r.error) throw new Error(`${label}: ${r.error}`);
+  return r;
+}
 self.onmessage = async (e) => {
   const { type } = e.data || {};
   if (type !== 'run') {
 async function runOne({ params, stream, buffer }) {
   const {
     buildType,
+    contentLength,
     nCtx,
     nGpuLayers,
+    // consistency
+    consistencyPrompt,
+    consistencyNPredict,
     refTokenIds,
+    // perf
+    nPrompt,
+    nGen,
+    nReps,
+    noWarmup,
   } = params;
   if (!stream && !buffer) {
     throw new Error('runOne: exactly one of `stream` or `buffer` must be provided');
     try {
       const adapter = await self.navigator.gpu.requestAdapter();
       if (adapter) {
         const info = adapter.info;
         result.gpuAdapterInfo = info ? {
           vendor: info.vendor || '',
   }
   const Module = await self.createBenchModule({
     locateFile: (filename) => `/build/${buildType}/${filename}`,
     print: (text) => log(`[wasm] ${text}`),
     printErr: (text) => log(`[wasm:err] ${text}`),
   log('WASM module loaded');
   // ─── Stream the model into the WASM heap (HeapFS-style) ───
   if (!(contentLength > 0)) {
     throw new Error('content-length is required for streaming into WASM heap');
   }
         post({ type: 'progress', fraction: downloaded / contentLength, downloaded, total: contentLength });
       }
     } else {
       const view = new Uint8Array(buffer);
       if (view.byteLength !== contentLength) {
         log(`warning: buffer size ${view.byteLength} != content-length ${contentLength}`);
     }
     log(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
     const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
     Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
     const node = Module.FS.lookupPath('/model.gguf').node;
     Object.defineProperty(node, 'contents', {
       get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
       set: () => { /* read-only file */ },
       configurable: true,
     });
     node.usedBytes = contentLength;
   } catch (err) {
     Module._free(modelPtr);
   if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
   log('Model loaded');
   try {
     Module.FS.unlink('/model.gguf');
   } catch (err) {
     log(`Warning: could not remove model FS node: ${err.message}`);
   }
+  // ─── Consistency phase ───
+  if (consistencyPrompt) {
+    status('consistency', 'Running consistency check...');
+    log(`bench_run("...", ${consistencyNPredict}) — consistency phase`);
+    const raw = await Module.ccall(
+      'bench_run', 'string',
+      ['string', 'number'],
+      [consistencyPrompt, consistencyNPredict],
       { async: true },
     );
+    const r = parseBenchResult('bench_run', raw);
+    result.output = r.output || '';
+    result.consistency = { token_ids: r.token_ids || [] };
+    if (refTokenIds) {
+      log('bench_eval_tokens — forced-decode vs CPU baseline');
+      const evalRaw = await Module.ccall(
+        'bench_eval_tokens', 'string',
+        ['string', 'string'],
+        [consistencyPrompt, refTokenIds],
+        { async: true },
+      );
+      const ev = parseBenchResult('bench_eval_tokens', evalRaw);
+      result.consistency = { ...result.consistency, ...ev };
       log(
+        `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
+        `${ev.n_agree}/${ev.n_tokens})` +
+        (ev.first_disagreement >= 0 ? ` — first diverge @ ${ev.first_disagreement}` : '')
       );
+    }
+  }
+  // ─── Perf phase (llama-bench style) ───
+  const wantPp = nPrompt > 0;
+  const wantTg = nGen > 0;
+  if (wantPp || wantTg) {
+    const tests = [];
+    if (wantPp) {
+      if (!noWarmup) {
+        status('perf', `warmup pp${nPrompt}`);
+        log(`bench_pp(${nPrompt}) — warmup`);
+        const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
+        parseBenchResult('bench_pp warmup', raw);
+      }
+      const samples_ns = [];
+      for (let i = 0; i < nReps; i++) {
+        status('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
+        const t0 = performance.now();
+        const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
+        const t_ns = (performance.now() - t0) * 1e6;
+        parseBenchResult('bench_pp', raw);
+        samples_ns.push(t_ns);
+        log(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
       }
+      tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
     }
+    if (wantTg) {
+      if (!noWarmup) {
+        status('perf', `warmup tg`);
+        log('bench_tg(1) — warmup');
+        const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
+        parseBenchResult('bench_tg warmup', raw);
+      }
+      const samples_ns = [];
+      for (let i = 0; i < nReps; i++) {
+        status('perf', `tg${nGen} ${i + 1}/${nReps}`);
+        const t0 = performance.now();
+        const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
+        const t_ns = (performance.now() - t0) * 1e6;
+        parseBenchResult('bench_tg', raw);
+        samples_ns.push(t_ns);
+        log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
+      }
+      tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
+    }
+    result.metrics = {
+      tests,
+      n_prompt: wantPp ? nPrompt : 0,
+      n_gen: wantTg ? nGen : 0,
+      n_reps: nReps,
+    };
   }
   await Module.ccall('bench_exit', null, [], [], { async: true });
   if (modelPtr) {
     Module._free(modelPtr);
     modelPtr = 0;
   }
   result.status = 'done';
+  const summary = result.metrics?.tests
+    ?.map(t => `${t.name}: ${t.avg_ts.toFixed(2)} ± ${t.stddev_ts.toFixed(2)} t/s`)
+    .join(' | ') || 'no perf';
+  status('done', `Done! ${summary}`);
   return result;
 }

js/run/controller.js CHANGED Viewed

@@ -22,7 +22,9 @@ const DEFAULT_N_PREDICT = 128;
 const DEFAULT_N_CTX = 2048;
 const DEFAULT_N_GPU_LAYERS = 999;
 const YIELD_BETWEEN_RUNS_MS = 500;
-const YIELD_BETWEEN_ITERATIONS_MS = 200;
 const DEFAULT_ITERATIONS = 5;
 const MIN_ITERATIONS_FOR_SUBMIT = 5;
@@ -39,6 +41,8 @@ const state = {
   results: [],         // result records from the current session
   hfSession: null,     // { accessToken, expiresAt, userName } when signed in
   iterations: DEFAULT_ITERATIONS,
   mounted: false,
   // Tracks variants the Run pipeline downloaded this session (as opposed to
   // the standalone Download button or pre-existing cache). Only these are
@@ -628,15 +632,34 @@ function wireBatchSelect() {
   });
 }
-function wireIterationsInput() {
-  const el = $('iterations-input');
-  if (!el) return;
-  el.value = String(state.iterations);
-  el.addEventListener('change', () => {
-    const n = Math.max(1, Math.min(50, parseInt(el.value, 10) || DEFAULT_ITERATIONS));
-    state.iterations = n;
-    el.value = String(n);
-  });
 }
 function submittableResults() {
@@ -767,8 +790,8 @@ function ensureProgressTable() {
           <th>Model</th>
           <th>Variant</th>
           <th>Status</th>
-          <th class="num">Prefill tok/s</th>
-          <th class="num">Decode tok/s</th>
           <th class="num">Wall s</th>
           <th>Error</th>
         </tr>
@@ -815,11 +838,21 @@ function progressRowFor(v) {
     fillFromRecord(record) {
       tr.className = `run-row-${record.status === 'done' ? 'ok' : 'error'}`;
       tr.querySelector('.status').textContent = record.status;
-      tr.querySelector('.prefill').textContent = record.metrics?.prefill_tok_s ?? '—';
-      tr.querySelector('.decode').textContent = record.metrics?.decode_tok_s ?? '—';
       tr.querySelector('.wall').textContent = record.wallTimeMs
         ? (record.wallTimeMs / 1000).toFixed(1)
-        : '—';
       tr.querySelector('.err').textContent = record.error || '';
     },
   };
@@ -1209,12 +1242,19 @@ async function runBenchmarkInWorker(v, params, callbacks) {
   const record = await runInWorker({
     params: {
       buildType: 'Suspending' in WebAssembly ? 'jspi' : 'asyncify',
-      prompt: params.prompt,
-      nPredict: params.nPredict,
       nCtx: params.nCtx,
       nGpuLayers: params.nGpuLayers,
-      refTokenIds: params.refTokenIds,
-      contentLength: fetched.contentLength,
     },
     stream: fetched.stream,
     onStatus: callbacks.onStatus,
@@ -1225,22 +1265,29 @@ async function runBenchmarkInWorker(v, params, callbacks) {
   return record;
 }
-// Runs one variant: CPU baseline (1x, for reference token IDs + consistency),
-// then N GPU iterations (consistency check on the first only to save time).
 // Returns an aggregate that makeRecord consumes.
 async function runVariantWithIterations(v, row) {
-  const iterations = Math.max(1, state.iterations || DEFAULT_ITERATIONS);
   // ─── CPU baseline ───
   row.setStatus('cpu-baseline', 'generating reference tokens');
   let cpuResult;
   try {
     cpuResult = await runBenchmarkInWorker(v, {
-      prompt: DEFAULT_PROMPT,
-      nPredict: DEFAULT_N_PREDICT,
       nCtx: DEFAULT_N_CTX,
       nGpuLayers: 0,
-      refTokenIds: null,
     }, {
       onStatus: (status, msg) => row.setStatus(`cpu/${status}`, msg),
       onProgress: (fr, downloaded, total) => row.setProgress(fr, downloaded, total),
@@ -1251,113 +1298,94 @@ async function runVariantWithIterations(v, row) {
   }
   // CPU baseline is "best effort": if it fails (typically OOM on a tight
-  // tab), keep going with GPU runs but skip the consistency check, since
-  // we'd have no reference token IDs to compare against. The user still
-  // gets prefill/decode metrics — just no agreement-rate number.
   const cpuOk = cpuResult.status === 'done';
   if (!cpuOk) {
     logLine(
-      `CPU baseline failed (${cpuResult.error || 'unknown'}) — proceeding with GPU runs, skipping consistency check.`
     );
     row.setStatus('cpu-skipped', 'continuing with GPU only');
   }
-  const refTokenIds = cpuOk ? (cpuResult.metrics?.token_ids || []).join(',') : '';
-  // ─── GPU iterations ───
-  const gpuSamples = [];
-  let consistency = null;
-  let gpuCore = null;
-  for (let i = 0; i < iterations; i++) {
-    if (state.aborted) break;
-    row.setStatus('gpu-run', `iteration ${i + 1}/${iterations}`);
-    let gpuResult;
-    try {
-      gpuResult = await runBenchmarkInWorker(v, {
-        prompt: DEFAULT_PROMPT,
-        nPredict: DEFAULT_N_PREDICT,
-        nCtx: DEFAULT_N_CTX,
-        nGpuLayers: DEFAULT_N_GPU_LAYERS,
-        refTokenIds: i === 0 ? (refTokenIds || null) : null,
-      }, {
-        onStatus: (s, m) => row.setStatus(`gpu${i + 1}/${s}`, m),
-        onProgress: (fr, d, t) => row.setProgress(fr, d, t),
-        onLog: logLine,
-      });
-    } catch (err) {
-      gpuResult = { status: 'error', error: err.message || String(err) };
-    }
-    if (gpuResult.status !== 'done') {
-      return {
-        status: 'error',
-        error: `GPU iteration ${i + 1} failed: ${gpuResult.error || 'unknown'}`,
-        iterations: gpuSamples.length,
-        cpu: cpuResult,
-        gpuSamples,
-        consistency,
-        gpuCore: gpuCore || gpuResult,
-      };
-    }
-    gpuSamples.push({
-      prefill_tok_s: gpuResult.metrics?.prefill_tok_s ?? 0,
-      decode_tok_s: gpuResult.metrics?.decode_tok_s ?? 0,
-      n_p_eval: gpuResult.metrics?.n_p_eval ?? 0,
-      n_eval: gpuResult.metrics?.n_eval ?? 0,
-      t_p_eval_ms: gpuResult.metrics?.t_p_eval_ms ?? 0,
-      t_eval_ms: gpuResult.metrics?.t_eval_ms ?? 0,
     });
-    if (i === 0) {
-      consistency = gpuResult.consistency || null;
-      gpuCore = gpuResult;
-    }
-    await sleep(YIELD_BETWEEN_ITERATIONS_MS);
   }
   return {
-    status: gpuSamples.length > 0 ? 'done' : 'error',
-    error: gpuSamples.length === 0 ? 'no GPU iterations completed' : null,
-    iterations: gpuSamples.length,
     cpu: cpuResult,
-    gpuSamples,
-    consistency,
-    gpuCore,
   };
 }
-function mean(arr, key) {
-  if (arr.length === 0) return 0;
-  return arr.reduce((a, x) => a + (x[key] || 0), 0) / arr.length;
-}
-function stdev(arr, key) {
-  if (arr.length < 2) return 0;
-  const m = mean(arr, key);
-  return Math.sqrt(arr.reduce((a, x) => a + ((x[key] || 0) - m) ** 2, 0) / arr.length);
-}
 function round2(n) { return Number.isFinite(n) ? parseFloat(n.toFixed(2)) : 0; }
 function makeRecord(v, vr, machine, browser, wallTimeMs) {
-  const first = vr.gpuSamples[0] || {};
-  const metrics = vr.gpuSamples.length > 0 ? {
-    prefill_tok_s: round2(mean(vr.gpuSamples, 'prefill_tok_s')),
-    decode_tok_s: round2(mean(vr.gpuSamples, 'decode_tok_s')),
-    prefill_tok_s_stdev: round2(stdev(vr.gpuSamples, 'prefill_tok_s')),
-    decode_tok_s_stdev: round2(stdev(vr.gpuSamples, 'decode_tok_s')),
-    prefill_samples: vr.gpuSamples.map(s => round2(s.prefill_tok_s)),
-    decode_samples: vr.gpuSamples.map(s => round2(s.decode_tok_s)),
-    iterations: vr.iterations,
-    n_p_eval: first.n_p_eval,
-    n_eval: first.n_eval,
-    t_p_eval_ms: first.t_p_eval_ms,
-    t_eval_ms: first.t_eval_ms,
   } : null;
-  const cpuBaseline = vr.cpu?.status === 'done' && vr.cpu.metrics ? {
-    prefill_tok_s: vr.cpu.metrics.prefill_tok_s,
-    decode_tok_s: vr.cpu.metrics.decode_tok_s,
   } : null;
   return {
@@ -1371,21 +1399,24 @@ function makeRecord(v, vr, machine, browser, wallTimeMs) {
     browser,
     nCtx: DEFAULT_N_CTX,
     nPredict: DEFAULT_N_PREDICT,
     nGpuLayers: DEFAULT_N_GPU_LAYERS,
     timestamp: new Date().toISOString(),
     wallTimeMs,
-    webgpuAvailable: vr.gpuCore?.webgpuAvailable ?? !!navigator.gpu,
-    gpuAdapterInfo: vr.gpuCore?.gpuAdapterInfo ?? null,
-    buildType: vr.gpuCore?.buildType ?? null,
     // llama.cpp version stamped from build-info.json. Lets us correlate
     // result drift with llama.cpp upgrades over time.
     llamaCppCommit: state.buildInfo?.llamaCppCommit ?? null,
     llamaCppDescribe: state.buildInfo?.llamaCppDescribe ?? null,
     dawnTag: state.buildInfo?.dawnTag ?? null,
     metrics,
-    consistency: vr.consistency ?? null,
     cpu_baseline: cpuBaseline,
-    output: vr.gpuCore?.output || '',
     machine,
     source: `webgpu-bench/site (${state.surface})`,
   };
@@ -1501,11 +1532,16 @@ function generateMarkdown(results) {
   let body = '';
   if (passed.length) {
     body += `## Passed (${passed.length})\n\n`;
-    body += `| Model | Variant | Size | Prefill tok/s | Decode tok/s | Wall s |\n`;
     body += `|---|---|---:|---:|---:|---:|\n`;
     for (const r of passed) {
       body += `| ${r.model} | ${r.variant} | ${formatSize(r.sizeMB)} | ${
-        r.metrics?.prefill_tok_s ?? '—'} | ${r.metrics?.decode_tok_s ?? '—'} | ${
         (r.wallTimeMs / 1000).toFixed(1)} |\n`;
     }
     body += `\n`;
@@ -1707,7 +1743,7 @@ export async function mountRunSection() {
   wireFilters();
   wireFamilySearch();
   wireBatchSelect();
-  wireIterationsInput();
   wireRunHandlers();
   wireAbortHandler();
   wirePurgeHandler();

 const DEFAULT_N_CTX = 2048;
 const DEFAULT_N_GPU_LAYERS = 999;
 const YIELD_BETWEEN_RUNS_MS = 500;
+// llama-bench defaults: -p 512 -n 128 -r 5
+const DEFAULT_N_PROMPT = 512;
+const DEFAULT_N_GEN = 128;
 const DEFAULT_ITERATIONS = 5;
 const MIN_ITERATIONS_FOR_SUBMIT = 5;
   results: [],         // result records from the current session
   hfSession: null,     // { accessToken, expiresAt, userName } when signed in
   iterations: DEFAULT_ITERATIONS,
+  nPrompt: DEFAULT_N_PROMPT,
+  nGen: DEFAULT_N_GEN,
   mounted: false,
   // Tracks variants the Run pipeline downloaded this session (as opposed to
   // the standalone Download button or pre-existing cache). Only these are
   });
 }
+function wirePerfInputs() {
+  const reps = $('iterations-input');
+  if (reps) {
+    reps.value = String(state.iterations);
+    reps.addEventListener('change', () => {
+      const n = Math.max(1, Math.min(50, parseInt(reps.value, 10) || DEFAULT_ITERATIONS));
+      state.iterations = n;
+      reps.value = String(n);
+    });
+  }
+  const np = $('n-prompt-input');
+  if (np) {
+    np.value = String(state.nPrompt);
+    np.addEventListener('change', () => {
+      const n = Math.max(0, Math.min(4096, parseInt(np.value, 10)));
+      state.nPrompt = Number.isFinite(n) ? n : DEFAULT_N_PROMPT;
+      np.value = String(state.nPrompt);
+    });
+  }
+  const ng = $('n-gen-input');
+  if (ng) {
+    ng.value = String(state.nGen);
+    ng.addEventListener('change', () => {
+      const n = Math.max(0, Math.min(4096, parseInt(ng.value, 10)));
+      state.nGen = Number.isFinite(n) ? n : DEFAULT_N_GEN;
+      ng.value = String(state.nGen);
+    });
+  }
 }
 function submittableResults() {
           <th>Model</th>
           <th>Variant</th>
           <th>Status</th>
+          <th class="num" title="Prompt processing throughput (avg \u00b1 stddev t/s)">pp tok/s</th>
+          <th class="num" title="Text generation throughput (avg \u00b1 stddev t/s)">tg tok/s</th>
           <th class="num">Wall s</th>
           <th>Error</th>
         </tr>
     fillFromRecord(record) {
       tr.className = `run-row-${record.status === 'done' ? 'ok' : 'error'}`;
       tr.querySelector('.status').textContent = record.status;
+      // Format llama-bench style: "avg \u00b1 stddev" with the test name as
+      // the cell tooltip so users see the exact pp/tg N that was measured.
+      const tests = record.metrics?.tests || [];
+      const pp = tests.find(t => t.name?.startsWith('pp'));
+      const tg = tests.find(t => t.name?.startsWith('tg'));
+      const fmt = (t) => t ? `${t.avg_ts.toFixed(2)} \u00b1 ${t.stddev_ts.toFixed(2)}` : '\u2014';
+      const ppCell = tr.querySelector('.prefill');
+      ppCell.textContent = fmt(pp);
+      if (pp) ppCell.title = pp.name;
+      const tgCell = tr.querySelector('.decode');
+      tgCell.textContent = fmt(tg);
+      if (tg) tgCell.title = tg.name;
       tr.querySelector('.wall').textContent = record.wallTimeMs
         ? (record.wallTimeMs / 1000).toFixed(1)
+        : '\u2014';
       tr.querySelector('.err').textContent = record.error || '';
     },
   };
   const record = await runInWorker({
     params: {
       buildType: 'Suspending' in WebAssembly ? 'jspi' : 'asyncify',
+      contentLength: fetched.contentLength,
+      // Model load
       nCtx: params.nCtx,
       nGpuLayers: params.nGpuLayers,
+      // Consistency phase — empty consistencyPrompt skips it
+      consistencyPrompt: params.consistencyPrompt || '',
+      consistencyNPredict: params.consistencyNPredict || DEFAULT_N_PREDICT,
+      refTokenIds: params.refTokenIds || null,
+      // Perf phase — set both to 0 to skip
+      nPrompt: params.nPrompt ?? 0,
+      nGen:    params.nGen    ?? 0,
+      nReps:   params.nReps   ?? DEFAULT_ITERATIONS,
+      noWarmup: !!params.noWarmup,
     },
     stream: fetched.stream,
     onStatus: callbacks.onStatus,
   return record;
 }
+// Runs one variant: CPU consistency baseline (one model load, generates
+// reference token IDs via bench_run), then GPU pass (one model load that
+// does both consistency forced-decoding and the llama-bench-style perf
+// sweep — pp + tg with warmup + nReps timed reps each).
 // Returns an aggregate that makeRecord consumes.
 async function runVariantWithIterations(v, row) {
+  const nReps = Math.max(1, state.iterations || DEFAULT_ITERATIONS);
+  const nPrompt = Math.max(0, state.nPrompt ?? DEFAULT_N_PROMPT);
+  const nGen = Math.max(0, state.nGen ?? DEFAULT_N_GEN);
   // ─── CPU baseline ───
+  // Pure consistency pass — capture token_ids; no perf metrics on CPU.
   row.setStatus('cpu-baseline', 'generating reference tokens');
   let cpuResult;
   try {
     cpuResult = await runBenchmarkInWorker(v, {
+      consistencyPrompt: DEFAULT_PROMPT,
+      consistencyNPredict: DEFAULT_N_PREDICT,
+      refTokenIds: null,
+      nPrompt: 0,
+      nGen: 0,
       nCtx: DEFAULT_N_CTX,
       nGpuLayers: 0,
     }, {
       onStatus: (status, msg) => row.setStatus(`cpu/${status}`, msg),
       onProgress: (fr, downloaded, total) => row.setProgress(fr, downloaded, total),
   }
   // CPU baseline is "best effort": if it fails (typically OOM on a tight
+  // tab), keep going with the GPU pass but skip consistency. Perf metrics
+  // are independent of consistency so they're still reported.
   const cpuOk = cpuResult.status === 'done';
   if (!cpuOk) {
     logLine(
+      `CPU baseline failed (${cpuResult.error || 'unknown'}) — proceeding with GPU run, skipping consistency check.`
     );
     row.setStatus('cpu-skipped', 'continuing with GPU only');
   }
+  const refTokenIds = cpuOk ? (cpuResult.consistency?.token_ids || []).join(',') : '';
+  if (state.aborted) {
+    return { status: 'error', error: 'aborted', cpu: cpuResult, gpu: null };
+  }
+  // ─── GPU pass: consistency + perf in one model load ───
+  row.setStatus('gpu-run', 'loading model');
+  let gpuResult;
+  try {
+    gpuResult = await runBenchmarkInWorker(v, {
+      consistencyPrompt: DEFAULT_PROMPT,
+      consistencyNPredict: DEFAULT_N_PREDICT,
+      refTokenIds: refTokenIds || null,
+      nPrompt,
+      nGen,
+      nReps,
+      nCtx: DEFAULT_N_CTX,
+      nGpuLayers: DEFAULT_N_GPU_LAYERS,
+    }, {
+      onStatus: (s, m) => row.setStatus(`gpu/${s}`, m),
+      onProgress: (fr, d, t) => row.setProgress(fr, d, t),
+      onLog: logLine,
     });
+  } catch (err) {
+    gpuResult = { status: 'error', error: err.message || String(err) };
   }
   return {
+    status: gpuResult.status === 'done' ? 'done' : 'error',
+    error: gpuResult.status === 'done' ? null : (gpuResult.error || 'GPU run failed'),
     cpu: cpuResult,
+    gpu: gpuResult,
   };
 }
 function round2(n) { return Number.isFinite(n) ? parseFloat(n.toFixed(2)) : 0; }
+// Pull pp/tg test results out of a metrics.tests array. Returns null if the
+// requested test wasn't run (e.g. nPrompt=0 means no pp test).
+function findTest(tests, prefix) {
+  if (!Array.isArray(tests)) return null;
+  return tests.find(t => typeof t.name === 'string' && t.name.startsWith(prefix)) || null;
+}
 function makeRecord(v, vr, machine, browser, wallTimeMs) {
+  const gpu = vr.gpu;
+  const tests = gpu?.metrics?.tests || null;
+  const pp = findTest(tests, 'pp');
+  const tg = findTest(tests, 'tg');
+  // Llama-bench shape lives under metrics.tests; flat prefill_tok_s /
+  // decode_tok_s are kept for backward compat with the existing dashboard
+  // table cells until those are migrated to read from tests directly.
+  const metrics = tests ? {
+    tests,
+    n_prompt: gpu.metrics.n_prompt,
+    n_gen: gpu.metrics.n_gen,
+    n_reps: gpu.metrics.n_reps,
+    iterations: gpu.metrics.n_reps,
+    prefill_tok_s: pp ? round2(pp.avg_ts) : 0,
+    decode_tok_s:  tg ? round2(tg.avg_ts) : 0,
+    prefill_tok_s_stdev: pp ? round2(pp.stddev_ts) : 0,
+    decode_tok_s_stdev:  tg ? round2(tg.stddev_ts) : 0,
+    prefill_samples: pp ? pp.samples_ts : [],
+    decode_samples:  tg ? tg.samples_ts : [],
+    n_p_eval: pp ? pp.n_prompt : 0,
+    n_eval:   tg ? tg.n_gen    : 0,
+    t_p_eval_ms: pp ? round2(pp.avg_ns / 1e6) : 0,
+    t_eval_ms:   tg ? round2(tg.avg_ns / 1e6) : 0,
   } : null;
+  const cpuBaseline = vr.cpu?.status === 'done' && vr.cpu.consistency?.token_ids?.length ? {
+    // CPU pass no longer measures perf — only token_ids for consistency.
+    // Keep the field present but null-valued so dashboards that look it up
+    // don't crash; downstream code can treat null as "not measured".
+    prefill_tok_s: null,
+    decode_tok_s: null,
   } : null;
   return {
     browser,
     nCtx: DEFAULT_N_CTX,
     nPredict: DEFAULT_N_PREDICT,
+    nPrompt: gpu?.metrics?.n_prompt ?? 0,
+    nGen: gpu?.metrics?.n_gen ?? 0,
+    nReps: gpu?.metrics?.n_reps ?? 0,
     nGpuLayers: DEFAULT_N_GPU_LAYERS,
     timestamp: new Date().toISOString(),
     wallTimeMs,
+    webgpuAvailable: gpu?.webgpuAvailable ?? !!navigator.gpu,
+    gpuAdapterInfo: gpu?.gpuAdapterInfo ?? null,
+    buildType: gpu?.buildType ?? null,
     // llama.cpp version stamped from build-info.json. Lets us correlate
     // result drift with llama.cpp upgrades over time.
     llamaCppCommit: state.buildInfo?.llamaCppCommit ?? null,
     llamaCppDescribe: state.buildInfo?.llamaCppDescribe ?? null,
     dawnTag: state.buildInfo?.dawnTag ?? null,
     metrics,
+    consistency: gpu?.consistency ?? null,
     cpu_baseline: cpuBaseline,
+    output: gpu?.output || '',
     machine,
     source: `webgpu-bench/site (${state.surface})`,
   };
   let body = '';
   if (passed.length) {
     body += `## Passed (${passed.length})\n\n`;
+    // llama-bench-style markdown: separate pp / tg columns with avg \u00b1 stddev.
+    body += `| Model | Variant | Size | pp tok/s | tg tok/s | Wall s |\n`;
     body += `|---|---|---:|---:|---:|---:|\n`;
+    const fmtTest = (tests, prefix) => {
+      const t = tests?.find(x => x.name?.startsWith(prefix));
+      return t ? `${t.avg_ts.toFixed(2)} \u00b1 ${t.stddev_ts.toFixed(2)} (${t.name})` : '\u2014';
+    };
     for (const r of passed) {
       body += `| ${r.model} | ${r.variant} | ${formatSize(r.sizeMB)} | ${
+        fmtTest(r.metrics?.tests, 'pp')} | ${fmtTest(r.metrics?.tests, 'tg')} | ${
         (r.wallTimeMs / 1000).toFixed(1)} |\n`;
     }
     body += `\n`;
   wireFilters();
   wireFamilySearch();
   wireBatchSelect();
+  wirePerfInputs();
   wireRunHandlers();
   wireAbortHandler();
   wirePurgeHandler();

js/run/core.js CHANGED Viewed

@@ -1,8 +1,14 @@
 // Benchmark core: load GGUF via a source adapter, init llama.cpp WASM,
-// run inference, collect metrics. Used by harness.js (URL-param driven, for
-// runner.js) and by the Run-tab controller (UI driven).
 const DEFAULT_PROMPT = 'Hello, how are you?';
 async function loadBenchScriptOnce(buildType) {
   if (typeof globalThis.createBenchModule === 'function') return;
@@ -18,15 +24,186 @@ async function loadBenchScriptOnce(buildType) {
   }
 }
 export async function runBenchmarkCore({
   source,
   modelFile,
   hfRepo,
-  prompt = DEFAULT_PROMPT,
-  nPredict = 128,
   nCtx = 2048,
   nGpuLayers = 999,
-  refTokenIds = null,
   onStatus = () => {},
   onProgress = () => {},
   onLog = () => {},
@@ -46,14 +223,13 @@ export async function runBenchmarkCore({
     webgpuAvailable: !!navigator.gpu,
     gpuAdapterInfo: null,
     metrics: null,
     output: '',
   };
-  // Declared outside the try so the catch can free our heap allocation.
   let Module;
   try {
-    // WebGPU adapter probe — informational only.
     if (navigator.gpu) {
       try {
         const adapter = await navigator.gpu.requestAdapter();
@@ -70,7 +246,6 @@ export async function runBenchmarkCore({
       onLog('WebGPU: not available in this browser');
     }
-    // Load the Emscripten glue script once per page.
     onStatus('loading_wasm', `Loading WASM module (${buildType})...`);
     onLog(`JSPI supported: ${hasJspi} — using ${buildType} variant`);
     await loadBenchScriptOnce(buildType);
@@ -78,7 +253,6 @@ export async function runBenchmarkCore({
     Module = await globalThis.createBenchModule({
       print: (text) => onLog(`[wasm] ${text}`),
       printErr: (text) => onLog(`[wasm:err] ${text}`),
-      // Catch Emscripten abort() — Firefox can abort during Asyncify init.
       onAbort: (reason) => {
         const msg = `WASM aborted: ${reason}`;
         result.error = msg;
@@ -88,7 +262,6 @@ export async function runBenchmarkCore({
     });
     onLog('WASM module loaded');
-    // Download model via the injected source adapter.
     onStatus('downloading', `Downloading ${modelFile}...`);
     onLog(`Fetching model via source: ${hfRepo}/${modelFile}`);
     const { stream, contentLength } = await source.fetchModel(hfRepo, modelFile);
@@ -96,13 +269,8 @@ export async function runBenchmarkCore({
       contentLength ? `${(contentLength / (1024 * 1024)).toFixed(1)} MB` : 'unknown'
     }`);
-    // Stream the GGUF directly into the WASM heap (HeapFS-style) to avoid a
-    // duplicate JS-side MEMFS staging buffer. _malloc reserves a region in
-    // the linear memory; HEAPU8.set writes chunks in place. We then expose
-    // the region as a MEMFS file with `canOwn=true` so MEMFS does not copy,
-    // and override node.contents with a getter that always rebuilds the
-    // view from the saved pointer — this survives the heap growth that
-    // llama.cpp triggers during bench_init/bench_load.
     if (!(contentLength > 0)) {
       throw new Error('content-length is required for streaming into WASM heap');
     }
@@ -137,19 +305,14 @@ export async function runBenchmarkCore({
       Module._free(modelPtr);
       throw err;
     }
-    // Track on the result object so we can free in the success/exit paths.
     result._modelPtr = modelPtr;
-    // Init backend.
     onStatus('initializing', 'Initializing llama.cpp backends...');
-    onLog('Calling bench_init()...');
     const initResult = await Module.ccall('bench_init', 'number', [], [], { async: true });
     if (initResult !== 0) throw new Error(`bench_init failed: ${initResult}`);
     onLog('Backends initialized');
-    // Load model.
     onStatus('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers})...`);
-    onLog(`Calling bench_load("/model.gguf", ${nCtx}, ${nGpuLayers})...`);
     const loadResult = await Module.ccall(
       'bench_load',
       'number',
@@ -160,89 +323,38 @@ export async function runBenchmarkCore({
     if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
     onLog('Model loaded');
-    // Drop the MEMFS node — llama.cpp's mmap captured a pointer into the
-    // _malloc'd region in the WASM heap, so the bytes themselves stay alive
-    // until we _free below after bench_exit.
     try {
       Module.FS.unlink('/model.gguf');
     } catch (e) {
       onLog(`Warning: could not remove model FS node: ${e.message}`);
     }
-    // Run inference.
-    onStatus('running', 'Running inference...');
-    onLog(`Calling bench_run(prompt, ${nPredict})...`);
-    const resultJson = await Module.ccall(
-      'bench_run',
-      'string',
-      ['string', 'number'],
-      [prompt, nPredict],
-      { async: true },
-    );
-    onLog(`bench_run returned: ${String(resultJson).substring(0, 200)}`);
-    const inferResult = JSON.parse(resultJson);
-    if (inferResult.error) throw new Error(`Inference error: ${inferResult.error}`);
-    const prefillTokS = inferResult.t_p_eval_ms > 0
-      ? (inferResult.n_p_eval / (inferResult.t_p_eval_ms / 1000)).toFixed(2)
-      : 'N/A';
-    const decodeTokS = inferResult.t_eval_ms > 0
-      ? (inferResult.n_eval / (inferResult.t_eval_ms / 1000)).toFixed(2)
-      : 'N/A';
-    result.metrics = {
-      ...inferResult,
-      prefill_tok_s: parseFloat(prefillTokS) || 0,
-      decode_tok_s: parseFloat(decodeTokS) || 0,
-    };
-    result.output = inferResult.output || '';
-    // Forced-decoding consistency check against a CPU reference token sequence.
-    if (refTokenIds && nGpuLayers > 0 && inferResult.token_ids?.length > 0) {
-      onLog('Running forced-decoding consistency check...');
-      const evalJson = await Module.ccall(
-        'bench_eval_tokens',
-        'string',
-        ['string', 'string'],
-        [prompt, refTokenIds],
-        { async: true },
-      );
-      const evalResult = JSON.parse(evalJson);
-      if (evalResult.error) {
-        onLog(`Consistency check error: ${evalResult.error}`);
-      } else {
-        result.consistency = evalResult;
-        onLog(
-          `Consistency: ${(evalResult.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
-          `${evalResult.n_agree}/${evalResult.n_tokens} tokens)`,
-        );
-        if (evalResult.first_disagreement >= 0) {
-          onLog(`First disagreement at token position ${evalResult.first_disagreement}`);
-        }
-      }
-    }
     onLog('Calling bench_exit()...');
     await Module.ccall('bench_exit', null, [], [], { async: true });
-    // Free the heap-resident model bytes now that llama.cpp has unmapped.
     if (result._modelPtr) {
       Module._free(result._modelPtr);
       delete result._modelPtr;
     }
     result.status = 'done';
-    onStatus('done', `Done! Prefill: ${prefillTokS} tok/s | Decode: ${decodeTokS} tok/s`);
-    onLog(
-      `Prefill: ${prefillTokS} tok/s (${inferResult.n_p_eval} tokens in ` +
-      `${inferResult.t_p_eval_ms.toFixed(0)} ms)`,
-    );
-    onLog(
-      `Decode:  ${decodeTokS} tok/s (${inferResult.n_eval} tokens in ` +
-      `${inferResult.t_eval_ms.toFixed(0)} ms)`,
-    );
-    onLog(`Output: ${(inferResult.output || '').substring(0, 200)}`);
     return result;
   } catch (err) {
     result.error = err.message || String(err);
@@ -250,7 +362,6 @@ export async function runBenchmarkCore({
     onStatus('error', `Error: ${err.message}`);
     onLog(`ERROR: ${err.message}`);
     if (err.stack) onLog(err.stack);
-    // Best-effort: release the model heap region so a re-run can reuse it.
     if (result._modelPtr && Module?._free) {
       try { Module._free(result._modelPtr); } catch { /* ignore */ }
       delete result._modelPtr;

 // Benchmark core: load GGUF via a source adapter, init llama.cpp WASM,
+// then run a consistency phase (forced-decoding against a CPU baseline) and
+// a perf phase (llama-bench-style pp/tg with warmup + n_reps timed reps).
+// Used by harness.js (URL-param driven, for runner.js) and by the Run-tab
+// controller (which runs the same logic in a Web Worker — see bench-worker.js).
 const DEFAULT_PROMPT = 'Hello, how are you?';
+const DEFAULT_N_PREDICT = 128;
+const DEFAULT_N_PROMPT = 512;
+const DEFAULT_N_GEN = 128;
+const DEFAULT_N_REPS = 5;
 async function loadBenchScriptOnce(buildType) {
   if (typeof globalThis.createBenchModule === 'function') return;
   }
 }
+// Aggregate raw nanosecond samples into the llama-bench result shape.
+// llama-bench reports avg_ts = (n_tokens * 1e9) / avg_ns and stddev_ts as
+// the std of per-sample t/s, computed independently rather than propagated
+// from stddev_ns (the mapping isn't linear).
+function buildTest(name, n_prompt, n_gen, samples_ns) {
+  const n = samples_ns.length;
+  if (n === 0) {
+    return { name, n_prompt, n_gen, avg_ns: 0, stddev_ns: 0, avg_ts: 0, stddev_ts: 0, samples_ns: [], samples_ts: [] };
+  }
+  const avg_ns = samples_ns.reduce((a, b) => a + b, 0) / n;
+  // Sample stddev (Bessel's correction) — matches llama-bench's avg_stdev when reps > 1.
+  const var_ns = n > 1
+    ? samples_ns.reduce((a, b) => a + (b - avg_ns) * (b - avg_ns), 0) / (n - 1)
+    : 0;
+  const stddev_ns = Math.sqrt(var_ns);
+  const n_tokens = n_prompt + n_gen;
+  const samples_ts = samples_ns.map(t => t > 0 ? (1e9 * n_tokens) / t : 0);
+  const avg_ts = samples_ts.reduce((a, b) => a + b, 0) / n;
+  const var_ts = n > 1
+    ? samples_ts.reduce((a, b) => a + (b - avg_ts) * (b - avg_ts), 0) / (n - 1)
+    : 0;
+  const stddev_ts = Math.sqrt(var_ts);
+  const round2 = x => Math.round(x * 100) / 100;
+  return {
+    name,
+    n_prompt,
+    n_gen,
+    avg_ns: Math.round(avg_ns),
+    stddev_ns: Math.round(stddev_ns),
+    avg_ts: round2(avg_ts),
+    stddev_ts: round2(stddev_ts),
+    samples_ns: samples_ns.map(Math.round),
+    samples_ts: samples_ts.map(round2),
+  };
+}
+// Parse the JSON returned by a bench_* C function. Throws on parse failure
+// or on `error` field from C.
+function parseBenchResult(label, raw) {
+  let r;
+  try { r = JSON.parse(raw); } catch (e) {
+    throw new Error(`${label}: invalid JSON from C (${e.message})`);
+  }
+  if (r.error) throw new Error(`${label}: ${r.error}`);
+  return r;
+}
+// Run the consistency + perf phases against an already-loaded WASM Module.
+// Returns { metrics: { tests, n_prompt, n_gen, n_reps }, consistency, output }.
+//
+// Both the worker (bench-worker.js) and the main-thread path (this file) call
+// into this. Keep the two implementations in sync.
+async function runBenchActions(Module, {
+  // Consistency phase
+  consistencyPrompt,    // non-empty string ⇒ run consistency
+  consistencyNPredict,  // tokens generated by bench_run during consistency
+  refTokenIds,          // CSV of CPU-side token IDs ⇒ forced-decode against them
+  // Perf phase
+  nPrompt, nGen, nReps, noWarmup,
+  // Reporting
+  onStatus, onLog,
+}) {
+  const out = { metrics: null, consistency: null, output: '' };
+  // ─── Consistency phase ───
+  // Two sub-modes: (a) CPU baseline — generates token_ids via bench_run for a
+  // future GPU verification pass; (b) GPU verification — runs bench_run then
+  // bench_eval_tokens to compute the agreement rate against refTokenIds.
+  if (consistencyPrompt) {
+    onStatus?.('consistency', 'Running consistency check...');
+    onLog?.(`bench_run("...", ${consistencyNPredict}) — consistency phase`);
+    const raw = await Module.ccall(
+      'bench_run', 'string',
+      ['string', 'number'],
+      [consistencyPrompt, consistencyNPredict],
+      { async: true },
+    );
+    const r = parseBenchResult('bench_run', raw);
+    out.output = r.output || '';
+    out.consistency = { token_ids: r.token_ids || [] };
+    if (refTokenIds) {
+      onLog?.('bench_eval_tokens — forced-decode vs CPU baseline');
+      const evalRaw = await Module.ccall(
+        'bench_eval_tokens', 'string',
+        ['string', 'string'],
+        [consistencyPrompt, refTokenIds],
+        { async: true },
+      );
+      const ev = parseBenchResult('bench_eval_tokens', evalRaw);
+      out.consistency = { ...out.consistency, ...ev };
+      onLog?.(
+        `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
+        `${ev.n_agree}/${ev.n_tokens})` +
+        (ev.first_disagreement >= 0 ? ` — first diverge @ ${ev.first_disagreement}` : '')
+      );
+    }
+  }
+  // ─── Perf phase (llama-bench style) ───
+  // Synthetic random tokens; KV cleared per call inside bench_pp/bench_tg.
+  // Warmup is one full pp + one tg(1) call before the timed reps, matching
+  // tools/llama-bench/llama-bench.cpp.
+  const wantPp = nPrompt > 0;
+  const wantTg = nGen > 0;
+  if (wantPp || wantTg) {
+    const tests = [];
+    if (wantPp) {
+      if (!noWarmup) {
+        onStatus?.('perf', `warmup pp${nPrompt}`);
+        onLog?.(`bench_pp(${nPrompt}) — warmup`);
+        const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
+        parseBenchResult('bench_pp warmup', raw);
+      }
+      const samples_ns = [];
+      for (let i = 0; i < nReps; i++) {
+        onStatus?.('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
+        const t0 = performance.now();
+        const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
+        const t_ns = (performance.now() - t0) * 1e6;
+        parseBenchResult('bench_pp', raw);
+        samples_ns.push(t_ns);
+        onLog?.(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
+      }
+      tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
+    }
+    if (wantTg) {
+      if (!noWarmup) {
+        onStatus?.('perf', `warmup tg`);
+        onLog?.('bench_tg(1) — warmup');
+        const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
+        parseBenchResult('bench_tg warmup', raw);
+      }
+      const samples_ns = [];
+      for (let i = 0; i < nReps; i++) {
+        onStatus?.('perf', `tg${nGen} ${i + 1}/${nReps}`);
+        const t0 = performance.now();
+        const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
+        const t_ns = (performance.now() - t0) * 1e6;
+        parseBenchResult('bench_tg', raw);
+        samples_ns.push(t_ns);
+        onLog?.(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
+      }
+      tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
+    }
+    out.metrics = {
+      tests,
+      n_prompt: wantPp ? nPrompt : 0,
+      n_gen: wantTg ? nGen : 0,
+      n_reps: nReps,
+    };
+  }
+  return out;
+}
+// Public entry. Loads the WASM module + model, then dispatches to
+// runBenchActions for the actual workload. Returns a flat record shape
+// consumed by harness.js (window.__BENCH) and by controller.makeRecord.
 export async function runBenchmarkCore({
   source,
   modelFile,
   hfRepo,
+  // consistency phase
+  consistencyPrompt = DEFAULT_PROMPT,
+  consistencyNPredict = DEFAULT_N_PREDICT,
+  refTokenIds = null,
+  runConsistency = true,    // false ⇒ skip consistency phase entirely
+  // perf phase
+  nPrompt = DEFAULT_N_PROMPT,
+  nGen = DEFAULT_N_GEN,
+  nReps = DEFAULT_N_REPS,
+  noWarmup = false,
+  // model load
   nCtx = 2048,
   nGpuLayers = 999,
+  // reporting
   onStatus = () => {},
   onProgress = () => {},
   onLog = () => {},
     webgpuAvailable: !!navigator.gpu,
     gpuAdapterInfo: null,
     metrics: null,
+    consistency: null,
     output: '',
   };
   let Module;
   try {
     if (navigator.gpu) {
       try {
         const adapter = await navigator.gpu.requestAdapter();
       onLog('WebGPU: not available in this browser');
     }
     onStatus('loading_wasm', `Loading WASM module (${buildType})...`);
     onLog(`JSPI supported: ${hasJspi} — using ${buildType} variant`);
     await loadBenchScriptOnce(buildType);
     Module = await globalThis.createBenchModule({
       print: (text) => onLog(`[wasm] ${text}`),
       printErr: (text) => onLog(`[wasm:err] ${text}`),
       onAbort: (reason) => {
         const msg = `WASM aborted: ${reason}`;
         result.error = msg;
     });
     onLog('WASM module loaded');
     onStatus('downloading', `Downloading ${modelFile}...`);
     onLog(`Fetching model via source: ${hfRepo}/${modelFile}`);
     const { stream, contentLength } = await source.fetchModel(hfRepo, modelFile);
       contentLength ? `${(contentLength / (1024 * 1024)).toFixed(1)} MB` : 'unknown'
     }`);
+    // Stream the GGUF directly into the WASM heap (HeapFS-style) — see worker
+    // for the full explanation of why we override node.contents with a getter.
     if (!(contentLength > 0)) {
       throw new Error('content-length is required for streaming into WASM heap');
     }
       Module._free(modelPtr);
       throw err;
     }
     result._modelPtr = modelPtr;
     onStatus('initializing', 'Initializing llama.cpp backends...');
     const initResult = await Module.ccall('bench_init', 'number', [], [], { async: true });
     if (initResult !== 0) throw new Error(`bench_init failed: ${initResult}`);
     onLog('Backends initialized');
     onStatus('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers})...`);
     const loadResult = await Module.ccall(
       'bench_load',
       'number',
     if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
     onLog('Model loaded');
     try {
       Module.FS.unlink('/model.gguf');
     } catch (e) {
       onLog(`Warning: could not remove model FS node: ${e.message}`);
     }
+    // ─── Consistency + perf phases ───
+    onStatus('running', 'Running benchmark...');
+    const actions = await runBenchActions(Module, {
+      consistencyPrompt: runConsistency ? consistencyPrompt : null,
+      consistencyNPredict,
+      refTokenIds,
+      nPrompt, nGen, nReps, noWarmup,
+      onStatus, onLog,
+    });
+    result.metrics = actions.metrics;
+    result.consistency = actions.consistency;
+    result.output = actions.output;
     onLog('Calling bench_exit()...');
     await Module.ccall('bench_exit', null, [], [], { async: true });
     if (result._modelPtr) {
       Module._free(result._modelPtr);
       delete result._modelPtr;
     }
     result.status = 'done';
+    const summary = result.metrics?.tests
+      ?.map(t => `${t.name}: ${t.avg_ts.toFixed(2)} ± ${t.stddev_ts.toFixed(2)} t/s`)
+      .join(' | ') || 'no perf';
+    onStatus('done', `Done! ${summary}`);
     return result;
   } catch (err) {
     result.error = err.message || String(err);
     onStatus('error', `Error: ${err.message}`);
     onLog(`ERROR: ${err.message}`);
     if (err.stack) onLog(err.stack);
     if (result._modelPtr && Module?._free) {
       try { Module._free(result._modelPtr); } catch { /* ignore */ }
       delete result._modelPtr;

js/tables.js CHANGED Viewed

@@ -76,10 +76,10 @@ export function renderResultsTable(results) {
     { key: 'status', label: 'Status', priority: 1 },
     { key: 'buildType', label: 'Build', priority: 3 },
     { key: 'webgpuAvailable', label: 'WebGPU', priority: 3 },
-    { key: 'decode_tok_s', label: 'Decode tok/s', priority: 1 },
-    { key: 'prefill_tok_s', label: 'Prefill tok/s', priority: 3 },
-    { key: 'cpu_baseline_decode_tok_s', label: 'CPU decode tok/s', priority: 2 },
-    { key: 'cpu_baseline_prefill_tok_s', label: 'CPU prefill tok/s', priority: 3 },
     { key: 'n_eval', label: 'n_eval', priority: 3 },
     { key: 't_eval_ms', label: 't_eval (ms)', priority: 3 },
     { key: 'n_p_eval', label: 'n_p_eval', priority: 3 },
@@ -133,9 +133,29 @@ export function renderResultsTable(results) {
         case 'decode_tok_s':
         case 'prefill_tok_s':
         case 'cpu_baseline_decode_tok_s':
-        case 'cpu_baseline_prefill_tok_s':
-          html += `<span class="mono">${formatTokS(r[col.key])}</span>`;
           break;
         case 't_eval_ms':
         case 't_p_eval_ms':
           html += `<span class="mono">${formatMs(r[col.key])}</span>`;

     { key: 'status', label: 'Status', priority: 1 },
     { key: 'buildType', label: 'Build', priority: 3 },
     { key: 'webgpuAvailable', label: 'WebGPU', priority: 3 },
+    { key: 'decode_tok_s', label: 'tg tok/s', priority: 1 },
+    { key: 'prefill_tok_s', label: 'pp tok/s', priority: 3 },
+    { key: 'cpu_baseline_decode_tok_s', label: 'CPU tg tok/s', priority: 2 },
+    { key: 'cpu_baseline_prefill_tok_s', label: 'CPU pp tok/s', priority: 3 },
     { key: 'n_eval', label: 'n_eval', priority: 3 },
     { key: 't_eval_ms', label: 't_eval (ms)', priority: 3 },
     { key: 'n_p_eval', label: 'n_p_eval', priority: 3 },
         case 'decode_tok_s':
         case 'prefill_tok_s':
         case 'cpu_baseline_decode_tok_s':
+        case 'cpu_baseline_prefill_tok_s': {
+          // llama-bench style "avg \u00b1 stddev" with the pp{N} / tg{N} test
+          // label as a tooltip when the new schema is present. Older records
+          // without stddev fall back to the bare avg from formatTokS.
+          const isDecode = col.key === 'decode_tok_s';
+          const isPrefill = col.key === 'prefill_tok_s';
+          const stddev = isDecode ? r.decode_stddev_ts
+            : isPrefill ? r.prefill_stddev_ts
+            : null;
+          const testName = isDecode ? r.tg_test_name
+            : isPrefill ? r.pp_test_name
+            : null;
+          const avg = r[col.key];
+          let cell;
+          if (avg != null && stddev != null) {
+            cell = `${formatTokS(avg)} \u00b1 ${formatTokS(stddev)}`;
+          } else {
+            cell = formatTokS(avg);
+          }
+          const titleAttr = testName ? ` title="${escapeHtml(testName)}"` : '';
+          html += `<span class="mono"${titleAttr}>${cell}</span>`;
           break;
+        }
         case 't_eval_ms':
         case 't_p_eval_ms':
           html += `<span class="mono">${formatMs(r[col.key])}</span>`;

run.html CHANGED Viewed

@@ -125,7 +125,15 @@
               </div>
             </div>
             <div class="filter-group">
-              <label class="filter-label" for="iterations-input">Iterations</label>
               <input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">
             </div>
           </div>

               </div>
             </div>
             <div class="filter-group">
+              <label class="filter-label" for="n-prompt-input">Prompt tokens (-p)</label>
+              <input type="number" id="n-prompt-input" class="filter-select run-iter-input" value="512" min="0" max="4096" step="1">
+            </div>
+            <div class="filter-group">
+              <label class="filter-label" for="n-gen-input">Gen tokens (-n)</label>
+              <input type="number" id="n-gen-input" class="filter-select run-iter-input" value="128" min="0" max="4096" step="1">
+            </div>
+            <div class="filter-group">
+              <label class="filter-label" for="iterations-input">Reps (-r)</label>
               <input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">
             </div>
           </div>