GitHub Actions commited on
Commit
e72601b
·
1 Parent(s): e6a49d5

sync from abhijitramesh/webgpu-bench@1be8b82935

Browse files
Files changed (7) hide show
  1. harness.js +23 -8
  2. js/dataset.js +14 -0
  3. js/run/bench-worker.js +146 -98
  4. js/run/controller.js +154 -118
  5. js/run/core.js +201 -90
  6. js/tables.js +26 -6
  7. run.html +9 -1
harness.js CHANGED
@@ -21,13 +21,22 @@ window.addEventListener('unhandledrejection', (e) => {
21
 
22
  (async function () {
23
  const params = new URLSearchParams(window.location.search);
24
- const modelFile = params.get('model') || '';
25
- const hfRepo = params.get('hfRepo') || 'unsloth/Llama-3.2-1B-Instruct-GGUF';
26
- const prompt = params.get('prompt') || 'Hello, how are you?';
27
- const nPredict = parseInt(params.get('nPredict') || '128', 10);
28
- const nCtx = parseInt(params.get('nCtx') || '2048', 10);
29
- const nGpuLayers = parseInt(params.get('nGpuLayers') || '999', 10);
30
- const refTokenIds = params.get('refTokenIds') || null;
 
 
 
 
 
 
 
 
 
31
 
32
  const hasJspi = 'Suspending' in WebAssembly;
33
 
@@ -73,7 +82,13 @@ window.addEventListener('unhandledrejection', (e) => {
73
 
74
  const result = await runBenchmarkCore({
75
  source: localSource(),
76
- modelFile, hfRepo, prompt, nPredict, nCtx, nGpuLayers, refTokenIds,
 
 
 
 
 
 
77
  onStatus, onProgress, onLog,
78
  });
79
 
 
21
 
22
  (async function () {
23
  const params = new URLSearchParams(window.location.search);
24
+ const modelFile = params.get('model') || '';
25
+ const hfRepo = params.get('hfRepo') || 'unsloth/Llama-3.2-1B-Instruct-GGUF';
26
+ const consistencyPrompt = params.get('prompt') || 'Hello, how are you?';
27
+ const consistencyNPredict = parseInt(params.get('nPredict') || '128', 10);
28
+ const nPrompt = parseInt(params.get('nPrompt') || '512', 10);
29
+ const nGen = parseInt(params.get('nGen') || '128', 10);
30
+ const nReps = parseInt(params.get('nReps') || '5', 10);
31
+ const nCtx = parseInt(params.get('nCtx') || '2048', 10);
32
+ const nGpuLayers = parseInt(params.get('nGpuLayers') || '999', 10);
33
+ const refTokenIds = params.get('refTokenIds') || null;
34
+ // mode=perf → skip consistency entirely (e.g. for the GPU perf-only pass).
35
+ // mode=consistency → skip perf (e.g. CPU baseline pass that just needs token_ids).
36
+ // default 'both' runs both phases in one model load.
37
+ const mode = params.get('mode') || 'both';
38
+ const runConsistency = mode !== 'perf';
39
+ const runPerf = mode !== 'consistency';
40
 
41
  const hasJspi = 'Suspending' in WebAssembly;
42
 
 
82
 
83
  const result = await runBenchmarkCore({
84
  source: localSource(),
85
+ modelFile, hfRepo,
86
+ consistencyPrompt, consistencyNPredict, refTokenIds,
87
+ runConsistency,
88
+ nPrompt: runPerf ? nPrompt : 0,
89
+ nGen: runPerf ? nGen : 0,
90
+ nReps,
91
+ nCtx, nGpuLayers,
92
  onStatus, onProgress, onLog,
93
  });
94
 
js/dataset.js CHANGED
@@ -120,6 +120,13 @@ async function fetchRunsBatch(datasetRepo, files) {
120
  produces. Keep field-for-field aligned with build-site.js so the merged
121
  results are indistinguishable from the baseline. */
122
  function flattenForDashboard(r, slug) {
 
 
 
 
 
 
 
123
  return {
124
  machineSlug: slug,
125
  timestamp: r.timestamp,
@@ -137,6 +144,13 @@ function flattenForDashboard(r, slug) {
137
  wallTimeMs: r.wallTimeMs,
138
  prefill_tok_s: r.metrics?.prefill_tok_s ?? null,
139
  decode_tok_s: r.metrics?.decode_tok_s ?? null,
 
 
 
 
 
 
 
140
  n_p_eval: r.metrics?.n_p_eval ?? null,
141
  t_p_eval_ms: r.metrics?.t_p_eval_ms ?? null,
142
  n_eval: r.metrics?.n_eval ?? null,
 
120
  produces. Keep field-for-field aligned with build-site.js so the merged
121
  results are indistinguishable from the baseline. */
122
  function flattenForDashboard(r, slug) {
123
+ // New-format records have metrics.tests = [{name:'pp512',...},{name:'tg128',...}].
124
+ // Old-format records have flat metrics.prefill_tok_s / decode_tok_s only.
125
+ // Surface both shapes so the table can render llama-bench-style "avg \u00b1 stddev"
126
+ // when stddev is available without breaking on older rows.
127
+ const tests = Array.isArray(r.metrics?.tests) ? r.metrics.tests : null;
128
+ const pp = tests?.find(t => t.name?.startsWith('pp')) || null;
129
+ const tg = tests?.find(t => t.name?.startsWith('tg')) || null;
130
  return {
131
  machineSlug: slug,
132
  timestamp: r.timestamp,
 
144
  wallTimeMs: r.wallTimeMs,
145
  prefill_tok_s: r.metrics?.prefill_tok_s ?? null,
146
  decode_tok_s: r.metrics?.decode_tok_s ?? null,
147
+ // llama-bench shape: per-test stddev + the test labels (pp{N} / tg{N})
148
+ prefill_stddev_ts: pp?.stddev_ts ?? r.metrics?.prefill_tok_s_stdev ?? null,
149
+ decode_stddev_ts: tg?.stddev_ts ?? r.metrics?.decode_tok_s_stdev ?? null,
150
+ pp_test_name: pp?.name ?? null,
151
+ tg_test_name: tg?.name ?? null,
152
+ pp_n_prompt: pp?.n_prompt ?? r.nPrompt ?? null,
153
+ tg_n_gen: tg?.n_gen ?? r.nGen ?? null,
154
  n_p_eval: r.metrics?.n_p_eval ?? null,
155
  t_p_eval_ms: r.metrics?.t_p_eval_ms ?? null,
156
  n_eval: r.metrics?.n_eval ?? null,
js/run/bench-worker.js CHANGED
@@ -6,12 +6,15 @@
6
  //
7
  // main → worker: {
8
  // type: 'run',
9
- // params: { buildType, prompt, nPredict, nCtx, nGpuLayers, refTokenIds,
10
- // contentLength },
11
- // // Exactly one of these — depends on whether the runtime supports
12
- // // transferable ReadableStreams (most desktops do; iOS Safari and some
13
- // // mobile Chrome configs don't, in which case the main thread drains
14
- // // the stream into an ArrayBuffer and transfers the buffer instead):
 
 
 
15
  // stream?: ReadableStream<Uint8Array>, // TRANSFERRED
16
  // buffer?: ArrayBuffer // TRANSFERRED (mobile fallback)
17
  // }
@@ -25,15 +28,57 @@
25
  // decode loops ignore signals, and termination is the only reliable way to
26
  // stop an in-flight WASM call.
27
  //
28
- // NOTE ON DUPLICATION: the orchestration below mirrors runBenchmarkCore()
29
- // in site/js/run/core.js. core.js stays the authoritative main-thread path
30
- // (used by harness.js + runner.js Playwright harness). When changing one,
31
- // change the other.
32
 
33
  const post = (msg) => self.postMessage(msg);
34
  const log = (line) => post({ type: 'log', line });
35
  const status = (s, msg) => post({ type: 'status', status: s, msg });
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  self.onmessage = async (e) => {
38
  const { type } = e.data || {};
39
  if (type !== 'run') {
@@ -58,12 +103,18 @@ self.onmessage = async (e) => {
58
  async function runOne({ params, stream, buffer }) {
59
  const {
60
  buildType,
61
- prompt,
62
- nPredict,
63
  nCtx,
64
  nGpuLayers,
 
 
 
65
  refTokenIds,
66
- contentLength,
 
 
 
 
67
  } = params;
68
  if (!stream && !buffer) {
69
  throw new Error('runOne: exactly one of `stream` or `buffer` must be provided');
@@ -85,9 +136,6 @@ async function runOne({ params, stream, buffer }) {
85
  try {
86
  const adapter = await self.navigator.gpu.requestAdapter();
87
  if (adapter) {
88
- // GPUAdapterInfo is a host object — structured-clone can't serialize
89
- // it across postMessage. Copy the fields we care about into a plain
90
- // object before storing on result.
91
  const info = adapter.info;
92
  result.gpuAdapterInfo = info ? {
93
  vendor: info.vendor || '',
@@ -118,10 +166,6 @@ async function runOne({ params, stream, buffer }) {
118
  }
119
 
120
  const Module = await self.createBenchModule({
121
- // In a worker loaded via importScripts(), Emscripten can't infer the
122
- // script's directory and falls back to self.location (this worker's
123
- // own URL), which makes it look for bench.wasm next to bench-worker.js.
124
- // Pin the lookup to the build directory so it grabs the right file.
125
  locateFile: (filename) => `/build/${buildType}/${filename}`,
126
  print: (text) => log(`[wasm] ${text}`),
127
  printErr: (text) => log(`[wasm:err] ${text}`),
@@ -135,15 +179,6 @@ async function runOne({ params, stream, buffer }) {
135
  log('WASM module loaded');
136
 
137
  // ─── Stream the model into the WASM heap (HeapFS-style) ───
138
- // Avoid the JS-side MEMFS staging buffer by allocating space inside the
139
- // WASM heap with _malloc and writing chunks directly via HEAPU8.set. Then
140
- // register the file with MEMFS using a Uint8Array view backed by the heap
141
- // region, so llama.cpp's mmap can take the zero-copy branch in MEMFS.mmap
142
- // (which fires when contents.buffer === HEAP8.buffer).
143
- //
144
- // Heap growth during bench_init/bench_load detaches old views, so we
145
- // override node.contents with a getter that always rebuilds the view
146
- // from the saved pointer + length against the current Module.HEAPU8.
147
  if (!(contentLength > 0)) {
148
  throw new Error('content-length is required for streaming into WASM heap');
149
  }
@@ -168,10 +203,6 @@ async function runOne({ params, stream, buffer }) {
168
  post({ type: 'progress', fraction: downloaded / contentLength, downloaded, total: contentLength });
169
  }
170
  } else {
171
- // Buffered path (mobile fallback): the whole file is already in
172
- // memory. Copy it into the WASM heap in one shot. Progress was
173
- // emitted on the main thread while buffering, so we just report 100%
174
- // here for the loading phase.
175
  const view = new Uint8Array(buffer);
176
  if (view.byteLength !== contentLength) {
177
  log(`warning: buffer size ${view.byteLength} != content-length ${contentLength}`);
@@ -182,21 +213,15 @@ async function runOne({ params, stream, buffer }) {
182
  }
183
  log(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
184
 
185
- // Register as a MEMFS file with a heap-backed view. canOwn=true so MEMFS
186
- // doesn't make its own copy.
187
  const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
188
  Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
189
 
190
- // Replace contents with a getter — heap growth (e.g. when llama.cpp
191
- // allocates KV cache) replaces Module.HEAPU8.buffer, which would
192
- // detach our static view. The getter rebuilds against the live buffer.
193
  const node = Module.FS.lookupPath('/model.gguf').node;
194
  Object.defineProperty(node, 'contents', {
195
  get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
196
  set: () => { /* read-only file */ },
197
  configurable: true,
198
  });
199
- // usedBytes is read by MEMFS for stat() — keep it accurate.
200
  node.usedBytes = contentLength;
201
  } catch (err) {
202
  Module._free(modelPtr);
@@ -222,86 +247,109 @@ async function runOne({ params, stream, buffer }) {
222
  if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
223
  log('Model loaded');
224
 
225
- // Drop the MEMFS node — the bytes themselves stay alive in the WASM heap
226
- // because llama.cpp's mmap captured a pointer into our _malloc'd region.
227
- // We free that region after bench_exit.
228
  try {
229
  Module.FS.unlink('/model.gguf');
230
  } catch (err) {
231
  log(`Warning: could not remove model FS node: ${err.message}`);
232
  }
233
 
234
- // ─── Inference ───
235
- status('running', 'Running inference...');
236
- const resultJson = await Module.ccall(
237
- 'bench_run',
238
- 'string',
239
- ['string', 'number'],
240
- [prompt, nPredict],
241
- { async: true },
242
- );
243
- log(`bench_run returned: ${String(resultJson).substring(0, 200)}`);
244
-
245
- const inferResult = JSON.parse(resultJson);
246
- if (inferResult.error) throw new Error(`Inference error: ${inferResult.error}`);
247
-
248
- const prefillTokS = inferResult.t_p_eval_ms > 0
249
- ? (inferResult.n_p_eval / (inferResult.t_p_eval_ms / 1000)).toFixed(2)
250
- : 'N/A';
251
- const decodeTokS = inferResult.t_eval_ms > 0
252
- ? (inferResult.n_eval / (inferResult.t_eval_ms / 1000)).toFixed(2)
253
- : 'N/A';
254
-
255
- result.metrics = {
256
- ...inferResult,
257
- prefill_tok_s: parseFloat(prefillTokS) || 0,
258
- decode_tok_s: parseFloat(decodeTokS) || 0,
259
- };
260
- result.output = inferResult.output || '';
261
-
262
- // ─── Consistency check ───
263
- if (refTokenIds && nGpuLayers > 0 && inferResult.token_ids?.length > 0) {
264
- log('Running forced-decoding consistency check...');
265
- const evalJson = await Module.ccall(
266
- 'bench_eval_tokens',
267
- 'string',
268
- ['string', 'string'],
269
- [prompt, refTokenIds],
270
  { async: true },
271
  );
272
- const evalResult = JSON.parse(evalJson);
273
- if (evalResult.error) {
274
- log(`Consistency check error: ${evalResult.error}`);
275
- } else {
276
- result.consistency = evalResult;
 
 
 
 
 
 
 
 
 
277
  log(
278
- `Consistency: ${(evalResult.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
279
- `${evalResult.n_agree}/${evalResult.n_tokens} tokens)`,
 
280
  );
281
- if (evalResult.first_disagreement >= 0) {
282
- log(`First disagreement at token position ${evalResult.first_disagreement}`);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  }
 
284
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  }
286
 
287
  await Module.ccall('bench_exit', null, [], [], { async: true });
288
 
289
- // Free the heap-resident model bytes now that llama.cpp has unmapped.
290
  if (modelPtr) {
291
  Module._free(modelPtr);
292
  modelPtr = 0;
293
  }
294
 
295
  result.status = 'done';
296
- status('done', `Done! Prefill: ${prefillTokS} tok/s | Decode: ${decodeTokS} tok/s`);
297
- log(
298
- `Prefill: ${prefillTokS} tok/s (${inferResult.n_p_eval} tokens in ` +
299
- `${inferResult.t_p_eval_ms.toFixed(0)} ms)`,
300
- );
301
- log(
302
- `Decode: ${decodeTokS} tok/s (${inferResult.n_eval} tokens in ` +
303
- `${inferResult.t_eval_ms.toFixed(0)} ms)`,
304
- );
305
- log(`Output: ${(inferResult.output || '').substring(0, 200)}`);
306
  return result;
307
  }
 
6
  //
7
  // main → worker: {
8
  // type: 'run',
9
+ // params: {
10
+ // buildType, contentLength,
11
+ // // model load
12
+ // nCtx, nGpuLayers,
13
+ // // consistency phase (set consistencyPrompt to '' to skip)
14
+ // consistencyPrompt, consistencyNPredict, refTokenIds,
15
+ // // perf phase
16
+ // nPrompt, nGen, nReps, noWarmup,
17
+ // },
18
  // stream?: ReadableStream<Uint8Array>, // TRANSFERRED
19
  // buffer?: ArrayBuffer // TRANSFERRED (mobile fallback)
20
  // }
 
28
  // decode loops ignore signals, and termination is the only reliable way to
29
  // stop an in-flight WASM call.
30
  //
31
+ // NOTE ON DUPLICATION: the orchestration below mirrors runBenchmarkCore() +
32
+ // runBenchActions() in site/js/run/core.js. core.js stays the authoritative
33
+ // main-thread path (used by harness.js + runner.js Playwright harness). When
34
+ // changing one, change the other.
35
 
36
  const post = (msg) => self.postMessage(msg);
37
  const log = (line) => post({ type: 'log', line });
38
  const status = (s, msg) => post({ type: 'status', status: s, msg });
39
 
40
+ // Aggregate raw nanosecond samples into the llama-bench result shape.
41
+ // Mirrors core.js buildTest — keep them identical.
42
+ function buildTest(name, n_prompt, n_gen, samples_ns) {
43
+ const n = samples_ns.length;
44
+ if (n === 0) {
45
+ return { name, n_prompt, n_gen, avg_ns: 0, stddev_ns: 0, avg_ts: 0, stddev_ts: 0, samples_ns: [], samples_ts: [] };
46
+ }
47
+ const avg_ns = samples_ns.reduce((a, b) => a + b, 0) / n;
48
+ const var_ns = n > 1
49
+ ? samples_ns.reduce((a, b) => a + (b - avg_ns) * (b - avg_ns), 0) / (n - 1)
50
+ : 0;
51
+ const stddev_ns = Math.sqrt(var_ns);
52
+ const n_tokens = n_prompt + n_gen;
53
+ const samples_ts = samples_ns.map(t => t > 0 ? (1e9 * n_tokens) / t : 0);
54
+ const avg_ts = samples_ts.reduce((a, b) => a + b, 0) / n;
55
+ const var_ts = n > 1
56
+ ? samples_ts.reduce((a, b) => a + (b - avg_ts) * (b - avg_ts), 0) / (n - 1)
57
+ : 0;
58
+ const stddev_ts = Math.sqrt(var_ts);
59
+ const round2 = x => Math.round(x * 100) / 100;
60
+ return {
61
+ name,
62
+ n_prompt,
63
+ n_gen,
64
+ avg_ns: Math.round(avg_ns),
65
+ stddev_ns: Math.round(stddev_ns),
66
+ avg_ts: round2(avg_ts),
67
+ stddev_ts: round2(stddev_ts),
68
+ samples_ns: samples_ns.map(Math.round),
69
+ samples_ts: samples_ts.map(round2),
70
+ };
71
+ }
72
+
73
+ function parseBenchResult(label, raw) {
74
+ let r;
75
+ try { r = JSON.parse(raw); } catch (e) {
76
+ throw new Error(`${label}: invalid JSON from C (${e.message})`);
77
+ }
78
+ if (r.error) throw new Error(`${label}: ${r.error}`);
79
+ return r;
80
+ }
81
+
82
  self.onmessage = async (e) => {
83
  const { type } = e.data || {};
84
  if (type !== 'run') {
 
103
  async function runOne({ params, stream, buffer }) {
104
  const {
105
  buildType,
106
+ contentLength,
 
107
  nCtx,
108
  nGpuLayers,
109
+ // consistency
110
+ consistencyPrompt,
111
+ consistencyNPredict,
112
  refTokenIds,
113
+ // perf
114
+ nPrompt,
115
+ nGen,
116
+ nReps,
117
+ noWarmup,
118
  } = params;
119
  if (!stream && !buffer) {
120
  throw new Error('runOne: exactly one of `stream` or `buffer` must be provided');
 
136
  try {
137
  const adapter = await self.navigator.gpu.requestAdapter();
138
  if (adapter) {
 
 
 
139
  const info = adapter.info;
140
  result.gpuAdapterInfo = info ? {
141
  vendor: info.vendor || '',
 
166
  }
167
 
168
  const Module = await self.createBenchModule({
 
 
 
 
169
  locateFile: (filename) => `/build/${buildType}/${filename}`,
170
  print: (text) => log(`[wasm] ${text}`),
171
  printErr: (text) => log(`[wasm:err] ${text}`),
 
179
  log('WASM module loaded');
180
 
181
  // ─── Stream the model into the WASM heap (HeapFS-style) ───
 
 
 
 
 
 
 
 
 
182
  if (!(contentLength > 0)) {
183
  throw new Error('content-length is required for streaming into WASM heap');
184
  }
 
203
  post({ type: 'progress', fraction: downloaded / contentLength, downloaded, total: contentLength });
204
  }
205
  } else {
 
 
 
 
206
  const view = new Uint8Array(buffer);
207
  if (view.byteLength !== contentLength) {
208
  log(`warning: buffer size ${view.byteLength} != content-length ${contentLength}`);
 
213
  }
214
  log(`Model written to WASM heap @ 0x${modelPtr.toString(16)} (${(downloaded / (1024 * 1024)).toFixed(1)} MB)`);
215
 
 
 
216
  const view = new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength);
217
  Module.FS.createDataFile('/', 'model.gguf', view, true, false, true);
218
 
 
 
 
219
  const node = Module.FS.lookupPath('/model.gguf').node;
220
  Object.defineProperty(node, 'contents', {
221
  get: () => new Uint8Array(Module.HEAPU8.buffer, modelPtr, contentLength),
222
  set: () => { /* read-only file */ },
223
  configurable: true,
224
  });
 
225
  node.usedBytes = contentLength;
226
  } catch (err) {
227
  Module._free(modelPtr);
 
247
  if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
248
  log('Model loaded');
249
 
 
 
 
250
  try {
251
  Module.FS.unlink('/model.gguf');
252
  } catch (err) {
253
  log(`Warning: could not remove model FS node: ${err.message}`);
254
  }
255
 
256
+ // ─── Consistency phase ───
257
+ if (consistencyPrompt) {
258
+ status('consistency', 'Running consistency check...');
259
+ log(`bench_run("...", ${consistencyNPredict}) — consistency phase`);
260
+ const raw = await Module.ccall(
261
+ 'bench_run', 'string',
262
+ ['string', 'number'],
263
+ [consistencyPrompt, consistencyNPredict],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  { async: true },
265
  );
266
+ const r = parseBenchResult('bench_run', raw);
267
+ result.output = r.output || '';
268
+ result.consistency = { token_ids: r.token_ids || [] };
269
+
270
+ if (refTokenIds) {
271
+ log('bench_eval_tokens — forced-decode vs CPU baseline');
272
+ const evalRaw = await Module.ccall(
273
+ 'bench_eval_tokens', 'string',
274
+ ['string', 'string'],
275
+ [consistencyPrompt, refTokenIds],
276
+ { async: true },
277
+ );
278
+ const ev = parseBenchResult('bench_eval_tokens', evalRaw);
279
+ result.consistency = { ...result.consistency, ...ev };
280
  log(
281
+ `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
282
+ `${ev.n_agree}/${ev.n_tokens})` +
283
+ (ev.first_disagreement >= 0 ? ` — first diverge @ ${ev.first_disagreement}` : '')
284
  );
285
+ }
286
+ }
287
+
288
+ // ─── Perf phase (llama-bench style) ───
289
+ const wantPp = nPrompt > 0;
290
+ const wantTg = nGen > 0;
291
+ if (wantPp || wantTg) {
292
+ const tests = [];
293
+
294
+ if (wantPp) {
295
+ if (!noWarmup) {
296
+ status('perf', `warmup pp${nPrompt}`);
297
+ log(`bench_pp(${nPrompt}) — warmup`);
298
+ const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
299
+ parseBenchResult('bench_pp warmup', raw);
300
+ }
301
+ const samples_ns = [];
302
+ for (let i = 0; i < nReps; i++) {
303
+ status('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
304
+ const t0 = performance.now();
305
+ const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
306
+ const t_ns = (performance.now() - t0) * 1e6;
307
+ parseBenchResult('bench_pp', raw);
308
+ samples_ns.push(t_ns);
309
+ log(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
310
  }
311
+ tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
312
  }
313
+
314
+ if (wantTg) {
315
+ if (!noWarmup) {
316
+ status('perf', `warmup tg`);
317
+ log('bench_tg(1) — warmup');
318
+ const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
319
+ parseBenchResult('bench_tg warmup', raw);
320
+ }
321
+ const samples_ns = [];
322
+ for (let i = 0; i < nReps; i++) {
323
+ status('perf', `tg${nGen} ${i + 1}/${nReps}`);
324
+ const t0 = performance.now();
325
+ const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
326
+ const t_ns = (performance.now() - t0) * 1e6;
327
+ parseBenchResult('bench_tg', raw);
328
+ samples_ns.push(t_ns);
329
+ log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
330
+ }
331
+ tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
332
+ }
333
+
334
+ result.metrics = {
335
+ tests,
336
+ n_prompt: wantPp ? nPrompt : 0,
337
+ n_gen: wantTg ? nGen : 0,
338
+ n_reps: nReps,
339
+ };
340
  }
341
 
342
  await Module.ccall('bench_exit', null, [], [], { async: true });
343
 
 
344
  if (modelPtr) {
345
  Module._free(modelPtr);
346
  modelPtr = 0;
347
  }
348
 
349
  result.status = 'done';
350
+ const summary = result.metrics?.tests
351
+ ?.map(t => `${t.name}: ${t.avg_ts.toFixed(2)} ± ${t.stddev_ts.toFixed(2)} t/s`)
352
+ .join(' | ') || 'no perf';
353
+ status('done', `Done! ${summary}`);
 
 
 
 
 
 
354
  return result;
355
  }
js/run/controller.js CHANGED
@@ -22,7 +22,9 @@ const DEFAULT_N_PREDICT = 128;
22
  const DEFAULT_N_CTX = 2048;
23
  const DEFAULT_N_GPU_LAYERS = 999;
24
  const YIELD_BETWEEN_RUNS_MS = 500;
25
- const YIELD_BETWEEN_ITERATIONS_MS = 200;
 
 
26
  const DEFAULT_ITERATIONS = 5;
27
  const MIN_ITERATIONS_FOR_SUBMIT = 5;
28
 
@@ -39,6 +41,8 @@ const state = {
39
  results: [], // result records from the current session
40
  hfSession: null, // { accessToken, expiresAt, userName } when signed in
41
  iterations: DEFAULT_ITERATIONS,
 
 
42
  mounted: false,
43
  // Tracks variants the Run pipeline downloaded this session (as opposed to
44
  // the standalone Download button or pre-existing cache). Only these are
@@ -628,15 +632,34 @@ function wireBatchSelect() {
628
  });
629
  }
630
 
631
- function wireIterationsInput() {
632
- const el = $('iterations-input');
633
- if (!el) return;
634
- el.value = String(state.iterations);
635
- el.addEventListener('change', () => {
636
- const n = Math.max(1, Math.min(50, parseInt(el.value, 10) || DEFAULT_ITERATIONS));
637
- state.iterations = n;
638
- el.value = String(n);
639
- });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
640
  }
641
 
642
  function submittableResults() {
@@ -767,8 +790,8 @@ function ensureProgressTable() {
767
  <th>Model</th>
768
  <th>Variant</th>
769
  <th>Status</th>
770
- <th class="num">Prefill tok/s</th>
771
- <th class="num">Decode tok/s</th>
772
  <th class="num">Wall s</th>
773
  <th>Error</th>
774
  </tr>
@@ -815,11 +838,21 @@ function progressRowFor(v) {
815
  fillFromRecord(record) {
816
  tr.className = `run-row-${record.status === 'done' ? 'ok' : 'error'}`;
817
  tr.querySelector('.status').textContent = record.status;
818
- tr.querySelector('.prefill').textContent = record.metrics?.prefill_tok_s ?? '—';
819
- tr.querySelector('.decode').textContent = record.metrics?.decode_tok_s ?? '—';
 
 
 
 
 
 
 
 
 
 
820
  tr.querySelector('.wall').textContent = record.wallTimeMs
821
  ? (record.wallTimeMs / 1000).toFixed(1)
822
- : '';
823
  tr.querySelector('.err').textContent = record.error || '';
824
  },
825
  };
@@ -1209,12 +1242,19 @@ async function runBenchmarkInWorker(v, params, callbacks) {
1209
  const record = await runInWorker({
1210
  params: {
1211
  buildType: 'Suspending' in WebAssembly ? 'jspi' : 'asyncify',
1212
- prompt: params.prompt,
1213
- nPredict: params.nPredict,
1214
  nCtx: params.nCtx,
1215
  nGpuLayers: params.nGpuLayers,
1216
- refTokenIds: params.refTokenIds,
1217
- contentLength: fetched.contentLength,
 
 
 
 
 
 
 
1218
  },
1219
  stream: fetched.stream,
1220
  onStatus: callbacks.onStatus,
@@ -1225,22 +1265,29 @@ async function runBenchmarkInWorker(v, params, callbacks) {
1225
  return record;
1226
  }
1227
 
1228
- // Runs one variant: CPU baseline (1x, for reference token IDs + consistency),
1229
- // then N GPU iterations (consistency check on the first only to save time).
 
 
1230
  // Returns an aggregate that makeRecord consumes.
1231
  async function runVariantWithIterations(v, row) {
1232
- const iterations = Math.max(1, state.iterations || DEFAULT_ITERATIONS);
 
 
1233
 
1234
  // ─── CPU baseline ───
 
1235
  row.setStatus('cpu-baseline', 'generating reference tokens');
1236
  let cpuResult;
1237
  try {
1238
  cpuResult = await runBenchmarkInWorker(v, {
1239
- prompt: DEFAULT_PROMPT,
1240
- nPredict: DEFAULT_N_PREDICT,
 
 
 
1241
  nCtx: DEFAULT_N_CTX,
1242
  nGpuLayers: 0,
1243
- refTokenIds: null,
1244
  }, {
1245
  onStatus: (status, msg) => row.setStatus(`cpu/${status}`, msg),
1246
  onProgress: (fr, downloaded, total) => row.setProgress(fr, downloaded, total),
@@ -1251,113 +1298,94 @@ async function runVariantWithIterations(v, row) {
1251
  }
1252
 
1253
  // CPU baseline is "best effort": if it fails (typically OOM on a tight
1254
- // tab), keep going with GPU runs but skip the consistency check, since
1255
- // we'd have no reference token IDs to compare against. The user still
1256
- // gets prefill/decode metrics — just no agreement-rate number.
1257
  const cpuOk = cpuResult.status === 'done';
1258
  if (!cpuOk) {
1259
  logLine(
1260
- `CPU baseline failed (${cpuResult.error || 'unknown'}) — proceeding with GPU runs, skipping consistency check.`
1261
  );
1262
  row.setStatus('cpu-skipped', 'continuing with GPU only');
1263
  }
1264
 
1265
- const refTokenIds = cpuOk ? (cpuResult.metrics?.token_ids || []).join(',') : '';
1266
 
1267
- // ─── GPU iterations ───
1268
- const gpuSamples = [];
1269
- let consistency = null;
1270
- let gpuCore = null;
1271
-
1272
- for (let i = 0; i < iterations; i++) {
1273
- if (state.aborted) break;
1274
- row.setStatus('gpu-run', `iteration ${i + 1}/${iterations}`);
1275
- let gpuResult;
1276
- try {
1277
- gpuResult = await runBenchmarkInWorker(v, {
1278
- prompt: DEFAULT_PROMPT,
1279
- nPredict: DEFAULT_N_PREDICT,
1280
- nCtx: DEFAULT_N_CTX,
1281
- nGpuLayers: DEFAULT_N_GPU_LAYERS,
1282
- refTokenIds: i === 0 ? (refTokenIds || null) : null,
1283
- }, {
1284
- onStatus: (s, m) => row.setStatus(`gpu${i + 1}/${s}`, m),
1285
- onProgress: (fr, d, t) => row.setProgress(fr, d, t),
1286
- onLog: logLine,
1287
- });
1288
- } catch (err) {
1289
- gpuResult = { status: 'error', error: err.message || String(err) };
1290
- }
1291
-
1292
- if (gpuResult.status !== 'done') {
1293
- return {
1294
- status: 'error',
1295
- error: `GPU iteration ${i + 1} failed: ${gpuResult.error || 'unknown'}`,
1296
- iterations: gpuSamples.length,
1297
- cpu: cpuResult,
1298
- gpuSamples,
1299
- consistency,
1300
- gpuCore: gpuCore || gpuResult,
1301
- };
1302
- }
1303
 
1304
- gpuSamples.push({
1305
- prefill_tok_s: gpuResult.metrics?.prefill_tok_s ?? 0,
1306
- decode_tok_s: gpuResult.metrics?.decode_tok_s ?? 0,
1307
- n_p_eval: gpuResult.metrics?.n_p_eval ?? 0,
1308
- n_eval: gpuResult.metrics?.n_eval ?? 0,
1309
- t_p_eval_ms: gpuResult.metrics?.t_p_eval_ms ?? 0,
1310
- t_eval_ms: gpuResult.metrics?.t_eval_ms ?? 0,
 
 
 
 
 
 
 
 
 
 
1311
  });
1312
- if (i === 0) {
1313
- consistency = gpuResult.consistency || null;
1314
- gpuCore = gpuResult;
1315
- }
1316
-
1317
- await sleep(YIELD_BETWEEN_ITERATIONS_MS);
1318
  }
1319
 
1320
  return {
1321
- status: gpuSamples.length > 0 ? 'done' : 'error',
1322
- error: gpuSamples.length === 0 ? 'no GPU iterations completed' : null,
1323
- iterations: gpuSamples.length,
1324
  cpu: cpuResult,
1325
- gpuSamples,
1326
- consistency,
1327
- gpuCore,
1328
  };
1329
  }
1330
 
1331
- function mean(arr, key) {
1332
- if (arr.length === 0) return 0;
1333
- return arr.reduce((a, x) => a + (x[key] || 0), 0) / arr.length;
1334
- }
1335
- function stdev(arr, key) {
1336
- if (arr.length < 2) return 0;
1337
- const m = mean(arr, key);
1338
- return Math.sqrt(arr.reduce((a, x) => a + ((x[key] || 0) - m) ** 2, 0) / arr.length);
1339
- }
1340
  function round2(n) { return Number.isFinite(n) ? parseFloat(n.toFixed(2)) : 0; }
1341
 
 
 
 
 
 
 
 
1342
  function makeRecord(v, vr, machine, browser, wallTimeMs) {
1343
- const first = vr.gpuSamples[0] || {};
1344
- const metrics = vr.gpuSamples.length > 0 ? {
1345
- prefill_tok_s: round2(mean(vr.gpuSamples, 'prefill_tok_s')),
1346
- decode_tok_s: round2(mean(vr.gpuSamples, 'decode_tok_s')),
1347
- prefill_tok_s_stdev: round2(stdev(vr.gpuSamples, 'prefill_tok_s')),
1348
- decode_tok_s_stdev: round2(stdev(vr.gpuSamples, 'decode_tok_s')),
1349
- prefill_samples: vr.gpuSamples.map(s => round2(s.prefill_tok_s)),
1350
- decode_samples: vr.gpuSamples.map(s => round2(s.decode_tok_s)),
1351
- iterations: vr.iterations,
1352
- n_p_eval: first.n_p_eval,
1353
- n_eval: first.n_eval,
1354
- t_p_eval_ms: first.t_p_eval_ms,
1355
- t_eval_ms: first.t_eval_ms,
 
 
 
 
 
 
 
 
 
 
 
1356
  } : null;
1357
 
1358
- const cpuBaseline = vr.cpu?.status === 'done' && vr.cpu.metrics ? {
1359
- prefill_tok_s: vr.cpu.metrics.prefill_tok_s,
1360
- decode_tok_s: vr.cpu.metrics.decode_tok_s,
 
 
 
1361
  } : null;
1362
 
1363
  return {
@@ -1371,21 +1399,24 @@ function makeRecord(v, vr, machine, browser, wallTimeMs) {
1371
  browser,
1372
  nCtx: DEFAULT_N_CTX,
1373
  nPredict: DEFAULT_N_PREDICT,
 
 
 
1374
  nGpuLayers: DEFAULT_N_GPU_LAYERS,
1375
  timestamp: new Date().toISOString(),
1376
  wallTimeMs,
1377
- webgpuAvailable: vr.gpuCore?.webgpuAvailable ?? !!navigator.gpu,
1378
- gpuAdapterInfo: vr.gpuCore?.gpuAdapterInfo ?? null,
1379
- buildType: vr.gpuCore?.buildType ?? null,
1380
  // llama.cpp version stamped from build-info.json. Lets us correlate
1381
  // result drift with llama.cpp upgrades over time.
1382
  llamaCppCommit: state.buildInfo?.llamaCppCommit ?? null,
1383
  llamaCppDescribe: state.buildInfo?.llamaCppDescribe ?? null,
1384
  dawnTag: state.buildInfo?.dawnTag ?? null,
1385
  metrics,
1386
- consistency: vr.consistency ?? null,
1387
  cpu_baseline: cpuBaseline,
1388
- output: vr.gpuCore?.output || '',
1389
  machine,
1390
  source: `webgpu-bench/site (${state.surface})`,
1391
  };
@@ -1501,11 +1532,16 @@ function generateMarkdown(results) {
1501
  let body = '';
1502
  if (passed.length) {
1503
  body += `## Passed (${passed.length})\n\n`;
1504
- body += `| Model | Variant | Size | Prefill tok/s | Decode tok/s | Wall s |\n`;
 
1505
  body += `|---|---|---:|---:|---:|---:|\n`;
 
 
 
 
1506
  for (const r of passed) {
1507
  body += `| ${r.model} | ${r.variant} | ${formatSize(r.sizeMB)} | ${
1508
- r.metrics?.prefill_tok_s ?? ''} | ${r.metrics?.decode_tok_s ?? ''} | ${
1509
  (r.wallTimeMs / 1000).toFixed(1)} |\n`;
1510
  }
1511
  body += `\n`;
@@ -1707,7 +1743,7 @@ export async function mountRunSection() {
1707
  wireFilters();
1708
  wireFamilySearch();
1709
  wireBatchSelect();
1710
- wireIterationsInput();
1711
  wireRunHandlers();
1712
  wireAbortHandler();
1713
  wirePurgeHandler();
 
22
  const DEFAULT_N_CTX = 2048;
23
  const DEFAULT_N_GPU_LAYERS = 999;
24
  const YIELD_BETWEEN_RUNS_MS = 500;
25
+ // llama-bench defaults: -p 512 -n 128 -r 5
26
+ const DEFAULT_N_PROMPT = 512;
27
+ const DEFAULT_N_GEN = 128;
28
  const DEFAULT_ITERATIONS = 5;
29
  const MIN_ITERATIONS_FOR_SUBMIT = 5;
30
 
 
41
  results: [], // result records from the current session
42
  hfSession: null, // { accessToken, expiresAt, userName } when signed in
43
  iterations: DEFAULT_ITERATIONS,
44
+ nPrompt: DEFAULT_N_PROMPT,
45
+ nGen: DEFAULT_N_GEN,
46
  mounted: false,
47
  // Tracks variants the Run pipeline downloaded this session (as opposed to
48
  // the standalone Download button or pre-existing cache). Only these are
 
632
  });
633
  }
634
 
635
+ function wirePerfInputs() {
636
+ const reps = $('iterations-input');
637
+ if (reps) {
638
+ reps.value = String(state.iterations);
639
+ reps.addEventListener('change', () => {
640
+ const n = Math.max(1, Math.min(50, parseInt(reps.value, 10) || DEFAULT_ITERATIONS));
641
+ state.iterations = n;
642
+ reps.value = String(n);
643
+ });
644
+ }
645
+ const np = $('n-prompt-input');
646
+ if (np) {
647
+ np.value = String(state.nPrompt);
648
+ np.addEventListener('change', () => {
649
+ const n = Math.max(0, Math.min(4096, parseInt(np.value, 10)));
650
+ state.nPrompt = Number.isFinite(n) ? n : DEFAULT_N_PROMPT;
651
+ np.value = String(state.nPrompt);
652
+ });
653
+ }
654
+ const ng = $('n-gen-input');
655
+ if (ng) {
656
+ ng.value = String(state.nGen);
657
+ ng.addEventListener('change', () => {
658
+ const n = Math.max(0, Math.min(4096, parseInt(ng.value, 10)));
659
+ state.nGen = Number.isFinite(n) ? n : DEFAULT_N_GEN;
660
+ ng.value = String(state.nGen);
661
+ });
662
+ }
663
  }
664
 
665
  function submittableResults() {
 
790
  <th>Model</th>
791
  <th>Variant</th>
792
  <th>Status</th>
793
+ <th class="num" title="Prompt processing throughput (avg \u00b1 stddev t/s)">pp tok/s</th>
794
+ <th class="num" title="Text generation throughput (avg \u00b1 stddev t/s)">tg tok/s</th>
795
  <th class="num">Wall s</th>
796
  <th>Error</th>
797
  </tr>
 
838
  fillFromRecord(record) {
839
  tr.className = `run-row-${record.status === 'done' ? 'ok' : 'error'}`;
840
  tr.querySelector('.status').textContent = record.status;
841
+ // Format llama-bench style: "avg \u00b1 stddev" with the test name as
842
+ // the cell tooltip so users see the exact pp/tg N that was measured.
843
+ const tests = record.metrics?.tests || [];
844
+ const pp = tests.find(t => t.name?.startsWith('pp'));
845
+ const tg = tests.find(t => t.name?.startsWith('tg'));
846
+ const fmt = (t) => t ? `${t.avg_ts.toFixed(2)} \u00b1 ${t.stddev_ts.toFixed(2)}` : '\u2014';
847
+ const ppCell = tr.querySelector('.prefill');
848
+ ppCell.textContent = fmt(pp);
849
+ if (pp) ppCell.title = pp.name;
850
+ const tgCell = tr.querySelector('.decode');
851
+ tgCell.textContent = fmt(tg);
852
+ if (tg) tgCell.title = tg.name;
853
  tr.querySelector('.wall').textContent = record.wallTimeMs
854
  ? (record.wallTimeMs / 1000).toFixed(1)
855
+ : '\u2014';
856
  tr.querySelector('.err').textContent = record.error || '';
857
  },
858
  };
 
1242
  const record = await runInWorker({
1243
  params: {
1244
  buildType: 'Suspending' in WebAssembly ? 'jspi' : 'asyncify',
1245
+ contentLength: fetched.contentLength,
1246
+ // Model load
1247
  nCtx: params.nCtx,
1248
  nGpuLayers: params.nGpuLayers,
1249
+ // Consistency phase — empty consistencyPrompt skips it
1250
+ consistencyPrompt: params.consistencyPrompt || '',
1251
+ consistencyNPredict: params.consistencyNPredict || DEFAULT_N_PREDICT,
1252
+ refTokenIds: params.refTokenIds || null,
1253
+ // Perf phase — set both to 0 to skip
1254
+ nPrompt: params.nPrompt ?? 0,
1255
+ nGen: params.nGen ?? 0,
1256
+ nReps: params.nReps ?? DEFAULT_ITERATIONS,
1257
+ noWarmup: !!params.noWarmup,
1258
  },
1259
  stream: fetched.stream,
1260
  onStatus: callbacks.onStatus,
 
1265
  return record;
1266
  }
1267
 
1268
+ // Runs one variant: CPU consistency baseline (one model load, generates
1269
+ // reference token IDs via bench_run), then GPU pass (one model load that
1270
+ // does both consistency forced-decoding and the llama-bench-style perf
1271
+ // sweep — pp + tg with warmup + nReps timed reps each).
1272
  // Returns an aggregate that makeRecord consumes.
1273
  async function runVariantWithIterations(v, row) {
1274
+ const nReps = Math.max(1, state.iterations || DEFAULT_ITERATIONS);
1275
+ const nPrompt = Math.max(0, state.nPrompt ?? DEFAULT_N_PROMPT);
1276
+ const nGen = Math.max(0, state.nGen ?? DEFAULT_N_GEN);
1277
 
1278
  // ─── CPU baseline ───
1279
+ // Pure consistency pass — capture token_ids; no perf metrics on CPU.
1280
  row.setStatus('cpu-baseline', 'generating reference tokens');
1281
  let cpuResult;
1282
  try {
1283
  cpuResult = await runBenchmarkInWorker(v, {
1284
+ consistencyPrompt: DEFAULT_PROMPT,
1285
+ consistencyNPredict: DEFAULT_N_PREDICT,
1286
+ refTokenIds: null,
1287
+ nPrompt: 0,
1288
+ nGen: 0,
1289
  nCtx: DEFAULT_N_CTX,
1290
  nGpuLayers: 0,
 
1291
  }, {
1292
  onStatus: (status, msg) => row.setStatus(`cpu/${status}`, msg),
1293
  onProgress: (fr, downloaded, total) => row.setProgress(fr, downloaded, total),
 
1298
  }
1299
 
1300
  // CPU baseline is "best effort": if it fails (typically OOM on a tight
1301
+ // tab), keep going with the GPU pass but skip consistency. Perf metrics
1302
+ // are independent of consistency so they're still reported.
 
1303
  const cpuOk = cpuResult.status === 'done';
1304
  if (!cpuOk) {
1305
  logLine(
1306
+ `CPU baseline failed (${cpuResult.error || 'unknown'}) — proceeding with GPU run, skipping consistency check.`
1307
  );
1308
  row.setStatus('cpu-skipped', 'continuing with GPU only');
1309
  }
1310
 
1311
+ const refTokenIds = cpuOk ? (cpuResult.consistency?.token_ids || []).join(',') : '';
1312
 
1313
+ if (state.aborted) {
1314
+ return { status: 'error', error: 'aborted', cpu: cpuResult, gpu: null };
1315
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1316
 
1317
+ // ─── GPU pass: consistency + perf in one model load ───
1318
+ row.setStatus('gpu-run', 'loading model');
1319
+ let gpuResult;
1320
+ try {
1321
+ gpuResult = await runBenchmarkInWorker(v, {
1322
+ consistencyPrompt: DEFAULT_PROMPT,
1323
+ consistencyNPredict: DEFAULT_N_PREDICT,
1324
+ refTokenIds: refTokenIds || null,
1325
+ nPrompt,
1326
+ nGen,
1327
+ nReps,
1328
+ nCtx: DEFAULT_N_CTX,
1329
+ nGpuLayers: DEFAULT_N_GPU_LAYERS,
1330
+ }, {
1331
+ onStatus: (s, m) => row.setStatus(`gpu/${s}`, m),
1332
+ onProgress: (fr, d, t) => row.setProgress(fr, d, t),
1333
+ onLog: logLine,
1334
  });
1335
+ } catch (err) {
1336
+ gpuResult = { status: 'error', error: err.message || String(err) };
 
 
 
 
1337
  }
1338
 
1339
  return {
1340
+ status: gpuResult.status === 'done' ? 'done' : 'error',
1341
+ error: gpuResult.status === 'done' ? null : (gpuResult.error || 'GPU run failed'),
 
1342
  cpu: cpuResult,
1343
+ gpu: gpuResult,
 
 
1344
  };
1345
  }
1346
 
 
 
 
 
 
 
 
 
 
1347
  function round2(n) { return Number.isFinite(n) ? parseFloat(n.toFixed(2)) : 0; }
1348
 
1349
+ // Pull pp/tg test results out of a metrics.tests array. Returns null if the
1350
+ // requested test wasn't run (e.g. nPrompt=0 means no pp test).
1351
+ function findTest(tests, prefix) {
1352
+ if (!Array.isArray(tests)) return null;
1353
+ return tests.find(t => typeof t.name === 'string' && t.name.startsWith(prefix)) || null;
1354
+ }
1355
+
1356
  function makeRecord(v, vr, machine, browser, wallTimeMs) {
1357
+ const gpu = vr.gpu;
1358
+ const tests = gpu?.metrics?.tests || null;
1359
+ const pp = findTest(tests, 'pp');
1360
+ const tg = findTest(tests, 'tg');
1361
+
1362
+ // Llama-bench shape lives under metrics.tests; flat prefill_tok_s /
1363
+ // decode_tok_s are kept for backward compat with the existing dashboard
1364
+ // table cells until those are migrated to read from tests directly.
1365
+ const metrics = tests ? {
1366
+ tests,
1367
+ n_prompt: gpu.metrics.n_prompt,
1368
+ n_gen: gpu.metrics.n_gen,
1369
+ n_reps: gpu.metrics.n_reps,
1370
+ iterations: gpu.metrics.n_reps,
1371
+ prefill_tok_s: pp ? round2(pp.avg_ts) : 0,
1372
+ decode_tok_s: tg ? round2(tg.avg_ts) : 0,
1373
+ prefill_tok_s_stdev: pp ? round2(pp.stddev_ts) : 0,
1374
+ decode_tok_s_stdev: tg ? round2(tg.stddev_ts) : 0,
1375
+ prefill_samples: pp ? pp.samples_ts : [],
1376
+ decode_samples: tg ? tg.samples_ts : [],
1377
+ n_p_eval: pp ? pp.n_prompt : 0,
1378
+ n_eval: tg ? tg.n_gen : 0,
1379
+ t_p_eval_ms: pp ? round2(pp.avg_ns / 1e6) : 0,
1380
+ t_eval_ms: tg ? round2(tg.avg_ns / 1e6) : 0,
1381
  } : null;
1382
 
1383
+ const cpuBaseline = vr.cpu?.status === 'done' && vr.cpu.consistency?.token_ids?.length ? {
1384
+ // CPU pass no longer measures perf — only token_ids for consistency.
1385
+ // Keep the field present but null-valued so dashboards that look it up
1386
+ // don't crash; downstream code can treat null as "not measured".
1387
+ prefill_tok_s: null,
1388
+ decode_tok_s: null,
1389
  } : null;
1390
 
1391
  return {
 
1399
  browser,
1400
  nCtx: DEFAULT_N_CTX,
1401
  nPredict: DEFAULT_N_PREDICT,
1402
+ nPrompt: gpu?.metrics?.n_prompt ?? 0,
1403
+ nGen: gpu?.metrics?.n_gen ?? 0,
1404
+ nReps: gpu?.metrics?.n_reps ?? 0,
1405
  nGpuLayers: DEFAULT_N_GPU_LAYERS,
1406
  timestamp: new Date().toISOString(),
1407
  wallTimeMs,
1408
+ webgpuAvailable: gpu?.webgpuAvailable ?? !!navigator.gpu,
1409
+ gpuAdapterInfo: gpu?.gpuAdapterInfo ?? null,
1410
+ buildType: gpu?.buildType ?? null,
1411
  // llama.cpp version stamped from build-info.json. Lets us correlate
1412
  // result drift with llama.cpp upgrades over time.
1413
  llamaCppCommit: state.buildInfo?.llamaCppCommit ?? null,
1414
  llamaCppDescribe: state.buildInfo?.llamaCppDescribe ?? null,
1415
  dawnTag: state.buildInfo?.dawnTag ?? null,
1416
  metrics,
1417
+ consistency: gpu?.consistency ?? null,
1418
  cpu_baseline: cpuBaseline,
1419
+ output: gpu?.output || '',
1420
  machine,
1421
  source: `webgpu-bench/site (${state.surface})`,
1422
  };
 
1532
  let body = '';
1533
  if (passed.length) {
1534
  body += `## Passed (${passed.length})\n\n`;
1535
+ // llama-bench-style markdown: separate pp / tg columns with avg \u00b1 stddev.
1536
+ body += `| Model | Variant | Size | pp tok/s | tg tok/s | Wall s |\n`;
1537
  body += `|---|---|---:|---:|---:|---:|\n`;
1538
+ const fmtTest = (tests, prefix) => {
1539
+ const t = tests?.find(x => x.name?.startsWith(prefix));
1540
+ return t ? `${t.avg_ts.toFixed(2)} \u00b1 ${t.stddev_ts.toFixed(2)} (${t.name})` : '\u2014';
1541
+ };
1542
  for (const r of passed) {
1543
  body += `| ${r.model} | ${r.variant} | ${formatSize(r.sizeMB)} | ${
1544
+ fmtTest(r.metrics?.tests, 'pp')} | ${fmtTest(r.metrics?.tests, 'tg')} | ${
1545
  (r.wallTimeMs / 1000).toFixed(1)} |\n`;
1546
  }
1547
  body += `\n`;
 
1743
  wireFilters();
1744
  wireFamilySearch();
1745
  wireBatchSelect();
1746
+ wirePerfInputs();
1747
  wireRunHandlers();
1748
  wireAbortHandler();
1749
  wirePurgeHandler();
js/run/core.js CHANGED
@@ -1,8 +1,14 @@
1
  // Benchmark core: load GGUF via a source adapter, init llama.cpp WASM,
2
- // run inference, collect metrics. Used by harness.js (URL-param driven, for
3
- // runner.js) and by the Run-tab controller (UI driven).
 
 
4
 
5
  const DEFAULT_PROMPT = 'Hello, how are you?';
 
 
 
 
6
 
7
  async function loadBenchScriptOnce(buildType) {
8
  if (typeof globalThis.createBenchModule === 'function') return;
@@ -18,15 +24,186 @@ async function loadBenchScriptOnce(buildType) {
18
  }
19
  }
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  export async function runBenchmarkCore({
22
  source,
23
  modelFile,
24
  hfRepo,
25
- prompt = DEFAULT_PROMPT,
26
- nPredict = 128,
 
 
 
 
 
 
 
 
 
27
  nCtx = 2048,
28
  nGpuLayers = 999,
29
- refTokenIds = null,
30
  onStatus = () => {},
31
  onProgress = () => {},
32
  onLog = () => {},
@@ -46,14 +223,13 @@ export async function runBenchmarkCore({
46
  webgpuAvailable: !!navigator.gpu,
47
  gpuAdapterInfo: null,
48
  metrics: null,
 
49
  output: '',
50
  };
51
 
52
- // Declared outside the try so the catch can free our heap allocation.
53
  let Module;
54
 
55
  try {
56
- // WebGPU adapter probe — informational only.
57
  if (navigator.gpu) {
58
  try {
59
  const adapter = await navigator.gpu.requestAdapter();
@@ -70,7 +246,6 @@ export async function runBenchmarkCore({
70
  onLog('WebGPU: not available in this browser');
71
  }
72
 
73
- // Load the Emscripten glue script once per page.
74
  onStatus('loading_wasm', `Loading WASM module (${buildType})...`);
75
  onLog(`JSPI supported: ${hasJspi} — using ${buildType} variant`);
76
  await loadBenchScriptOnce(buildType);
@@ -78,7 +253,6 @@ export async function runBenchmarkCore({
78
  Module = await globalThis.createBenchModule({
79
  print: (text) => onLog(`[wasm] ${text}`),
80
  printErr: (text) => onLog(`[wasm:err] ${text}`),
81
- // Catch Emscripten abort() — Firefox can abort during Asyncify init.
82
  onAbort: (reason) => {
83
  const msg = `WASM aborted: ${reason}`;
84
  result.error = msg;
@@ -88,7 +262,6 @@ export async function runBenchmarkCore({
88
  });
89
  onLog('WASM module loaded');
90
 
91
- // Download model via the injected source adapter.
92
  onStatus('downloading', `Downloading ${modelFile}...`);
93
  onLog(`Fetching model via source: ${hfRepo}/${modelFile}`);
94
  const { stream, contentLength } = await source.fetchModel(hfRepo, modelFile);
@@ -96,13 +269,8 @@ export async function runBenchmarkCore({
96
  contentLength ? `${(contentLength / (1024 * 1024)).toFixed(1)} MB` : 'unknown'
97
  }`);
98
 
99
- // Stream the GGUF directly into the WASM heap (HeapFS-style) to avoid a
100
- // duplicate JS-side MEMFS staging buffer. _malloc reserves a region in
101
- // the linear memory; HEAPU8.set writes chunks in place. We then expose
102
- // the region as a MEMFS file with `canOwn=true` so MEMFS does not copy,
103
- // and override node.contents with a getter that always rebuilds the
104
- // view from the saved pointer — this survives the heap growth that
105
- // llama.cpp triggers during bench_init/bench_load.
106
  if (!(contentLength > 0)) {
107
  throw new Error('content-length is required for streaming into WASM heap');
108
  }
@@ -137,19 +305,14 @@ export async function runBenchmarkCore({
137
  Module._free(modelPtr);
138
  throw err;
139
  }
140
- // Track on the result object so we can free in the success/exit paths.
141
  result._modelPtr = modelPtr;
142
 
143
- // Init backend.
144
  onStatus('initializing', 'Initializing llama.cpp backends...');
145
- onLog('Calling bench_init()...');
146
  const initResult = await Module.ccall('bench_init', 'number', [], [], { async: true });
147
  if (initResult !== 0) throw new Error(`bench_init failed: ${initResult}`);
148
  onLog('Backends initialized');
149
 
150
- // Load model.
151
  onStatus('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers})...`);
152
- onLog(`Calling bench_load("/model.gguf", ${nCtx}, ${nGpuLayers})...`);
153
  const loadResult = await Module.ccall(
154
  'bench_load',
155
  'number',
@@ -160,89 +323,38 @@ export async function runBenchmarkCore({
160
  if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
161
  onLog('Model loaded');
162
 
163
- // Drop the MEMFS node — llama.cpp's mmap captured a pointer into the
164
- // _malloc'd region in the WASM heap, so the bytes themselves stay alive
165
- // until we _free below after bench_exit.
166
  try {
167
  Module.FS.unlink('/model.gguf');
168
  } catch (e) {
169
  onLog(`Warning: could not remove model FS node: ${e.message}`);
170
  }
171
 
172
- // Run inference.
173
- onStatus('running', 'Running inference...');
174
- onLog(`Calling bench_run(prompt, ${nPredict})...`);
175
- const resultJson = await Module.ccall(
176
- 'bench_run',
177
- 'string',
178
- ['string', 'number'],
179
- [prompt, nPredict],
180
- { async: true },
181
- );
182
- onLog(`bench_run returned: ${String(resultJson).substring(0, 200)}`);
183
-
184
- const inferResult = JSON.parse(resultJson);
185
- if (inferResult.error) throw new Error(`Inference error: ${inferResult.error}`);
186
-
187
- const prefillTokS = inferResult.t_p_eval_ms > 0
188
- ? (inferResult.n_p_eval / (inferResult.t_p_eval_ms / 1000)).toFixed(2)
189
- : 'N/A';
190
- const decodeTokS = inferResult.t_eval_ms > 0
191
- ? (inferResult.n_eval / (inferResult.t_eval_ms / 1000)).toFixed(2)
192
- : 'N/A';
193
-
194
- result.metrics = {
195
- ...inferResult,
196
- prefill_tok_s: parseFloat(prefillTokS) || 0,
197
- decode_tok_s: parseFloat(decodeTokS) || 0,
198
- };
199
- result.output = inferResult.output || '';
200
-
201
- // Forced-decoding consistency check against a CPU reference token sequence.
202
- if (refTokenIds && nGpuLayers > 0 && inferResult.token_ids?.length > 0) {
203
- onLog('Running forced-decoding consistency check...');
204
- const evalJson = await Module.ccall(
205
- 'bench_eval_tokens',
206
- 'string',
207
- ['string', 'string'],
208
- [prompt, refTokenIds],
209
- { async: true },
210
- );
211
- const evalResult = JSON.parse(evalJson);
212
- if (evalResult.error) {
213
- onLog(`Consistency check error: ${evalResult.error}`);
214
- } else {
215
- result.consistency = evalResult;
216
- onLog(
217
- `Consistency: ${(evalResult.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
218
- `${evalResult.n_agree}/${evalResult.n_tokens} tokens)`,
219
- );
220
- if (evalResult.first_disagreement >= 0) {
221
- onLog(`First disagreement at token position ${evalResult.first_disagreement}`);
222
- }
223
- }
224
- }
225
 
226
  onLog('Calling bench_exit()...');
227
  await Module.ccall('bench_exit', null, [], [], { async: true });
228
 
229
- // Free the heap-resident model bytes now that llama.cpp has unmapped.
230
  if (result._modelPtr) {
231
  Module._free(result._modelPtr);
232
  delete result._modelPtr;
233
  }
234
 
235
  result.status = 'done';
236
- onStatus('done', `Done! Prefill: ${prefillTokS} tok/s | Decode: ${decodeTokS} tok/s`);
237
- onLog(
238
- `Prefill: ${prefillTokS} tok/s (${inferResult.n_p_eval} tokens in ` +
239
- `${inferResult.t_p_eval_ms.toFixed(0)} ms)`,
240
- );
241
- onLog(
242
- `Decode: ${decodeTokS} tok/s (${inferResult.n_eval} tokens in ` +
243
- `${inferResult.t_eval_ms.toFixed(0)} ms)`,
244
- );
245
- onLog(`Output: ${(inferResult.output || '').substring(0, 200)}`);
246
  return result;
247
  } catch (err) {
248
  result.error = err.message || String(err);
@@ -250,7 +362,6 @@ export async function runBenchmarkCore({
250
  onStatus('error', `Error: ${err.message}`);
251
  onLog(`ERROR: ${err.message}`);
252
  if (err.stack) onLog(err.stack);
253
- // Best-effort: release the model heap region so a re-run can reuse it.
254
  if (result._modelPtr && Module?._free) {
255
  try { Module._free(result._modelPtr); } catch { /* ignore */ }
256
  delete result._modelPtr;
 
1
  // Benchmark core: load GGUF via a source adapter, init llama.cpp WASM,
2
+ // then run a consistency phase (forced-decoding against a CPU baseline) and
3
+ // a perf phase (llama-bench-style pp/tg with warmup + n_reps timed reps).
4
+ // Used by harness.js (URL-param driven, for runner.js) and by the Run-tab
5
+ // controller (which runs the same logic in a Web Worker — see bench-worker.js).
6
 
7
  const DEFAULT_PROMPT = 'Hello, how are you?';
8
+ const DEFAULT_N_PREDICT = 128;
9
+ const DEFAULT_N_PROMPT = 512;
10
+ const DEFAULT_N_GEN = 128;
11
+ const DEFAULT_N_REPS = 5;
12
 
13
  async function loadBenchScriptOnce(buildType) {
14
  if (typeof globalThis.createBenchModule === 'function') return;
 
24
  }
25
  }
26
 
27
+ // Aggregate raw nanosecond samples into the llama-bench result shape.
28
+ // llama-bench reports avg_ts = (n_tokens * 1e9) / avg_ns and stddev_ts as
29
+ // the std of per-sample t/s, computed independently rather than propagated
30
+ // from stddev_ns (the mapping isn't linear).
31
+ function buildTest(name, n_prompt, n_gen, samples_ns) {
32
+ const n = samples_ns.length;
33
+ if (n === 0) {
34
+ return { name, n_prompt, n_gen, avg_ns: 0, stddev_ns: 0, avg_ts: 0, stddev_ts: 0, samples_ns: [], samples_ts: [] };
35
+ }
36
+ const avg_ns = samples_ns.reduce((a, b) => a + b, 0) / n;
37
+ // Sample stddev (Bessel's correction) — matches llama-bench's avg_stdev when reps > 1.
38
+ const var_ns = n > 1
39
+ ? samples_ns.reduce((a, b) => a + (b - avg_ns) * (b - avg_ns), 0) / (n - 1)
40
+ : 0;
41
+ const stddev_ns = Math.sqrt(var_ns);
42
+ const n_tokens = n_prompt + n_gen;
43
+ const samples_ts = samples_ns.map(t => t > 0 ? (1e9 * n_tokens) / t : 0);
44
+ const avg_ts = samples_ts.reduce((a, b) => a + b, 0) / n;
45
+ const var_ts = n > 1
46
+ ? samples_ts.reduce((a, b) => a + (b - avg_ts) * (b - avg_ts), 0) / (n - 1)
47
+ : 0;
48
+ const stddev_ts = Math.sqrt(var_ts);
49
+ const round2 = x => Math.round(x * 100) / 100;
50
+ return {
51
+ name,
52
+ n_prompt,
53
+ n_gen,
54
+ avg_ns: Math.round(avg_ns),
55
+ stddev_ns: Math.round(stddev_ns),
56
+ avg_ts: round2(avg_ts),
57
+ stddev_ts: round2(stddev_ts),
58
+ samples_ns: samples_ns.map(Math.round),
59
+ samples_ts: samples_ts.map(round2),
60
+ };
61
+ }
62
+
63
+ // Parse the JSON returned by a bench_* C function. Throws on parse failure
64
+ // or on `error` field from C.
65
+ function parseBenchResult(label, raw) {
66
+ let r;
67
+ try { r = JSON.parse(raw); } catch (e) {
68
+ throw new Error(`${label}: invalid JSON from C (${e.message})`);
69
+ }
70
+ if (r.error) throw new Error(`${label}: ${r.error}`);
71
+ return r;
72
+ }
73
+
74
+ // Run the consistency + perf phases against an already-loaded WASM Module.
75
+ // Returns { metrics: { tests, n_prompt, n_gen, n_reps }, consistency, output }.
76
+ //
77
+ // Both the worker (bench-worker.js) and the main-thread path (this file) call
78
+ // into this. Keep the two implementations in sync.
79
+ async function runBenchActions(Module, {
80
+ // Consistency phase
81
+ consistencyPrompt, // non-empty string ⇒ run consistency
82
+ consistencyNPredict, // tokens generated by bench_run during consistency
83
+ refTokenIds, // CSV of CPU-side token IDs ⇒ forced-decode against them
84
+ // Perf phase
85
+ nPrompt, nGen, nReps, noWarmup,
86
+ // Reporting
87
+ onStatus, onLog,
88
+ }) {
89
+ const out = { metrics: null, consistency: null, output: '' };
90
+
91
+ // ─── Consistency phase ───
92
+ // Two sub-modes: (a) CPU baseline — generates token_ids via bench_run for a
93
+ // future GPU verification pass; (b) GPU verification — runs bench_run then
94
+ // bench_eval_tokens to compute the agreement rate against refTokenIds.
95
+ if (consistencyPrompt) {
96
+ onStatus?.('consistency', 'Running consistency check...');
97
+ onLog?.(`bench_run("...", ${consistencyNPredict}) — consistency phase`);
98
+ const raw = await Module.ccall(
99
+ 'bench_run', 'string',
100
+ ['string', 'number'],
101
+ [consistencyPrompt, consistencyNPredict],
102
+ { async: true },
103
+ );
104
+ const r = parseBenchResult('bench_run', raw);
105
+ out.output = r.output || '';
106
+ out.consistency = { token_ids: r.token_ids || [] };
107
+
108
+ if (refTokenIds) {
109
+ onLog?.('bench_eval_tokens — forced-decode vs CPU baseline');
110
+ const evalRaw = await Module.ccall(
111
+ 'bench_eval_tokens', 'string',
112
+ ['string', 'string'],
113
+ [consistencyPrompt, refTokenIds],
114
+ { async: true },
115
+ );
116
+ const ev = parseBenchResult('bench_eval_tokens', evalRaw);
117
+ out.consistency = { ...out.consistency, ...ev };
118
+ onLog?.(
119
+ `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
120
+ `${ev.n_agree}/${ev.n_tokens})` +
121
+ (ev.first_disagreement >= 0 ? ` — first diverge @ ${ev.first_disagreement}` : '')
122
+ );
123
+ }
124
+ }
125
+
126
+ // ─── Perf phase (llama-bench style) ───
127
+ // Synthetic random tokens; KV cleared per call inside bench_pp/bench_tg.
128
+ // Warmup is one full pp + one tg(1) call before the timed reps, matching
129
+ // tools/llama-bench/llama-bench.cpp.
130
+ const wantPp = nPrompt > 0;
131
+ const wantTg = nGen > 0;
132
+ if (wantPp || wantTg) {
133
+ const tests = [];
134
+
135
+ if (wantPp) {
136
+ if (!noWarmup) {
137
+ onStatus?.('perf', `warmup pp${nPrompt}`);
138
+ onLog?.(`bench_pp(${nPrompt}) — warmup`);
139
+ const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
140
+ parseBenchResult('bench_pp warmup', raw);
141
+ }
142
+ const samples_ns = [];
143
+ for (let i = 0; i < nReps; i++) {
144
+ onStatus?.('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
145
+ const t0 = performance.now();
146
+ const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
147
+ const t_ns = (performance.now() - t0) * 1e6;
148
+ parseBenchResult('bench_pp', raw);
149
+ samples_ns.push(t_ns);
150
+ onLog?.(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
151
+ }
152
+ tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
153
+ }
154
+
155
+ if (wantTg) {
156
+ if (!noWarmup) {
157
+ onStatus?.('perf', `warmup tg`);
158
+ onLog?.('bench_tg(1) — warmup');
159
+ const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
160
+ parseBenchResult('bench_tg warmup', raw);
161
+ }
162
+ const samples_ns = [];
163
+ for (let i = 0; i < nReps; i++) {
164
+ onStatus?.('perf', `tg${nGen} ${i + 1}/${nReps}`);
165
+ const t0 = performance.now();
166
+ const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
167
+ const t_ns = (performance.now() - t0) * 1e6;
168
+ parseBenchResult('bench_tg', raw);
169
+ samples_ns.push(t_ns);
170
+ onLog?.(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
171
+ }
172
+ tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
173
+ }
174
+
175
+ out.metrics = {
176
+ tests,
177
+ n_prompt: wantPp ? nPrompt : 0,
178
+ n_gen: wantTg ? nGen : 0,
179
+ n_reps: nReps,
180
+ };
181
+ }
182
+
183
+ return out;
184
+ }
185
+
186
+ // Public entry. Loads the WASM module + model, then dispatches to
187
+ // runBenchActions for the actual workload. Returns a flat record shape
188
+ // consumed by harness.js (window.__BENCH) and by controller.makeRecord.
189
  export async function runBenchmarkCore({
190
  source,
191
  modelFile,
192
  hfRepo,
193
+ // consistency phase
194
+ consistencyPrompt = DEFAULT_PROMPT,
195
+ consistencyNPredict = DEFAULT_N_PREDICT,
196
+ refTokenIds = null,
197
+ runConsistency = true, // false ⇒ skip consistency phase entirely
198
+ // perf phase
199
+ nPrompt = DEFAULT_N_PROMPT,
200
+ nGen = DEFAULT_N_GEN,
201
+ nReps = DEFAULT_N_REPS,
202
+ noWarmup = false,
203
+ // model load
204
  nCtx = 2048,
205
  nGpuLayers = 999,
206
+ // reporting
207
  onStatus = () => {},
208
  onProgress = () => {},
209
  onLog = () => {},
 
223
  webgpuAvailable: !!navigator.gpu,
224
  gpuAdapterInfo: null,
225
  metrics: null,
226
+ consistency: null,
227
  output: '',
228
  };
229
 
 
230
  let Module;
231
 
232
  try {
 
233
  if (navigator.gpu) {
234
  try {
235
  const adapter = await navigator.gpu.requestAdapter();
 
246
  onLog('WebGPU: not available in this browser');
247
  }
248
 
 
249
  onStatus('loading_wasm', `Loading WASM module (${buildType})...`);
250
  onLog(`JSPI supported: ${hasJspi} — using ${buildType} variant`);
251
  await loadBenchScriptOnce(buildType);
 
253
  Module = await globalThis.createBenchModule({
254
  print: (text) => onLog(`[wasm] ${text}`),
255
  printErr: (text) => onLog(`[wasm:err] ${text}`),
 
256
  onAbort: (reason) => {
257
  const msg = `WASM aborted: ${reason}`;
258
  result.error = msg;
 
262
  });
263
  onLog('WASM module loaded');
264
 
 
265
  onStatus('downloading', `Downloading ${modelFile}...`);
266
  onLog(`Fetching model via source: ${hfRepo}/${modelFile}`);
267
  const { stream, contentLength } = await source.fetchModel(hfRepo, modelFile);
 
269
  contentLength ? `${(contentLength / (1024 * 1024)).toFixed(1)} MB` : 'unknown'
270
  }`);
271
 
272
+ // Stream the GGUF directly into the WASM heap (HeapFS-style) see worker
273
+ // for the full explanation of why we override node.contents with a getter.
 
 
 
 
 
274
  if (!(contentLength > 0)) {
275
  throw new Error('content-length is required for streaming into WASM heap');
276
  }
 
305
  Module._free(modelPtr);
306
  throw err;
307
  }
 
308
  result._modelPtr = modelPtr;
309
 
 
310
  onStatus('initializing', 'Initializing llama.cpp backends...');
 
311
  const initResult = await Module.ccall('bench_init', 'number', [], [], { async: true });
312
  if (initResult !== 0) throw new Error(`bench_init failed: ${initResult}`);
313
  onLog('Backends initialized');
314
 
 
315
  onStatus('loading_model', `Loading model (ctx=${nCtx}, gpu_layers=${nGpuLayers})...`);
 
316
  const loadResult = await Module.ccall(
317
  'bench_load',
318
  'number',
 
323
  if (loadResult !== 0) throw new Error(`bench_load failed: ${loadResult}`);
324
  onLog('Model loaded');
325
 
 
 
 
326
  try {
327
  Module.FS.unlink('/model.gguf');
328
  } catch (e) {
329
  onLog(`Warning: could not remove model FS node: ${e.message}`);
330
  }
331
 
332
+ // ─── Consistency + perf phases ───
333
+ onStatus('running', 'Running benchmark...');
334
+ const actions = await runBenchActions(Module, {
335
+ consistencyPrompt: runConsistency ? consistencyPrompt : null,
336
+ consistencyNPredict,
337
+ refTokenIds,
338
+ nPrompt, nGen, nReps, noWarmup,
339
+ onStatus, onLog,
340
+ });
341
+ result.metrics = actions.metrics;
342
+ result.consistency = actions.consistency;
343
+ result.output = actions.output;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
 
345
  onLog('Calling bench_exit()...');
346
  await Module.ccall('bench_exit', null, [], [], { async: true });
347
 
 
348
  if (result._modelPtr) {
349
  Module._free(result._modelPtr);
350
  delete result._modelPtr;
351
  }
352
 
353
  result.status = 'done';
354
+ const summary = result.metrics?.tests
355
+ ?.map(t => `${t.name}: ${t.avg_ts.toFixed(2)} ± ${t.stddev_ts.toFixed(2)} t/s`)
356
+ .join(' | ') || 'no perf';
357
+ onStatus('done', `Done! ${summary}`);
 
 
 
 
 
 
358
  return result;
359
  } catch (err) {
360
  result.error = err.message || String(err);
 
362
  onStatus('error', `Error: ${err.message}`);
363
  onLog(`ERROR: ${err.message}`);
364
  if (err.stack) onLog(err.stack);
 
365
  if (result._modelPtr && Module?._free) {
366
  try { Module._free(result._modelPtr); } catch { /* ignore */ }
367
  delete result._modelPtr;
js/tables.js CHANGED
@@ -76,10 +76,10 @@ export function renderResultsTable(results) {
76
  { key: 'status', label: 'Status', priority: 1 },
77
  { key: 'buildType', label: 'Build', priority: 3 },
78
  { key: 'webgpuAvailable', label: 'WebGPU', priority: 3 },
79
- { key: 'decode_tok_s', label: 'Decode tok/s', priority: 1 },
80
- { key: 'prefill_tok_s', label: 'Prefill tok/s', priority: 3 },
81
- { key: 'cpu_baseline_decode_tok_s', label: 'CPU decode tok/s', priority: 2 },
82
- { key: 'cpu_baseline_prefill_tok_s', label: 'CPU prefill tok/s', priority: 3 },
83
  { key: 'n_eval', label: 'n_eval', priority: 3 },
84
  { key: 't_eval_ms', label: 't_eval (ms)', priority: 3 },
85
  { key: 'n_p_eval', label: 'n_p_eval', priority: 3 },
@@ -133,9 +133,29 @@ export function renderResultsTable(results) {
133
  case 'decode_tok_s':
134
  case 'prefill_tok_s':
135
  case 'cpu_baseline_decode_tok_s':
136
- case 'cpu_baseline_prefill_tok_s':
137
- html += `<span class="mono">${formatTokS(r[col.key])}</span>`;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  break;
 
139
  case 't_eval_ms':
140
  case 't_p_eval_ms':
141
  html += `<span class="mono">${formatMs(r[col.key])}</span>`;
 
76
  { key: 'status', label: 'Status', priority: 1 },
77
  { key: 'buildType', label: 'Build', priority: 3 },
78
  { key: 'webgpuAvailable', label: 'WebGPU', priority: 3 },
79
+ { key: 'decode_tok_s', label: 'tg tok/s', priority: 1 },
80
+ { key: 'prefill_tok_s', label: 'pp tok/s', priority: 3 },
81
+ { key: 'cpu_baseline_decode_tok_s', label: 'CPU tg tok/s', priority: 2 },
82
+ { key: 'cpu_baseline_prefill_tok_s', label: 'CPU pp tok/s', priority: 3 },
83
  { key: 'n_eval', label: 'n_eval', priority: 3 },
84
  { key: 't_eval_ms', label: 't_eval (ms)', priority: 3 },
85
  { key: 'n_p_eval', label: 'n_p_eval', priority: 3 },
 
133
  case 'decode_tok_s':
134
  case 'prefill_tok_s':
135
  case 'cpu_baseline_decode_tok_s':
136
+ case 'cpu_baseline_prefill_tok_s': {
137
+ // llama-bench style "avg \u00b1 stddev" with the pp{N} / tg{N} test
138
+ // label as a tooltip when the new schema is present. Older records
139
+ // without stddev fall back to the bare avg from formatTokS.
140
+ const isDecode = col.key === 'decode_tok_s';
141
+ const isPrefill = col.key === 'prefill_tok_s';
142
+ const stddev = isDecode ? r.decode_stddev_ts
143
+ : isPrefill ? r.prefill_stddev_ts
144
+ : null;
145
+ const testName = isDecode ? r.tg_test_name
146
+ : isPrefill ? r.pp_test_name
147
+ : null;
148
+ const avg = r[col.key];
149
+ let cell;
150
+ if (avg != null && stddev != null) {
151
+ cell = `${formatTokS(avg)} \u00b1 ${formatTokS(stddev)}`;
152
+ } else {
153
+ cell = formatTokS(avg);
154
+ }
155
+ const titleAttr = testName ? ` title="${escapeHtml(testName)}"` : '';
156
+ html += `<span class="mono"${titleAttr}>${cell}</span>`;
157
  break;
158
+ }
159
  case 't_eval_ms':
160
  case 't_p_eval_ms':
161
  html += `<span class="mono">${formatMs(r[col.key])}</span>`;
run.html CHANGED
@@ -125,7 +125,15 @@
125
  </div>
126
  </div>
127
  <div class="filter-group">
128
- <label class="filter-label" for="iterations-input">Iterations</label>
 
 
 
 
 
 
 
 
129
  <input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">
130
  </div>
131
  </div>
 
125
  </div>
126
  </div>
127
  <div class="filter-group">
128
+ <label class="filter-label" for="n-prompt-input">Prompt tokens (-p)</label>
129
+ <input type="number" id="n-prompt-input" class="filter-select run-iter-input" value="512" min="0" max="4096" step="1">
130
+ </div>
131
+ <div class="filter-group">
132
+ <label class="filter-label" for="n-gen-input">Gen tokens (-n)</label>
133
+ <input type="number" id="n-gen-input" class="filter-select run-iter-input" value="128" min="0" max="4096" step="1">
134
+ </div>
135
+ <div class="filter-group">
136
+ <label class="filter-label" for="iterations-input">Reps (-r)</label>
137
  <input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">
138
  </div>
139
  </div>