GitHub Actions commited on
Commit
629e542
Β·
1 Parent(s): 1683f65

sync from abhijitramesh/webgpu-bench@1a0973fa5b

Browse files
Files changed (4) hide show
  1. js/run/bench-worker.js +84 -63
  2. js/run/controller.js +70 -31
  3. js/run/core.js +81 -64
  4. run.html +7 -0
js/run/bench-worker.js CHANGED
@@ -254,89 +254,110 @@ async function runOne({ params, stream, buffer }) {
254
  }
255
 
256
  // ─── Consistency phase ───
 
 
 
 
257
  if (consistencyPrompt) {
258
- status('consistency', 'Running consistency check...');
259
- log(`bench_run("...", ${consistencyNPredict}) β€” consistency phase`);
260
- const raw = await Module.ccall(
261
- 'bench_run', 'string',
262
- ['string', 'number'],
263
- [consistencyPrompt, consistencyNPredict],
264
- { async: true },
265
- );
266
- const r = parseBenchResult('bench_run', raw);
267
- result.output = r.output || '';
268
- result.consistency = { token_ids: r.token_ids || [] };
269
-
270
- if (refTokenIds) {
271
- log('bench_eval_tokens β€” forced-decode vs CPU baseline');
272
- const evalRaw = await Module.ccall(
273
- 'bench_eval_tokens', 'string',
274
- ['string', 'string'],
275
- [consistencyPrompt, refTokenIds],
276
  { async: true },
277
  );
278
- const ev = parseBenchResult('bench_eval_tokens', evalRaw);
279
- result.consistency = { ...result.consistency, ...ev };
280
- log(
281
- `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
282
- `${ev.n_agree}/${ev.n_tokens})` +
283
- (ev.first_disagreement >= 0 ? ` β€” first diverge @ ${ev.first_disagreement}` : '')
284
- );
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  }
286
  }
287
 
288
  // ─── Perf phase (llama-bench style) ───
 
 
 
289
  const wantPp = nPrompt > 0;
290
  const wantTg = nGen > 0;
291
  if (wantPp || wantTg) {
292
  const tests = [];
293
 
294
  if (wantPp) {
295
- if (!noWarmup) {
296
- status('perf', `warmup pp${nPrompt}`);
297
- log(`bench_pp(${nPrompt}) β€” warmup`);
298
- const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
299
- parseBenchResult('bench_pp warmup', raw);
300
- }
301
- const samples_ns = [];
302
- for (let i = 0; i < nReps; i++) {
303
- status('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
304
- const t0 = performance.now();
305
- const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
306
- const t_ns = (performance.now() - t0) * 1e6;
307
- parseBenchResult('bench_pp', raw);
308
- samples_ns.push(t_ns);
309
- log(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
 
 
 
 
 
310
  }
311
- tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
312
  }
313
 
314
  if (wantTg) {
315
- if (!noWarmup) {
316
- status('perf', `warmup tg`);
317
- log('bench_tg(1) β€” warmup');
318
- const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
319
- parseBenchResult('bench_tg warmup', raw);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  }
321
- const samples_ns = [];
322
- for (let i = 0; i < nReps; i++) {
323
- status('perf', `tg${nGen} ${i + 1}/${nReps}`);
324
- const t0 = performance.now();
325
- const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
326
- const t_ns = (performance.now() - t0) * 1e6;
327
- parseBenchResult('bench_tg', raw);
328
- samples_ns.push(t_ns);
329
- log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
330
- }
331
- tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
332
  }
333
 
334
- result.metrics = {
335
- tests,
336
- n_prompt: wantPp ? nPrompt : 0,
337
- n_gen: wantTg ? nGen : 0,
338
- n_reps: nReps,
339
- };
 
 
340
  }
341
 
342
  await Module.ccall('bench_exit', null, [], [], { async: true });
 
254
  }
255
 
256
  // ─── Consistency phase ───
257
+ // Soft-fail: a failure here logs and falls through to the perf phase
258
+ // rather than aborting the whole run. Some devices/models can't survive
259
+ // bench_run (e.g. unsupported op, OOM mid-decode) but can still produce
260
+ // useful pp/tg numbers via synthetic-token paths.
261
  if (consistencyPrompt) {
262
+ try {
263
+ status('consistency', 'Running consistency check...');
264
+ log(`bench_run("...", ${consistencyNPredict}) β€” consistency phase`);
265
+ const raw = await Module.ccall(
266
+ 'bench_run', 'string',
267
+ ['string', 'number'],
268
+ [consistencyPrompt, consistencyNPredict],
 
 
 
 
 
 
 
 
 
 
 
269
  { async: true },
270
  );
271
+ const r = parseBenchResult('bench_run', raw);
272
+ result.output = r.output || '';
273
+ result.consistency = { token_ids: r.token_ids || [] };
274
+
275
+ if (refTokenIds) {
276
+ log('bench_eval_tokens β€” forced-decode vs CPU baseline');
277
+ const evalRaw = await Module.ccall(
278
+ 'bench_eval_tokens', 'string',
279
+ ['string', 'string'],
280
+ [consistencyPrompt, refTokenIds],
281
+ { async: true },
282
+ );
283
+ const ev = parseBenchResult('bench_eval_tokens', evalRaw);
284
+ result.consistency = { ...result.consistency, ...ev };
285
+ log(
286
+ `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
287
+ `${ev.n_agree}/${ev.n_tokens})` +
288
+ (ev.first_disagreement >= 0 ? ` β€” first diverge @ ${ev.first_disagreement}` : '')
289
+ );
290
+ }
291
+ } catch (err) {
292
+ log(`Consistency phase failed: ${err.message} β€” continuing to perf phase`);
293
  }
294
  }
295
 
296
  // ─── Perf phase (llama-bench style) ───
297
+ // Each test (pp, tg) is wrapped independently so a failure in one doesn't
298
+ // skip the other. Empty samples_ns produces a buildTest with avg_ts=0,
299
+ // which the dashboard renders as a dash.
300
  const wantPp = nPrompt > 0;
301
  const wantTg = nGen > 0;
302
  if (wantPp || wantTg) {
303
  const tests = [];
304
 
305
  if (wantPp) {
306
+ try {
307
+ if (!noWarmup) {
308
+ status('perf', `warmup pp${nPrompt}`);
309
+ log(`bench_pp(${nPrompt}) β€” warmup`);
310
+ const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
311
+ parseBenchResult('bench_pp warmup', raw);
312
+ }
313
+ const samples_ns = [];
314
+ for (let i = 0; i < nReps; i++) {
315
+ status('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
316
+ const t0 = performance.now();
317
+ const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
318
+ const t_ns = (performance.now() - t0) * 1e6;
319
+ parseBenchResult('bench_pp', raw);
320
+ samples_ns.push(t_ns);
321
+ log(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
322
+ }
323
+ tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
324
+ } catch (err) {
325
+ log(`pp test failed: ${err.message}`);
326
  }
 
327
  }
328
 
329
  if (wantTg) {
330
+ try {
331
+ if (!noWarmup) {
332
+ status('perf', `warmup tg`);
333
+ log('bench_tg(1) β€” warmup');
334
+ const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
335
+ parseBenchResult('bench_tg warmup', raw);
336
+ }
337
+ const samples_ns = [];
338
+ for (let i = 0; i < nReps; i++) {
339
+ status('perf', `tg${nGen} ${i + 1}/${nReps}`);
340
+ const t0 = performance.now();
341
+ const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
342
+ const t_ns = (performance.now() - t0) * 1e6;
343
+ parseBenchResult('bench_tg', raw);
344
+ samples_ns.push(t_ns);
345
+ log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
346
+ }
347
+ tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
348
+ } catch (err) {
349
+ log(`tg test failed: ${err.message}`);
350
  }
 
 
 
 
 
 
 
 
 
 
 
351
  }
352
 
353
+ if (tests.length > 0) {
354
+ result.metrics = {
355
+ tests,
356
+ n_prompt: wantPp ? nPrompt : 0,
357
+ n_gen: wantTg ? nGen : 0,
358
+ n_reps: nReps,
359
+ };
360
+ }
361
  }
362
 
363
  await Module.ccall('bench_exit', null, [], [], { async: true });
js/run/controller.js CHANGED
@@ -43,6 +43,12 @@ const state = {
43
  iterations: DEFAULT_ITERATIONS,
44
  nPrompt: DEFAULT_N_PROMPT,
45
  nGen: DEFAULT_N_GEN,
 
 
 
 
 
 
46
  mounted: false,
47
  // Tracks variants the Run pipeline downloaded this session (as opposed to
48
  // the standalone Download button or pre-existing cache). Only these are
@@ -660,6 +666,20 @@ function wirePerfInputs() {
660
  ng.value = String(state.nGen);
661
  });
662
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
663
  }
664
 
665
  function submittableResults() {
@@ -1274,55 +1294,74 @@ async function runVariantWithIterations(v, row) {
1274
  const nReps = Math.max(1, state.iterations || DEFAULT_ITERATIONS);
1275
  const nPrompt = Math.max(0, state.nPrompt ?? DEFAULT_N_PROMPT);
1276
  const nGen = Math.max(0, state.nGen ?? DEFAULT_N_GEN);
 
 
 
 
 
 
 
 
1277
 
1278
  // ─── CPU baseline ───
1279
- // Consistency (token_ids) + a single warmup-then-1-rep perf measurement.
1280
- // The single rep gives us a CPU-vs-GPU speedup signal in the dashboard
1281
- // without paying for a full nReps sweep on CPU.
1282
- row.setStatus('cpu-baseline', 'reference tokens + 1-rep perf');
1283
  let cpuResult;
1284
- try {
1285
- cpuResult = await runBenchmarkInWorker(v, {
1286
- consistencyPrompt: DEFAULT_PROMPT,
1287
- consistencyNPredict: DEFAULT_N_PREDICT,
1288
- refTokenIds: null,
1289
- nPrompt,
1290
- nGen,
1291
- nReps: 1,
1292
- nCtx: DEFAULT_N_CTX,
1293
- nGpuLayers: 0,
1294
- }, {
1295
- onStatus: (status, msg) => row.setStatus(`cpu/${status}`, msg),
1296
- onProgress: (fr, downloaded, total) => row.setProgress(fr, downloaded, total),
1297
- onLog: logLine,
1298
- });
1299
- } catch (err) {
1300
- cpuResult = { status: 'error', error: err.message || String(err) };
 
 
 
 
 
 
 
 
1301
  }
1302
 
1303
- // CPU baseline is "best effort": if it fails (typically OOM on a tight
1304
- // tab), keep going with the GPU pass but skip consistency. Perf metrics
1305
- // are independent of consistency so they're still reported.
1306
  const cpuOk = cpuResult.status === 'done';
1307
- if (!cpuOk) {
1308
- logLine(
1309
- `CPU baseline failed (${cpuResult.error || 'unknown'}) β€” proceeding with GPU run, skipping consistency check.`
1310
- );
1311
  row.setStatus('cpu-skipped', 'continuing with GPU only');
1312
  }
1313
 
1314
- const refTokenIds = cpuOk ? (cpuResult.consistency?.token_ids || []).join(',') : '';
 
 
 
 
 
1315
 
1316
  if (state.aborted) {
1317
  return { status: 'error', error: 'aborted', cpu: cpuResult, gpu: null };
1318
  }
1319
 
1320
- // ─── GPU pass: consistency + perf in one model load ───
1321
  row.setStatus('gpu-run', 'loading model');
1322
  let gpuResult;
1323
  try {
1324
  gpuResult = await runBenchmarkInWorker(v, {
1325
- consistencyPrompt: DEFAULT_PROMPT,
1326
  consistencyNPredict: DEFAULT_N_PREDICT,
1327
  refTokenIds: refTokenIds || null,
1328
  nPrompt,
 
43
  iterations: DEFAULT_ITERATIONS,
44
  nPrompt: DEFAULT_N_PROMPT,
45
  nGen: DEFAULT_N_GEN,
46
+ // User-controlled phase toggles. Defaults match the previous behaviour:
47
+ // run consistency (CPU baseline + GPU forced-decode) AND run CPU perf
48
+ // baseline. Both checkable to skip β€” useful on devices where CPU is too
49
+ // slow / unreliable to be worth waiting for.
50
+ skipConsistency: false,
51
+ skipCpuPerf: false,
52
  mounted: false,
53
  // Tracks variants the Run pipeline downloaded this session (as opposed to
54
  // the standalone Download button or pre-existing cache). Only these are
 
666
  ng.value = String(state.nGen);
667
  });
668
  }
669
+ const skipCons = $('skip-consistency');
670
+ if (skipCons) {
671
+ skipCons.checked = state.skipConsistency;
672
+ skipCons.addEventListener('change', () => {
673
+ state.skipConsistency = skipCons.checked;
674
+ });
675
+ }
676
+ const skipCpu = $('skip-cpu-perf');
677
+ if (skipCpu) {
678
+ skipCpu.checked = state.skipCpuPerf;
679
+ skipCpu.addEventListener('change', () => {
680
+ state.skipCpuPerf = skipCpu.checked;
681
+ });
682
+ }
683
  }
684
 
685
  function submittableResults() {
 
1294
  const nReps = Math.max(1, state.iterations || DEFAULT_ITERATIONS);
1295
  const nPrompt = Math.max(0, state.nPrompt ?? DEFAULT_N_PROMPT);
1296
  const nGen = Math.max(0, state.nGen ?? DEFAULT_N_GEN);
1297
+ // Phase toggles from the run page. Combined effect:
1298
+ // skip both β†’ only GPU perf, no CPU pass at all
1299
+ // skip consistency β†’ CPU perf baseline + GPU perf, no token-id check
1300
+ // skip CPU perf β†’ CPU consistency tokens + GPU consistency + GPU perf
1301
+ // skip neither β†’ full default flow
1302
+ const runConsistency = !state.skipConsistency;
1303
+ const runCpuPerf = !state.skipCpuPerf;
1304
+ const needCpuPass = runConsistency || runCpuPerf;
1305
 
1306
  // ─── CPU baseline ───
1307
+ // Skipped entirely if both toggles disable it. Otherwise the pass mixes
1308
+ // and matches: consistency_run captures token_ids; perf phase runs at
1309
+ // nReps=1 (single warmup+timed rep β€” enough to populate the dashboard's
1310
+ // CPU/GPU comparison without doubling CPU runtime).
1311
  let cpuResult;
1312
+ if (needCpuPass) {
1313
+ const phaseLabel = runConsistency && runCpuPerf ? 'reference tokens + 1-rep perf'
1314
+ : runConsistency ? 'reference tokens'
1315
+ : '1-rep perf';
1316
+ row.setStatus('cpu-baseline', phaseLabel);
1317
+ try {
1318
+ cpuResult = await runBenchmarkInWorker(v, {
1319
+ consistencyPrompt: runConsistency ? DEFAULT_PROMPT : '',
1320
+ consistencyNPredict: DEFAULT_N_PREDICT,
1321
+ refTokenIds: null,
1322
+ nPrompt: runCpuPerf ? nPrompt : 0,
1323
+ nGen: runCpuPerf ? nGen : 0,
1324
+ nReps: 1,
1325
+ nCtx: DEFAULT_N_CTX,
1326
+ nGpuLayers: 0,
1327
+ }, {
1328
+ onStatus: (status, msg) => row.setStatus(`cpu/${status}`, msg),
1329
+ onProgress: (fr, downloaded, total) => row.setProgress(fr, downloaded, total),
1330
+ onLog: logLine,
1331
+ });
1332
+ } catch (err) {
1333
+ cpuResult = { status: 'error', error: err.message || String(err) };
1334
+ }
1335
+ } else {
1336
+ cpuResult = { status: 'skipped' };
1337
  }
1338
 
1339
+ // CPU pass is best-effort. Failures (OOM, slow device, missing op) don't
1340
+ // block the GPU run β€” the user opted into resilience implicitly by the
1341
+ // phase being best-effort, and explicitly via the skip checkboxes.
1342
  const cpuOk = cpuResult.status === 'done';
1343
+ if (cpuResult.status === 'error') {
1344
+ logLine(`CPU baseline failed (${cpuResult.error || 'unknown'}) β€” proceeding with GPU run.`);
 
 
1345
  row.setStatus('cpu-skipped', 'continuing with GPU only');
1346
  }
1347
 
1348
+ // refTokenIds is the GPU pass's input for forced-decode consistency. Only
1349
+ // pass when we actually have tokens (consistency was requested AND CPU
1350
+ // produced tokens).
1351
+ const refTokenIds = (cpuOk && runConsistency && cpuResult.consistency?.token_ids?.length)
1352
+ ? cpuResult.consistency.token_ids.join(',')
1353
+ : '';
1354
 
1355
  if (state.aborted) {
1356
  return { status: 'error', error: 'aborted', cpu: cpuResult, gpu: null };
1357
  }
1358
 
1359
+ // ─── GPU pass: consistency (when not skipped) + perf in one model load ───
1360
  row.setStatus('gpu-run', 'loading model');
1361
  let gpuResult;
1362
  try {
1363
  gpuResult = await runBenchmarkInWorker(v, {
1364
+ consistencyPrompt: runConsistency ? DEFAULT_PROMPT : '',
1365
  consistencyNPredict: DEFAULT_N_PREDICT,
1366
  refTokenIds: refTokenIds || null,
1367
  nPrompt,
js/run/core.js CHANGED
@@ -92,92 +92,109 @@ async function runBenchActions(Module, {
92
  // Two sub-modes: (a) CPU baseline β€” generates token_ids via bench_run for a
93
  // future GPU verification pass; (b) GPU verification β€” runs bench_run then
94
  // bench_eval_tokens to compute the agreement rate against refTokenIds.
 
 
95
  if (consistencyPrompt) {
96
- onStatus?.('consistency', 'Running consistency check...');
97
- onLog?.(`bench_run("...", ${consistencyNPredict}) β€” consistency phase`);
98
- const raw = await Module.ccall(
99
- 'bench_run', 'string',
100
- ['string', 'number'],
101
- [consistencyPrompt, consistencyNPredict],
102
- { async: true },
103
- );
104
- const r = parseBenchResult('bench_run', raw);
105
- out.output = r.output || '';
106
- out.consistency = { token_ids: r.token_ids || [] };
107
-
108
- if (refTokenIds) {
109
- onLog?.('bench_eval_tokens β€” forced-decode vs CPU baseline');
110
- const evalRaw = await Module.ccall(
111
- 'bench_eval_tokens', 'string',
112
- ['string', 'string'],
113
- [consistencyPrompt, refTokenIds],
114
  { async: true },
115
  );
116
- const ev = parseBenchResult('bench_eval_tokens', evalRaw);
117
- out.consistency = { ...out.consistency, ...ev };
118
- onLog?.(
119
- `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
120
- `${ev.n_agree}/${ev.n_tokens})` +
121
- (ev.first_disagreement >= 0 ? ` β€” first diverge @ ${ev.first_disagreement}` : '')
122
- );
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  }
124
  }
125
 
126
  // ─── Perf phase (llama-bench style) ───
127
  // Synthetic random tokens; KV cleared per call inside bench_pp/bench_tg.
128
  // Warmup is one full pp + one tg(1) call before the timed reps, matching
129
- // tools/llama-bench/llama-bench.cpp.
 
130
  const wantPp = nPrompt > 0;
131
  const wantTg = nGen > 0;
132
  if (wantPp || wantTg) {
133
  const tests = [];
134
 
135
  if (wantPp) {
136
- if (!noWarmup) {
137
- onStatus?.('perf', `warmup pp${nPrompt}`);
138
- onLog?.(`bench_pp(${nPrompt}) β€” warmup`);
139
- const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
140
- parseBenchResult('bench_pp warmup', raw);
141
- }
142
- const samples_ns = [];
143
- for (let i = 0; i < nReps; i++) {
144
- onStatus?.('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
145
- const t0 = performance.now();
146
- const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
147
- const t_ns = (performance.now() - t0) * 1e6;
148
- parseBenchResult('bench_pp', raw);
149
- samples_ns.push(t_ns);
150
- onLog?.(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
 
 
 
 
 
151
  }
152
- tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
153
  }
154
 
155
  if (wantTg) {
156
- if (!noWarmup) {
157
- onStatus?.('perf', `warmup tg`);
158
- onLog?.('bench_tg(1) β€” warmup');
159
- const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
160
- parseBenchResult('bench_tg warmup', raw);
161
- }
162
- const samples_ns = [];
163
- for (let i = 0; i < nReps; i++) {
164
- onStatus?.('perf', `tg${nGen} ${i + 1}/${nReps}`);
165
- const t0 = performance.now();
166
- const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
167
- const t_ns = (performance.now() - t0) * 1e6;
168
- parseBenchResult('bench_tg', raw);
169
- samples_ns.push(t_ns);
170
- onLog?.(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
 
 
 
 
 
171
  }
172
- tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
173
  }
174
 
175
- out.metrics = {
176
- tests,
177
- n_prompt: wantPp ? nPrompt : 0,
178
- n_gen: wantTg ? nGen : 0,
179
- n_reps: nReps,
180
- };
 
 
181
  }
182
 
183
  return out;
 
92
  // Two sub-modes: (a) CPU baseline β€” generates token_ids via bench_run for a
93
  // future GPU verification pass; (b) GPU verification β€” runs bench_run then
94
  // bench_eval_tokens to compute the agreement rate against refTokenIds.
95
+ // Soft-fail: a failure here falls through to the perf phase rather than
96
+ // aborting the whole run.
97
  if (consistencyPrompt) {
98
+ try {
99
+ onStatus?.('consistency', 'Running consistency check...');
100
+ onLog?.(`bench_run("...", ${consistencyNPredict}) β€” consistency phase`);
101
+ const raw = await Module.ccall(
102
+ 'bench_run', 'string',
103
+ ['string', 'number'],
104
+ [consistencyPrompt, consistencyNPredict],
 
 
 
 
 
 
 
 
 
 
 
105
  { async: true },
106
  );
107
+ const r = parseBenchResult('bench_run', raw);
108
+ out.output = r.output || '';
109
+ out.consistency = { token_ids: r.token_ids || [] };
110
+
111
+ if (refTokenIds) {
112
+ onLog?.('bench_eval_tokens β€” forced-decode vs CPU baseline');
113
+ const evalRaw = await Module.ccall(
114
+ 'bench_eval_tokens', 'string',
115
+ ['string', 'string'],
116
+ [consistencyPrompt, refTokenIds],
117
+ { async: true },
118
+ );
119
+ const ev = parseBenchResult('bench_eval_tokens', evalRaw);
120
+ out.consistency = { ...out.consistency, ...ev };
121
+ onLog?.(
122
+ `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
123
+ `${ev.n_agree}/${ev.n_tokens})` +
124
+ (ev.first_disagreement >= 0 ? ` β€” first diverge @ ${ev.first_disagreement}` : '')
125
+ );
126
+ }
127
+ } catch (err) {
128
+ onLog?.(`Consistency phase failed: ${err.message} β€” continuing to perf phase`);
129
  }
130
  }
131
 
132
  // ─── Perf phase (llama-bench style) ───
133
  // Synthetic random tokens; KV cleared per call inside bench_pp/bench_tg.
134
  // Warmup is one full pp + one tg(1) call before the timed reps, matching
135
+ // tools/llama-bench/llama-bench.cpp. pp and tg are wrapped independently
136
+ // so failure in one doesn't skip the other.
137
  const wantPp = nPrompt > 0;
138
  const wantTg = nGen > 0;
139
  if (wantPp || wantTg) {
140
  const tests = [];
141
 
142
  if (wantPp) {
143
+ try {
144
+ if (!noWarmup) {
145
+ onStatus?.('perf', `warmup pp${nPrompt}`);
146
+ onLog?.(`bench_pp(${nPrompt}) β€” warmup`);
147
+ const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
148
+ parseBenchResult('bench_pp warmup', raw);
149
+ }
150
+ const samples_ns = [];
151
+ for (let i = 0; i < nReps; i++) {
152
+ onStatus?.('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
153
+ const t0 = performance.now();
154
+ const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
155
+ const t_ns = (performance.now() - t0) * 1e6;
156
+ parseBenchResult('bench_pp', raw);
157
+ samples_ns.push(t_ns);
158
+ onLog?.(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
159
+ }
160
+ tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
161
+ } catch (err) {
162
+ onLog?.(`pp test failed: ${err.message}`);
163
  }
 
164
  }
165
 
166
  if (wantTg) {
167
+ try {
168
+ if (!noWarmup) {
169
+ onStatus?.('perf', `warmup tg`);
170
+ onLog?.('bench_tg(1) β€” warmup');
171
+ const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
172
+ parseBenchResult('bench_tg warmup', raw);
173
+ }
174
+ const samples_ns = [];
175
+ for (let i = 0; i < nReps; i++) {
176
+ onStatus?.('perf', `tg${nGen} ${i + 1}/${nReps}`);
177
+ const t0 = performance.now();
178
+ const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
179
+ const t_ns = (performance.now() - t0) * 1e6;
180
+ parseBenchResult('bench_tg', raw);
181
+ samples_ns.push(t_ns);
182
+ onLog?.(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
183
+ }
184
+ tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
185
+ } catch (err) {
186
+ onLog?.(`tg test failed: ${err.message}`);
187
  }
 
188
  }
189
 
190
+ if (tests.length > 0) {
191
+ out.metrics = {
192
+ tests,
193
+ n_prompt: wantPp ? nPrompt : 0,
194
+ n_gen: wantTg ? nGen : 0,
195
+ n_reps: nReps,
196
+ };
197
+ }
198
  }
199
 
200
  return out;
run.html CHANGED
@@ -136,6 +136,13 @@
136
  <label class="filter-label" for="iterations-input">Reps (-r)</label>
137
  <input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">
138
  </div>
 
 
 
 
 
 
 
139
  </div>
140
  </div>
141
 
 
136
  <label class="filter-label" for="iterations-input">Reps (-r)</label>
137
  <input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">
138
  </div>
139
+ <div class="filter-group">
140
+ <span class="filter-label">Skip</span>
141
+ <div class="run-filters-checks">
142
+ <label class="run-hide-label" title="Skip the consistency check (CPU baseline + GPU forced-decode agreement). Useful when consistency is failing on a device or you only care about perf."><input type="checkbox" id="skip-consistency"> Consistency</label>
143
+ <label class="run-hide-label" title="Skip the single-rep CPU perf baseline. Useful when CPU runs are too slow or unstable on a device."><input type="checkbox" id="skip-cpu-perf"> CPU perf</label>
144
+ </div>
145
+ </div>
146
  </div>
147
  </div>
148