Spaces:

abhijitramesh
/

webgpu-bench

Running

App Files Files Community

GitHub Actions commited on Apr 29

Commit

629e542

1 Parent(s): 1683f65

sync from abhijitramesh/webgpu-bench@1a0973fa5b

Browse files

Files changed (4) hide show

js/run/bench-worker.js +84 -63
js/run/controller.js +70 -31
js/run/core.js +81 -64
run.html +7 -0

js/run/bench-worker.js CHANGED Viewed

@@ -254,89 +254,110 @@ async function runOne({ params, stream, buffer }) {
   }
   // ─── Consistency phase ───
   if (consistencyPrompt) {
-    status('consistency', 'Running consistency check...');
-    log(`bench_run("...", ${consistencyNPredict}) — consistency phase`);
-    const raw = await Module.ccall(
-      'bench_run', 'string',
-      ['string', 'number'],
-      [consistencyPrompt, consistencyNPredict],
-      { async: true },
-    );
-    const r = parseBenchResult('bench_run', raw);
-    result.output = r.output || '';
-    result.consistency = { token_ids: r.token_ids || [] };
-    if (refTokenIds) {
-      log('bench_eval_tokens — forced-decode vs CPU baseline');
-      const evalRaw = await Module.ccall(
-        'bench_eval_tokens', 'string',
-        ['string', 'string'],
-        [consistencyPrompt, refTokenIds],
         { async: true },
       );
-      const ev = parseBenchResult('bench_eval_tokens', evalRaw);
-      result.consistency = { ...result.consistency, ...ev };
-      log(
-        `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
-        `${ev.n_agree}/${ev.n_tokens})` +
-        (ev.first_disagreement >= 0 ? ` — first diverge @ ${ev.first_disagreement}` : '')
-      );
     }
   }
   // ─── Perf phase (llama-bench style) ───
   const wantPp = nPrompt > 0;
   const wantTg = nGen > 0;
   if (wantPp || wantTg) {
     const tests = [];
     if (wantPp) {
-      if (!noWarmup) {
-        status('perf', `warmup pp${nPrompt}`);
-        log(`bench_pp(${nPrompt}) — warmup`);
-        const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
-        parseBenchResult('bench_pp warmup', raw);
-      }
-      const samples_ns = [];
-      for (let i = 0; i < nReps; i++) {
-        status('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
-        const t0 = performance.now();
-        const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
-        const t_ns = (performance.now() - t0) * 1e6;
-        parseBenchResult('bench_pp', raw);
-        samples_ns.push(t_ns);
-        log(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
       }
-      tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
     }
     if (wantTg) {
-      if (!noWarmup) {
-        status('perf', `warmup tg`);
-        log('bench_tg(1) — warmup');
-        const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
-        parseBenchResult('bench_tg warmup', raw);
       }
-      const samples_ns = [];
-      for (let i = 0; i < nReps; i++) {
-        status('perf', `tg${nGen} ${i + 1}/${nReps}`);
-        const t0 = performance.now();
-        const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
-        const t_ns = (performance.now() - t0) * 1e6;
-        parseBenchResult('bench_tg', raw);
-        samples_ns.push(t_ns);
-        log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
-      }
-      tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
     }
-    result.metrics = {
-      tests,
-      n_prompt: wantPp ? nPrompt : 0,
-      n_gen: wantTg ? nGen : 0,
-      n_reps: nReps,
-    };
   }
   await Module.ccall('bench_exit', null, [], [], { async: true });

   }
   // ─── Consistency phase ───
+  // Soft-fail: a failure here logs and falls through to the perf phase
+  // rather than aborting the whole run. Some devices/models can't survive
+  // bench_run (e.g. unsupported op, OOM mid-decode) but can still produce
+  // useful pp/tg numbers via synthetic-token paths.
   if (consistencyPrompt) {
+    try {
+      status('consistency', 'Running consistency check...');
+      log(`bench_run("...", ${consistencyNPredict}) — consistency phase`);
+      const raw = await Module.ccall(
+        'bench_run', 'string',
+        ['string', 'number'],
+        [consistencyPrompt, consistencyNPredict],
         { async: true },
       );
+      const r = parseBenchResult('bench_run', raw);
+      result.output = r.output || '';
+      result.consistency = { token_ids: r.token_ids || [] };
+      if (refTokenIds) {
+        log('bench_eval_tokens — forced-decode vs CPU baseline');
+        const evalRaw = await Module.ccall(
+          'bench_eval_tokens', 'string',
+          ['string', 'string'],
+          [consistencyPrompt, refTokenIds],
+          { async: true },
+        );
+        const ev = parseBenchResult('bench_eval_tokens', evalRaw);
+        result.consistency = { ...result.consistency, ...ev };
+        log(
+          `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
+          `${ev.n_agree}/${ev.n_tokens})` +
+          (ev.first_disagreement >= 0 ? ` — first diverge @ ${ev.first_disagreement}` : '')
+        );
+      }
+    } catch (err) {
+      log(`Consistency phase failed: ${err.message} — continuing to perf phase`);
     }
   }
   // ─── Perf phase (llama-bench style) ───
+  // Each test (pp, tg) is wrapped independently so a failure in one doesn't
+  // skip the other. Empty samples_ns produces a buildTest with avg_ts=0,
+  // which the dashboard renders as a dash.
   const wantPp = nPrompt > 0;
   const wantTg = nGen > 0;
   if (wantPp || wantTg) {
     const tests = [];
     if (wantPp) {
+      try {
+        if (!noWarmup) {
+          status('perf', `warmup pp${nPrompt}`);
+          log(`bench_pp(${nPrompt}) — warmup`);
+          const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
+          parseBenchResult('bench_pp warmup', raw);
+        }
+        const samples_ns = [];
+        for (let i = 0; i < nReps; i++) {
+          status('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
+          const t0 = performance.now();
+          const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
+          const t_ns = (performance.now() - t0) * 1e6;
+          parseBenchResult('bench_pp', raw);
+          samples_ns.push(t_ns);
+          log(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
+        }
+        tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
+      } catch (err) {
+        log(`pp test failed: ${err.message}`);
       }
     }
     if (wantTg) {
+      try {
+        if (!noWarmup) {
+          status('perf', `warmup tg`);
+          log('bench_tg(1) — warmup');
+          const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
+          parseBenchResult('bench_tg warmup', raw);
+        }
+        const samples_ns = [];
+        for (let i = 0; i < nReps; i++) {
+          status('perf', `tg${nGen} ${i + 1}/${nReps}`);
+          const t0 = performance.now();
+          const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
+          const t_ns = (performance.now() - t0) * 1e6;
+          parseBenchResult('bench_tg', raw);
+          samples_ns.push(t_ns);
+          log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
+        }
+        tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
+      } catch (err) {
+        log(`tg test failed: ${err.message}`);
       }
     }
+    if (tests.length > 0) {
+      result.metrics = {
+        tests,
+        n_prompt: wantPp ? nPrompt : 0,
+        n_gen: wantTg ? nGen : 0,
+        n_reps: nReps,
+      };
+    }
   }
   await Module.ccall('bench_exit', null, [], [], { async: true });

js/run/controller.js CHANGED Viewed

@@ -43,6 +43,12 @@ const state = {
   iterations: DEFAULT_ITERATIONS,
   nPrompt: DEFAULT_N_PROMPT,
   nGen: DEFAULT_N_GEN,
   mounted: false,
   // Tracks variants the Run pipeline downloaded this session (as opposed to
   // the standalone Download button or pre-existing cache). Only these are
@@ -660,6 +666,20 @@ function wirePerfInputs() {
       ng.value = String(state.nGen);
     });
   }
 }
 function submittableResults() {
@@ -1274,55 +1294,74 @@ async function runVariantWithIterations(v, row) {
   const nReps = Math.max(1, state.iterations || DEFAULT_ITERATIONS);
   const nPrompt = Math.max(0, state.nPrompt ?? DEFAULT_N_PROMPT);
   const nGen = Math.max(0, state.nGen ?? DEFAULT_N_GEN);
   // ─── CPU baseline ───
-  // Consistency (token_ids) + a single warmup-then-1-rep perf measurement.
-  // The single rep gives us a CPU-vs-GPU speedup signal in the dashboard
-  // without paying for a full nReps sweep on CPU.
-  row.setStatus('cpu-baseline', 'reference tokens + 1-rep perf');
   let cpuResult;
-  try {
-    cpuResult = await runBenchmarkInWorker(v, {
-      consistencyPrompt: DEFAULT_PROMPT,
-      consistencyNPredict: DEFAULT_N_PREDICT,
-      refTokenIds: null,
-      nPrompt,
-      nGen,
-      nReps: 1,
-      nCtx: DEFAULT_N_CTX,
-      nGpuLayers: 0,
-    }, {
-      onStatus: (status, msg) => row.setStatus(`cpu/${status}`, msg),
-      onProgress: (fr, downloaded, total) => row.setProgress(fr, downloaded, total),
-      onLog: logLine,
-    });
-  } catch (err) {
-    cpuResult = { status: 'error', error: err.message || String(err) };
   }
-  // CPU baseline is "best effort": if it fails (typically OOM on a tight
-  // tab), keep going with the GPU pass but skip consistency. Perf metrics
-  // are independent of consistency so they're still reported.
   const cpuOk = cpuResult.status === 'done';
-  if (!cpuOk) {
-    logLine(
-      `CPU baseline failed (${cpuResult.error || 'unknown'}) — proceeding with GPU run, skipping consistency check.`
-    );
     row.setStatus('cpu-skipped', 'continuing with GPU only');
   }
-  const refTokenIds = cpuOk ? (cpuResult.consistency?.token_ids || []).join(',') : '';
   if (state.aborted) {
     return { status: 'error', error: 'aborted', cpu: cpuResult, gpu: null };
   }
-  // ─── GPU pass: consistency + perf in one model load ───
   row.setStatus('gpu-run', 'loading model');
   let gpuResult;
   try {
     gpuResult = await runBenchmarkInWorker(v, {
-      consistencyPrompt: DEFAULT_PROMPT,
       consistencyNPredict: DEFAULT_N_PREDICT,
       refTokenIds: refTokenIds || null,
       nPrompt,

   iterations: DEFAULT_ITERATIONS,
   nPrompt: DEFAULT_N_PROMPT,
   nGen: DEFAULT_N_GEN,
+  // User-controlled phase toggles. Defaults match the previous behaviour:
+  // run consistency (CPU baseline + GPU forced-decode) AND run CPU perf
+  // baseline. Both checkable to skip — useful on devices where CPU is too
+  // slow / unreliable to be worth waiting for.
+  skipConsistency: false,
+  skipCpuPerf: false,
   mounted: false,
   // Tracks variants the Run pipeline downloaded this session (as opposed to
   // the standalone Download button or pre-existing cache). Only these are
       ng.value = String(state.nGen);
     });
   }
+  const skipCons = $('skip-consistency');
+  if (skipCons) {
+    skipCons.checked = state.skipConsistency;
+    skipCons.addEventListener('change', () => {
+      state.skipConsistency = skipCons.checked;
+    });
+  }
+  const skipCpu = $('skip-cpu-perf');
+  if (skipCpu) {
+    skipCpu.checked = state.skipCpuPerf;
+    skipCpu.addEventListener('change', () => {
+      state.skipCpuPerf = skipCpu.checked;
+    });
+  }
 }
 function submittableResults() {
   const nReps = Math.max(1, state.iterations || DEFAULT_ITERATIONS);
   const nPrompt = Math.max(0, state.nPrompt ?? DEFAULT_N_PROMPT);
   const nGen = Math.max(0, state.nGen ?? DEFAULT_N_GEN);
+  // Phase toggles from the run page. Combined effect:
+  //   skip both          → only GPU perf, no CPU pass at all
+  //   skip consistency   → CPU perf baseline + GPU perf, no token-id check
+  //   skip CPU perf      → CPU consistency tokens + GPU consistency + GPU perf
+  //   skip neither       → full default flow
+  const runConsistency = !state.skipConsistency;
+  const runCpuPerf = !state.skipCpuPerf;
+  const needCpuPass = runConsistency || runCpuPerf;
   // ─── CPU baseline ───
+  // Skipped entirely if both toggles disable it. Otherwise the pass mixes
+  // and matches: consistency_run captures token_ids; perf phase runs at
+  // nReps=1 (single warmup+timed rep — enough to populate the dashboard's
+  // CPU/GPU comparison without doubling CPU runtime).
   let cpuResult;
+  if (needCpuPass) {
+    const phaseLabel = runConsistency && runCpuPerf ? 'reference tokens + 1-rep perf'
+      : runConsistency ? 'reference tokens'
+      : '1-rep perf';
+    row.setStatus('cpu-baseline', phaseLabel);
+    try {
+      cpuResult = await runBenchmarkInWorker(v, {
+        consistencyPrompt: runConsistency ? DEFAULT_PROMPT : '',
+        consistencyNPredict: DEFAULT_N_PREDICT,
+        refTokenIds: null,
+        nPrompt: runCpuPerf ? nPrompt : 0,
+        nGen:    runCpuPerf ? nGen    : 0,
+        nReps: 1,
+        nCtx: DEFAULT_N_CTX,
+        nGpuLayers: 0,
+      }, {
+        onStatus: (status, msg) => row.setStatus(`cpu/${status}`, msg),
+        onProgress: (fr, downloaded, total) => row.setProgress(fr, downloaded, total),
+        onLog: logLine,
+      });
+    } catch (err) {
+      cpuResult = { status: 'error', error: err.message || String(err) };
+    }
+  } else {
+    cpuResult = { status: 'skipped' };
   }
+  // CPU pass is best-effort. Failures (OOM, slow device, missing op) don't
+  // block the GPU run — the user opted into resilience implicitly by the
+  // phase being best-effort, and explicitly via the skip checkboxes.
   const cpuOk = cpuResult.status === 'done';
+  if (cpuResult.status === 'error') {
+    logLine(`CPU baseline failed (${cpuResult.error || 'unknown'}) — proceeding with GPU run.`);
     row.setStatus('cpu-skipped', 'continuing with GPU only');
   }
+  // refTokenIds is the GPU pass's input for forced-decode consistency. Only
+  // pass when we actually have tokens (consistency was requested AND CPU
+  // produced tokens).
+  const refTokenIds = (cpuOk && runConsistency && cpuResult.consistency?.token_ids?.length)
+    ? cpuResult.consistency.token_ids.join(',')
+    : '';
   if (state.aborted) {
     return { status: 'error', error: 'aborted', cpu: cpuResult, gpu: null };
   }
+  // ─── GPU pass: consistency (when not skipped) + perf in one model load ───
   row.setStatus('gpu-run', 'loading model');
   let gpuResult;
   try {
     gpuResult = await runBenchmarkInWorker(v, {
+      consistencyPrompt: runConsistency ? DEFAULT_PROMPT : '',
       consistencyNPredict: DEFAULT_N_PREDICT,
       refTokenIds: refTokenIds || null,
       nPrompt,

js/run/core.js CHANGED Viewed

@@ -92,92 +92,109 @@ async function runBenchActions(Module, {
   // Two sub-modes: (a) CPU baseline — generates token_ids via bench_run for a
   // future GPU verification pass; (b) GPU verification — runs bench_run then
   // bench_eval_tokens to compute the agreement rate against refTokenIds.
   if (consistencyPrompt) {
-    onStatus?.('consistency', 'Running consistency check...');
-    onLog?.(`bench_run("...", ${consistencyNPredict}) — consistency phase`);
-    const raw = await Module.ccall(
-      'bench_run', 'string',
-      ['string', 'number'],
-      [consistencyPrompt, consistencyNPredict],
-      { async: true },
-    );
-    const r = parseBenchResult('bench_run', raw);
-    out.output = r.output || '';
-    out.consistency = { token_ids: r.token_ids || [] };
-    if (refTokenIds) {
-      onLog?.('bench_eval_tokens — forced-decode vs CPU baseline');
-      const evalRaw = await Module.ccall(
-        'bench_eval_tokens', 'string',
-        ['string', 'string'],
-        [consistencyPrompt, refTokenIds],
         { async: true },
       );
-      const ev = parseBenchResult('bench_eval_tokens', evalRaw);
-      out.consistency = { ...out.consistency, ...ev };
-      onLog?.(
-        `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
-        `${ev.n_agree}/${ev.n_tokens})` +
-        (ev.first_disagreement >= 0 ? ` — first diverge @ ${ev.first_disagreement}` : '')
-      );
     }
   }
   // ─── Perf phase (llama-bench style) ───
   // Synthetic random tokens; KV cleared per call inside bench_pp/bench_tg.
   // Warmup is one full pp + one tg(1) call before the timed reps, matching
-  // tools/llama-bench/llama-bench.cpp.
   const wantPp = nPrompt > 0;
   const wantTg = nGen > 0;
   if (wantPp || wantTg) {
     const tests = [];
     if (wantPp) {
-      if (!noWarmup) {
-        onStatus?.('perf', `warmup pp${nPrompt}`);
-        onLog?.(`bench_pp(${nPrompt}) — warmup`);
-        const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
-        parseBenchResult('bench_pp warmup', raw);
-      }
-      const samples_ns = [];
-      for (let i = 0; i < nReps; i++) {
-        onStatus?.('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
-        const t0 = performance.now();
-        const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
-        const t_ns = (performance.now() - t0) * 1e6;
-        parseBenchResult('bench_pp', raw);
-        samples_ns.push(t_ns);
-        onLog?.(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
       }
-      tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
     }
     if (wantTg) {
-      if (!noWarmup) {
-        onStatus?.('perf', `warmup tg`);
-        onLog?.('bench_tg(1) — warmup');
-        const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
-        parseBenchResult('bench_tg warmup', raw);
-      }
-      const samples_ns = [];
-      for (let i = 0; i < nReps; i++) {
-        onStatus?.('perf', `tg${nGen} ${i + 1}/${nReps}`);
-        const t0 = performance.now();
-        const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
-        const t_ns = (performance.now() - t0) * 1e6;
-        parseBenchResult('bench_tg', raw);
-        samples_ns.push(t_ns);
-        onLog?.(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
       }
-      tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
     }
-    out.metrics = {
-      tests,
-      n_prompt: wantPp ? nPrompt : 0,
-      n_gen: wantTg ? nGen : 0,
-      n_reps: nReps,
-    };
   }
   return out;

   // Two sub-modes: (a) CPU baseline — generates token_ids via bench_run for a
   // future GPU verification pass; (b) GPU verification — runs bench_run then
   // bench_eval_tokens to compute the agreement rate against refTokenIds.
+  // Soft-fail: a failure here falls through to the perf phase rather than
+  // aborting the whole run.
   if (consistencyPrompt) {
+    try {
+      onStatus?.('consistency', 'Running consistency check...');
+      onLog?.(`bench_run("...", ${consistencyNPredict}) — consistency phase`);
+      const raw = await Module.ccall(
+        'bench_run', 'string',
+        ['string', 'number'],
+        [consistencyPrompt, consistencyNPredict],
         { async: true },
       );
+      const r = parseBenchResult('bench_run', raw);
+      out.output = r.output || '';
+      out.consistency = { token_ids: r.token_ids || [] };
+      if (refTokenIds) {
+        onLog?.('bench_eval_tokens — forced-decode vs CPU baseline');
+        const evalRaw = await Module.ccall(
+          'bench_eval_tokens', 'string',
+          ['string', 'string'],
+          [consistencyPrompt, refTokenIds],
+          { async: true },
+        );
+        const ev = parseBenchResult('bench_eval_tokens', evalRaw);
+        out.consistency = { ...out.consistency, ...ev };
+        onLog?.(
+          `Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
+          `${ev.n_agree}/${ev.n_tokens})` +
+          (ev.first_disagreement >= 0 ? ` — first diverge @ ${ev.first_disagreement}` : '')
+        );
+      }
+    } catch (err) {
+      onLog?.(`Consistency phase failed: ${err.message} — continuing to perf phase`);
     }
   }
   // ─── Perf phase (llama-bench style) ───
   // Synthetic random tokens; KV cleared per call inside bench_pp/bench_tg.
   // Warmup is one full pp + one tg(1) call before the timed reps, matching
+  // tools/llama-bench/llama-bench.cpp. pp and tg are wrapped independently
+  // so failure in one doesn't skip the other.
   const wantPp = nPrompt > 0;
   const wantTg = nGen > 0;
   if (wantPp || wantTg) {
     const tests = [];
     if (wantPp) {
+      try {
+        if (!noWarmup) {
+          onStatus?.('perf', `warmup pp${nPrompt}`);
+          onLog?.(`bench_pp(${nPrompt}) — warmup`);
+          const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
+          parseBenchResult('bench_pp warmup', raw);
+        }
+        const samples_ns = [];
+        for (let i = 0; i < nReps; i++) {
+          onStatus?.('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
+          const t0 = performance.now();
+          const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
+          const t_ns = (performance.now() - t0) * 1e6;
+          parseBenchResult('bench_pp', raw);
+          samples_ns.push(t_ns);
+          onLog?.(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
+        }
+        tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
+      } catch (err) {
+        onLog?.(`pp test failed: ${err.message}`);
       }
     }
     if (wantTg) {
+      try {
+        if (!noWarmup) {
+          onStatus?.('perf', `warmup tg`);
+          onLog?.('bench_tg(1) — warmup');
+          const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
+          parseBenchResult('bench_tg warmup', raw);
+        }
+        const samples_ns = [];
+        for (let i = 0; i < nReps; i++) {
+          onStatus?.('perf', `tg${nGen} ${i + 1}/${nReps}`);
+          const t0 = performance.now();
+          const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
+          const t_ns = (performance.now() - t0) * 1e6;
+          parseBenchResult('bench_tg', raw);
+          samples_ns.push(t_ns);
+          onLog?.(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
+        }
+        tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
+      } catch (err) {
+        onLog?.(`tg test failed: ${err.message}`);
       }
     }
+    if (tests.length > 0) {
+      out.metrics = {
+        tests,
+        n_prompt: wantPp ? nPrompt : 0,
+        n_gen: wantTg ? nGen : 0,
+        n_reps: nReps,
+      };
+    }
   }
   return out;

run.html CHANGED Viewed

@@ -136,6 +136,13 @@
               <label class="filter-label" for="iterations-input">Reps (-r)</label>
               <input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">
             </div>
           </div>
         </div>

               <label class="filter-label" for="iterations-input">Reps (-r)</label>
               <input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">
             </div>
+            <div class="filter-group">
+              <span class="filter-label">Skip</span>
+              <div class="run-filters-checks">
+                <label class="run-hide-label" title="Skip the consistency check (CPU baseline + GPU forced-decode agreement). Useful when consistency is failing on a device or you only care about perf."><input type="checkbox" id="skip-consistency"> Consistency</label>
+                <label class="run-hide-label" title="Skip the single-rep CPU perf baseline. Useful when CPU runs are too slow or unstable on a device."><input type="checkbox" id="skip-cpu-perf"> CPU perf</label>
+              </div>
+            </div>
           </div>
         </div>