GitHub Actions commited on
Commit
b0f367a
·
1 Parent(s): 44a16ab

sync from abhijitramesh/webgpu-bench@bf90a1fd89

Browse files
Files changed (2) hide show
  1. js/run/bench-worker.js +8 -0
  2. js/run/core.js +8 -0
js/run/bench-worker.js CHANGED
@@ -45,6 +45,12 @@ const status = (s, msg, sinceMs) => post({ type: 'status', status: s, msg, since
45
  // report 100%). Mirror of CONSISTENCY_MIN_TOKENS in core.js.
46
  const CONSISTENCY_MIN_TOKENS = 8;
47
 
 
 
 
 
 
 
48
  // llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the
49
  // actually-bad lines as :err so real failures stand out. Mirror in core.js.
50
  function classifyWasmStderr(text) {
@@ -486,6 +492,7 @@ async function runOne({ params, stream, buffer, opfsPath }) {
486
  parseBenchResult('bench_pp', raw);
487
  samples_ns.push(t_ns);
488
  log(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
 
489
  }
490
  tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
491
  } catch (err) {
@@ -514,6 +521,7 @@ async function runOne({ params, stream, buffer, opfsPath }) {
514
  parseBenchResult('bench_tg', raw);
515
  samples_ns.push(t_ns);
516
  log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
 
517
  }
518
  tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
519
  } catch (err) {
 
45
  // report 100%). Mirror of CONSISTENCY_MIN_TOKENS in core.js.
46
  const CONSISTENCY_MIN_TOKENS = 8;
47
 
48
+ // Sleep between perf reps so the GPU clock state can recover. Without
49
+ // this, sustained tg decode reps showed monotonic decay (rep 1 fastest,
50
+ // rep N slowest) — looks like Apple's GPU power-state cooldown.
51
+ const REP_COOLDOWN_MS = 1000;
52
+ const sleep = (ms) => new Promise(r => setTimeout(r, ms));
53
+
54
  // llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the
55
  // actually-bad lines as :err so real failures stand out. Mirror in core.js.
56
  function classifyWasmStderr(text) {
 
492
  parseBenchResult('bench_pp', raw);
493
  samples_ns.push(t_ns);
494
  log(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
495
+ if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
496
  }
497
  tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
498
  } catch (err) {
 
521
  parseBenchResult('bench_tg', raw);
522
  samples_ns.push(t_ns);
523
  log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
524
+ if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
525
  }
526
  tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
527
  } catch (err) {
js/run/core.js CHANGED
@@ -14,6 +14,12 @@ const DEFAULT_N_REPS = 5;
14
  // (e.g. early-EOS models that produce 1 token will always report 100%).
15
  const CONSISTENCY_MIN_TOKENS = 8;
16
 
 
 
 
 
 
 
17
  // llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the
18
  // actually-bad lines as :err so real failures stand out from routine output.
19
  function classifyWasmStderr(text) {
@@ -174,6 +180,7 @@ async function runBenchActions(Module, {
174
  parseBenchResult('bench_pp', raw);
175
  samples_ns.push(t_ns);
176
  onLog?.(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
 
177
  }
178
  tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
179
  } catch (err) {
@@ -202,6 +209,7 @@ async function runBenchActions(Module, {
202
  parseBenchResult('bench_tg', raw);
203
  samples_ns.push(t_ns);
204
  onLog?.(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
 
205
  }
206
  tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
207
  } catch (err) {
 
14
  // (e.g. early-EOS models that produce 1 token will always report 100%).
15
  const CONSISTENCY_MIN_TOKENS = 8;
16
 
17
+ // Sleep between perf reps so the GPU clock state can recover. Without
18
+ // this, sustained tg decode reps showed monotonic decay (rep 1 fastest,
19
+ // rep N slowest) — looks like Apple's GPU power-state cooldown.
20
+ const REP_COOLDOWN_MS = 1000;
21
+ const sleep = (ms) => new Promise(r => setTimeout(r, ms));
22
+
23
  // llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the
24
  // actually-bad lines as :err so real failures stand out from routine output.
25
  function classifyWasmStderr(text) {
 
180
  parseBenchResult('bench_pp', raw);
181
  samples_ns.push(t_ns);
182
  onLog?.(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
183
+ if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
184
  }
185
  tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
186
  } catch (err) {
 
209
  parseBenchResult('bench_tg', raw);
210
  samples_ns.push(t_ns);
211
  onLog?.(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
212
+ if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
213
  }
214
  tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
215
  } catch (err) {