Spaces:

abhijitramesh
/

webgpu-bench

Running

GitHub Actions commited on 26 days ago

Commit

b0f367a

1 Parent(s): 44a16ab

sync from abhijitramesh/webgpu-bench@bf90a1fd89

Files changed (2) hide show

js/run/bench-worker.js CHANGED Viewed

@@ -45,6 +45,12 @@ const status = (s, msg, sinceMs) => post({ type: 'status', status: s, msg, since
 // report 100%). Mirror of CONSISTENCY_MIN_TOKENS in core.js.
 const CONSISTENCY_MIN_TOKENS = 8;
 // llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the
 // actually-bad lines as :err so real failures stand out. Mirror in core.js.
 function classifyWasmStderr(text) {
@@ -486,6 +492,7 @@ async function runOne({ params, stream, buffer, opfsPath }) {
           parseBenchResult('bench_pp', raw);
           samples_ns.push(t_ns);
           log(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
         }
         tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
       } catch (err) {
@@ -514,6 +521,7 @@ async function runOne({ params, stream, buffer, opfsPath }) {
           parseBenchResult('bench_tg', raw);
           samples_ns.push(t_ns);
           log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
         }
         tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
       } catch (err) {

 // report 100%). Mirror of CONSISTENCY_MIN_TOKENS in core.js.
 const CONSISTENCY_MIN_TOKENS = 8;
+// Sleep between perf reps so the GPU clock state can recover. Without
+// this, sustained tg decode reps showed monotonic decay (rep 1 fastest,
+// rep N slowest) — looks like Apple's GPU power-state cooldown.
+const REP_COOLDOWN_MS = 1000;
+const sleep = (ms) => new Promise(r => setTimeout(r, ms));
 // llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the
 // actually-bad lines as :err so real failures stand out. Mirror in core.js.
 function classifyWasmStderr(text) {
           parseBenchResult('bench_pp', raw);
           samples_ns.push(t_ns);
           log(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
+          if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
         }
         tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
       } catch (err) {
           parseBenchResult('bench_tg', raw);
           samples_ns.push(t_ns);
           log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
+          if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
         }
         tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
       } catch (err) {

js/run/core.js CHANGED Viewed

@@ -14,6 +14,12 @@ const DEFAULT_N_REPS = 5;
 // (e.g. early-EOS models that produce 1 token will always report 100%).
 const CONSISTENCY_MIN_TOKENS = 8;
 // llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the
 // actually-bad lines as :err so real failures stand out from routine output.
 function classifyWasmStderr(text) {
@@ -174,6 +180,7 @@ async function runBenchActions(Module, {
           parseBenchResult('bench_pp', raw);
           samples_ns.push(t_ns);
           onLog?.(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
         }
         tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
       } catch (err) {
@@ -202,6 +209,7 @@ async function runBenchActions(Module, {
           parseBenchResult('bench_tg', raw);
           samples_ns.push(t_ns);
           onLog?.(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
         }
         tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
       } catch (err) {

 // (e.g. early-EOS models that produce 1 token will always report 100%).
 const CONSISTENCY_MIN_TOKENS = 8;
+// Sleep between perf reps so the GPU clock state can recover. Without
+// this, sustained tg decode reps showed monotonic decay (rep 1 fastest,
+// rep N slowest) — looks like Apple's GPU power-state cooldown.
+const REP_COOLDOWN_MS = 1000;
+const sleep = (ms) => new Promise(r => setTimeout(r, ms));
 // llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the
 // actually-bad lines as :err so real failures stand out from routine output.
 function classifyWasmStderr(text) {
           parseBenchResult('bench_pp', raw);
           samples_ns.push(t_ns);
           onLog?.(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
+          if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
         }
         tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
       } catch (err) {
           parseBenchResult('bench_tg', raw);
           samples_ns.push(t_ns);
           onLog?.(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
+          if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
         }
         tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
       } catch (err) {