Spaces:
Running
Running
GitHub Actions commited on
Commit ·
b0f367a
1
Parent(s): 44a16ab
sync from abhijitramesh/webgpu-bench@bf90a1fd89
Browse files- js/run/bench-worker.js +8 -0
- js/run/core.js +8 -0
js/run/bench-worker.js
CHANGED
|
@@ -45,6 +45,12 @@ const status = (s, msg, sinceMs) => post({ type: 'status', status: s, msg, since
|
|
| 45 |
// report 100%). Mirror of CONSISTENCY_MIN_TOKENS in core.js.
|
| 46 |
const CONSISTENCY_MIN_TOKENS = 8;
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
// llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the
|
| 49 |
// actually-bad lines as :err so real failures stand out. Mirror in core.js.
|
| 50 |
function classifyWasmStderr(text) {
|
|
@@ -486,6 +492,7 @@ async function runOne({ params, stream, buffer, opfsPath }) {
|
|
| 486 |
parseBenchResult('bench_pp', raw);
|
| 487 |
samples_ns.push(t_ns);
|
| 488 |
log(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
|
|
|
|
| 489 |
}
|
| 490 |
tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
|
| 491 |
} catch (err) {
|
|
@@ -514,6 +521,7 @@ async function runOne({ params, stream, buffer, opfsPath }) {
|
|
| 514 |
parseBenchResult('bench_tg', raw);
|
| 515 |
samples_ns.push(t_ns);
|
| 516 |
log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
|
|
|
|
| 517 |
}
|
| 518 |
tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
|
| 519 |
} catch (err) {
|
|
|
|
| 45 |
// report 100%). Mirror of CONSISTENCY_MIN_TOKENS in core.js.
|
| 46 |
const CONSISTENCY_MIN_TOKENS = 8;
|
| 47 |
|
| 48 |
+
// Sleep between perf reps so the GPU clock state can recover. Without
|
| 49 |
+
// this, sustained tg decode reps showed monotonic decay (rep 1 fastest,
|
| 50 |
+
// rep N slowest) — looks like Apple's GPU power-state cooldown.
|
| 51 |
+
const REP_COOLDOWN_MS = 1000;
|
| 52 |
+
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
| 53 |
+
|
| 54 |
// llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the
|
| 55 |
// actually-bad lines as :err so real failures stand out. Mirror in core.js.
|
| 56 |
function classifyWasmStderr(text) {
|
|
|
|
| 492 |
parseBenchResult('bench_pp', raw);
|
| 493 |
samples_ns.push(t_ns);
|
| 494 |
log(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
|
| 495 |
+
if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
|
| 496 |
}
|
| 497 |
tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
|
| 498 |
} catch (err) {
|
|
|
|
| 521 |
parseBenchResult('bench_tg', raw);
|
| 522 |
samples_ns.push(t_ns);
|
| 523 |
log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
|
| 524 |
+
if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
|
| 525 |
}
|
| 526 |
tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
|
| 527 |
} catch (err) {
|
js/run/core.js
CHANGED
|
@@ -14,6 +14,12 @@ const DEFAULT_N_REPS = 5;
|
|
| 14 |
// (e.g. early-EOS models that produce 1 token will always report 100%).
|
| 15 |
const CONSISTENCY_MIN_TOKENS = 8;
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
// llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the
|
| 18 |
// actually-bad lines as :err so real failures stand out from routine output.
|
| 19 |
function classifyWasmStderr(text) {
|
|
@@ -174,6 +180,7 @@ async function runBenchActions(Module, {
|
|
| 174 |
parseBenchResult('bench_pp', raw);
|
| 175 |
samples_ns.push(t_ns);
|
| 176 |
onLog?.(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
|
|
|
|
| 177 |
}
|
| 178 |
tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
|
| 179 |
} catch (err) {
|
|
@@ -202,6 +209,7 @@ async function runBenchActions(Module, {
|
|
| 202 |
parseBenchResult('bench_tg', raw);
|
| 203 |
samples_ns.push(t_ns);
|
| 204 |
onLog?.(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
|
|
|
|
| 205 |
}
|
| 206 |
tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
|
| 207 |
} catch (err) {
|
|
|
|
| 14 |
// (e.g. early-EOS models that produce 1 token will always report 100%).
|
| 15 |
const CONSISTENCY_MIN_TOKENS = 8;
|
| 16 |
|
| 17 |
+
// Sleep between perf reps so the GPU clock state can recover. Without
|
| 18 |
+
// this, sustained tg decode reps showed monotonic decay (rep 1 fastest,
|
| 19 |
+
// rep N slowest) — looks like Apple's GPU power-state cooldown.
|
| 20 |
+
const REP_COOLDOWN_MS = 1000;
|
| 21 |
+
const sleep = (ms) => new Promise(r => setTimeout(r, ms));
|
| 22 |
+
|
| 23 |
// llama.cpp/ggml emit info, warnings, AND errors all to stderr. Tag only the
|
| 24 |
// actually-bad lines as :err so real failures stand out from routine output.
|
| 25 |
function classifyWasmStderr(text) {
|
|
|
|
| 180 |
parseBenchResult('bench_pp', raw);
|
| 181 |
samples_ns.push(t_ns);
|
| 182 |
onLog?.(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
|
| 183 |
+
if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
|
| 184 |
}
|
| 185 |
tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
|
| 186 |
} catch (err) {
|
|
|
|
| 209 |
parseBenchResult('bench_tg', raw);
|
| 210 |
samples_ns.push(t_ns);
|
| 211 |
onLog?.(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
|
| 212 |
+
if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
|
| 213 |
}
|
| 214 |
tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
|
| 215 |
} catch (err) {
|