Spaces:
Running
Running
GitHub Actions commited on
Commit ·
ee944ff
1
Parent(s): bc8e1d3
sync from abhijitramesh/webgpu-bench@cfa77c10dc
Browse files- build/asyncify/bench.js +0 -0
- build/asyncify/bench.wasm +2 -2
- build/asyncify/build-info.json +1 -1
- build/jspi/bench.js +0 -0
- build/jspi/bench.wasm +2 -2
- build/jspi/build-info.json +1 -1
- harness.js +2 -0
- js/dataset.js +4 -0
- js/run/bench-worker.js +37 -13
- js/run/controller.js +24 -2
- run.html +4 -0
build/asyncify/bench.js
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
build/asyncify/bench.wasm
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:992c98b10bc138f1b92c12b9586a8ea0f925ea9e6789b2a8da3117c928ee5acc
|
| 3 |
+
size 5240204
|
build/asyncify/build-info.json
CHANGED
|
@@ -2,5 +2,5 @@
|
|
| 2 |
"llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
|
| 3 |
"llamaCppDescribe": "b8981-3-gf22c8021d",
|
| 4 |
"dawnTag": "v20260317.182325",
|
| 5 |
-
"builtAt": "2026-05-
|
| 6 |
}
|
|
|
|
| 2 |
"llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
|
| 3 |
"llamaCppDescribe": "b8981-3-gf22c8021d",
|
| 4 |
"dawnTag": "v20260317.182325",
|
| 5 |
+
"builtAt": "2026-05-01T09:06:07Z"
|
| 6 |
}
|
build/jspi/bench.js
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
build/jspi/bench.wasm
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a699aba49bfcc4e8cf4e78efa891cf1eb4b7a895345ea17d64916cc7c50c6df1
|
| 3 |
+
size 3616251
|
build/jspi/build-info.json
CHANGED
|
@@ -2,5 +2,5 @@
|
|
| 2 |
"llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
|
| 3 |
"llamaCppDescribe": "b8981-3-gf22c8021d",
|
| 4 |
"dawnTag": "v20260317.182325",
|
| 5 |
-
"builtAt": "2026-05-
|
| 6 |
}
|
|
|
|
| 2 |
"llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
|
| 3 |
"llamaCppDescribe": "b8981-3-gf22c8021d",
|
| 4 |
"dawnTag": "v20260317.182325",
|
| 5 |
+
"builtAt": "2026-05-01T09:02:12Z"
|
| 6 |
}
|
harness.js
CHANGED
|
@@ -30,6 +30,7 @@ window.addEventListener('unhandledrejection', (e) => {
|
|
| 30 |
const nPrompt = parseInt(params.get('nPrompt') || '512', 10);
|
| 31 |
const nGen = parseInt(params.get('nGen') || '128', 10);
|
| 32 |
const nReps = parseInt(params.get('nReps') || '5', 10);
|
|
|
|
| 33 |
const nCtx = parseInt(params.get('nCtx') || '2048', 10);
|
| 34 |
const nGpuLayers = parseInt(params.get('nGpuLayers') || '999', 10);
|
| 35 |
const refTokenIds = params.get('refTokenIds') || null;
|
|
@@ -145,6 +146,7 @@ window.addEventListener('unhandledrejection', (e) => {
|
|
| 145 |
nPrompt: runPerf ? nPrompt : 0,
|
| 146 |
nGen: runPerf ? nGen : 0,
|
| 147 |
nReps,
|
|
|
|
| 148 |
noWarmup: false,
|
| 149 |
},
|
| 150 |
opfsPath: { rootDir: OPFS_ROOT_NAME, repo: hfRepo, filename: modelFile },
|
|
|
|
| 30 |
const nPrompt = parseInt(params.get('nPrompt') || '512', 10);
|
| 31 |
const nGen = parseInt(params.get('nGen') || '128', 10);
|
| 32 |
const nReps = parseInt(params.get('nReps') || '5', 10);
|
| 33 |
+
const nDepth = parseInt(params.get('nDepth') || '0', 10);
|
| 34 |
const nCtx = parseInt(params.get('nCtx') || '2048', 10);
|
| 35 |
const nGpuLayers = parseInt(params.get('nGpuLayers') || '999', 10);
|
| 36 |
const refTokenIds = params.get('refTokenIds') || null;
|
|
|
|
| 146 |
nPrompt: runPerf ? nPrompt : 0,
|
| 147 |
nGen: runPerf ? nGen : 0,
|
| 148 |
nReps,
|
| 149 |
+
nDepth: runPerf ? nDepth : 0,
|
| 150 |
noWarmup: false,
|
| 151 |
},
|
| 152 |
opfsPath: { rootDir: OPFS_ROOT_NAME, repo: hfRepo, filename: modelFile },
|
js/dataset.js
CHANGED
|
@@ -151,6 +151,10 @@ function flattenForDashboard(r, slug) {
|
|
| 151 |
tg_test_name: tg?.name ?? null,
|
| 152 |
pp_n_prompt: pp?.n_prompt ?? r.nPrompt ?? null,
|
| 153 |
tg_n_gen: tg?.n_gen ?? r.nGen ?? null,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
n_p_eval: r.metrics?.n_p_eval ?? null,
|
| 155 |
t_p_eval_ms: r.metrics?.t_p_eval_ms ?? null,
|
| 156 |
n_eval: r.metrics?.n_eval ?? null,
|
|
|
|
| 151 |
tg_test_name: tg?.name ?? null,
|
| 152 |
pp_n_prompt: pp?.n_prompt ?? r.nPrompt ?? null,
|
| 153 |
tg_n_gen: tg?.n_gen ?? r.nGen ?? null,
|
| 154 |
+
// KV-cache depth the timed reps ran at. Mirrors llama-bench's `-d` and
|
| 155 |
+
// is per-test in metrics.tests; record-level r.nDepth is the
|
| 156 |
+
// study/runner-set value, used as a fallback for older exports.
|
| 157 |
+
n_depth: pp?.n_depth ?? tg?.n_depth ?? r.nDepth ?? 0,
|
| 158 |
n_p_eval: r.metrics?.n_p_eval ?? null,
|
| 159 |
t_p_eval_ms: r.metrics?.t_p_eval_ms ?? null,
|
| 160 |
n_eval: r.metrics?.n_eval ?? null,
|
js/run/bench-worker.js
CHANGED
|
@@ -13,7 +13,7 @@
|
|
| 13 |
// // consistency phase (set consistencyPrompt to '' to skip)
|
| 14 |
// consistencyPrompt, consistencyNPredict, refTokenIds,
|
| 15 |
// // perf phase
|
| 16 |
-
// nPrompt, nGen, nReps, noWarmup,
|
| 17 |
// },
|
| 18 |
// opfsPath: { rootDir, repo, filename }
|
| 19 |
// }
|
|
@@ -162,10 +162,14 @@ function opfsFreeAll(Module) {
|
|
| 162 |
// llama-bench reports avg_ts = (n_tokens * 1e9) / avg_ns and stddev_ts as
|
| 163 |
// the std of per-sample t/s, computed independently rather than propagated
|
| 164 |
// from stddev_ns (the mapping isn't linear).
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
const n = samples_ns.length;
|
| 167 |
if (n === 0) {
|
| 168 |
-
return { name, n_prompt, n_gen, avg_ns: 0, stddev_ns: 0, avg_ts: 0, stddev_ts: 0, samples_ns: [], samples_ts: [] };
|
| 169 |
}
|
| 170 |
const avg_ns = samples_ns.reduce((a, b) => a + b, 0) / n;
|
| 171 |
const var_ns = n > 1
|
|
@@ -184,6 +188,7 @@ function buildTest(name, n_prompt, n_gen, samples_ns) {
|
|
| 184 |
name,
|
| 185 |
n_prompt,
|
| 186 |
n_gen,
|
|
|
|
| 187 |
avg_ns: Math.round(avg_ns),
|
| 188 |
stddev_ns: Math.round(stddev_ns),
|
| 189 |
avg_ts: round2(avg_ts),
|
|
@@ -236,6 +241,7 @@ async function runOne({ params, opfsPath }) {
|
|
| 236 |
nPrompt,
|
| 237 |
nGen,
|
| 238 |
nReps,
|
|
|
|
| 239 |
noWarmup,
|
| 240 |
} = params;
|
| 241 |
// The worker only loads via OPFS now: main thread downloads to OPFS,
|
|
@@ -409,29 +415,44 @@ async function runOne({ params, opfsPath }) {
|
|
| 409 |
// which the dashboard renders as a dash.
|
| 410 |
const wantPp = nPrompt > 0;
|
| 411 |
const wantTg = nGen > 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
if (wantPp || wantTg) {
|
| 413 |
const tests = [];
|
| 414 |
|
| 415 |
if (wantPp) {
|
| 416 |
try {
|
| 417 |
if (!noWarmup) {
|
| 418 |
-
status('perf', `warmup pp${nPrompt}`, Date.now());
|
| 419 |
-
|
|
|
|
| 420 |
const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
|
| 421 |
parseBenchResult('bench_pp warmup', raw);
|
| 422 |
}
|
| 423 |
const samples_ns = [];
|
| 424 |
for (let i = 0; i < nReps; i++) {
|
| 425 |
-
status('perf', `pp${nPrompt} ${i + 1}/${nReps}`, Date.now());
|
|
|
|
| 426 |
const t0 = performance.now();
|
| 427 |
const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
|
| 428 |
const t_ns = (performance.now() - t0) * 1e6;
|
| 429 |
parseBenchResult('bench_pp', raw);
|
| 430 |
samples_ns.push(t_ns);
|
| 431 |
-
log(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
|
| 432 |
if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
|
| 433 |
}
|
| 434 |
-
tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
|
| 435 |
} catch (err) {
|
| 436 |
log(`pp test failed: ${err.message}`);
|
| 437 |
}
|
|
@@ -444,23 +465,25 @@ async function runOne({ params, opfsPath }) {
|
|
| 444 |
// A 1-token warmup exercises the decode kernel once, which leaves
|
| 445 |
// the first timed rep absorbing pipeline-cache / shader-specialize
|
| 446 |
// cost on every subsequent step.
|
| 447 |
-
status('perf', `warmup tg${nGen}`, Date.now());
|
| 448 |
-
|
|
|
|
| 449 |
const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
|
| 450 |
parseBenchResult('bench_tg warmup', raw);
|
| 451 |
}
|
| 452 |
const samples_ns = [];
|
| 453 |
for (let i = 0; i < nReps; i++) {
|
| 454 |
-
status('perf', `tg${nGen} ${i + 1}/${nReps}`, Date.now());
|
|
|
|
| 455 |
const t0 = performance.now();
|
| 456 |
const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
|
| 457 |
const t_ns = (performance.now() - t0) * 1e6;
|
| 458 |
parseBenchResult('bench_tg', raw);
|
| 459 |
samples_ns.push(t_ns);
|
| 460 |
-
log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
|
| 461 |
if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
|
| 462 |
}
|
| 463 |
-
tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
|
| 464 |
} catch (err) {
|
| 465 |
log(`tg test failed: ${err.message}`);
|
| 466 |
}
|
|
@@ -471,6 +494,7 @@ async function runOne({ params, opfsPath }) {
|
|
| 471 |
tests,
|
| 472 |
n_prompt: wantPp ? nPrompt : 0,
|
| 473 |
n_gen: wantTg ? nGen : 0,
|
|
|
|
| 474 |
n_reps: nReps,
|
| 475 |
};
|
| 476 |
}
|
|
|
|
| 13 |
// // consistency phase (set consistencyPrompt to '' to skip)
|
| 14 |
// consistencyPrompt, consistencyNPredict, refTokenIds,
|
| 15 |
// // perf phase
|
| 16 |
+
// nPrompt, nGen, nReps, nDepth, noWarmup,
|
| 17 |
// },
|
| 18 |
// opfsPath: { rootDir, repo, filename }
|
| 19 |
// }
|
|
|
|
| 162 |
// llama-bench reports avg_ts = (n_tokens * 1e9) / avg_ns and stddev_ts as
|
| 163 |
// the std of per-sample t/s, computed independently rather than propagated
|
| 164 |
// from stddev_ns (the mapping isn't linear).
|
| 165 |
+
//
|
| 166 |
+
// `n_depth` carries through unchanged so downstream consumers can label
|
| 167 |
+
// e.g. "pp512 @ d2048" the way llama-bench does (line 1984 of
|
| 168 |
+
// llama.cpp/tools/llama-bench/llama-bench.cpp).
|
| 169 |
+
function buildTest(name, n_prompt, n_gen, n_depth, samples_ns) {
|
| 170 |
const n = samples_ns.length;
|
| 171 |
if (n === 0) {
|
| 172 |
+
return { name, n_prompt, n_gen, n_depth, avg_ns: 0, stddev_ns: 0, avg_ts: 0, stddev_ts: 0, samples_ns: [], samples_ts: [] };
|
| 173 |
}
|
| 174 |
const avg_ns = samples_ns.reduce((a, b) => a + b, 0) / n;
|
| 175 |
const var_ns = n > 1
|
|
|
|
| 188 |
name,
|
| 189 |
n_prompt,
|
| 190 |
n_gen,
|
| 191 |
+
n_depth,
|
| 192 |
avg_ns: Math.round(avg_ns),
|
| 193 |
stddev_ns: Math.round(stddev_ns),
|
| 194 |
avg_ts: round2(avg_ts),
|
|
|
|
| 241 |
nPrompt,
|
| 242 |
nGen,
|
| 243 |
nReps,
|
| 244 |
+
nDepth = 0,
|
| 245 |
noWarmup,
|
| 246 |
} = params;
|
| 247 |
// The worker only loads via OPFS now: main thread downloads to OPFS,
|
|
|
|
| 415 |
// which the dashboard renders as a dash.
|
| 416 |
const wantPp = nPrompt > 0;
|
| 417 |
const wantTg = nGen > 0;
|
| 418 |
+
// Test name suffix mirroring llama-bench (e.g. "pp512 @ d2048").
|
| 419 |
+
const depthSuffix = nDepth > 0 ? ` @ d${nDepth}` : '';
|
| 420 |
+
// Each timed rep is preceded by an untimed bench_set_depth call so the KV
|
| 421 |
+
// cache is in a known state. The C side caches the post-prefill snapshot,
|
| 422 |
+
// so reps 2..N at the same depth restore from snapshot instead of
|
| 423 |
+
// re-running the prefill (mirroring llama-bench's `cstate` reuse).
|
| 424 |
+
const setDepth = async (label) => {
|
| 425 |
+
const raw = await Module.ccall('bench_set_depth', 'string', ['number'], [nDepth], { async: true });
|
| 426 |
+
const r = parseBenchResult(`bench_set_depth(${nDepth}) ${label}`, raw);
|
| 427 |
+
if (nDepth > 0) {
|
| 428 |
+
log(`bench_set_depth(${nDepth}) ${label}: ${r.cached ? 'restored snapshot' : 'prefilled'}`);
|
| 429 |
+
}
|
| 430 |
+
};
|
| 431 |
if (wantPp || wantTg) {
|
| 432 |
const tests = [];
|
| 433 |
|
| 434 |
if (wantPp) {
|
| 435 |
try {
|
| 436 |
if (!noWarmup) {
|
| 437 |
+
status('perf', `warmup pp${nPrompt}${depthSuffix}`, Date.now());
|
| 438 |
+
await setDepth('pp warmup');
|
| 439 |
+
log(`bench_pp(${nPrompt})${depthSuffix} — warmup`);
|
| 440 |
const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
|
| 441 |
parseBenchResult('bench_pp warmup', raw);
|
| 442 |
}
|
| 443 |
const samples_ns = [];
|
| 444 |
for (let i = 0; i < nReps; i++) {
|
| 445 |
+
status('perf', `pp${nPrompt}${depthSuffix} ${i + 1}/${nReps}`, Date.now());
|
| 446 |
+
await setDepth(`pp rep ${i + 1}/${nReps}`);
|
| 447 |
const t0 = performance.now();
|
| 448 |
const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
|
| 449 |
const t_ns = (performance.now() - t0) * 1e6;
|
| 450 |
parseBenchResult('bench_pp', raw);
|
| 451 |
samples_ns.push(t_ns);
|
| 452 |
+
log(`pp${nPrompt}${depthSuffix} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
|
| 453 |
if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
|
| 454 |
}
|
| 455 |
+
tests.push(buildTest(`pp${nPrompt}${depthSuffix}`, nPrompt, 0, nDepth, samples_ns));
|
| 456 |
} catch (err) {
|
| 457 |
log(`pp test failed: ${err.message}`);
|
| 458 |
}
|
|
|
|
| 465 |
// A 1-token warmup exercises the decode kernel once, which leaves
|
| 466 |
// the first timed rep absorbing pipeline-cache / shader-specialize
|
| 467 |
// cost on every subsequent step.
|
| 468 |
+
status('perf', `warmup tg${nGen}${depthSuffix}`, Date.now());
|
| 469 |
+
await setDepth('tg warmup');
|
| 470 |
+
log(`bench_tg(${nGen})${depthSuffix} — warmup`);
|
| 471 |
const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
|
| 472 |
parseBenchResult('bench_tg warmup', raw);
|
| 473 |
}
|
| 474 |
const samples_ns = [];
|
| 475 |
for (let i = 0; i < nReps; i++) {
|
| 476 |
+
status('perf', `tg${nGen}${depthSuffix} ${i + 1}/${nReps}`, Date.now());
|
| 477 |
+
await setDepth(`tg rep ${i + 1}/${nReps}`);
|
| 478 |
const t0 = performance.now();
|
| 479 |
const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
|
| 480 |
const t_ns = (performance.now() - t0) * 1e6;
|
| 481 |
parseBenchResult('bench_tg', raw);
|
| 482 |
samples_ns.push(t_ns);
|
| 483 |
+
log(`tg${nGen}${depthSuffix} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
|
| 484 |
if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
|
| 485 |
}
|
| 486 |
+
tests.push(buildTest(`tg${nGen}${depthSuffix}`, 0, nGen, nDepth, samples_ns));
|
| 487 |
} catch (err) {
|
| 488 |
log(`tg test failed: ${err.message}`);
|
| 489 |
}
|
|
|
|
| 494 |
tests,
|
| 495 |
n_prompt: wantPp ? nPrompt : 0,
|
| 496 |
n_gen: wantTg ? nGen : 0,
|
| 497 |
+
n_depth: nDepth,
|
| 498 |
n_reps: nReps,
|
| 499 |
};
|
| 500 |
}
|
js/run/controller.js
CHANGED
|
@@ -22,6 +22,7 @@ const YIELD_BETWEEN_RUNS_MS = 500;
|
|
| 22 |
// llama-bench defaults: -p 512 -n 128 -r 5
|
| 23 |
const DEFAULT_N_PROMPT = 512;
|
| 24 |
const DEFAULT_N_GEN = 128;
|
|
|
|
| 25 |
const DEFAULT_ITERATIONS = 5;
|
| 26 |
const MIN_ITERATIONS_FOR_SUBMIT = 5;
|
| 27 |
|
|
@@ -40,6 +41,7 @@ const state = {
|
|
| 40 |
iterations: DEFAULT_ITERATIONS,
|
| 41 |
nPrompt: DEFAULT_N_PROMPT,
|
| 42 |
nGen: DEFAULT_N_GEN,
|
|
|
|
| 43 |
// User-controlled phase toggles. Defaults match the previous behaviour:
|
| 44 |
// run consistency (CPU baseline + GPU forced-decode) AND run CPU perf
|
| 45 |
// baseline. Both checkable to skip — useful on devices where CPU is too
|
|
@@ -706,6 +708,15 @@ function wirePerfInputs() {
|
|
| 706 |
ng.value = String(state.nGen);
|
| 707 |
});
|
| 708 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
const skipCons = $('skip-consistency');
|
| 710 |
if (skipCons) {
|
| 711 |
skipCons.checked = state.skipConsistency;
|
|
@@ -1400,6 +1411,7 @@ async function runBenchmarkInWorker(v, params, callbacks) {
|
|
| 1400 |
nPrompt: params.nPrompt ?? 0,
|
| 1401 |
nGen: params.nGen ?? 0,
|
| 1402 |
nReps: params.nReps ?? DEFAULT_ITERATIONS,
|
|
|
|
| 1403 |
noWarmup: !!params.noWarmup,
|
| 1404 |
};
|
| 1405 |
|
|
@@ -1453,6 +1465,11 @@ async function runVariantWithIterations(v, row) {
|
|
| 1453 |
const nReps = Math.max(1, state.iterations || DEFAULT_ITERATIONS);
|
| 1454 |
const nPrompt = Math.max(0, state.nPrompt ?? DEFAULT_N_PROMPT);
|
| 1455 |
const nGen = Math.max(0, state.nGen ?? DEFAULT_N_GEN);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1456 |
// Phase toggles from the run page. Combined effect:
|
| 1457 |
// skip both → only GPU perf, no CPU pass at all
|
| 1458 |
// skip consistency → CPU perf baseline + GPU perf, no token-id check
|
|
@@ -1480,8 +1497,11 @@ async function runVariantWithIterations(v, row) {
|
|
| 1480 |
refTokenIds: null,
|
| 1481 |
nPrompt: runCpuPerf ? nPrompt : 0,
|
| 1482 |
nGen: runCpuPerf ? nGen : 0,
|
|
|
|
|
|
|
|
|
|
| 1483 |
nReps: 1,
|
| 1484 |
-
nCtx:
|
| 1485 |
nGpuLayers: 0,
|
| 1486 |
}, {
|
| 1487 |
onStatus: (status, msg, sinceMs) => row.setStatus(`cpu/${status}`, msg, sinceMs),
|
|
@@ -1525,8 +1545,9 @@ async function runVariantWithIterations(v, row) {
|
|
| 1525 |
refTokenIds: refTokenIds || null,
|
| 1526 |
nPrompt,
|
| 1527 |
nGen,
|
|
|
|
| 1528 |
nReps,
|
| 1529 |
-
nCtx:
|
| 1530 |
nGpuLayers: DEFAULT_N_GPU_LAYERS,
|
| 1531 |
}, {
|
| 1532 |
onStatus: (s, m, sinceMs) => row.setStatus(`gpu/${s}`, m, sinceMs),
|
|
@@ -1605,6 +1626,7 @@ function makeRecord(v, vr, machine, browser, wallTimeMs) {
|
|
| 1605 |
nPredict: DEFAULT_N_PREDICT,
|
| 1606 |
nPrompt: gpu?.metrics?.n_prompt ?? 0,
|
| 1607 |
nGen: gpu?.metrics?.n_gen ?? 0,
|
|
|
|
| 1608 |
nReps: gpu?.metrics?.n_reps ?? 0,
|
| 1609 |
nGpuLayers: DEFAULT_N_GPU_LAYERS,
|
| 1610 |
timestamp: new Date().toISOString(),
|
|
|
|
| 22 |
// llama-bench defaults: -p 512 -n 128 -r 5
|
| 23 |
const DEFAULT_N_PROMPT = 512;
|
| 24 |
const DEFAULT_N_GEN = 128;
|
| 25 |
+
const DEFAULT_N_DEPTH = 2048;
|
| 26 |
const DEFAULT_ITERATIONS = 5;
|
| 27 |
const MIN_ITERATIONS_FOR_SUBMIT = 5;
|
| 28 |
|
|
|
|
| 41 |
iterations: DEFAULT_ITERATIONS,
|
| 42 |
nPrompt: DEFAULT_N_PROMPT,
|
| 43 |
nGen: DEFAULT_N_GEN,
|
| 44 |
+
nDepth: DEFAULT_N_DEPTH,
|
| 45 |
// User-controlled phase toggles. Defaults match the previous behaviour:
|
| 46 |
// run consistency (CPU baseline + GPU forced-decode) AND run CPU perf
|
| 47 |
// baseline. Both checkable to skip — useful on devices where CPU is too
|
|
|
|
| 708 |
ng.value = String(state.nGen);
|
| 709 |
});
|
| 710 |
}
|
| 711 |
+
const nd = $('n-depth-input');
|
| 712 |
+
if (nd) {
|
| 713 |
+
nd.value = String(state.nDepth);
|
| 714 |
+
nd.addEventListener('change', () => {
|
| 715 |
+
const n = Math.max(0, Math.min(32768, parseInt(nd.value, 10)));
|
| 716 |
+
state.nDepth = Number.isFinite(n) ? n : DEFAULT_N_DEPTH;
|
| 717 |
+
nd.value = String(state.nDepth);
|
| 718 |
+
});
|
| 719 |
+
}
|
| 720 |
const skipCons = $('skip-consistency');
|
| 721 |
if (skipCons) {
|
| 722 |
skipCons.checked = state.skipConsistency;
|
|
|
|
| 1411 |
nPrompt: params.nPrompt ?? 0,
|
| 1412 |
nGen: params.nGen ?? 0,
|
| 1413 |
nReps: params.nReps ?? DEFAULT_ITERATIONS,
|
| 1414 |
+
nDepth: params.nDepth ?? 0,
|
| 1415 |
noWarmup: !!params.noWarmup,
|
| 1416 |
};
|
| 1417 |
|
|
|
|
| 1465 |
const nReps = Math.max(1, state.iterations || DEFAULT_ITERATIONS);
|
| 1466 |
const nPrompt = Math.max(0, state.nPrompt ?? DEFAULT_N_PROMPT);
|
| 1467 |
const nGen = Math.max(0, state.nGen ?? DEFAULT_N_GEN);
|
| 1468 |
+
const nDepth = Math.max(0, state.nDepth ?? DEFAULT_N_DEPTH);
|
| 1469 |
+
// Per-test n_ctx mirrors llama-bench (line 1211 of
|
| 1470 |
+
// tools/llama-bench/llama-bench.cpp): sized to fit prompt+gen+depth so a
|
| 1471 |
+
// raised depth doesn't silently overflow the cache.
|
| 1472 |
+
const nCtxFor = (depth) => Math.max(DEFAULT_N_CTX, nPrompt + nGen + depth);
|
| 1473 |
// Phase toggles from the run page. Combined effect:
|
| 1474 |
// skip both → only GPU perf, no CPU pass at all
|
| 1475 |
// skip consistency → CPU perf baseline + GPU perf, no token-id check
|
|
|
|
| 1497 |
refTokenIds: null,
|
| 1498 |
nPrompt: runCpuPerf ? nPrompt : 0,
|
| 1499 |
nGen: runCpuPerf ? nGen : 0,
|
| 1500 |
+
// CPU baseline keeps depth=0 — its job is reference-token capture
|
| 1501 |
+
// and a single-rep perf comparator, not depth-loaded sweeping.
|
| 1502 |
+
nDepth: 0,
|
| 1503 |
nReps: 1,
|
| 1504 |
+
nCtx: nCtxFor(0),
|
| 1505 |
nGpuLayers: 0,
|
| 1506 |
}, {
|
| 1507 |
onStatus: (status, msg, sinceMs) => row.setStatus(`cpu/${status}`, msg, sinceMs),
|
|
|
|
| 1545 |
refTokenIds: refTokenIds || null,
|
| 1546 |
nPrompt,
|
| 1547 |
nGen,
|
| 1548 |
+
nDepth,
|
| 1549 |
nReps,
|
| 1550 |
+
nCtx: nCtxFor(nDepth),
|
| 1551 |
nGpuLayers: DEFAULT_N_GPU_LAYERS,
|
| 1552 |
}, {
|
| 1553 |
onStatus: (s, m, sinceMs) => row.setStatus(`gpu/${s}`, m, sinceMs),
|
|
|
|
| 1626 |
nPredict: DEFAULT_N_PREDICT,
|
| 1627 |
nPrompt: gpu?.metrics?.n_prompt ?? 0,
|
| 1628 |
nGen: gpu?.metrics?.n_gen ?? 0,
|
| 1629 |
+
nDepth: gpu?.metrics?.n_depth ?? 0,
|
| 1630 |
nReps: gpu?.metrics?.n_reps ?? 0,
|
| 1631 |
nGpuLayers: DEFAULT_N_GPU_LAYERS,
|
| 1632 |
timestamp: new Date().toISOString(),
|
run.html
CHANGED
|
@@ -231,6 +231,10 @@
|
|
| 231 |
<label class="filter-label" for="n-gen-input">Gen tokens (-n)</label>
|
| 232 |
<input type="number" id="n-gen-input" class="filter-select run-iter-input" value="128" min="0" max="4096" step="1">
|
| 233 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
<div class="filter-group">
|
| 235 |
<label class="filter-label" for="iterations-input">Reps (-r)</label>
|
| 236 |
<input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">
|
|
|
|
| 231 |
<label class="filter-label" for="n-gen-input">Gen tokens (-n)</label>
|
| 232 |
<input type="number" id="n-gen-input" class="filter-select run-iter-input" value="128" min="0" max="4096" step="1">
|
| 233 |
</div>
|
| 234 |
+
<div class="filter-group">
|
| 235 |
+
<label class="filter-label" for="n-depth-input">KV depth (-d)</label>
|
| 236 |
+
<input type="number" id="n-depth-input" class="filter-select run-iter-input" value="2048" min="0" max="32768" step="1">
|
| 237 |
+
</div>
|
| 238 |
<div class="filter-group">
|
| 239 |
<label class="filter-label" for="iterations-input">Reps (-r)</label>
|
| 240 |
<input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">
|