Spaces:
Running
Running
GitHub Actions commited on
Commit Β·
629e542
1
Parent(s): 1683f65
sync from abhijitramesh/webgpu-bench@1a0973fa5b
Browse files- js/run/bench-worker.js +84 -63
- js/run/controller.js +70 -31
- js/run/core.js +81 -64
- run.html +7 -0
js/run/bench-worker.js
CHANGED
|
@@ -254,89 +254,110 @@ async function runOne({ params, stream, buffer }) {
|
|
| 254 |
}
|
| 255 |
|
| 256 |
// βββ Consistency phase βββ
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
if (consistencyPrompt) {
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
);
|
| 266 |
-
const r = parseBenchResult('bench_run', raw);
|
| 267 |
-
result.output = r.output || '';
|
| 268 |
-
result.consistency = { token_ids: r.token_ids || [] };
|
| 269 |
-
|
| 270 |
-
if (refTokenIds) {
|
| 271 |
-
log('bench_eval_tokens β forced-decode vs CPU baseline');
|
| 272 |
-
const evalRaw = await Module.ccall(
|
| 273 |
-
'bench_eval_tokens', 'string',
|
| 274 |
-
['string', 'string'],
|
| 275 |
-
[consistencyPrompt, refTokenIds],
|
| 276 |
{ async: true },
|
| 277 |
);
|
| 278 |
-
const
|
| 279 |
-
result.
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
(
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
}
|
| 286 |
}
|
| 287 |
|
| 288 |
// βββ Perf phase (llama-bench style) βββ
|
|
|
|
|
|
|
|
|
|
| 289 |
const wantPp = nPrompt > 0;
|
| 290 |
const wantTg = nGen > 0;
|
| 291 |
if (wantPp || wantTg) {
|
| 292 |
const tests = [];
|
| 293 |
|
| 294 |
if (wantPp) {
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
}
|
| 311 |
-
tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
|
| 312 |
}
|
| 313 |
|
| 314 |
if (wantTg) {
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
}
|
| 321 |
-
const samples_ns = [];
|
| 322 |
-
for (let i = 0; i < nReps; i++) {
|
| 323 |
-
status('perf', `tg${nGen} ${i + 1}/${nReps}`);
|
| 324 |
-
const t0 = performance.now();
|
| 325 |
-
const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
|
| 326 |
-
const t_ns = (performance.now() - t0) * 1e6;
|
| 327 |
-
parseBenchResult('bench_tg', raw);
|
| 328 |
-
samples_ns.push(t_ns);
|
| 329 |
-
log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
|
| 330 |
-
}
|
| 331 |
-
tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
|
| 332 |
}
|
| 333 |
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
|
|
|
|
|
|
| 340 |
}
|
| 341 |
|
| 342 |
await Module.ccall('bench_exit', null, [], [], { async: true });
|
|
|
|
| 254 |
}
|
| 255 |
|
| 256 |
// βββ Consistency phase βββ
|
| 257 |
+
// Soft-fail: a failure here logs and falls through to the perf phase
|
| 258 |
+
// rather than aborting the whole run. Some devices/models can't survive
|
| 259 |
+
// bench_run (e.g. unsupported op, OOM mid-decode) but can still produce
|
| 260 |
+
// useful pp/tg numbers via synthetic-token paths.
|
| 261 |
if (consistencyPrompt) {
|
| 262 |
+
try {
|
| 263 |
+
status('consistency', 'Running consistency check...');
|
| 264 |
+
log(`bench_run("...", ${consistencyNPredict}) β consistency phase`);
|
| 265 |
+
const raw = await Module.ccall(
|
| 266 |
+
'bench_run', 'string',
|
| 267 |
+
['string', 'number'],
|
| 268 |
+
[consistencyPrompt, consistencyNPredict],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
{ async: true },
|
| 270 |
);
|
| 271 |
+
const r = parseBenchResult('bench_run', raw);
|
| 272 |
+
result.output = r.output || '';
|
| 273 |
+
result.consistency = { token_ids: r.token_ids || [] };
|
| 274 |
+
|
| 275 |
+
if (refTokenIds) {
|
| 276 |
+
log('bench_eval_tokens β forced-decode vs CPU baseline');
|
| 277 |
+
const evalRaw = await Module.ccall(
|
| 278 |
+
'bench_eval_tokens', 'string',
|
| 279 |
+
['string', 'string'],
|
| 280 |
+
[consistencyPrompt, refTokenIds],
|
| 281 |
+
{ async: true },
|
| 282 |
+
);
|
| 283 |
+
const ev = parseBenchResult('bench_eval_tokens', evalRaw);
|
| 284 |
+
result.consistency = { ...result.consistency, ...ev };
|
| 285 |
+
log(
|
| 286 |
+
`Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
|
| 287 |
+
`${ev.n_agree}/${ev.n_tokens})` +
|
| 288 |
+
(ev.first_disagreement >= 0 ? ` β first diverge @ ${ev.first_disagreement}` : '')
|
| 289 |
+
);
|
| 290 |
+
}
|
| 291 |
+
} catch (err) {
|
| 292 |
+
log(`Consistency phase failed: ${err.message} β continuing to perf phase`);
|
| 293 |
}
|
| 294 |
}
|
| 295 |
|
| 296 |
// βββ Perf phase (llama-bench style) βββ
|
| 297 |
+
// Each test (pp, tg) is wrapped independently so a failure in one doesn't
|
| 298 |
+
// skip the other. Empty samples_ns produces a buildTest with avg_ts=0,
|
| 299 |
+
// which the dashboard renders as a dash.
|
| 300 |
const wantPp = nPrompt > 0;
|
| 301 |
const wantTg = nGen > 0;
|
| 302 |
if (wantPp || wantTg) {
|
| 303 |
const tests = [];
|
| 304 |
|
| 305 |
if (wantPp) {
|
| 306 |
+
try {
|
| 307 |
+
if (!noWarmup) {
|
| 308 |
+
status('perf', `warmup pp${nPrompt}`);
|
| 309 |
+
log(`bench_pp(${nPrompt}) β warmup`);
|
| 310 |
+
const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
|
| 311 |
+
parseBenchResult('bench_pp warmup', raw);
|
| 312 |
+
}
|
| 313 |
+
const samples_ns = [];
|
| 314 |
+
for (let i = 0; i < nReps; i++) {
|
| 315 |
+
status('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
|
| 316 |
+
const t0 = performance.now();
|
| 317 |
+
const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
|
| 318 |
+
const t_ns = (performance.now() - t0) * 1e6;
|
| 319 |
+
parseBenchResult('bench_pp', raw);
|
| 320 |
+
samples_ns.push(t_ns);
|
| 321 |
+
log(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
|
| 322 |
+
}
|
| 323 |
+
tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
|
| 324 |
+
} catch (err) {
|
| 325 |
+
log(`pp test failed: ${err.message}`);
|
| 326 |
}
|
|
|
|
| 327 |
}
|
| 328 |
|
| 329 |
if (wantTg) {
|
| 330 |
+
try {
|
| 331 |
+
if (!noWarmup) {
|
| 332 |
+
status('perf', `warmup tg`);
|
| 333 |
+
log('bench_tg(1) β warmup');
|
| 334 |
+
const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
|
| 335 |
+
parseBenchResult('bench_tg warmup', raw);
|
| 336 |
+
}
|
| 337 |
+
const samples_ns = [];
|
| 338 |
+
for (let i = 0; i < nReps; i++) {
|
| 339 |
+
status('perf', `tg${nGen} ${i + 1}/${nReps}`);
|
| 340 |
+
const t0 = performance.now();
|
| 341 |
+
const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
|
| 342 |
+
const t_ns = (performance.now() - t0) * 1e6;
|
| 343 |
+
parseBenchResult('bench_tg', raw);
|
| 344 |
+
samples_ns.push(t_ns);
|
| 345 |
+
log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
|
| 346 |
+
}
|
| 347 |
+
tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
|
| 348 |
+
} catch (err) {
|
| 349 |
+
log(`tg test failed: ${err.message}`);
|
| 350 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
}
|
| 352 |
|
| 353 |
+
if (tests.length > 0) {
|
| 354 |
+
result.metrics = {
|
| 355 |
+
tests,
|
| 356 |
+
n_prompt: wantPp ? nPrompt : 0,
|
| 357 |
+
n_gen: wantTg ? nGen : 0,
|
| 358 |
+
n_reps: nReps,
|
| 359 |
+
};
|
| 360 |
+
}
|
| 361 |
}
|
| 362 |
|
| 363 |
await Module.ccall('bench_exit', null, [], [], { async: true });
|
js/run/controller.js
CHANGED
|
@@ -43,6 +43,12 @@ const state = {
|
|
| 43 |
iterations: DEFAULT_ITERATIONS,
|
| 44 |
nPrompt: DEFAULT_N_PROMPT,
|
| 45 |
nGen: DEFAULT_N_GEN,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
mounted: false,
|
| 47 |
// Tracks variants the Run pipeline downloaded this session (as opposed to
|
| 48 |
// the standalone Download button or pre-existing cache). Only these are
|
|
@@ -660,6 +666,20 @@ function wirePerfInputs() {
|
|
| 660 |
ng.value = String(state.nGen);
|
| 661 |
});
|
| 662 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 663 |
}
|
| 664 |
|
| 665 |
function submittableResults() {
|
|
@@ -1274,55 +1294,74 @@ async function runVariantWithIterations(v, row) {
|
|
| 1274 |
const nReps = Math.max(1, state.iterations || DEFAULT_ITERATIONS);
|
| 1275 |
const nPrompt = Math.max(0, state.nPrompt ?? DEFAULT_N_PROMPT);
|
| 1276 |
const nGen = Math.max(0, state.nGen ?? DEFAULT_N_GEN);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1277 |
|
| 1278 |
// βββ CPU baseline βββ
|
| 1279 |
-
//
|
| 1280 |
-
//
|
| 1281 |
-
//
|
| 1282 |
-
|
| 1283 |
let cpuResult;
|
| 1284 |
-
|
| 1285 |
-
|
| 1286 |
-
|
| 1287 |
-
|
| 1288 |
-
|
| 1289 |
-
|
| 1290 |
-
|
| 1291 |
-
|
| 1292 |
-
|
| 1293 |
-
|
| 1294 |
-
|
| 1295 |
-
|
| 1296 |
-
|
| 1297 |
-
|
| 1298 |
-
|
| 1299 |
-
|
| 1300 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1301 |
}
|
| 1302 |
|
| 1303 |
-
// CPU
|
| 1304 |
-
//
|
| 1305 |
-
//
|
| 1306 |
const cpuOk = cpuResult.status === 'done';
|
| 1307 |
-
if (
|
| 1308 |
-
logLine(
|
| 1309 |
-
`CPU baseline failed (${cpuResult.error || 'unknown'}) β proceeding with GPU run, skipping consistency check.`
|
| 1310 |
-
);
|
| 1311 |
row.setStatus('cpu-skipped', 'continuing with GPU only');
|
| 1312 |
}
|
| 1313 |
|
| 1314 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1315 |
|
| 1316 |
if (state.aborted) {
|
| 1317 |
return { status: 'error', error: 'aborted', cpu: cpuResult, gpu: null };
|
| 1318 |
}
|
| 1319 |
|
| 1320 |
-
// βββ GPU pass: consistency + perf in one model load βββ
|
| 1321 |
row.setStatus('gpu-run', 'loading model');
|
| 1322 |
let gpuResult;
|
| 1323 |
try {
|
| 1324 |
gpuResult = await runBenchmarkInWorker(v, {
|
| 1325 |
-
consistencyPrompt: DEFAULT_PROMPT,
|
| 1326 |
consistencyNPredict: DEFAULT_N_PREDICT,
|
| 1327 |
refTokenIds: refTokenIds || null,
|
| 1328 |
nPrompt,
|
|
|
|
| 43 |
iterations: DEFAULT_ITERATIONS,
|
| 44 |
nPrompt: DEFAULT_N_PROMPT,
|
| 45 |
nGen: DEFAULT_N_GEN,
|
| 46 |
+
// User-controlled phase toggles. Defaults match the previous behaviour:
|
| 47 |
+
// run consistency (CPU baseline + GPU forced-decode) AND run CPU perf
|
| 48 |
+
// baseline. Both checkable to skip β useful on devices where CPU is too
|
| 49 |
+
// slow / unreliable to be worth waiting for.
|
| 50 |
+
skipConsistency: false,
|
| 51 |
+
skipCpuPerf: false,
|
| 52 |
mounted: false,
|
| 53 |
// Tracks variants the Run pipeline downloaded this session (as opposed to
|
| 54 |
// the standalone Download button or pre-existing cache). Only these are
|
|
|
|
| 666 |
ng.value = String(state.nGen);
|
| 667 |
});
|
| 668 |
}
|
| 669 |
+
const skipCons = $('skip-consistency');
|
| 670 |
+
if (skipCons) {
|
| 671 |
+
skipCons.checked = state.skipConsistency;
|
| 672 |
+
skipCons.addEventListener('change', () => {
|
| 673 |
+
state.skipConsistency = skipCons.checked;
|
| 674 |
+
});
|
| 675 |
+
}
|
| 676 |
+
const skipCpu = $('skip-cpu-perf');
|
| 677 |
+
if (skipCpu) {
|
| 678 |
+
skipCpu.checked = state.skipCpuPerf;
|
| 679 |
+
skipCpu.addEventListener('change', () => {
|
| 680 |
+
state.skipCpuPerf = skipCpu.checked;
|
| 681 |
+
});
|
| 682 |
+
}
|
| 683 |
}
|
| 684 |
|
| 685 |
function submittableResults() {
|
|
|
|
| 1294 |
const nReps = Math.max(1, state.iterations || DEFAULT_ITERATIONS);
|
| 1295 |
const nPrompt = Math.max(0, state.nPrompt ?? DEFAULT_N_PROMPT);
|
| 1296 |
const nGen = Math.max(0, state.nGen ?? DEFAULT_N_GEN);
|
| 1297 |
+
// Phase toggles from the run page. Combined effect:
|
| 1298 |
+
// skip both β only GPU perf, no CPU pass at all
|
| 1299 |
+
// skip consistency β CPU perf baseline + GPU perf, no token-id check
|
| 1300 |
+
// skip CPU perf β CPU consistency tokens + GPU consistency + GPU perf
|
| 1301 |
+
// skip neither β full default flow
|
| 1302 |
+
const runConsistency = !state.skipConsistency;
|
| 1303 |
+
const runCpuPerf = !state.skipCpuPerf;
|
| 1304 |
+
const needCpuPass = runConsistency || runCpuPerf;
|
| 1305 |
|
| 1306 |
// βββ CPU baseline βββ
|
| 1307 |
+
// Skipped entirely if both toggles disable it. Otherwise the pass mixes
|
| 1308 |
+
// and matches: consistency_run captures token_ids; perf phase runs at
|
| 1309 |
+
// nReps=1 (single warmup+timed rep β enough to populate the dashboard's
|
| 1310 |
+
// CPU/GPU comparison without doubling CPU runtime).
|
| 1311 |
let cpuResult;
|
| 1312 |
+
if (needCpuPass) {
|
| 1313 |
+
const phaseLabel = runConsistency && runCpuPerf ? 'reference tokens + 1-rep perf'
|
| 1314 |
+
: runConsistency ? 'reference tokens'
|
| 1315 |
+
: '1-rep perf';
|
| 1316 |
+
row.setStatus('cpu-baseline', phaseLabel);
|
| 1317 |
+
try {
|
| 1318 |
+
cpuResult = await runBenchmarkInWorker(v, {
|
| 1319 |
+
consistencyPrompt: runConsistency ? DEFAULT_PROMPT : '',
|
| 1320 |
+
consistencyNPredict: DEFAULT_N_PREDICT,
|
| 1321 |
+
refTokenIds: null,
|
| 1322 |
+
nPrompt: runCpuPerf ? nPrompt : 0,
|
| 1323 |
+
nGen: runCpuPerf ? nGen : 0,
|
| 1324 |
+
nReps: 1,
|
| 1325 |
+
nCtx: DEFAULT_N_CTX,
|
| 1326 |
+
nGpuLayers: 0,
|
| 1327 |
+
}, {
|
| 1328 |
+
onStatus: (status, msg) => row.setStatus(`cpu/${status}`, msg),
|
| 1329 |
+
onProgress: (fr, downloaded, total) => row.setProgress(fr, downloaded, total),
|
| 1330 |
+
onLog: logLine,
|
| 1331 |
+
});
|
| 1332 |
+
} catch (err) {
|
| 1333 |
+
cpuResult = { status: 'error', error: err.message || String(err) };
|
| 1334 |
+
}
|
| 1335 |
+
} else {
|
| 1336 |
+
cpuResult = { status: 'skipped' };
|
| 1337 |
}
|
| 1338 |
|
| 1339 |
+
// CPU pass is best-effort. Failures (OOM, slow device, missing op) don't
|
| 1340 |
+
// block the GPU run β the user opted into resilience implicitly by the
|
| 1341 |
+
// phase being best-effort, and explicitly via the skip checkboxes.
|
| 1342 |
const cpuOk = cpuResult.status === 'done';
|
| 1343 |
+
if (cpuResult.status === 'error') {
|
| 1344 |
+
logLine(`CPU baseline failed (${cpuResult.error || 'unknown'}) β proceeding with GPU run.`);
|
|
|
|
|
|
|
| 1345 |
row.setStatus('cpu-skipped', 'continuing with GPU only');
|
| 1346 |
}
|
| 1347 |
|
| 1348 |
+
// refTokenIds is the GPU pass's input for forced-decode consistency. Only
|
| 1349 |
+
// pass when we actually have tokens (consistency was requested AND CPU
|
| 1350 |
+
// produced tokens).
|
| 1351 |
+
const refTokenIds = (cpuOk && runConsistency && cpuResult.consistency?.token_ids?.length)
|
| 1352 |
+
? cpuResult.consistency.token_ids.join(',')
|
| 1353 |
+
: '';
|
| 1354 |
|
| 1355 |
if (state.aborted) {
|
| 1356 |
return { status: 'error', error: 'aborted', cpu: cpuResult, gpu: null };
|
| 1357 |
}
|
| 1358 |
|
| 1359 |
+
// βββ GPU pass: consistency (when not skipped) + perf in one model load βββ
|
| 1360 |
row.setStatus('gpu-run', 'loading model');
|
| 1361 |
let gpuResult;
|
| 1362 |
try {
|
| 1363 |
gpuResult = await runBenchmarkInWorker(v, {
|
| 1364 |
+
consistencyPrompt: runConsistency ? DEFAULT_PROMPT : '',
|
| 1365 |
consistencyNPredict: DEFAULT_N_PREDICT,
|
| 1366 |
refTokenIds: refTokenIds || null,
|
| 1367 |
nPrompt,
|
js/run/core.js
CHANGED
|
@@ -92,92 +92,109 @@ async function runBenchActions(Module, {
|
|
| 92 |
// Two sub-modes: (a) CPU baseline β generates token_ids via bench_run for a
|
| 93 |
// future GPU verification pass; (b) GPU verification β runs bench_run then
|
| 94 |
// bench_eval_tokens to compute the agreement rate against refTokenIds.
|
|
|
|
|
|
|
| 95 |
if (consistencyPrompt) {
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
);
|
| 104 |
-
const r = parseBenchResult('bench_run', raw);
|
| 105 |
-
out.output = r.output || '';
|
| 106 |
-
out.consistency = { token_ids: r.token_ids || [] };
|
| 107 |
-
|
| 108 |
-
if (refTokenIds) {
|
| 109 |
-
onLog?.('bench_eval_tokens β forced-decode vs CPU baseline');
|
| 110 |
-
const evalRaw = await Module.ccall(
|
| 111 |
-
'bench_eval_tokens', 'string',
|
| 112 |
-
['string', 'string'],
|
| 113 |
-
[consistencyPrompt, refTokenIds],
|
| 114 |
{ async: true },
|
| 115 |
);
|
| 116 |
-
const
|
| 117 |
-
out.
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
}
|
| 124 |
}
|
| 125 |
|
| 126 |
// βββ Perf phase (llama-bench style) βββ
|
| 127 |
// Synthetic random tokens; KV cleared per call inside bench_pp/bench_tg.
|
| 128 |
// Warmup is one full pp + one tg(1) call before the timed reps, matching
|
| 129 |
-
// tools/llama-bench/llama-bench.cpp.
|
|
|
|
| 130 |
const wantPp = nPrompt > 0;
|
| 131 |
const wantTg = nGen > 0;
|
| 132 |
if (wantPp || wantTg) {
|
| 133 |
const tests = [];
|
| 134 |
|
| 135 |
if (wantPp) {
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
}
|
| 152 |
-
tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
|
| 153 |
}
|
| 154 |
|
| 155 |
if (wantTg) {
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
}
|
| 172 |
-
tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
|
| 173 |
}
|
| 174 |
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
|
|
|
|
|
|
| 181 |
}
|
| 182 |
|
| 183 |
return out;
|
|
|
|
| 92 |
// Two sub-modes: (a) CPU baseline β generates token_ids via bench_run for a
|
| 93 |
// future GPU verification pass; (b) GPU verification β runs bench_run then
|
| 94 |
// bench_eval_tokens to compute the agreement rate against refTokenIds.
|
| 95 |
+
// Soft-fail: a failure here falls through to the perf phase rather than
|
| 96 |
+
// aborting the whole run.
|
| 97 |
if (consistencyPrompt) {
|
| 98 |
+
try {
|
| 99 |
+
onStatus?.('consistency', 'Running consistency check...');
|
| 100 |
+
onLog?.(`bench_run("...", ${consistencyNPredict}) β consistency phase`);
|
| 101 |
+
const raw = await Module.ccall(
|
| 102 |
+
'bench_run', 'string',
|
| 103 |
+
['string', 'number'],
|
| 104 |
+
[consistencyPrompt, consistencyNPredict],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
{ async: true },
|
| 106 |
);
|
| 107 |
+
const r = parseBenchResult('bench_run', raw);
|
| 108 |
+
out.output = r.output || '';
|
| 109 |
+
out.consistency = { token_ids: r.token_ids || [] };
|
| 110 |
+
|
| 111 |
+
if (refTokenIds) {
|
| 112 |
+
onLog?.('bench_eval_tokens β forced-decode vs CPU baseline');
|
| 113 |
+
const evalRaw = await Module.ccall(
|
| 114 |
+
'bench_eval_tokens', 'string',
|
| 115 |
+
['string', 'string'],
|
| 116 |
+
[consistencyPrompt, refTokenIds],
|
| 117 |
+
{ async: true },
|
| 118 |
+
);
|
| 119 |
+
const ev = parseBenchResult('bench_eval_tokens', evalRaw);
|
| 120 |
+
out.consistency = { ...out.consistency, ...ev };
|
| 121 |
+
onLog?.(
|
| 122 |
+
`Consistency: ${(ev.agreement_rate * 100).toFixed(1)}% top-1 agreement (` +
|
| 123 |
+
`${ev.n_agree}/${ev.n_tokens})` +
|
| 124 |
+
(ev.first_disagreement >= 0 ? ` β first diverge @ ${ev.first_disagreement}` : '')
|
| 125 |
+
);
|
| 126 |
+
}
|
| 127 |
+
} catch (err) {
|
| 128 |
+
onLog?.(`Consistency phase failed: ${err.message} β continuing to perf phase`);
|
| 129 |
}
|
| 130 |
}
|
| 131 |
|
| 132 |
// βββ Perf phase (llama-bench style) βββ
|
| 133 |
// Synthetic random tokens; KV cleared per call inside bench_pp/bench_tg.
|
| 134 |
// Warmup is one full pp + one tg(1) call before the timed reps, matching
|
| 135 |
+
// tools/llama-bench/llama-bench.cpp. pp and tg are wrapped independently
|
| 136 |
+
// so failure in one doesn't skip the other.
|
| 137 |
const wantPp = nPrompt > 0;
|
| 138 |
const wantTg = nGen > 0;
|
| 139 |
if (wantPp || wantTg) {
|
| 140 |
const tests = [];
|
| 141 |
|
| 142 |
if (wantPp) {
|
| 143 |
+
try {
|
| 144 |
+
if (!noWarmup) {
|
| 145 |
+
onStatus?.('perf', `warmup pp${nPrompt}`);
|
| 146 |
+
onLog?.(`bench_pp(${nPrompt}) β warmup`);
|
| 147 |
+
const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
|
| 148 |
+
parseBenchResult('bench_pp warmup', raw);
|
| 149 |
+
}
|
| 150 |
+
const samples_ns = [];
|
| 151 |
+
for (let i = 0; i < nReps; i++) {
|
| 152 |
+
onStatus?.('perf', `pp${nPrompt} ${i + 1}/${nReps}`);
|
| 153 |
+
const t0 = performance.now();
|
| 154 |
+
const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
|
| 155 |
+
const t_ns = (performance.now() - t0) * 1e6;
|
| 156 |
+
parseBenchResult('bench_pp', raw);
|
| 157 |
+
samples_ns.push(t_ns);
|
| 158 |
+
onLog?.(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
|
| 159 |
+
}
|
| 160 |
+
tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
|
| 161 |
+
} catch (err) {
|
| 162 |
+
onLog?.(`pp test failed: ${err.message}`);
|
| 163 |
}
|
|
|
|
| 164 |
}
|
| 165 |
|
| 166 |
if (wantTg) {
|
| 167 |
+
try {
|
| 168 |
+
if (!noWarmup) {
|
| 169 |
+
onStatus?.('perf', `warmup tg`);
|
| 170 |
+
onLog?.('bench_tg(1) β warmup');
|
| 171 |
+
const raw = await Module.ccall('bench_tg', 'string', ['number'], [1], { async: true });
|
| 172 |
+
parseBenchResult('bench_tg warmup', raw);
|
| 173 |
+
}
|
| 174 |
+
const samples_ns = [];
|
| 175 |
+
for (let i = 0; i < nReps; i++) {
|
| 176 |
+
onStatus?.('perf', `tg${nGen} ${i + 1}/${nReps}`);
|
| 177 |
+
const t0 = performance.now();
|
| 178 |
+
const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
|
| 179 |
+
const t_ns = (performance.now() - t0) * 1e6;
|
| 180 |
+
parseBenchResult('bench_tg', raw);
|
| 181 |
+
samples_ns.push(t_ns);
|
| 182 |
+
onLog?.(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
|
| 183 |
+
}
|
| 184 |
+
tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
|
| 185 |
+
} catch (err) {
|
| 186 |
+
onLog?.(`tg test failed: ${err.message}`);
|
| 187 |
}
|
|
|
|
| 188 |
}
|
| 189 |
|
| 190 |
+
if (tests.length > 0) {
|
| 191 |
+
out.metrics = {
|
| 192 |
+
tests,
|
| 193 |
+
n_prompt: wantPp ? nPrompt : 0,
|
| 194 |
+
n_gen: wantTg ? nGen : 0,
|
| 195 |
+
n_reps: nReps,
|
| 196 |
+
};
|
| 197 |
+
}
|
| 198 |
}
|
| 199 |
|
| 200 |
return out;
|
run.html
CHANGED
|
@@ -136,6 +136,13 @@
|
|
| 136 |
<label class="filter-label" for="iterations-input">Reps (-r)</label>
|
| 137 |
<input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">
|
| 138 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
</div>
|
| 140 |
</div>
|
| 141 |
|
|
|
|
| 136 |
<label class="filter-label" for="iterations-input">Reps (-r)</label>
|
| 137 |
<input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">
|
| 138 |
</div>
|
| 139 |
+
<div class="filter-group">
|
| 140 |
+
<span class="filter-label">Skip</span>
|
| 141 |
+
<div class="run-filters-checks">
|
| 142 |
+
<label class="run-hide-label" title="Skip the consistency check (CPU baseline + GPU forced-decode agreement). Useful when consistency is failing on a device or you only care about perf."><input type="checkbox" id="skip-consistency"> Consistency</label>
|
| 143 |
+
<label class="run-hide-label" title="Skip the single-rep CPU perf baseline. Useful when CPU runs are too slow or unstable on a device."><input type="checkbox" id="skip-cpu-perf"> CPU perf</label>
|
| 144 |
+
</div>
|
| 145 |
+
</div>
|
| 146 |
</div>
|
| 147 |
</div>
|
| 148 |
|