GitHub Actions commited on
Commit
ee944ff
·
1 Parent(s): bc8e1d3

sync from abhijitramesh/webgpu-bench@cfa77c10dc

Browse files
build/asyncify/bench.js CHANGED
The diff for this file is too large to render. See raw diff
 
build/asyncify/bench.wasm CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ba4381b1a8c3a34d003bae0837e684515174b8c1e24b470b7013eedafc359e4
3
- size 5235854
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:992c98b10bc138f1b92c12b9586a8ea0f925ea9e6789b2a8da3117c928ee5acc
3
+ size 5240204
build/asyncify/build-info.json CHANGED
@@ -2,5 +2,5 @@
2
  "llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
3
  "llamaCppDescribe": "b8981-3-gf22c8021d",
4
  "dawnTag": "v20260317.182325",
5
- "builtAt": "2026-05-01T08:36:58Z"
6
  }
 
2
  "llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
3
  "llamaCppDescribe": "b8981-3-gf22c8021d",
4
  "dawnTag": "v20260317.182325",
5
+ "builtAt": "2026-05-01T09:06:07Z"
6
  }
build/jspi/bench.js CHANGED
The diff for this file is too large to render. See raw diff
 
build/jspi/bench.wasm CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39eea46a9d02b044c9d143cf8243c3e05f8bf89d94bb5bcd804b6e43755b958d
3
- size 3614207
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a699aba49bfcc4e8cf4e78efa891cf1eb4b7a895345ea17d64916cc7c50c6df1
3
+ size 3616251
build/jspi/build-info.json CHANGED
@@ -2,5 +2,5 @@
2
  "llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
3
  "llamaCppDescribe": "b8981-3-gf22c8021d",
4
  "dawnTag": "v20260317.182325",
5
- "builtAt": "2026-05-01T08:33:19Z"
6
  }
 
2
  "llamaCppCommit": "f22c8021d213567942a3d0134692e70f02f28f3a",
3
  "llamaCppDescribe": "b8981-3-gf22c8021d",
4
  "dawnTag": "v20260317.182325",
5
+ "builtAt": "2026-05-01T09:02:12Z"
6
  }
harness.js CHANGED
@@ -30,6 +30,7 @@ window.addEventListener('unhandledrejection', (e) => {
30
  const nPrompt = parseInt(params.get('nPrompt') || '512', 10);
31
  const nGen = parseInt(params.get('nGen') || '128', 10);
32
  const nReps = parseInt(params.get('nReps') || '5', 10);
 
33
  const nCtx = parseInt(params.get('nCtx') || '2048', 10);
34
  const nGpuLayers = parseInt(params.get('nGpuLayers') || '999', 10);
35
  const refTokenIds = params.get('refTokenIds') || null;
@@ -145,6 +146,7 @@ window.addEventListener('unhandledrejection', (e) => {
145
  nPrompt: runPerf ? nPrompt : 0,
146
  nGen: runPerf ? nGen : 0,
147
  nReps,
 
148
  noWarmup: false,
149
  },
150
  opfsPath: { rootDir: OPFS_ROOT_NAME, repo: hfRepo, filename: modelFile },
 
30
  const nPrompt = parseInt(params.get('nPrompt') || '512', 10);
31
  const nGen = parseInt(params.get('nGen') || '128', 10);
32
  const nReps = parseInt(params.get('nReps') || '5', 10);
33
+ const nDepth = parseInt(params.get('nDepth') || '0', 10);
34
  const nCtx = parseInt(params.get('nCtx') || '2048', 10);
35
  const nGpuLayers = parseInt(params.get('nGpuLayers') || '999', 10);
36
  const refTokenIds = params.get('refTokenIds') || null;
 
146
  nPrompt: runPerf ? nPrompt : 0,
147
  nGen: runPerf ? nGen : 0,
148
  nReps,
149
+ nDepth: runPerf ? nDepth : 0,
150
  noWarmup: false,
151
  },
152
  opfsPath: { rootDir: OPFS_ROOT_NAME, repo: hfRepo, filename: modelFile },
js/dataset.js CHANGED
@@ -151,6 +151,10 @@ function flattenForDashboard(r, slug) {
151
  tg_test_name: tg?.name ?? null,
152
  pp_n_prompt: pp?.n_prompt ?? r.nPrompt ?? null,
153
  tg_n_gen: tg?.n_gen ?? r.nGen ?? null,
 
 
 
 
154
  n_p_eval: r.metrics?.n_p_eval ?? null,
155
  t_p_eval_ms: r.metrics?.t_p_eval_ms ?? null,
156
  n_eval: r.metrics?.n_eval ?? null,
 
151
  tg_test_name: tg?.name ?? null,
152
  pp_n_prompt: pp?.n_prompt ?? r.nPrompt ?? null,
153
  tg_n_gen: tg?.n_gen ?? r.nGen ?? null,
154
+ // KV-cache depth the timed reps ran at. Mirrors llama-bench's `-d` and
155
+ // is per-test in metrics.tests; record-level r.nDepth is the
156
+ // study/runner-set value, used as a fallback for older exports.
157
+ n_depth: pp?.n_depth ?? tg?.n_depth ?? r.nDepth ?? 0,
158
  n_p_eval: r.metrics?.n_p_eval ?? null,
159
  t_p_eval_ms: r.metrics?.t_p_eval_ms ?? null,
160
  n_eval: r.metrics?.n_eval ?? null,
js/run/bench-worker.js CHANGED
@@ -13,7 +13,7 @@
13
  // // consistency phase (set consistencyPrompt to '' to skip)
14
  // consistencyPrompt, consistencyNPredict, refTokenIds,
15
  // // perf phase
16
- // nPrompt, nGen, nReps, noWarmup,
17
  // },
18
  // opfsPath: { rootDir, repo, filename }
19
  // }
@@ -162,10 +162,14 @@ function opfsFreeAll(Module) {
162
  // llama-bench reports avg_ts = (n_tokens * 1e9) / avg_ns and stddev_ts as
163
  // the std of per-sample t/s, computed independently rather than propagated
164
  // from stddev_ns (the mapping isn't linear).
165
- function buildTest(name, n_prompt, n_gen, samples_ns) {
 
 
 
 
166
  const n = samples_ns.length;
167
  if (n === 0) {
168
- return { name, n_prompt, n_gen, avg_ns: 0, stddev_ns: 0, avg_ts: 0, stddev_ts: 0, samples_ns: [], samples_ts: [] };
169
  }
170
  const avg_ns = samples_ns.reduce((a, b) => a + b, 0) / n;
171
  const var_ns = n > 1
@@ -184,6 +188,7 @@ function buildTest(name, n_prompt, n_gen, samples_ns) {
184
  name,
185
  n_prompt,
186
  n_gen,
 
187
  avg_ns: Math.round(avg_ns),
188
  stddev_ns: Math.round(stddev_ns),
189
  avg_ts: round2(avg_ts),
@@ -236,6 +241,7 @@ async function runOne({ params, opfsPath }) {
236
  nPrompt,
237
  nGen,
238
  nReps,
 
239
  noWarmup,
240
  } = params;
241
  // The worker only loads via OPFS now: main thread downloads to OPFS,
@@ -409,29 +415,44 @@ async function runOne({ params, opfsPath }) {
409
  // which the dashboard renders as a dash.
410
  const wantPp = nPrompt > 0;
411
  const wantTg = nGen > 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  if (wantPp || wantTg) {
413
  const tests = [];
414
 
415
  if (wantPp) {
416
  try {
417
  if (!noWarmup) {
418
- status('perf', `warmup pp${nPrompt}`, Date.now());
419
- log(`bench_pp(${nPrompt}) warmup`);
 
420
  const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
421
  parseBenchResult('bench_pp warmup', raw);
422
  }
423
  const samples_ns = [];
424
  for (let i = 0; i < nReps; i++) {
425
- status('perf', `pp${nPrompt} ${i + 1}/${nReps}`, Date.now());
 
426
  const t0 = performance.now();
427
  const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
428
  const t_ns = (performance.now() - t0) * 1e6;
429
  parseBenchResult('bench_pp', raw);
430
  samples_ns.push(t_ns);
431
- log(`pp${nPrompt} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
432
  if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
433
  }
434
- tests.push(buildTest(`pp${nPrompt}`, nPrompt, 0, samples_ns));
435
  } catch (err) {
436
  log(`pp test failed: ${err.message}`);
437
  }
@@ -444,23 +465,25 @@ async function runOne({ params, opfsPath }) {
444
  // A 1-token warmup exercises the decode kernel once, which leaves
445
  // the first timed rep absorbing pipeline-cache / shader-specialize
446
  // cost on every subsequent step.
447
- status('perf', `warmup tg${nGen}`, Date.now());
448
- log(`bench_tg(${nGen}) warmup`);
 
449
  const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
450
  parseBenchResult('bench_tg warmup', raw);
451
  }
452
  const samples_ns = [];
453
  for (let i = 0; i < nReps; i++) {
454
- status('perf', `tg${nGen} ${i + 1}/${nReps}`, Date.now());
 
455
  const t0 = performance.now();
456
  const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
457
  const t_ns = (performance.now() - t0) * 1e6;
458
  parseBenchResult('bench_tg', raw);
459
  samples_ns.push(t_ns);
460
- log(`tg${nGen} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
461
  if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
462
  }
463
- tests.push(buildTest(`tg${nGen}`, 0, nGen, samples_ns));
464
  } catch (err) {
465
  log(`tg test failed: ${err.message}`);
466
  }
@@ -471,6 +494,7 @@ async function runOne({ params, opfsPath }) {
471
  tests,
472
  n_prompt: wantPp ? nPrompt : 0,
473
  n_gen: wantTg ? nGen : 0,
 
474
  n_reps: nReps,
475
  };
476
  }
 
13
  // // consistency phase (set consistencyPrompt to '' to skip)
14
  // consistencyPrompt, consistencyNPredict, refTokenIds,
15
  // // perf phase
16
+ // nPrompt, nGen, nReps, nDepth, noWarmup,
17
  // },
18
  // opfsPath: { rootDir, repo, filename }
19
  // }
 
162
  // llama-bench reports avg_ts = (n_tokens * 1e9) / avg_ns and stddev_ts as
163
  // the std of per-sample t/s, computed independently rather than propagated
164
  // from stddev_ns (the mapping isn't linear).
165
+ //
166
+ // `n_depth` carries through unchanged so downstream consumers can label
167
+ // e.g. "pp512 @ d2048" the way llama-bench does (line 1984 of
168
+ // llama.cpp/tools/llama-bench/llama-bench.cpp).
169
+ function buildTest(name, n_prompt, n_gen, n_depth, samples_ns) {
170
  const n = samples_ns.length;
171
  if (n === 0) {
172
+ return { name, n_prompt, n_gen, n_depth, avg_ns: 0, stddev_ns: 0, avg_ts: 0, stddev_ts: 0, samples_ns: [], samples_ts: [] };
173
  }
174
  const avg_ns = samples_ns.reduce((a, b) => a + b, 0) / n;
175
  const var_ns = n > 1
 
188
  name,
189
  n_prompt,
190
  n_gen,
191
+ n_depth,
192
  avg_ns: Math.round(avg_ns),
193
  stddev_ns: Math.round(stddev_ns),
194
  avg_ts: round2(avg_ts),
 
241
  nPrompt,
242
  nGen,
243
  nReps,
244
+ nDepth = 0,
245
  noWarmup,
246
  } = params;
247
  // The worker only loads via OPFS now: main thread downloads to OPFS,
 
415
  // which the dashboard renders as a dash.
416
  const wantPp = nPrompt > 0;
417
  const wantTg = nGen > 0;
418
+ // Test name suffix mirroring llama-bench (e.g. "pp512 @ d2048").
419
+ const depthSuffix = nDepth > 0 ? ` @ d${nDepth}` : '';
420
+ // Each timed rep is preceded by an untimed bench_set_depth call so the KV
421
+ // cache is in a known state. The C side caches the post-prefill snapshot,
422
+ // so reps 2..N at the same depth restore from snapshot instead of
423
+ // re-running the prefill (mirroring llama-bench's `cstate` reuse).
424
+ const setDepth = async (label) => {
425
+ const raw = await Module.ccall('bench_set_depth', 'string', ['number'], [nDepth], { async: true });
426
+ const r = parseBenchResult(`bench_set_depth(${nDepth}) ${label}`, raw);
427
+ if (nDepth > 0) {
428
+ log(`bench_set_depth(${nDepth}) ${label}: ${r.cached ? 'restored snapshot' : 'prefilled'}`);
429
+ }
430
+ };
431
  if (wantPp || wantTg) {
432
  const tests = [];
433
 
434
  if (wantPp) {
435
  try {
436
  if (!noWarmup) {
437
+ status('perf', `warmup pp${nPrompt}${depthSuffix}`, Date.now());
438
+ await setDepth('pp warmup');
439
+ log(`bench_pp(${nPrompt})${depthSuffix} — warmup`);
440
  const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
441
  parseBenchResult('bench_pp warmup', raw);
442
  }
443
  const samples_ns = [];
444
  for (let i = 0; i < nReps; i++) {
445
+ status('perf', `pp${nPrompt}${depthSuffix} ${i + 1}/${nReps}`, Date.now());
446
+ await setDepth(`pp rep ${i + 1}/${nReps}`);
447
  const t0 = performance.now();
448
  const raw = await Module.ccall('bench_pp', 'string', ['number'], [nPrompt], { async: true });
449
  const t_ns = (performance.now() - t0) * 1e6;
450
  parseBenchResult('bench_pp', raw);
451
  samples_ns.push(t_ns);
452
+ log(`pp${nPrompt}${depthSuffix} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nPrompt / t_ns).toFixed(1)} t/s)`);
453
  if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
454
  }
455
+ tests.push(buildTest(`pp${nPrompt}${depthSuffix}`, nPrompt, 0, nDepth, samples_ns));
456
  } catch (err) {
457
  log(`pp test failed: ${err.message}`);
458
  }
 
465
  // A 1-token warmup exercises the decode kernel once, which leaves
466
  // the first timed rep absorbing pipeline-cache / shader-specialize
467
  // cost on every subsequent step.
468
+ status('perf', `warmup tg${nGen}${depthSuffix}`, Date.now());
469
+ await setDepth('tg warmup');
470
+ log(`bench_tg(${nGen})${depthSuffix} — warmup`);
471
  const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
472
  parseBenchResult('bench_tg warmup', raw);
473
  }
474
  const samples_ns = [];
475
  for (let i = 0; i < nReps; i++) {
476
+ status('perf', `tg${nGen}${depthSuffix} ${i + 1}/${nReps}`, Date.now());
477
+ await setDepth(`tg rep ${i + 1}/${nReps}`);
478
  const t0 = performance.now();
479
  const raw = await Module.ccall('bench_tg', 'string', ['number'], [nGen], { async: true });
480
  const t_ns = (performance.now() - t0) * 1e6;
481
  parseBenchResult('bench_tg', raw);
482
  samples_ns.push(t_ns);
483
+ log(`tg${nGen}${depthSuffix} run ${i + 1}/${nReps}: ${(t_ns / 1e6).toFixed(1)} ms (${(1e9 * nGen / t_ns).toFixed(1)} t/s)`);
484
  if (i + 1 < nReps) await sleep(REP_COOLDOWN_MS);
485
  }
486
+ tests.push(buildTest(`tg${nGen}${depthSuffix}`, 0, nGen, nDepth, samples_ns));
487
  } catch (err) {
488
  log(`tg test failed: ${err.message}`);
489
  }
 
494
  tests,
495
  n_prompt: wantPp ? nPrompt : 0,
496
  n_gen: wantTg ? nGen : 0,
497
+ n_depth: nDepth,
498
  n_reps: nReps,
499
  };
500
  }
js/run/controller.js CHANGED
@@ -22,6 +22,7 @@ const YIELD_BETWEEN_RUNS_MS = 500;
22
  // llama-bench defaults: -p 512 -n 128 -r 5
23
  const DEFAULT_N_PROMPT = 512;
24
  const DEFAULT_N_GEN = 128;
 
25
  const DEFAULT_ITERATIONS = 5;
26
  const MIN_ITERATIONS_FOR_SUBMIT = 5;
27
 
@@ -40,6 +41,7 @@ const state = {
40
  iterations: DEFAULT_ITERATIONS,
41
  nPrompt: DEFAULT_N_PROMPT,
42
  nGen: DEFAULT_N_GEN,
 
43
  // User-controlled phase toggles. Defaults match the previous behaviour:
44
  // run consistency (CPU baseline + GPU forced-decode) AND run CPU perf
45
  // baseline. Both checkable to skip — useful on devices where CPU is too
@@ -706,6 +708,15 @@ function wirePerfInputs() {
706
  ng.value = String(state.nGen);
707
  });
708
  }
 
 
 
 
 
 
 
 
 
709
  const skipCons = $('skip-consistency');
710
  if (skipCons) {
711
  skipCons.checked = state.skipConsistency;
@@ -1400,6 +1411,7 @@ async function runBenchmarkInWorker(v, params, callbacks) {
1400
  nPrompt: params.nPrompt ?? 0,
1401
  nGen: params.nGen ?? 0,
1402
  nReps: params.nReps ?? DEFAULT_ITERATIONS,
 
1403
  noWarmup: !!params.noWarmup,
1404
  };
1405
 
@@ -1453,6 +1465,11 @@ async function runVariantWithIterations(v, row) {
1453
  const nReps = Math.max(1, state.iterations || DEFAULT_ITERATIONS);
1454
  const nPrompt = Math.max(0, state.nPrompt ?? DEFAULT_N_PROMPT);
1455
  const nGen = Math.max(0, state.nGen ?? DEFAULT_N_GEN);
 
 
 
 
 
1456
  // Phase toggles from the run page. Combined effect:
1457
  // skip both → only GPU perf, no CPU pass at all
1458
  // skip consistency → CPU perf baseline + GPU perf, no token-id check
@@ -1480,8 +1497,11 @@ async function runVariantWithIterations(v, row) {
1480
  refTokenIds: null,
1481
  nPrompt: runCpuPerf ? nPrompt : 0,
1482
  nGen: runCpuPerf ? nGen : 0,
 
 
 
1483
  nReps: 1,
1484
- nCtx: DEFAULT_N_CTX,
1485
  nGpuLayers: 0,
1486
  }, {
1487
  onStatus: (status, msg, sinceMs) => row.setStatus(`cpu/${status}`, msg, sinceMs),
@@ -1525,8 +1545,9 @@ async function runVariantWithIterations(v, row) {
1525
  refTokenIds: refTokenIds || null,
1526
  nPrompt,
1527
  nGen,
 
1528
  nReps,
1529
- nCtx: DEFAULT_N_CTX,
1530
  nGpuLayers: DEFAULT_N_GPU_LAYERS,
1531
  }, {
1532
  onStatus: (s, m, sinceMs) => row.setStatus(`gpu/${s}`, m, sinceMs),
@@ -1605,6 +1626,7 @@ function makeRecord(v, vr, machine, browser, wallTimeMs) {
1605
  nPredict: DEFAULT_N_PREDICT,
1606
  nPrompt: gpu?.metrics?.n_prompt ?? 0,
1607
  nGen: gpu?.metrics?.n_gen ?? 0,
 
1608
  nReps: gpu?.metrics?.n_reps ?? 0,
1609
  nGpuLayers: DEFAULT_N_GPU_LAYERS,
1610
  timestamp: new Date().toISOString(),
 
22
  // llama-bench defaults: -p 512 -n 128 -r 5
23
  const DEFAULT_N_PROMPT = 512;
24
  const DEFAULT_N_GEN = 128;
25
+ const DEFAULT_N_DEPTH = 2048;
26
  const DEFAULT_ITERATIONS = 5;
27
  const MIN_ITERATIONS_FOR_SUBMIT = 5;
28
 
 
41
  iterations: DEFAULT_ITERATIONS,
42
  nPrompt: DEFAULT_N_PROMPT,
43
  nGen: DEFAULT_N_GEN,
44
+ nDepth: DEFAULT_N_DEPTH,
45
  // User-controlled phase toggles. Defaults match the previous behaviour:
46
  // run consistency (CPU baseline + GPU forced-decode) AND run CPU perf
47
  // baseline. Both checkable to skip — useful on devices where CPU is too
 
708
  ng.value = String(state.nGen);
709
  });
710
  }
711
+ const nd = $('n-depth-input');
712
+ if (nd) {
713
+ nd.value = String(state.nDepth);
714
+ nd.addEventListener('change', () => {
715
+ const n = Math.max(0, Math.min(32768, parseInt(nd.value, 10)));
716
+ state.nDepth = Number.isFinite(n) ? n : DEFAULT_N_DEPTH;
717
+ nd.value = String(state.nDepth);
718
+ });
719
+ }
720
  const skipCons = $('skip-consistency');
721
  if (skipCons) {
722
  skipCons.checked = state.skipConsistency;
 
1411
  nPrompt: params.nPrompt ?? 0,
1412
  nGen: params.nGen ?? 0,
1413
  nReps: params.nReps ?? DEFAULT_ITERATIONS,
1414
+ nDepth: params.nDepth ?? 0,
1415
  noWarmup: !!params.noWarmup,
1416
  };
1417
 
 
1465
  const nReps = Math.max(1, state.iterations || DEFAULT_ITERATIONS);
1466
  const nPrompt = Math.max(0, state.nPrompt ?? DEFAULT_N_PROMPT);
1467
  const nGen = Math.max(0, state.nGen ?? DEFAULT_N_GEN);
1468
+ const nDepth = Math.max(0, state.nDepth ?? DEFAULT_N_DEPTH);
1469
+ // Per-test n_ctx mirrors llama-bench (line 1211 of
1470
+ // tools/llama-bench/llama-bench.cpp): sized to fit prompt+gen+depth so a
1471
+ // raised depth doesn't silently overflow the cache.
1472
+ const nCtxFor = (depth) => Math.max(DEFAULT_N_CTX, nPrompt + nGen + depth);
1473
  // Phase toggles from the run page. Combined effect:
1474
  // skip both → only GPU perf, no CPU pass at all
1475
  // skip consistency → CPU perf baseline + GPU perf, no token-id check
 
1497
  refTokenIds: null,
1498
  nPrompt: runCpuPerf ? nPrompt : 0,
1499
  nGen: runCpuPerf ? nGen : 0,
1500
+ // CPU baseline keeps depth=0 — its job is reference-token capture
1501
+ // and a single-rep perf comparator, not depth-loaded sweeping.
1502
+ nDepth: 0,
1503
  nReps: 1,
1504
+ nCtx: nCtxFor(0),
1505
  nGpuLayers: 0,
1506
  }, {
1507
  onStatus: (status, msg, sinceMs) => row.setStatus(`cpu/${status}`, msg, sinceMs),
 
1545
  refTokenIds: refTokenIds || null,
1546
  nPrompt,
1547
  nGen,
1548
+ nDepth,
1549
  nReps,
1550
+ nCtx: nCtxFor(nDepth),
1551
  nGpuLayers: DEFAULT_N_GPU_LAYERS,
1552
  }, {
1553
  onStatus: (s, m, sinceMs) => row.setStatus(`gpu/${s}`, m, sinceMs),
 
1626
  nPredict: DEFAULT_N_PREDICT,
1627
  nPrompt: gpu?.metrics?.n_prompt ?? 0,
1628
  nGen: gpu?.metrics?.n_gen ?? 0,
1629
+ nDepth: gpu?.metrics?.n_depth ?? 0,
1630
  nReps: gpu?.metrics?.n_reps ?? 0,
1631
  nGpuLayers: DEFAULT_N_GPU_LAYERS,
1632
  timestamp: new Date().toISOString(),
run.html CHANGED
@@ -231,6 +231,10 @@
231
  <label class="filter-label" for="n-gen-input">Gen tokens (-n)</label>
232
  <input type="number" id="n-gen-input" class="filter-select run-iter-input" value="128" min="0" max="4096" step="1">
233
  </div>
 
 
 
 
234
  <div class="filter-group">
235
  <label class="filter-label" for="iterations-input">Reps (-r)</label>
236
  <input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">
 
231
  <label class="filter-label" for="n-gen-input">Gen tokens (-n)</label>
232
  <input type="number" id="n-gen-input" class="filter-select run-iter-input" value="128" min="0" max="4096" step="1">
233
  </div>
234
+ <div class="filter-group">
235
+ <label class="filter-label" for="n-depth-input">KV depth (-d)</label>
236
+ <input type="number" id="n-depth-input" class="filter-select run-iter-input" value="2048" min="0" max="32768" step="1">
237
+ </div>
238
  <div class="filter-group">
239
  <label class="filter-label" for="iterations-input">Reps (-r)</label>
240
  <input type="number" id="iterations-input" class="filter-select run-iter-input" value="5" min="1" max="50" step="1">