GitHub Actions commited on
Commit
e2ac5c3
·
1 Parent(s): f8195e9

sync from abhijitramesh/webgpu-bench@d35922fe12

Browse files
Files changed (4) hide show
  1. js/app.js +11 -7
  2. js/charts.js +14 -6
  3. js/data.js +76 -0
  4. js/tables.js +54 -17
js/app.js CHANGED
@@ -1,4 +1,4 @@
1
- import { loadData, filterResults, selectBestResults, expandCpuRows, attachCpuBaselineFromCpuRecords } from './data.js';
2
  import { initFilters, populateQuantOptions, getFilters, resetFilters } from './filters.js';
3
  import { renderDecodeChart, renderPrefillChart, renderSizeChart, renderMachineChart, renderCpuGpuChart, renderSpeedupChart } from './charts.js';
4
  import { renderResultsTable, renderErrorTable, renderMachineInfo, renderCpuGpuTable } from './tables.js';
@@ -78,11 +78,14 @@ function render() {
78
  const filters = getFilters();
79
  // Filter, attach CPU baseline values (folds CLI-flow CPU records onto
80
  // their GPU sibling so both submission paths produce one row per cell),
81
- // collapse to one canonical row per (machine, browser, model, variant,
82
- // backend), then drop the now-redundant CPU rows. The CPU numbers stay
83
- // visible via the cpu_baseline_* columns on each GPU row.
 
84
  const filtered = selectBestResults(
85
- attachCpuBaselineFromCpuRecords(filterResults(appData.results, filters)),
 
 
86
  ).filter(r => r.nGpuLayers !== 0);
87
 
88
  // Summary cards — counts tween from previous value to new on filter changes
@@ -203,8 +206,9 @@ function renderHeroMeta(data) {
203
 
204
  // Hero stat: top decode tok/s with machine + model context. Uses the
205
  // canonical set (best per cell) so a noisy 1-iteration outlier can't
206
- // hijack the headline number.
207
- const canonical = selectBestResults(data?.results || []);
 
208
  const passed = canonical.filter(r => r.status === 'done' && r.decode_tok_s != null);
209
  const heroStatEl = document.getElementById('hero-stat');
210
  const heroNumEl = document.getElementById('hero-top-decode');
 
1
+ import { loadData, filterResults, selectBestResults, expandCpuRows, attachCpuBaselineFromCpuRecords, mergeDepthPairs } from './data.js';
2
  import { initFilters, populateQuantOptions, getFilters, resetFilters } from './filters.js';
3
  import { renderDecodeChart, renderPrefillChart, renderSizeChart, renderMachineChart, renderCpuGpuChart, renderSpeedupChart } from './charts.js';
4
  import { renderResultsTable, renderErrorTable, renderMachineInfo, renderCpuGpuTable } from './tables.js';
 
78
  const filters = getFilters();
79
  // Filter, attach CPU baseline values (folds CLI-flow CPU records onto
80
  // their GPU sibling so both submission paths produce one row per cell),
81
+ // fold the (d=0, d=N) study pair into a single GPU row carrying both
82
+ // depths, collapse to one canonical row per (machine, browser, model,
83
+ // variant, backend), then drop the now-redundant CPU rows. CPU numbers
84
+ // stay visible via the cpu_baseline_* columns on each GPU row.
85
  const filtered = selectBestResults(
86
+ mergeDepthPairs(
87
+ attachCpuBaselineFromCpuRecords(filterResults(appData.results, filters)),
88
+ ),
89
  ).filter(r => r.nGpuLayers !== 0);
90
 
91
  // Summary cards — counts tween from previous value to new on filter changes
 
206
 
207
  // Hero stat: top decode tok/s with machine + model context. Uses the
208
  // canonical set (best per cell) so a noisy 1-iteration outlier can't
209
+ // hijack the headline number. Depth-merge first so a Study cell counts
210
+ // once at its d=N number, not twice.
211
+ const canonical = selectBestResults(mergeDepthPairs(data?.results || []));
212
  const passed = canonical.filter(r => r.status === 'done' && r.decode_tok_s != null);
213
  const heroStatEl = document.getElementById('hero-stat');
214
  const heroNumEl = document.getElementById('hero-top-decode');
js/charts.js CHANGED
@@ -250,16 +250,23 @@ function avgBy(items, field) {
250
  return vals.length ? vals.reduce((a, b) => a + b, 0) / vals.length : null;
251
  }
252
 
 
 
 
 
 
 
253
  export function renderCpuGpuChart(results, metric = 'decode_tok_s') {
254
  const canvasId = 'chart-cpu-gpu';
255
  destroyChart(canvasId);
256
  const canvas = document.getElementById(canvasId);
257
  if (!canvas) return;
258
 
 
259
  const passed = results.filter(r => r.status === 'done');
260
  // expandCpuRows folds in cpu_baseline_* from browser-flow GPU records.
261
  const cpuResults = expandCpuRows(passed).filter(r => r[metric] != null);
262
- const gpuResults = passed.filter(r => r.nGpuLayers !== 0 && r[metric] != null);
263
 
264
  if (cpuResults.length === 0 || gpuResults.length === 0) {
265
  showEmptyState(canvas, cpuResults.length === 0 ? 'No CPU baseline data in current filter' : 'No GPU data in current filter');
@@ -286,7 +293,7 @@ export function renderCpuGpuChart(results, metric = 'decode_tok_s') {
286
  return {
287
  label: browser,
288
  backgroundColor: BROWSER_COLORS[browser] || '#888',
289
- data: allQuants.map(q => avgBy(byVariant[q] || [], metric)),
290
  };
291
  });
292
 
@@ -298,7 +305,7 @@ export function renderCpuGpuChart(results, metric = 'decode_tok_s') {
298
  responsive: true,
299
  maintainAspectRatio: false,
300
  plugins: {
301
- title: titleConfig(`CPU vs WebGPU: ${metricLabel}`),
302
  legend: darkLegend(),
303
  tooltip: { callbacks: { label: ctx => `${ctx.dataset.label}: ${formatTokS(ctx.raw)} tok/s` } },
304
  },
@@ -313,9 +320,10 @@ export function renderSpeedupChart(results, metric = 'decode_tok_s') {
313
  const canvas = document.getElementById(canvasId);
314
  if (!canvas) return;
315
 
 
316
  const passed = results.filter(r => r.status === 'done');
317
  const cpuResults = expandCpuRows(passed).filter(r => r[metric] != null);
318
- const gpuResults = passed.filter(r => r.nGpuLayers !== 0 && r[metric] != null);
319
 
320
  if (cpuResults.length === 0 || gpuResults.length === 0) {
321
  showEmptyState(canvas, cpuResults.length === 0 ? 'No CPU baseline data in current filter' : 'No GPU data in current filter');
@@ -345,7 +353,7 @@ export function renderSpeedupChart(results, metric = 'decode_tok_s') {
345
  backgroundColor: BROWSER_COLORS[browser] || '#888',
346
  data: allQuants.map(q => {
347
  const cpuAvg = cpuAvgByVariant[q];
348
- const gpuAvg = avgBy(byVariant[q] || [], metric);
349
  return cpuAvg && gpuAvg ? gpuAvg / cpuAvg : null;
350
  }),
351
  };
@@ -371,7 +379,7 @@ export function renderSpeedupChart(results, metric = 'decode_tok_s') {
371
  responsive: true,
372
  maintainAspectRatio: false,
373
  plugins: {
374
- title: titleConfig(`WebGPU Speedup over CPU (${metricLabel})`),
375
  legend: {
376
  ...darkLegend(),
377
  labels: { ...darkLegend().labels, filter: item => item.text !== '1\u00d7' },
 
250
  return vals.length ? vals.reduce((a, b) => a + b, 0) / vals.length : null;
251
  }
252
 
253
+ // CPU is pinned to d=0 by the runner, so apples-to-apples means reading
254
+ // GPU's d=0 number. The CPU side keeps its bare metric (CPU records are
255
+ // depth-pinned to 0 either way); GPU reads `<metric>_d0`. Plain-Run
256
+ // records that only measured d=N have null `_d0` and silently drop out.
257
+ function gpuDepthField(metric) { return `${metric}_d0`; }
258
+
259
  export function renderCpuGpuChart(results, metric = 'decode_tok_s') {
260
  const canvasId = 'chart-cpu-gpu';
261
  destroyChart(canvasId);
262
  const canvas = document.getElementById(canvasId);
263
  if (!canvas) return;
264
 
265
+ const gpuMetric = gpuDepthField(metric);
266
  const passed = results.filter(r => r.status === 'done');
267
  // expandCpuRows folds in cpu_baseline_* from browser-flow GPU records.
268
  const cpuResults = expandCpuRows(passed).filter(r => r[metric] != null);
269
+ const gpuResults = passed.filter(r => r.nGpuLayers !== 0 && r[gpuMetric] != null);
270
 
271
  if (cpuResults.length === 0 || gpuResults.length === 0) {
272
  showEmptyState(canvas, cpuResults.length === 0 ? 'No CPU baseline data in current filter' : 'No GPU data in current filter');
 
293
  return {
294
  label: browser,
295
  backgroundColor: BROWSER_COLORS[browser] || '#888',
296
+ data: allQuants.map(q => avgBy(byVariant[q] || [], gpuMetric)),
297
  };
298
  });
299
 
 
305
  responsive: true,
306
  maintainAspectRatio: false,
307
  plugins: {
308
+ title: titleConfig(`CPU vs WebGPU: ${metricLabel} @ d0`),
309
  legend: darkLegend(),
310
  tooltip: { callbacks: { label: ctx => `${ctx.dataset.label}: ${formatTokS(ctx.raw)} tok/s` } },
311
  },
 
320
  const canvas = document.getElementById(canvasId);
321
  if (!canvas) return;
322
 
323
+ const gpuMetric = gpuDepthField(metric);
324
  const passed = results.filter(r => r.status === 'done');
325
  const cpuResults = expandCpuRows(passed).filter(r => r[metric] != null);
326
+ const gpuResults = passed.filter(r => r.nGpuLayers !== 0 && r[gpuMetric] != null);
327
 
328
  if (cpuResults.length === 0 || gpuResults.length === 0) {
329
  showEmptyState(canvas, cpuResults.length === 0 ? 'No CPU baseline data in current filter' : 'No GPU data in current filter');
 
353
  backgroundColor: BROWSER_COLORS[browser] || '#888',
354
  data: allQuants.map(q => {
355
  const cpuAvg = cpuAvgByVariant[q];
356
+ const gpuAvg = avgBy(byVariant[q] || [], gpuMetric);
357
  return cpuAvg && gpuAvg ? gpuAvg / cpuAvg : null;
358
  }),
359
  };
 
379
  responsive: true,
380
  maintainAspectRatio: false,
381
  plugins: {
382
+ title: titleConfig(`WebGPU Speedup over CPU (${metricLabel} @ d0)`),
383
  legend: {
384
  ...darkLegend(),
385
  labels: { ...darkLegend().labels, filter: item => item.text !== '1\u00d7' },
js/data.js CHANGED
@@ -128,6 +128,82 @@ function writeSessionCache(data) {
128
  } catch { /* quota or disabled */ }
129
  }
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  /* Reduce a flat result set down to one canonical row per
132
  (machineSlug, browser, model, variant, backend) cell. Picks the row with
133
  the most iterations; ties break on latest timestamp. This is the
 
128
  } catch { /* quota or disabled */ }
129
  }
130
 
131
+ /* Fold the (d=0, d=N) GPU record pair that Run Study emits per variant
132
+ into a single dashboard row. The d=N record stays canonical
133
+ (`decode_tok_s` / `prefill_tok_s` keep the depth-loaded numbers) so
134
+ existing chart/table consumers keep working unchanged; a new pair of
135
+ `_d0` / `_dN` suffix fields lets depth-aware code pick a specific pass.
136
+
137
+ CPU records are pinned to d=0 by the runner, so they pass through
138
+ untouched. Cells with only one half of the pair (plain Run, pre-study
139
+ data, or a partial study) lift their values into the suffix field on
140
+ the side that exists, leaving the other side null — so consumers can
141
+ render `—` without having to know the record's history.
142
+
143
+ Within each cell we also tie-break duplicate records per depth bucket
144
+ (same iteration / latest timestamp wins, mirroring selectBestResults)
145
+ so multiple study runs of the same variant collapse cleanly.
146
+
147
+ Run AFTER attachCpuBaselineFromCpuRecords (which keys on the
148
+ depth-independent (machine, browser, model, variant) tuple) and
149
+ BEFORE selectBestResults (CPU rows still need cell-dedup; GPU rows
150
+ are already deduped here). */
151
+ export function mergeDepthPairs(records) {
152
+ const cells = new Map();
153
+ const cpuRows = [];
154
+ for (const r of records) {
155
+ if (r.nGpuLayers === 0) {
156
+ cpuRows.push(r);
157
+ continue;
158
+ }
159
+ const cellKey = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}`;
160
+ const bucket = (r.n_depth ?? 0) === 0 ? 'd0' : 'dN';
161
+ const slot = cells.get(cellKey) || { d0: null, dN: null };
162
+ if (!slot[bucket] || isStrongerRecord(r, slot[bucket])) slot[bucket] = r;
163
+ cells.set(cellKey, slot);
164
+ }
165
+ const merged = [...cpuRows];
166
+ for (const { d0, dN } of cells.values()) {
167
+ if (d0 && dN) merged.push(joinDepthPair(d0, dN));
168
+ else if (dN) merged.push(liftSingleDepth(dN, 'dN'));
169
+ else if (d0) merged.push(liftSingleDepth(d0, 'd0'));
170
+ }
171
+ return merged;
172
+ }
173
+
174
+ function isStrongerRecord(a, b) {
175
+ const ai = a.iterations ?? 0;
176
+ const bi = b.iterations ?? 0;
177
+ if (ai !== bi) return ai > bi;
178
+ return (a.timestamp || '') > (b.timestamp || '');
179
+ }
180
+
181
+ const DEPTH_PERF_FIELDS = [
182
+ 'decode_tok_s', 'prefill_tok_s',
183
+ 'decode_stddev_ts', 'prefill_stddev_ts',
184
+ 'pp_test_name', 'tg_test_name',
185
+ ];
186
+
187
+ function joinDepthPair(d0, dN) {
188
+ const out = { ...dN };
189
+ for (const f of DEPTH_PERF_FIELDS) {
190
+ out[`${f}_d0`] = d0[f] ?? null;
191
+ out[`${f}_dN`] = dN[f] ?? null;
192
+ }
193
+ out.n_depth_dN = dN.n_depth ?? null;
194
+ return out;
195
+ }
196
+
197
+ function liftSingleDepth(r, bucket) {
198
+ const out = { ...r };
199
+ for (const f of DEPTH_PERF_FIELDS) {
200
+ out[`${f}_d0`] = bucket === 'd0' ? (r[f] ?? null) : null;
201
+ out[`${f}_dN`] = bucket === 'dN' ? (r[f] ?? null) : null;
202
+ }
203
+ out.n_depth_dN = bucket === 'dN' ? (r.n_depth ?? null) : null;
204
+ return out;
205
+ }
206
+
207
  /* Reduce a flat result set down to one canonical row per
208
  (machineSlug, browser, model, variant, backend) cell. Picks the row with
209
  the most iterations; ties break on latest timestamp. This is the
js/tables.js CHANGED
@@ -5,7 +5,10 @@ let lastResults = [];
5
  let sortState = { key: null, dir: 'asc' };
6
 
7
  const NUM_KEYS = new Set([
8
- 'sizeMB', 'decode_tok_s', 'prefill_tok_s',
 
 
 
9
  'cpu_baseline_decode_tok_s', 'cpu_baseline_prefill_tok_s',
10
  'n_eval', 't_eval_ms',
11
  'n_p_eval', 't_p_eval_ms', 'wallTimeMs', 'consistency_rate',
@@ -64,6 +67,18 @@ export function renderResultsTable(results) {
64
 
65
  const sorted = sortState.key ? sortResults(results, sortState.key, sortState.dir) : results;
66
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  /* priority: 1 = always show; 2 = hide below 640px; 3 = hide below 900px */
68
  const cols = [
69
  { key: 'machineSlug', label: 'Machine', priority: 1 },
@@ -75,8 +90,14 @@ export function renderResultsTable(results) {
75
  { key: 'status', label: 'Status', priority: 1 },
76
  { key: 'buildType', label: 'Build', priority: 3 },
77
  { key: 'webgpuAvailable', label: 'WebGPU', priority: 3 },
78
- { key: 'decode_tok_s', label: 'tg tok/s', priority: 1 },
79
- { key: 'prefill_tok_s', label: 'pp tok/s', priority: 3 },
 
 
 
 
 
 
80
  { key: 'cpu_baseline_decode_tok_s', label: 'CPU tg tok/s', priority: 2 },
81
  { key: 'cpu_baseline_prefill_tok_s', label: 'CPU pp tok/s', priority: 3 },
82
  { key: 'n_eval', label: 'n_eval', priority: 3 },
@@ -97,7 +118,8 @@ export function renderResultsTable(results) {
97
  const pin = i === 0 ? ' col-pin col-pin-1' : (i === 1 ? ' col-pin col-pin-2' : '');
98
  const prio = col.priority >= 3 ? ' col-p3' : (col.priority === 2 ? ' col-p2' : '');
99
  const cls = `sortable${isActive ? ' sorted' : ''}${pin}${prio}`;
100
- html += `<th data-key="${col.key}" class="${cls}" aria-sort="${ariaSort}" scope="col" tabindex="0"><span class="th-label">${col.label}</span><span class="th-sort-indicator" aria-hidden="true">${arrowChar}</span></th>`;
 
101
  });
102
  html += '</tr></thead><tbody>';
103
 
@@ -123,19 +145,27 @@ export function renderResultsTable(results) {
123
  break;
124
  case 'decode_tok_s':
125
  case 'prefill_tok_s':
 
 
 
 
126
  case 'cpu_baseline_decode_tok_s':
127
  case 'cpu_baseline_prefill_tok_s': {
128
  // llama-bench style "avg \u00b1 stddev" with the pp{N} / tg{N} test
129
  // label as a tooltip when the new schema is present. Older records
130
  // without stddev fall back to the bare avg from formatTokS.
131
- const isDecode = col.key === 'decode_tok_s';
132
- const isPrefill = col.key === 'prefill_tok_s';
133
- const stddev = isDecode ? r.decode_stddev_ts
134
- : isPrefill ? r.prefill_stddev_ts
135
- : null;
136
- const testName = isDecode ? r.tg_test_name
137
- : isPrefill ? r.pp_test_name
138
- : null;
 
 
 
 
139
  const avg = r[col.key];
140
  let cell;
141
  if (avg != null && stddev != null) {
@@ -308,9 +338,14 @@ export function renderCpuGpuTable(results) {
308
  const container = document.getElementById('cpu-gpu-table');
309
  if (!container) return;
310
 
 
 
 
 
 
311
  const METRICS = [
312
- { field: 'decode_tok_s', label: 'Decode tok/s' },
313
- { field: 'prefill_tok_s', label: 'Prefill tok/s' },
314
  ];
315
 
316
  const passed = results.filter(r => r.status === 'done');
@@ -353,6 +388,8 @@ export function renderCpuGpuTable(results) {
353
  // Two-row grouped header: row1 = group labels (CPU, Chromium, …), row2 = metric sub-labels
354
  // CPU gets colspan = METRICS.length, each GPU browser gets colspan = METRICS.length * 2 (value + speedup per metric)
355
  const gpuColspan = METRICS.length * 2;
 
 
356
  let html = '<div class="table-card"><div class="results-wrapper"><table class="results-table"><thead>';
357
 
358
  // Row 1: group headers
@@ -387,7 +424,7 @@ export function renderCpuGpuTable(results) {
387
 
388
  // CPU columns
389
  for (const m of METRICS) {
390
- const val = avg(cpuItems, m.field);
391
  html += `<td><span class="mono">${formatTokS(val)}</span></td>`;
392
  }
393
 
@@ -395,8 +432,8 @@ export function renderCpuGpuTable(results) {
395
  for (const b of gpuBrowsers) {
396
  const gpuItems = gpuByBrowser[b] || [];
397
  for (const m of METRICS) {
398
- const cpuVal = avg(cpuItems, m.field);
399
- const gpuVal = avg(gpuItems, m.field);
400
  const speedup = cpuVal && gpuVal ? gpuVal / cpuVal : null;
401
  const cls = speedup == null ? '' : speedup >= 3 ? 'text-success' : speedup >= 1.5 ? '' : speedup >= 1 ? 'text-muted' : 'text-error';
402
  html += `<td><span class="mono">${formatTokS(gpuVal)}</span></td>`;
 
5
  let sortState = { key: null, dir: 'asc' };
6
 
7
  const NUM_KEYS = new Set([
8
+ 'sizeMB',
9
+ 'decode_tok_s', 'prefill_tok_s',
10
+ 'decode_tok_s_d0', 'decode_tok_s_dN',
11
+ 'prefill_tok_s_d0', 'prefill_tok_s_dN',
12
  'cpu_baseline_decode_tok_s', 'cpu_baseline_prefill_tok_s',
13
  'n_eval', 't_eval_ms',
14
  'n_p_eval', 't_p_eval_ms', 'wallTimeMs', 'consistency_rate',
 
67
 
68
  const sorted = sortState.key ? sortResults(results, sortState.key, sortState.dir) : results;
69
 
70
+ // Resolve the depth-loaded column label from the data: when every visible
71
+ // row shares one N (the typical leaderboard case), show that concrete
72
+ // value (e.g., "@ d2048"). When rows mix depths (someone experimenting
73
+ // with d=4096 vs d=2048), fall back to the abstract "@ dN" with a tooltip
74
+ // listing the values present so the user knows the column is mixed.
75
+ const depthNValues = [...new Set(results.map(r => r.n_depth_dN).filter(v => v != null))]
76
+ .sort((a, b) => a - b);
77
+ const dnLabel = depthNValues.length === 1 ? `d${depthNValues[0]}` : 'dN';
78
+ const dnHeaderTitle = depthNValues.length > 1
79
+ ? `Mixed depths in view: ${depthNValues.map(v => `d${v}`).join(', ')}`
80
+ : '';
81
+
82
  /* priority: 1 = always show; 2 = hide below 640px; 3 = hide below 900px */
83
  const cols = [
84
  { key: 'machineSlug', label: 'Machine', priority: 1 },
 
90
  { key: 'status', label: 'Status', priority: 1 },
91
  { key: 'buildType', label: 'Build', priority: 3 },
92
  { key: 'webgpuAvailable', label: 'WebGPU', priority: 3 },
93
+ // tg / pp split into cold-cache (d=0) and depth-loaded (d=N) columns
94
+ // so Run Study's depth-pair shows as side-by-side numbers instead of
95
+ // overwriting one with the other. Pre-study and plain-Run records
96
+ // populate only the side they actually measured; the other reads `—`.
97
+ { key: 'decode_tok_s_d0', label: 'tg @ d0', priority: 1 },
98
+ { key: 'decode_tok_s_dN', label: `tg @ ${dnLabel}`, priority: 1, headerTitle: dnHeaderTitle },
99
+ { key: 'prefill_tok_s_d0', label: 'pp @ d0', priority: 3 },
100
+ { key: 'prefill_tok_s_dN', label: `pp @ ${dnLabel}`, priority: 3, headerTitle: dnHeaderTitle },
101
  { key: 'cpu_baseline_decode_tok_s', label: 'CPU tg tok/s', priority: 2 },
102
  { key: 'cpu_baseline_prefill_tok_s', label: 'CPU pp tok/s', priority: 3 },
103
  { key: 'n_eval', label: 'n_eval', priority: 3 },
 
118
  const pin = i === 0 ? ' col-pin col-pin-1' : (i === 1 ? ' col-pin col-pin-2' : '');
119
  const prio = col.priority >= 3 ? ' col-p3' : (col.priority === 2 ? ' col-p2' : '');
120
  const cls = `sortable${isActive ? ' sorted' : ''}${pin}${prio}`;
121
+ const titleAttr = col.headerTitle ? ` title="${escapeHtml(col.headerTitle)}"` : '';
122
+ html += `<th data-key="${col.key}" class="${cls}" aria-sort="${ariaSort}" scope="col" tabindex="0"${titleAttr}><span class="th-label">${col.label}</span><span class="th-sort-indicator" aria-hidden="true">${arrowChar}</span></th>`;
123
  });
124
  html += '</tr></thead><tbody>';
125
 
 
145
  break;
146
  case 'decode_tok_s':
147
  case 'prefill_tok_s':
148
+ case 'decode_tok_s_d0':
149
+ case 'decode_tok_s_dN':
150
+ case 'prefill_tok_s_d0':
151
+ case 'prefill_tok_s_dN':
152
  case 'cpu_baseline_decode_tok_s':
153
  case 'cpu_baseline_prefill_tok_s': {
154
  // llama-bench style "avg \u00b1 stddev" with the pp{N} / tg{N} test
155
  // label as a tooltip when the new schema is present. Older records
156
  // without stddev fall back to the bare avg from formatTokS.
157
+ // Depth-suffixed keys read from the matching `_d0` / `_dN`
158
+ // stddev + test_name fields produced by mergeDepthPairs.
159
+ let stddev = null;
160
+ let testName = null;
161
+ switch (col.key) {
162
+ case 'decode_tok_s': stddev = r.decode_stddev_ts; testName = r.tg_test_name; break;
163
+ case 'prefill_tok_s': stddev = r.prefill_stddev_ts; testName = r.pp_test_name; break;
164
+ case 'decode_tok_s_d0': stddev = r.decode_stddev_ts_d0; testName = r.tg_test_name_d0; break;
165
+ case 'decode_tok_s_dN': stddev = r.decode_stddev_ts_dN; testName = r.tg_test_name_dN; break;
166
+ case 'prefill_tok_s_d0': stddev = r.prefill_stddev_ts_d0; testName = r.pp_test_name_d0; break;
167
+ case 'prefill_tok_s_dN': stddev = r.prefill_stddev_ts_dN; testName = r.pp_test_name_dN; break;
168
+ }
169
  const avg = r[col.key];
170
  let cell;
171
  if (avg != null && stddev != null) {
 
338
  const container = document.getElementById('cpu-gpu-table');
339
  if (!container) return;
340
 
341
+ // CPU is pinned to d=0 by the runner, so the comparison must read GPU's
342
+ // d=0 number for an apples-to-apples ratio. Plain-Run records that only
343
+ // measured d=N have null `_d0` and silently drop out of the comparison
344
+ // — that's the right call: without a cold-cache GPU sample the speedup
345
+ // ratio would be measuring different workloads.
346
  const METRICS = [
347
+ { cpuField: 'decode_tok_s', gpuField: 'decode_tok_s_d0', label: 'Decode tok/s @ d0' },
348
+ { cpuField: 'prefill_tok_s', gpuField: 'prefill_tok_s_d0', label: 'Prefill tok/s @ d0' },
349
  ];
350
 
351
  const passed = results.filter(r => r.status === 'done');
 
388
  // Two-row grouped header: row1 = group labels (CPU, Chromium, …), row2 = metric sub-labels
389
  // CPU gets colspan = METRICS.length, each GPU browser gets colspan = METRICS.length * 2 (value + speedup per metric)
390
  const gpuColspan = METRICS.length * 2;
391
+ // CPU side reads cpuField; GPU side reads gpuField (_d0 for apples-to-
392
+ // apples). Both labels match the metric's display label.
393
  let html = '<div class="table-card"><div class="results-wrapper"><table class="results-table"><thead>';
394
 
395
  // Row 1: group headers
 
424
 
425
  // CPU columns
426
  for (const m of METRICS) {
427
+ const val = avg(cpuItems, m.cpuField);
428
  html += `<td><span class="mono">${formatTokS(val)}</span></td>`;
429
  }
430
 
 
432
  for (const b of gpuBrowsers) {
433
  const gpuItems = gpuByBrowser[b] || [];
434
  for (const m of METRICS) {
435
+ const cpuVal = avg(cpuItems, m.cpuField);
436
+ const gpuVal = avg(gpuItems, m.gpuField);
437
  const speedup = cpuVal && gpuVal ? gpuVal / cpuVal : null;
438
  const cls = speedup == null ? '' : speedup >= 3 ? 'text-success' : speedup >= 1.5 ? '' : speedup >= 1 ? 'text-muted' : 'text-error';
439
  html += `<td><span class="mono">${formatTokS(gpuVal)}</span></td>`;