Spaces:

abhijitramesh
/

webgpu-bench

Running

App Files Files Community

GitHub Actions commited on 28 days ago

Commit

e2ac5c3

1 Parent(s): f8195e9

sync from abhijitramesh/webgpu-bench@d35922fe12

Browse files

Files changed (4) hide show

js/app.js +11 -7
js/charts.js +14 -6
js/data.js +76 -0
js/tables.js +54 -17

js/app.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { loadData, filterResults, selectBestResults, expandCpuRows, attachCpuBaselineFromCpuRecords } from './data.js';
 import { initFilters, populateQuantOptions, getFilters, resetFilters } from './filters.js';
 import { renderDecodeChart, renderPrefillChart, renderSizeChart, renderMachineChart, renderCpuGpuChart, renderSpeedupChart } from './charts.js';
 import { renderResultsTable, renderErrorTable, renderMachineInfo, renderCpuGpuTable } from './tables.js';
@@ -78,11 +78,14 @@ function render() {
   const filters = getFilters();
   // Filter, attach CPU baseline values (folds CLI-flow CPU records onto
   // their GPU sibling so both submission paths produce one row per cell),
-  // collapse to one canonical row per (machine, browser, model, variant,
-  // backend), then drop the now-redundant CPU rows. The CPU numbers stay
-  // visible via the cpu_baseline_* columns on each GPU row.
   const filtered = selectBestResults(
-    attachCpuBaselineFromCpuRecords(filterResults(appData.results, filters)),
   ).filter(r => r.nGpuLayers !== 0);
   // Summary cards — counts tween from previous value to new on filter changes
@@ -203,8 +206,9 @@ function renderHeroMeta(data) {
   // Hero stat: top decode tok/s with machine + model context. Uses the
   // canonical set (best per cell) so a noisy 1-iteration outlier can't
-  // hijack the headline number.
-  const canonical = selectBestResults(data?.results || []);
   const passed = canonical.filter(r => r.status === 'done' && r.decode_tok_s != null);
   const heroStatEl = document.getElementById('hero-stat');
   const heroNumEl = document.getElementById('hero-top-decode');

+import { loadData, filterResults, selectBestResults, expandCpuRows, attachCpuBaselineFromCpuRecords, mergeDepthPairs } from './data.js';
 import { initFilters, populateQuantOptions, getFilters, resetFilters } from './filters.js';
 import { renderDecodeChart, renderPrefillChart, renderSizeChart, renderMachineChart, renderCpuGpuChart, renderSpeedupChart } from './charts.js';
 import { renderResultsTable, renderErrorTable, renderMachineInfo, renderCpuGpuTable } from './tables.js';
   const filters = getFilters();
   // Filter, attach CPU baseline values (folds CLI-flow CPU records onto
   // their GPU sibling so both submission paths produce one row per cell),
+  // fold the (d=0, d=N) study pair into a single GPU row carrying both
+  // depths, collapse to one canonical row per (machine, browser, model,
+  // variant, backend), then drop the now-redundant CPU rows. CPU numbers
+  // stay visible via the cpu_baseline_* columns on each GPU row.
   const filtered = selectBestResults(
+    mergeDepthPairs(
+      attachCpuBaselineFromCpuRecords(filterResults(appData.results, filters)),
+    ),
   ).filter(r => r.nGpuLayers !== 0);
   // Summary cards — counts tween from previous value to new on filter changes
   // Hero stat: top decode tok/s with machine + model context. Uses the
   // canonical set (best per cell) so a noisy 1-iteration outlier can't
+  // hijack the headline number. Depth-merge first so a Study cell counts
+  // once at its d=N number, not twice.
+  const canonical = selectBestResults(mergeDepthPairs(data?.results || []));
   const passed = canonical.filter(r => r.status === 'done' && r.decode_tok_s != null);
   const heroStatEl = document.getElementById('hero-stat');
   const heroNumEl = document.getElementById('hero-top-decode');

js/charts.js CHANGED Viewed

@@ -250,16 +250,23 @@ function avgBy(items, field) {
   return vals.length ? vals.reduce((a, b) => a + b, 0) / vals.length : null;
 }
 export function renderCpuGpuChart(results, metric = 'decode_tok_s') {
   const canvasId = 'chart-cpu-gpu';
   destroyChart(canvasId);
   const canvas = document.getElementById(canvasId);
   if (!canvas) return;
   const passed = results.filter(r => r.status === 'done');
   // expandCpuRows folds in cpu_baseline_* from browser-flow GPU records.
   const cpuResults = expandCpuRows(passed).filter(r => r[metric] != null);
-  const gpuResults = passed.filter(r => r.nGpuLayers !== 0 && r[metric] != null);
   if (cpuResults.length === 0 || gpuResults.length === 0) {
     showEmptyState(canvas, cpuResults.length === 0 ? 'No CPU baseline data in current filter' : 'No GPU data in current filter');
@@ -286,7 +293,7 @@ export function renderCpuGpuChart(results, metric = 'decode_tok_s') {
     return {
       label: browser,
       backgroundColor: BROWSER_COLORS[browser] || '#888',
-      data: allQuants.map(q => avgBy(byVariant[q] || [], metric)),
     };
   });
@@ -298,7 +305,7 @@ export function renderCpuGpuChart(results, metric = 'decode_tok_s') {
       responsive: true,
       maintainAspectRatio: false,
       plugins: {
-        title: titleConfig(`CPU vs WebGPU: ${metricLabel}`),
         legend: darkLegend(),
         tooltip: { callbacks: { label: ctx => `${ctx.dataset.label}: ${formatTokS(ctx.raw)} tok/s` } },
       },
@@ -313,9 +320,10 @@ export function renderSpeedupChart(results, metric = 'decode_tok_s') {
   const canvas = document.getElementById(canvasId);
   if (!canvas) return;
   const passed = results.filter(r => r.status === 'done');
   const cpuResults = expandCpuRows(passed).filter(r => r[metric] != null);
-  const gpuResults = passed.filter(r => r.nGpuLayers !== 0 && r[metric] != null);
   if (cpuResults.length === 0 || gpuResults.length === 0) {
     showEmptyState(canvas, cpuResults.length === 0 ? 'No CPU baseline data in current filter' : 'No GPU data in current filter');
@@ -345,7 +353,7 @@ export function renderSpeedupChart(results, metric = 'decode_tok_s') {
       backgroundColor: BROWSER_COLORS[browser] || '#888',
       data: allQuants.map(q => {
         const cpuAvg = cpuAvgByVariant[q];
-        const gpuAvg = avgBy(byVariant[q] || [], metric);
         return cpuAvg && gpuAvg ? gpuAvg / cpuAvg : null;
       }),
     };
@@ -371,7 +379,7 @@ export function renderSpeedupChart(results, metric = 'decode_tok_s') {
       responsive: true,
       maintainAspectRatio: false,
       plugins: {
-        title: titleConfig(`WebGPU Speedup over CPU (${metricLabel})`),
         legend: {
           ...darkLegend(),
           labels: { ...darkLegend().labels, filter: item => item.text !== '1\u00d7' },

   return vals.length ? vals.reduce((a, b) => a + b, 0) / vals.length : null;
 }
+// CPU is pinned to d=0 by the runner, so apples-to-apples means reading
+// GPU's d=0 number. The CPU side keeps its bare metric (CPU records are
+// depth-pinned to 0 either way); GPU reads `<metric>_d0`. Plain-Run
+// records that only measured d=N have null `_d0` and silently drop out.
+function gpuDepthField(metric) { return `${metric}_d0`; }
 export function renderCpuGpuChart(results, metric = 'decode_tok_s') {
   const canvasId = 'chart-cpu-gpu';
   destroyChart(canvasId);
   const canvas = document.getElementById(canvasId);
   if (!canvas) return;
+  const gpuMetric = gpuDepthField(metric);
   const passed = results.filter(r => r.status === 'done');
   // expandCpuRows folds in cpu_baseline_* from browser-flow GPU records.
   const cpuResults = expandCpuRows(passed).filter(r => r[metric] != null);
+  const gpuResults = passed.filter(r => r.nGpuLayers !== 0 && r[gpuMetric] != null);
   if (cpuResults.length === 0 || gpuResults.length === 0) {
     showEmptyState(canvas, cpuResults.length === 0 ? 'No CPU baseline data in current filter' : 'No GPU data in current filter');
     return {
       label: browser,
       backgroundColor: BROWSER_COLORS[browser] || '#888',
+      data: allQuants.map(q => avgBy(byVariant[q] || [], gpuMetric)),
     };
   });
       responsive: true,
       maintainAspectRatio: false,
       plugins: {
+        title: titleConfig(`CPU vs WebGPU: ${metricLabel} @ d0`),
         legend: darkLegend(),
         tooltip: { callbacks: { label: ctx => `${ctx.dataset.label}: ${formatTokS(ctx.raw)} tok/s` } },
       },
   const canvas = document.getElementById(canvasId);
   if (!canvas) return;
+  const gpuMetric = gpuDepthField(metric);
   const passed = results.filter(r => r.status === 'done');
   const cpuResults = expandCpuRows(passed).filter(r => r[metric] != null);
+  const gpuResults = passed.filter(r => r.nGpuLayers !== 0 && r[gpuMetric] != null);
   if (cpuResults.length === 0 || gpuResults.length === 0) {
     showEmptyState(canvas, cpuResults.length === 0 ? 'No CPU baseline data in current filter' : 'No GPU data in current filter');
       backgroundColor: BROWSER_COLORS[browser] || '#888',
       data: allQuants.map(q => {
         const cpuAvg = cpuAvgByVariant[q];
+        const gpuAvg = avgBy(byVariant[q] || [], gpuMetric);
         return cpuAvg && gpuAvg ? gpuAvg / cpuAvg : null;
       }),
     };
       responsive: true,
       maintainAspectRatio: false,
       plugins: {
+        title: titleConfig(`WebGPU Speedup over CPU (${metricLabel} @ d0)`),
         legend: {
           ...darkLegend(),
           labels: { ...darkLegend().labels, filter: item => item.text !== '1\u00d7' },

js/data.js CHANGED Viewed

@@ -128,6 +128,82 @@ function writeSessionCache(data) {
   } catch { /* quota or disabled */ }
 }
 /* Reduce a flat result set down to one canonical row per
    (machineSlug, browser, model, variant, backend) cell. Picks the row with
    the most iterations; ties break on latest timestamp. This is the

   } catch { /* quota or disabled */ }
 }
+/* Fold the (d=0, d=N) GPU record pair that Run Study emits per variant
+   into a single dashboard row. The d=N record stays canonical
+   (`decode_tok_s` / `prefill_tok_s` keep the depth-loaded numbers) so
+   existing chart/table consumers keep working unchanged; a new pair of
+   `_d0` / `_dN` suffix fields lets depth-aware code pick a specific pass.
+   CPU records are pinned to d=0 by the runner, so they pass through
+   untouched. Cells with only one half of the pair (plain Run, pre-study
+   data, or a partial study) lift their values into the suffix field on
+   the side that exists, leaving the other side null — so consumers can
+   render `—` without having to know the record's history.
+   Within each cell we also tie-break duplicate records per depth bucket
+   (same iteration / latest timestamp wins, mirroring selectBestResults)
+   so multiple study runs of the same variant collapse cleanly.
+   Run AFTER attachCpuBaselineFromCpuRecords (which keys on the
+   depth-independent (machine, browser, model, variant) tuple) and
+   BEFORE selectBestResults (CPU rows still need cell-dedup; GPU rows
+   are already deduped here). */
+export function mergeDepthPairs(records) {
+  const cells = new Map();
+  const cpuRows = [];
+  for (const r of records) {
+    if (r.nGpuLayers === 0) {
+      cpuRows.push(r);
+      continue;
+    }
+    const cellKey = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}`;
+    const bucket = (r.n_depth ?? 0) === 0 ? 'd0' : 'dN';
+    const slot = cells.get(cellKey) || { d0: null, dN: null };
+    if (!slot[bucket] || isStrongerRecord(r, slot[bucket])) slot[bucket] = r;
+    cells.set(cellKey, slot);
+  }
+  const merged = [...cpuRows];
+  for (const { d0, dN } of cells.values()) {
+    if (d0 && dN) merged.push(joinDepthPair(d0, dN));
+    else if (dN) merged.push(liftSingleDepth(dN, 'dN'));
+    else if (d0) merged.push(liftSingleDepth(d0, 'd0'));
+  }
+  return merged;
+}
+function isStrongerRecord(a, b) {
+  const ai = a.iterations ?? 0;
+  const bi = b.iterations ?? 0;
+  if (ai !== bi) return ai > bi;
+  return (a.timestamp || '') > (b.timestamp || '');
+}
+const DEPTH_PERF_FIELDS = [
+  'decode_tok_s', 'prefill_tok_s',
+  'decode_stddev_ts', 'prefill_stddev_ts',
+  'pp_test_name', 'tg_test_name',
+];
+function joinDepthPair(d0, dN) {
+  const out = { ...dN };
+  for (const f of DEPTH_PERF_FIELDS) {
+    out[`${f}_d0`] = d0[f] ?? null;
+    out[`${f}_dN`] = dN[f] ?? null;
+  }
+  out.n_depth_dN = dN.n_depth ?? null;
+  return out;
+}
+function liftSingleDepth(r, bucket) {
+  const out = { ...r };
+  for (const f of DEPTH_PERF_FIELDS) {
+    out[`${f}_d0`] = bucket === 'd0' ? (r[f] ?? null) : null;
+    out[`${f}_dN`] = bucket === 'dN' ? (r[f] ?? null) : null;
+  }
+  out.n_depth_dN = bucket === 'dN' ? (r.n_depth ?? null) : null;
+  return out;
+}
 /* Reduce a flat result set down to one canonical row per
    (machineSlug, browser, model, variant, backend) cell. Picks the row with
    the most iterations; ties break on latest timestamp. This is the

js/tables.js CHANGED Viewed

@@ -5,7 +5,10 @@ let lastResults = [];
 let sortState = { key: null, dir: 'asc' };
 const NUM_KEYS = new Set([
-  'sizeMB', 'decode_tok_s', 'prefill_tok_s',
   'cpu_baseline_decode_tok_s', 'cpu_baseline_prefill_tok_s',
   'n_eval', 't_eval_ms',
   'n_p_eval', 't_p_eval_ms', 'wallTimeMs', 'consistency_rate',
@@ -64,6 +67,18 @@ export function renderResultsTable(results) {
   const sorted = sortState.key ? sortResults(results, sortState.key, sortState.dir) : results;
   /* priority: 1 = always show; 2 = hide below 640px; 3 = hide below 900px */
   const cols = [
     { key: 'machineSlug', label: 'Machine', priority: 1 },
@@ -75,8 +90,14 @@ export function renderResultsTable(results) {
     { key: 'status', label: 'Status', priority: 1 },
     { key: 'buildType', label: 'Build', priority: 3 },
     { key: 'webgpuAvailable', label: 'WebGPU', priority: 3 },
-    { key: 'decode_tok_s', label: 'tg tok/s', priority: 1 },
-    { key: 'prefill_tok_s', label: 'pp tok/s', priority: 3 },
     { key: 'cpu_baseline_decode_tok_s', label: 'CPU tg tok/s', priority: 2 },
     { key: 'cpu_baseline_prefill_tok_s', label: 'CPU pp tok/s', priority: 3 },
     { key: 'n_eval', label: 'n_eval', priority: 3 },
@@ -97,7 +118,8 @@ export function renderResultsTable(results) {
     const pin = i === 0 ? ' col-pin col-pin-1' : (i === 1 ? ' col-pin col-pin-2' : '');
     const prio = col.priority >= 3 ? ' col-p3' : (col.priority === 2 ? ' col-p2' : '');
     const cls = `sortable${isActive ? ' sorted' : ''}${pin}${prio}`;
-    html += `<th data-key="${col.key}" class="${cls}" aria-sort="${ariaSort}" scope="col" tabindex="0"><span class="th-label">${col.label}</span><span class="th-sort-indicator" aria-hidden="true">${arrowChar}</span></th>`;
   });
   html += '</tr></thead><tbody>';
@@ -123,19 +145,27 @@ export function renderResultsTable(results) {
           break;
         case 'decode_tok_s':
         case 'prefill_tok_s':
         case 'cpu_baseline_decode_tok_s':
         case 'cpu_baseline_prefill_tok_s': {
           // llama-bench style "avg \u00b1 stddev" with the pp{N} / tg{N} test
           // label as a tooltip when the new schema is present. Older records
           // without stddev fall back to the bare avg from formatTokS.
-          const isDecode = col.key === 'decode_tok_s';
-          const isPrefill = col.key === 'prefill_tok_s';
-          const stddev = isDecode ? r.decode_stddev_ts
-            : isPrefill ? r.prefill_stddev_ts
-            : null;
-          const testName = isDecode ? r.tg_test_name
-            : isPrefill ? r.pp_test_name
-            : null;
           const avg = r[col.key];
           let cell;
           if (avg != null && stddev != null) {
@@ -308,9 +338,14 @@ export function renderCpuGpuTable(results) {
   const container = document.getElementById('cpu-gpu-table');
   if (!container) return;
   const METRICS = [
-    { field: 'decode_tok_s', label: 'Decode tok/s' },
-    { field: 'prefill_tok_s', label: 'Prefill tok/s' },
   ];
   const passed = results.filter(r => r.status === 'done');
@@ -353,6 +388,8 @@ export function renderCpuGpuTable(results) {
   // Two-row grouped header: row1 = group labels (CPU, Chromium, …), row2 = metric sub-labels
   // CPU gets colspan = METRICS.length, each GPU browser gets colspan = METRICS.length * 2 (value + speedup per metric)
   const gpuColspan = METRICS.length * 2;
   let html = '<div class="table-card"><div class="results-wrapper"><table class="results-table"><thead>';
   // Row 1: group headers
@@ -387,7 +424,7 @@ export function renderCpuGpuTable(results) {
     // CPU columns
     for (const m of METRICS) {
-      const val = avg(cpuItems, m.field);
       html += `<td><span class="mono">${formatTokS(val)}</span></td>`;
     }
@@ -395,8 +432,8 @@ export function renderCpuGpuTable(results) {
     for (const b of gpuBrowsers) {
       const gpuItems = gpuByBrowser[b] || [];
       for (const m of METRICS) {
-        const cpuVal = avg(cpuItems, m.field);
-        const gpuVal = avg(gpuItems, m.field);
         const speedup = cpuVal && gpuVal ? gpuVal / cpuVal : null;
         const cls = speedup == null ? '' : speedup >= 3 ? 'text-success' : speedup >= 1.5 ? '' : speedup >= 1 ? 'text-muted' : 'text-error';
         html += `<td><span class="mono">${formatTokS(gpuVal)}</span></td>`;

 let sortState = { key: null, dir: 'asc' };
 const NUM_KEYS = new Set([
+  'sizeMB',
+  'decode_tok_s', 'prefill_tok_s',
+  'decode_tok_s_d0', 'decode_tok_s_dN',
+  'prefill_tok_s_d0', 'prefill_tok_s_dN',
   'cpu_baseline_decode_tok_s', 'cpu_baseline_prefill_tok_s',
   'n_eval', 't_eval_ms',
   'n_p_eval', 't_p_eval_ms', 'wallTimeMs', 'consistency_rate',
   const sorted = sortState.key ? sortResults(results, sortState.key, sortState.dir) : results;
+  // Resolve the depth-loaded column label from the data: when every visible
+  // row shares one N (the typical leaderboard case), show that concrete
+  // value (e.g., "@ d2048"). When rows mix depths (someone experimenting
+  // with d=4096 vs d=2048), fall back to the abstract "@ dN" with a tooltip
+  // listing the values present so the user knows the column is mixed.
+  const depthNValues = [...new Set(results.map(r => r.n_depth_dN).filter(v => v != null))]
+    .sort((a, b) => a - b);
+  const dnLabel = depthNValues.length === 1 ? `d${depthNValues[0]}` : 'dN';
+  const dnHeaderTitle = depthNValues.length > 1
+    ? `Mixed depths in view: ${depthNValues.map(v => `d${v}`).join(', ')}`
+    : '';
   /* priority: 1 = always show; 2 = hide below 640px; 3 = hide below 900px */
   const cols = [
     { key: 'machineSlug', label: 'Machine', priority: 1 },
     { key: 'status', label: 'Status', priority: 1 },
     { key: 'buildType', label: 'Build', priority: 3 },
     { key: 'webgpuAvailable', label: 'WebGPU', priority: 3 },
+    // tg / pp split into cold-cache (d=0) and depth-loaded (d=N) columns
+    // so Run Study's depth-pair shows as side-by-side numbers instead of
+    // overwriting one with the other. Pre-study and plain-Run records
+    // populate only the side they actually measured; the other reads `—`.
+    { key: 'decode_tok_s_d0', label: 'tg @ d0', priority: 1 },
+    { key: 'decode_tok_s_dN', label: `tg @ ${dnLabel}`, priority: 1, headerTitle: dnHeaderTitle },
+    { key: 'prefill_tok_s_d0', label: 'pp @ d0', priority: 3 },
+    { key: 'prefill_tok_s_dN', label: `pp @ ${dnLabel}`, priority: 3, headerTitle: dnHeaderTitle },
     { key: 'cpu_baseline_decode_tok_s', label: 'CPU tg tok/s', priority: 2 },
     { key: 'cpu_baseline_prefill_tok_s', label: 'CPU pp tok/s', priority: 3 },
     { key: 'n_eval', label: 'n_eval', priority: 3 },
     const pin = i === 0 ? ' col-pin col-pin-1' : (i === 1 ? ' col-pin col-pin-2' : '');
     const prio = col.priority >= 3 ? ' col-p3' : (col.priority === 2 ? ' col-p2' : '');
     const cls = `sortable${isActive ? ' sorted' : ''}${pin}${prio}`;
+    const titleAttr = col.headerTitle ? ` title="${escapeHtml(col.headerTitle)}"` : '';
+    html += `<th data-key="${col.key}" class="${cls}" aria-sort="${ariaSort}" scope="col" tabindex="0"${titleAttr}><span class="th-label">${col.label}</span><span class="th-sort-indicator" aria-hidden="true">${arrowChar}</span></th>`;
   });
   html += '</tr></thead><tbody>';
           break;
         case 'decode_tok_s':
         case 'prefill_tok_s':
+        case 'decode_tok_s_d0':
+        case 'decode_tok_s_dN':
+        case 'prefill_tok_s_d0':
+        case 'prefill_tok_s_dN':
         case 'cpu_baseline_decode_tok_s':
         case 'cpu_baseline_prefill_tok_s': {
           // llama-bench style "avg \u00b1 stddev" with the pp{N} / tg{N} test
           // label as a tooltip when the new schema is present. Older records
           // without stddev fall back to the bare avg from formatTokS.
+          // Depth-suffixed keys read from the matching `_d0` / `_dN`
+          // stddev + test_name fields produced by mergeDepthPairs.
+          let stddev = null;
+          let testName = null;
+          switch (col.key) {
+            case 'decode_tok_s':       stddev = r.decode_stddev_ts;     testName = r.tg_test_name;       break;
+            case 'prefill_tok_s':      stddev = r.prefill_stddev_ts;    testName = r.pp_test_name;       break;
+            case 'decode_tok_s_d0':    stddev = r.decode_stddev_ts_d0;  testName = r.tg_test_name_d0;    break;
+            case 'decode_tok_s_dN':    stddev = r.decode_stddev_ts_dN;  testName = r.tg_test_name_dN;    break;
+            case 'prefill_tok_s_d0':   stddev = r.prefill_stddev_ts_d0; testName = r.pp_test_name_d0;    break;
+            case 'prefill_tok_s_dN':   stddev = r.prefill_stddev_ts_dN; testName = r.pp_test_name_dN;    break;
+          }
           const avg = r[col.key];
           let cell;
           if (avg != null && stddev != null) {
   const container = document.getElementById('cpu-gpu-table');
   if (!container) return;
+  // CPU is pinned to d=0 by the runner, so the comparison must read GPU's
+  // d=0 number for an apples-to-apples ratio. Plain-Run records that only
+  // measured d=N have null `_d0` and silently drop out of the comparison
+  // — that's the right call: without a cold-cache GPU sample the speedup
+  // ratio would be measuring different workloads.
   const METRICS = [
+    { cpuField: 'decode_tok_s',  gpuField: 'decode_tok_s_d0',  label: 'Decode tok/s @ d0' },
+    { cpuField: 'prefill_tok_s', gpuField: 'prefill_tok_s_d0', label: 'Prefill tok/s @ d0' },
   ];
   const passed = results.filter(r => r.status === 'done');
   // Two-row grouped header: row1 = group labels (CPU, Chromium, …), row2 = metric sub-labels
   // CPU gets colspan = METRICS.length, each GPU browser gets colspan = METRICS.length * 2 (value + speedup per metric)
   const gpuColspan = METRICS.length * 2;
+  // CPU side reads cpuField; GPU side reads gpuField (_d0 for apples-to-
+  // apples). Both labels match the metric's display label.
   let html = '<div class="table-card"><div class="results-wrapper"><table class="results-table"><thead>';
   // Row 1: group headers
     // CPU columns
     for (const m of METRICS) {
+      const val = avg(cpuItems, m.cpuField);
       html += `<td><span class="mono">${formatTokS(val)}</span></td>`;
     }
     for (const b of gpuBrowsers) {
       const gpuItems = gpuByBrowser[b] || [];
       for (const m of METRICS) {
+        const cpuVal = avg(cpuItems, m.cpuField);
+        const gpuVal = avg(gpuItems, m.gpuField);
         const speedup = cpuVal && gpuVal ? gpuVal / cpuVal : null;
         const cls = speedup == null ? '' : speedup >= 3 ? 'text-success' : speedup >= 1.5 ? '' : speedup >= 1 ? 'text-muted' : 'text-error';
         html += `<td><span class="mono">${formatTokS(gpuVal)}</span></td>`;