Spaces:
Running
Running
GitHub Actions commited on
Commit ·
e2ac5c3
1
Parent(s): f8195e9
sync from abhijitramesh/webgpu-bench@d35922fe12
Browse files- js/app.js +11 -7
- js/charts.js +14 -6
- js/data.js +76 -0
- js/tables.js +54 -17
js/app.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import { loadData, filterResults, selectBestResults, expandCpuRows, attachCpuBaselineFromCpuRecords } from './data.js';
|
| 2 |
import { initFilters, populateQuantOptions, getFilters, resetFilters } from './filters.js';
|
| 3 |
import { renderDecodeChart, renderPrefillChart, renderSizeChart, renderMachineChart, renderCpuGpuChart, renderSpeedupChart } from './charts.js';
|
| 4 |
import { renderResultsTable, renderErrorTable, renderMachineInfo, renderCpuGpuTable } from './tables.js';
|
|
@@ -78,11 +78,14 @@ function render() {
|
|
| 78 |
const filters = getFilters();
|
| 79 |
// Filter, attach CPU baseline values (folds CLI-flow CPU records onto
|
| 80 |
// their GPU sibling so both submission paths produce one row per cell),
|
| 81 |
-
//
|
| 82 |
-
//
|
| 83 |
-
//
|
|
|
|
| 84 |
const filtered = selectBestResults(
|
| 85 |
-
|
|
|
|
|
|
|
| 86 |
).filter(r => r.nGpuLayers !== 0);
|
| 87 |
|
| 88 |
// Summary cards — counts tween from previous value to new on filter changes
|
|
@@ -203,8 +206,9 @@ function renderHeroMeta(data) {
|
|
| 203 |
|
| 204 |
// Hero stat: top decode tok/s with machine + model context. Uses the
|
| 205 |
// canonical set (best per cell) so a noisy 1-iteration outlier can't
|
| 206 |
-
// hijack the headline number.
|
| 207 |
-
|
|
|
|
| 208 |
const passed = canonical.filter(r => r.status === 'done' && r.decode_tok_s != null);
|
| 209 |
const heroStatEl = document.getElementById('hero-stat');
|
| 210 |
const heroNumEl = document.getElementById('hero-top-decode');
|
|
|
|
| 1 |
+
import { loadData, filterResults, selectBestResults, expandCpuRows, attachCpuBaselineFromCpuRecords, mergeDepthPairs } from './data.js';
|
| 2 |
import { initFilters, populateQuantOptions, getFilters, resetFilters } from './filters.js';
|
| 3 |
import { renderDecodeChart, renderPrefillChart, renderSizeChart, renderMachineChart, renderCpuGpuChart, renderSpeedupChart } from './charts.js';
|
| 4 |
import { renderResultsTable, renderErrorTable, renderMachineInfo, renderCpuGpuTable } from './tables.js';
|
|
|
|
| 78 |
const filters = getFilters();
|
| 79 |
// Filter, attach CPU baseline values (folds CLI-flow CPU records onto
|
| 80 |
// their GPU sibling so both submission paths produce one row per cell),
|
| 81 |
+
// fold the (d=0, d=N) study pair into a single GPU row carrying both
|
| 82 |
+
// depths, collapse to one canonical row per (machine, browser, model,
|
| 83 |
+
// variant, backend), then drop the now-redundant CPU rows. CPU numbers
|
| 84 |
+
// stay visible via the cpu_baseline_* columns on each GPU row.
|
| 85 |
const filtered = selectBestResults(
|
| 86 |
+
mergeDepthPairs(
|
| 87 |
+
attachCpuBaselineFromCpuRecords(filterResults(appData.results, filters)),
|
| 88 |
+
),
|
| 89 |
).filter(r => r.nGpuLayers !== 0);
|
| 90 |
|
| 91 |
// Summary cards — counts tween from previous value to new on filter changes
|
|
|
|
| 206 |
|
| 207 |
// Hero stat: top decode tok/s with machine + model context. Uses the
|
| 208 |
// canonical set (best per cell) so a noisy 1-iteration outlier can't
|
| 209 |
+
// hijack the headline number. Depth-merge first so a Study cell counts
|
| 210 |
+
// once at its d=N number, not twice.
|
| 211 |
+
const canonical = selectBestResults(mergeDepthPairs(data?.results || []));
|
| 212 |
const passed = canonical.filter(r => r.status === 'done' && r.decode_tok_s != null);
|
| 213 |
const heroStatEl = document.getElementById('hero-stat');
|
| 214 |
const heroNumEl = document.getElementById('hero-top-decode');
|
js/charts.js
CHANGED
|
@@ -250,16 +250,23 @@ function avgBy(items, field) {
|
|
| 250 |
return vals.length ? vals.reduce((a, b) => a + b, 0) / vals.length : null;
|
| 251 |
}
|
| 252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
export function renderCpuGpuChart(results, metric = 'decode_tok_s') {
|
| 254 |
const canvasId = 'chart-cpu-gpu';
|
| 255 |
destroyChart(canvasId);
|
| 256 |
const canvas = document.getElementById(canvasId);
|
| 257 |
if (!canvas) return;
|
| 258 |
|
|
|
|
| 259 |
const passed = results.filter(r => r.status === 'done');
|
| 260 |
// expandCpuRows folds in cpu_baseline_* from browser-flow GPU records.
|
| 261 |
const cpuResults = expandCpuRows(passed).filter(r => r[metric] != null);
|
| 262 |
-
const gpuResults = passed.filter(r => r.nGpuLayers !== 0 && r[
|
| 263 |
|
| 264 |
if (cpuResults.length === 0 || gpuResults.length === 0) {
|
| 265 |
showEmptyState(canvas, cpuResults.length === 0 ? 'No CPU baseline data in current filter' : 'No GPU data in current filter');
|
|
@@ -286,7 +293,7 @@ export function renderCpuGpuChart(results, metric = 'decode_tok_s') {
|
|
| 286 |
return {
|
| 287 |
label: browser,
|
| 288 |
backgroundColor: BROWSER_COLORS[browser] || '#888',
|
| 289 |
-
data: allQuants.map(q => avgBy(byVariant[q] || [],
|
| 290 |
};
|
| 291 |
});
|
| 292 |
|
|
@@ -298,7 +305,7 @@ export function renderCpuGpuChart(results, metric = 'decode_tok_s') {
|
|
| 298 |
responsive: true,
|
| 299 |
maintainAspectRatio: false,
|
| 300 |
plugins: {
|
| 301 |
-
title: titleConfig(`CPU vs WebGPU: ${metricLabel}`),
|
| 302 |
legend: darkLegend(),
|
| 303 |
tooltip: { callbacks: { label: ctx => `${ctx.dataset.label}: ${formatTokS(ctx.raw)} tok/s` } },
|
| 304 |
},
|
|
@@ -313,9 +320,10 @@ export function renderSpeedupChart(results, metric = 'decode_tok_s') {
|
|
| 313 |
const canvas = document.getElementById(canvasId);
|
| 314 |
if (!canvas) return;
|
| 315 |
|
|
|
|
| 316 |
const passed = results.filter(r => r.status === 'done');
|
| 317 |
const cpuResults = expandCpuRows(passed).filter(r => r[metric] != null);
|
| 318 |
-
const gpuResults = passed.filter(r => r.nGpuLayers !== 0 && r[
|
| 319 |
|
| 320 |
if (cpuResults.length === 0 || gpuResults.length === 0) {
|
| 321 |
showEmptyState(canvas, cpuResults.length === 0 ? 'No CPU baseline data in current filter' : 'No GPU data in current filter');
|
|
@@ -345,7 +353,7 @@ export function renderSpeedupChart(results, metric = 'decode_tok_s') {
|
|
| 345 |
backgroundColor: BROWSER_COLORS[browser] || '#888',
|
| 346 |
data: allQuants.map(q => {
|
| 347 |
const cpuAvg = cpuAvgByVariant[q];
|
| 348 |
-
const gpuAvg = avgBy(byVariant[q] || [],
|
| 349 |
return cpuAvg && gpuAvg ? gpuAvg / cpuAvg : null;
|
| 350 |
}),
|
| 351 |
};
|
|
@@ -371,7 +379,7 @@ export function renderSpeedupChart(results, metric = 'decode_tok_s') {
|
|
| 371 |
responsive: true,
|
| 372 |
maintainAspectRatio: false,
|
| 373 |
plugins: {
|
| 374 |
-
title: titleConfig(`WebGPU Speedup over CPU (${metricLabel})`),
|
| 375 |
legend: {
|
| 376 |
...darkLegend(),
|
| 377 |
labels: { ...darkLegend().labels, filter: item => item.text !== '1\u00d7' },
|
|
|
|
| 250 |
return vals.length ? vals.reduce((a, b) => a + b, 0) / vals.length : null;
|
| 251 |
}
|
| 252 |
|
| 253 |
+
// CPU is pinned to d=0 by the runner, so apples-to-apples means reading
|
| 254 |
+
// GPU's d=0 number. The CPU side keeps its bare metric (CPU records are
|
| 255 |
+
// depth-pinned to 0 either way); GPU reads `<metric>_d0`. Plain-Run
|
| 256 |
+
// records that only measured d=N have null `_d0` and silently drop out.
|
| 257 |
+
function gpuDepthField(metric) { return `${metric}_d0`; }
|
| 258 |
+
|
| 259 |
export function renderCpuGpuChart(results, metric = 'decode_tok_s') {
|
| 260 |
const canvasId = 'chart-cpu-gpu';
|
| 261 |
destroyChart(canvasId);
|
| 262 |
const canvas = document.getElementById(canvasId);
|
| 263 |
if (!canvas) return;
|
| 264 |
|
| 265 |
+
const gpuMetric = gpuDepthField(metric);
|
| 266 |
const passed = results.filter(r => r.status === 'done');
|
| 267 |
// expandCpuRows folds in cpu_baseline_* from browser-flow GPU records.
|
| 268 |
const cpuResults = expandCpuRows(passed).filter(r => r[metric] != null);
|
| 269 |
+
const gpuResults = passed.filter(r => r.nGpuLayers !== 0 && r[gpuMetric] != null);
|
| 270 |
|
| 271 |
if (cpuResults.length === 0 || gpuResults.length === 0) {
|
| 272 |
showEmptyState(canvas, cpuResults.length === 0 ? 'No CPU baseline data in current filter' : 'No GPU data in current filter');
|
|
|
|
| 293 |
return {
|
| 294 |
label: browser,
|
| 295 |
backgroundColor: BROWSER_COLORS[browser] || '#888',
|
| 296 |
+
data: allQuants.map(q => avgBy(byVariant[q] || [], gpuMetric)),
|
| 297 |
};
|
| 298 |
});
|
| 299 |
|
|
|
|
| 305 |
responsive: true,
|
| 306 |
maintainAspectRatio: false,
|
| 307 |
plugins: {
|
| 308 |
+
title: titleConfig(`CPU vs WebGPU: ${metricLabel} @ d0`),
|
| 309 |
legend: darkLegend(),
|
| 310 |
tooltip: { callbacks: { label: ctx => `${ctx.dataset.label}: ${formatTokS(ctx.raw)} tok/s` } },
|
| 311 |
},
|
|
|
|
| 320 |
const canvas = document.getElementById(canvasId);
|
| 321 |
if (!canvas) return;
|
| 322 |
|
| 323 |
+
const gpuMetric = gpuDepthField(metric);
|
| 324 |
const passed = results.filter(r => r.status === 'done');
|
| 325 |
const cpuResults = expandCpuRows(passed).filter(r => r[metric] != null);
|
| 326 |
+
const gpuResults = passed.filter(r => r.nGpuLayers !== 0 && r[gpuMetric] != null);
|
| 327 |
|
| 328 |
if (cpuResults.length === 0 || gpuResults.length === 0) {
|
| 329 |
showEmptyState(canvas, cpuResults.length === 0 ? 'No CPU baseline data in current filter' : 'No GPU data in current filter');
|
|
|
|
| 353 |
backgroundColor: BROWSER_COLORS[browser] || '#888',
|
| 354 |
data: allQuants.map(q => {
|
| 355 |
const cpuAvg = cpuAvgByVariant[q];
|
| 356 |
+
const gpuAvg = avgBy(byVariant[q] || [], gpuMetric);
|
| 357 |
return cpuAvg && gpuAvg ? gpuAvg / cpuAvg : null;
|
| 358 |
}),
|
| 359 |
};
|
|
|
|
| 379 |
responsive: true,
|
| 380 |
maintainAspectRatio: false,
|
| 381 |
plugins: {
|
| 382 |
+
title: titleConfig(`WebGPU Speedup over CPU (${metricLabel} @ d0)`),
|
| 383 |
legend: {
|
| 384 |
...darkLegend(),
|
| 385 |
labels: { ...darkLegend().labels, filter: item => item.text !== '1\u00d7' },
|
js/data.js
CHANGED
|
@@ -128,6 +128,82 @@ function writeSessionCache(data) {
|
|
| 128 |
} catch { /* quota or disabled */ }
|
| 129 |
}
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
/* Reduce a flat result set down to one canonical row per
|
| 132 |
(machineSlug, browser, model, variant, backend) cell. Picks the row with
|
| 133 |
the most iterations; ties break on latest timestamp. This is the
|
|
|
|
| 128 |
} catch { /* quota or disabled */ }
|
| 129 |
}
|
| 130 |
|
| 131 |
+
/* Fold the (d=0, d=N) GPU record pair that Run Study emits per variant
|
| 132 |
+
into a single dashboard row. The d=N record stays canonical
|
| 133 |
+
(`decode_tok_s` / `prefill_tok_s` keep the depth-loaded numbers) so
|
| 134 |
+
existing chart/table consumers keep working unchanged; a new pair of
|
| 135 |
+
`_d0` / `_dN` suffix fields lets depth-aware code pick a specific pass.
|
| 136 |
+
|
| 137 |
+
CPU records are pinned to d=0 by the runner, so they pass through
|
| 138 |
+
untouched. Cells with only one half of the pair (plain Run, pre-study
|
| 139 |
+
data, or a partial study) lift their values into the suffix field on
|
| 140 |
+
the side that exists, leaving the other side null — so consumers can
|
| 141 |
+
render `—` without having to know the record's history.
|
| 142 |
+
|
| 143 |
+
Within each cell we also tie-break duplicate records per depth bucket
|
| 144 |
+
(same iteration / latest timestamp wins, mirroring selectBestResults)
|
| 145 |
+
so multiple study runs of the same variant collapse cleanly.
|
| 146 |
+
|
| 147 |
+
Run AFTER attachCpuBaselineFromCpuRecords (which keys on the
|
| 148 |
+
depth-independent (machine, browser, model, variant) tuple) and
|
| 149 |
+
BEFORE selectBestResults (CPU rows still need cell-dedup; GPU rows
|
| 150 |
+
are already deduped here). */
|
| 151 |
+
export function mergeDepthPairs(records) {
|
| 152 |
+
const cells = new Map();
|
| 153 |
+
const cpuRows = [];
|
| 154 |
+
for (const r of records) {
|
| 155 |
+
if (r.nGpuLayers === 0) {
|
| 156 |
+
cpuRows.push(r);
|
| 157 |
+
continue;
|
| 158 |
+
}
|
| 159 |
+
const cellKey = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}`;
|
| 160 |
+
const bucket = (r.n_depth ?? 0) === 0 ? 'd0' : 'dN';
|
| 161 |
+
const slot = cells.get(cellKey) || { d0: null, dN: null };
|
| 162 |
+
if (!slot[bucket] || isStrongerRecord(r, slot[bucket])) slot[bucket] = r;
|
| 163 |
+
cells.set(cellKey, slot);
|
| 164 |
+
}
|
| 165 |
+
const merged = [...cpuRows];
|
| 166 |
+
for (const { d0, dN } of cells.values()) {
|
| 167 |
+
if (d0 && dN) merged.push(joinDepthPair(d0, dN));
|
| 168 |
+
else if (dN) merged.push(liftSingleDepth(dN, 'dN'));
|
| 169 |
+
else if (d0) merged.push(liftSingleDepth(d0, 'd0'));
|
| 170 |
+
}
|
| 171 |
+
return merged;
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
function isStrongerRecord(a, b) {
|
| 175 |
+
const ai = a.iterations ?? 0;
|
| 176 |
+
const bi = b.iterations ?? 0;
|
| 177 |
+
if (ai !== bi) return ai > bi;
|
| 178 |
+
return (a.timestamp || '') > (b.timestamp || '');
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
const DEPTH_PERF_FIELDS = [
|
| 182 |
+
'decode_tok_s', 'prefill_tok_s',
|
| 183 |
+
'decode_stddev_ts', 'prefill_stddev_ts',
|
| 184 |
+
'pp_test_name', 'tg_test_name',
|
| 185 |
+
];
|
| 186 |
+
|
| 187 |
+
function joinDepthPair(d0, dN) {
|
| 188 |
+
const out = { ...dN };
|
| 189 |
+
for (const f of DEPTH_PERF_FIELDS) {
|
| 190 |
+
out[`${f}_d0`] = d0[f] ?? null;
|
| 191 |
+
out[`${f}_dN`] = dN[f] ?? null;
|
| 192 |
+
}
|
| 193 |
+
out.n_depth_dN = dN.n_depth ?? null;
|
| 194 |
+
return out;
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
function liftSingleDepth(r, bucket) {
|
| 198 |
+
const out = { ...r };
|
| 199 |
+
for (const f of DEPTH_PERF_FIELDS) {
|
| 200 |
+
out[`${f}_d0`] = bucket === 'd0' ? (r[f] ?? null) : null;
|
| 201 |
+
out[`${f}_dN`] = bucket === 'dN' ? (r[f] ?? null) : null;
|
| 202 |
+
}
|
| 203 |
+
out.n_depth_dN = bucket === 'dN' ? (r.n_depth ?? null) : null;
|
| 204 |
+
return out;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
/* Reduce a flat result set down to one canonical row per
|
| 208 |
(machineSlug, browser, model, variant, backend) cell. Picks the row with
|
| 209 |
the most iterations; ties break on latest timestamp. This is the
|
js/tables.js
CHANGED
|
@@ -5,7 +5,10 @@ let lastResults = [];
|
|
| 5 |
let sortState = { key: null, dir: 'asc' };
|
| 6 |
|
| 7 |
const NUM_KEYS = new Set([
|
| 8 |
-
'sizeMB',
|
|
|
|
|
|
|
|
|
|
| 9 |
'cpu_baseline_decode_tok_s', 'cpu_baseline_prefill_tok_s',
|
| 10 |
'n_eval', 't_eval_ms',
|
| 11 |
'n_p_eval', 't_p_eval_ms', 'wallTimeMs', 'consistency_rate',
|
|
@@ -64,6 +67,18 @@ export function renderResultsTable(results) {
|
|
| 64 |
|
| 65 |
const sorted = sortState.key ? sortResults(results, sortState.key, sortState.dir) : results;
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
/* priority: 1 = always show; 2 = hide below 640px; 3 = hide below 900px */
|
| 68 |
const cols = [
|
| 69 |
{ key: 'machineSlug', label: 'Machine', priority: 1 },
|
|
@@ -75,8 +90,14 @@ export function renderResultsTable(results) {
|
|
| 75 |
{ key: 'status', label: 'Status', priority: 1 },
|
| 76 |
{ key: 'buildType', label: 'Build', priority: 3 },
|
| 77 |
{ key: 'webgpuAvailable', label: 'WebGPU', priority: 3 },
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
{ key: 'cpu_baseline_decode_tok_s', label: 'CPU tg tok/s', priority: 2 },
|
| 81 |
{ key: 'cpu_baseline_prefill_tok_s', label: 'CPU pp tok/s', priority: 3 },
|
| 82 |
{ key: 'n_eval', label: 'n_eval', priority: 3 },
|
|
@@ -97,7 +118,8 @@ export function renderResultsTable(results) {
|
|
| 97 |
const pin = i === 0 ? ' col-pin col-pin-1' : (i === 1 ? ' col-pin col-pin-2' : '');
|
| 98 |
const prio = col.priority >= 3 ? ' col-p3' : (col.priority === 2 ? ' col-p2' : '');
|
| 99 |
const cls = `sortable${isActive ? ' sorted' : ''}${pin}${prio}`;
|
| 100 |
-
|
|
|
|
| 101 |
});
|
| 102 |
html += '</tr></thead><tbody>';
|
| 103 |
|
|
@@ -123,19 +145,27 @@ export function renderResultsTable(results) {
|
|
| 123 |
break;
|
| 124 |
case 'decode_tok_s':
|
| 125 |
case 'prefill_tok_s':
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
case 'cpu_baseline_decode_tok_s':
|
| 127 |
case 'cpu_baseline_prefill_tok_s': {
|
| 128 |
// llama-bench style "avg \u00b1 stddev" with the pp{N} / tg{N} test
|
| 129 |
// label as a tooltip when the new schema is present. Older records
|
| 130 |
// without stddev fall back to the bare avg from formatTokS.
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
:
|
| 138 |
-
:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
const avg = r[col.key];
|
| 140 |
let cell;
|
| 141 |
if (avg != null && stddev != null) {
|
|
@@ -308,9 +338,14 @@ export function renderCpuGpuTable(results) {
|
|
| 308 |
const container = document.getElementById('cpu-gpu-table');
|
| 309 |
if (!container) return;
|
| 310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
const METRICS = [
|
| 312 |
-
{
|
| 313 |
-
{
|
| 314 |
];
|
| 315 |
|
| 316 |
const passed = results.filter(r => r.status === 'done');
|
|
@@ -353,6 +388,8 @@ export function renderCpuGpuTable(results) {
|
|
| 353 |
// Two-row grouped header: row1 = group labels (CPU, Chromium, …), row2 = metric sub-labels
|
| 354 |
// CPU gets colspan = METRICS.length, each GPU browser gets colspan = METRICS.length * 2 (value + speedup per metric)
|
| 355 |
const gpuColspan = METRICS.length * 2;
|
|
|
|
|
|
|
| 356 |
let html = '<div class="table-card"><div class="results-wrapper"><table class="results-table"><thead>';
|
| 357 |
|
| 358 |
// Row 1: group headers
|
|
@@ -387,7 +424,7 @@ export function renderCpuGpuTable(results) {
|
|
| 387 |
|
| 388 |
// CPU columns
|
| 389 |
for (const m of METRICS) {
|
| 390 |
-
const val = avg(cpuItems, m.
|
| 391 |
html += `<td><span class="mono">${formatTokS(val)}</span></td>`;
|
| 392 |
}
|
| 393 |
|
|
@@ -395,8 +432,8 @@ export function renderCpuGpuTable(results) {
|
|
| 395 |
for (const b of gpuBrowsers) {
|
| 396 |
const gpuItems = gpuByBrowser[b] || [];
|
| 397 |
for (const m of METRICS) {
|
| 398 |
-
const cpuVal = avg(cpuItems, m.
|
| 399 |
-
const gpuVal = avg(gpuItems, m.
|
| 400 |
const speedup = cpuVal && gpuVal ? gpuVal / cpuVal : null;
|
| 401 |
const cls = speedup == null ? '' : speedup >= 3 ? 'text-success' : speedup >= 1.5 ? '' : speedup >= 1 ? 'text-muted' : 'text-error';
|
| 402 |
html += `<td><span class="mono">${formatTokS(gpuVal)}</span></td>`;
|
|
|
|
| 5 |
let sortState = { key: null, dir: 'asc' };
|
| 6 |
|
| 7 |
const NUM_KEYS = new Set([
|
| 8 |
+
'sizeMB',
|
| 9 |
+
'decode_tok_s', 'prefill_tok_s',
|
| 10 |
+
'decode_tok_s_d0', 'decode_tok_s_dN',
|
| 11 |
+
'prefill_tok_s_d0', 'prefill_tok_s_dN',
|
| 12 |
'cpu_baseline_decode_tok_s', 'cpu_baseline_prefill_tok_s',
|
| 13 |
'n_eval', 't_eval_ms',
|
| 14 |
'n_p_eval', 't_p_eval_ms', 'wallTimeMs', 'consistency_rate',
|
|
|
|
| 67 |
|
| 68 |
const sorted = sortState.key ? sortResults(results, sortState.key, sortState.dir) : results;
|
| 69 |
|
| 70 |
+
// Resolve the depth-loaded column label from the data: when every visible
|
| 71 |
+
// row shares one N (the typical leaderboard case), show that concrete
|
| 72 |
+
// value (e.g., "@ d2048"). When rows mix depths (someone experimenting
|
| 73 |
+
// with d=4096 vs d=2048), fall back to the abstract "@ dN" with a tooltip
|
| 74 |
+
// listing the values present so the user knows the column is mixed.
|
| 75 |
+
const depthNValues = [...new Set(results.map(r => r.n_depth_dN).filter(v => v != null))]
|
| 76 |
+
.sort((a, b) => a - b);
|
| 77 |
+
const dnLabel = depthNValues.length === 1 ? `d${depthNValues[0]}` : 'dN';
|
| 78 |
+
const dnHeaderTitle = depthNValues.length > 1
|
| 79 |
+
? `Mixed depths in view: ${depthNValues.map(v => `d${v}`).join(', ')}`
|
| 80 |
+
: '';
|
| 81 |
+
|
| 82 |
/* priority: 1 = always show; 2 = hide below 640px; 3 = hide below 900px */
|
| 83 |
const cols = [
|
| 84 |
{ key: 'machineSlug', label: 'Machine', priority: 1 },
|
|
|
|
| 90 |
{ key: 'status', label: 'Status', priority: 1 },
|
| 91 |
{ key: 'buildType', label: 'Build', priority: 3 },
|
| 92 |
{ key: 'webgpuAvailable', label: 'WebGPU', priority: 3 },
|
| 93 |
+
// tg / pp split into cold-cache (d=0) and depth-loaded (d=N) columns
|
| 94 |
+
// so Run Study's depth-pair shows as side-by-side numbers instead of
|
| 95 |
+
// overwriting one with the other. Pre-study and plain-Run records
|
| 96 |
+
// populate only the side they actually measured; the other reads `—`.
|
| 97 |
+
{ key: 'decode_tok_s_d0', label: 'tg @ d0', priority: 1 },
|
| 98 |
+
{ key: 'decode_tok_s_dN', label: `tg @ ${dnLabel}`, priority: 1, headerTitle: dnHeaderTitle },
|
| 99 |
+
{ key: 'prefill_tok_s_d0', label: 'pp @ d0', priority: 3 },
|
| 100 |
+
{ key: 'prefill_tok_s_dN', label: `pp @ ${dnLabel}`, priority: 3, headerTitle: dnHeaderTitle },
|
| 101 |
{ key: 'cpu_baseline_decode_tok_s', label: 'CPU tg tok/s', priority: 2 },
|
| 102 |
{ key: 'cpu_baseline_prefill_tok_s', label: 'CPU pp tok/s', priority: 3 },
|
| 103 |
{ key: 'n_eval', label: 'n_eval', priority: 3 },
|
|
|
|
| 118 |
const pin = i === 0 ? ' col-pin col-pin-1' : (i === 1 ? ' col-pin col-pin-2' : '');
|
| 119 |
const prio = col.priority >= 3 ? ' col-p3' : (col.priority === 2 ? ' col-p2' : '');
|
| 120 |
const cls = `sortable${isActive ? ' sorted' : ''}${pin}${prio}`;
|
| 121 |
+
const titleAttr = col.headerTitle ? ` title="${escapeHtml(col.headerTitle)}"` : '';
|
| 122 |
+
html += `<th data-key="${col.key}" class="${cls}" aria-sort="${ariaSort}" scope="col" tabindex="0"${titleAttr}><span class="th-label">${col.label}</span><span class="th-sort-indicator" aria-hidden="true">${arrowChar}</span></th>`;
|
| 123 |
});
|
| 124 |
html += '</tr></thead><tbody>';
|
| 125 |
|
|
|
|
| 145 |
break;
|
| 146 |
case 'decode_tok_s':
|
| 147 |
case 'prefill_tok_s':
|
| 148 |
+
case 'decode_tok_s_d0':
|
| 149 |
+
case 'decode_tok_s_dN':
|
| 150 |
+
case 'prefill_tok_s_d0':
|
| 151 |
+
case 'prefill_tok_s_dN':
|
| 152 |
case 'cpu_baseline_decode_tok_s':
|
| 153 |
case 'cpu_baseline_prefill_tok_s': {
|
| 154 |
// llama-bench style "avg \u00b1 stddev" with the pp{N} / tg{N} test
|
| 155 |
// label as a tooltip when the new schema is present. Older records
|
| 156 |
// without stddev fall back to the bare avg from formatTokS.
|
| 157 |
+
// Depth-suffixed keys read from the matching `_d0` / `_dN`
|
| 158 |
+
// stddev + test_name fields produced by mergeDepthPairs.
|
| 159 |
+
let stddev = null;
|
| 160 |
+
let testName = null;
|
| 161 |
+
switch (col.key) {
|
| 162 |
+
case 'decode_tok_s': stddev = r.decode_stddev_ts; testName = r.tg_test_name; break;
|
| 163 |
+
case 'prefill_tok_s': stddev = r.prefill_stddev_ts; testName = r.pp_test_name; break;
|
| 164 |
+
case 'decode_tok_s_d0': stddev = r.decode_stddev_ts_d0; testName = r.tg_test_name_d0; break;
|
| 165 |
+
case 'decode_tok_s_dN': stddev = r.decode_stddev_ts_dN; testName = r.tg_test_name_dN; break;
|
| 166 |
+
case 'prefill_tok_s_d0': stddev = r.prefill_stddev_ts_d0; testName = r.pp_test_name_d0; break;
|
| 167 |
+
case 'prefill_tok_s_dN': stddev = r.prefill_stddev_ts_dN; testName = r.pp_test_name_dN; break;
|
| 168 |
+
}
|
| 169 |
const avg = r[col.key];
|
| 170 |
let cell;
|
| 171 |
if (avg != null && stddev != null) {
|
|
|
|
| 338 |
const container = document.getElementById('cpu-gpu-table');
|
| 339 |
if (!container) return;
|
| 340 |
|
| 341 |
+
// CPU is pinned to d=0 by the runner, so the comparison must read GPU's
|
| 342 |
+
// d=0 number for an apples-to-apples ratio. Plain-Run records that only
|
| 343 |
+
// measured d=N have null `_d0` and silently drop out of the comparison
|
| 344 |
+
// — that's the right call: without a cold-cache GPU sample the speedup
|
| 345 |
+
// ratio would be measuring different workloads.
|
| 346 |
const METRICS = [
|
| 347 |
+
{ cpuField: 'decode_tok_s', gpuField: 'decode_tok_s_d0', label: 'Decode tok/s @ d0' },
|
| 348 |
+
{ cpuField: 'prefill_tok_s', gpuField: 'prefill_tok_s_d0', label: 'Prefill tok/s @ d0' },
|
| 349 |
];
|
| 350 |
|
| 351 |
const passed = results.filter(r => r.status === 'done');
|
|
|
|
| 388 |
// Two-row grouped header: row1 = group labels (CPU, Chromium, …), row2 = metric sub-labels
|
| 389 |
// CPU gets colspan = METRICS.length, each GPU browser gets colspan = METRICS.length * 2 (value + speedup per metric)
|
| 390 |
const gpuColspan = METRICS.length * 2;
|
| 391 |
+
// CPU side reads cpuField; GPU side reads gpuField (_d0 for apples-to-
|
| 392 |
+
// apples). Both labels match the metric's display label.
|
| 393 |
let html = '<div class="table-card"><div class="results-wrapper"><table class="results-table"><thead>';
|
| 394 |
|
| 395 |
// Row 1: group headers
|
|
|
|
| 424 |
|
| 425 |
// CPU columns
|
| 426 |
for (const m of METRICS) {
|
| 427 |
+
const val = avg(cpuItems, m.cpuField);
|
| 428 |
html += `<td><span class="mono">${formatTokS(val)}</span></td>`;
|
| 429 |
}
|
| 430 |
|
|
|
|
| 432 |
for (const b of gpuBrowsers) {
|
| 433 |
const gpuItems = gpuByBrowser[b] || [];
|
| 434 |
for (const m of METRICS) {
|
| 435 |
+
const cpuVal = avg(cpuItems, m.cpuField);
|
| 436 |
+
const gpuVal = avg(gpuItems, m.gpuField);
|
| 437 |
const speedup = cpuVal && gpuVal ? gpuVal / cpuVal : null;
|
| 438 |
const cls = speedup == null ? '' : speedup >= 3 ? 'text-success' : speedup >= 1.5 ? '' : speedup >= 1 ? 'text-muted' : 'text-error';
|
| 439 |
html += `<td><span class="mono">${formatTokS(gpuVal)}</span></td>`;
|