Spaces:

abhijitramesh
/

webgpu-bench

Running

App Files Files Community

webgpu-bench / js /data.js

GitHub Actions

sync from abhijitramesh/webgpu-bench@d35922fe12

e2ac5c3 9 days ago

history blame contribute delete

12.8 kB

	import { fetchAllRuns } from './dataset.js';
	import { HF_DATASET_REPO } from './run/config.js';

	// In-memory cache for the current page session.
	let cachedData = null;
	// sessionStorage cache so a refresh-within-a-minute doesn't re-fetch the
	// entire dataset. Short TTL — submissions land continuously and the
	// dashboard is the surface where we actually want freshness.
	const SESSION_CACHE_KEY = 'webgpu-bench:dashboard-data';
	const SESSION_CACHE_TTL_MS = 60 * 1000;

	export async function loadData() {
	if (cachedData) return cachedData;

	const fromSession = readSessionCache();
	if (fromSession) {
	cachedData = fromSession;
	return cachedData;
	}

	// Single source of truth: the HF dataset repo. No static baseline. A new
	// dashboard with zero submissions shows an empty state until something is
	// submitted.
	const empty = makeEmptyDataset();
	try {
	const { records, machines, fileCount } = await fetchAllRuns(HF_DATASET_REPO);
	if (fileCount > 0) {
	mergeRecords(empty, records, machines);
	}
	cachedData = empty;
	writeSessionCache(cachedData);
	} catch (err) {
	console.warn(`Live dataset load failed: ${err.message}`);
	cachedData = empty;
	}
	return cachedData;
	}

	function makeEmptyDataset() {
	return {
	meta: {
	machines: [],
	models: [],
	browsers: [],
	generatedAt: new Date().toISOString(),
	},
	results: [],
	};
	}

	/* Append records into an empty payload and recompute the meta lookups. Same
	shape the old combined.json had, so all downstream consumers (charts,
	tables, machine cards) work unchanged. */
	function mergeRecords(payload, records, machines) {
	if (records.length === 0) return;

	payload.results.push(...records);

	const modelsSet = new Set(payload.meta.models \|\| []);
	const browsersSet = new Set(payload.meta.browsers \|\| []);
	for (const r of records) {
	if (r.model) modelsSet.add(r.model);
	if (r.browser) browsersSet.add(r.browser);
	}
	payload.meta.models = [...modelsSet].sort();
	payload.meta.browsers = [...browsersSet].sort();

	const machineMap = new Map((payload.meta.machines \|\| []).map(m => [m.slug, m]));
	for (const m of machines) {
	if (!machineMap.has(m.slug)) machineMap.set(m.slug, { ...m });
	}
	for (const m of machineMap.values()) {
	m.resultCount = 0;
	m.passCount = 0;
	}

	// Per-machine submitter aggregation — counts contributions and tracks the
	// most-recent submission so the machine card can render a stacked-avatar
	// row sorted by activity.
	const submitterAccumulator = new Map(); // slug → Map(key → {profile, count, latestAt})
	for (const r of payload.results) {
	const m = machineMap.get(r.machineSlug);
	if (!m) continue;
	m.resultCount += 1;
	if (r.status === 'done') m.passCount += 1;
	const sb = r.submittedBy;
	if (!sb?.name) continue;
	const key = sb.hubId \|\| sb.name;
	if (!submitterAccumulator.has(r.machineSlug)) submitterAccumulator.set(r.machineSlug, new Map());
	const inner = submitterAccumulator.get(r.machineSlug);
	const cur = inner.get(key);
	if (!cur) {
	inner.set(key, { profile: sb, count: 1, latestAt: r.timestamp \|\| '' });
	} else {
	cur.count += 1;
	if (r.timestamp && r.timestamp > cur.latestAt) {
	cur.profile = sb;
	cur.latestAt = r.timestamp;
	}
	}
	}
	for (const [slug, inner] of submitterAccumulator) {
	const m = machineMap.get(slug);
	if (!m) continue;
	m.submitters = [...inner.values()]
	.map(({ profile, count, latestAt }) => ({ ...profile, count, latestAt }))
	.sort((a, b) => b.count - a.count \|\| (b.latestAt \|\| '').localeCompare(a.latestAt \|\| ''));
	}
	payload.meta.machines = [...machineMap.values()];
	payload.meta.generatedAt = new Date().toISOString();
	}

	function readSessionCache() {
	try {
	const raw = sessionStorage.getItem(SESSION_CACHE_KEY);
	if (!raw) return null;
	const { ts, data } = JSON.parse(raw);
	if (typeof ts !== 'number' \|\| (Date.now() - ts) > SESSION_CACHE_TTL_MS) return null;
	return data;
	} catch {
	return null;
	}
	}

	function writeSessionCache(data) {
	try {
	sessionStorage.setItem(SESSION_CACHE_KEY, JSON.stringify({ ts: Date.now(), data }));
	} catch { /* quota or disabled */ }
	}

	/* Fold the (d=0, d=N) GPU record pair that Run Study emits per variant
	into a single dashboard row. The d=N record stays canonical
	(`decode_tok_s` / `prefill_tok_s` keep the depth-loaded numbers) so
	existing chart/table consumers keep working unchanged; a new pair of
	`_d0` / `_dN` suffix fields lets depth-aware code pick a specific pass.

	CPU records are pinned to d=0 by the runner, so they pass through
	untouched. Cells with only one half of the pair (plain Run, pre-study
	data, or a partial study) lift their values into the suffix field on
	the side that exists, leaving the other side null — so consumers can
	render `—` without having to know the record's history.

	Within each cell we also tie-break duplicate records per depth bucket
	(same iteration / latest timestamp wins, mirroring selectBestResults)
	so multiple study runs of the same variant collapse cleanly.

	Run AFTER attachCpuBaselineFromCpuRecords (which keys on the
	depth-independent (machine, browser, model, variant) tuple) and
	BEFORE selectBestResults (CPU rows still need cell-dedup; GPU rows
	are already deduped here). */
	export function mergeDepthPairs(records) {
	const cells = new Map();
	const cpuRows = [];
	for (const r of records) {
	if (r.nGpuLayers === 0) {
	cpuRows.push(r);
	continue;
	}
	const cellKey = `${r.machineSlug}\|${r.browser}\|${r.model}\|${r.variant}`;
	const bucket = (r.n_depth ?? 0) === 0 ? 'd0' : 'dN';
	const slot = cells.get(cellKey) \|\| { d0: null, dN: null };
	if (!slot[bucket] \|\| isStrongerRecord(r, slot[bucket])) slot[bucket] = r;
	cells.set(cellKey, slot);
	}
	const merged = [...cpuRows];
	for (const { d0, dN } of cells.values()) {
	if (d0 && dN) merged.push(joinDepthPair(d0, dN));
	else if (dN) merged.push(liftSingleDepth(dN, 'dN'));
	else if (d0) merged.push(liftSingleDepth(d0, 'd0'));
	}
	return merged;
	}

	function isStrongerRecord(a, b) {
	const ai = a.iterations ?? 0;
	const bi = b.iterations ?? 0;
	if (ai !== bi) return ai > bi;
	return (a.timestamp \|\| '') > (b.timestamp \|\| '');
	}

	const DEPTH_PERF_FIELDS = [
	'decode_tok_s', 'prefill_tok_s',
	'decode_stddev_ts', 'prefill_stddev_ts',
	'pp_test_name', 'tg_test_name',
	];

	function joinDepthPair(d0, dN) {
	const out = { ...dN };
	for (const f of DEPTH_PERF_FIELDS) {
	out[`${f}_d0`] = d0[f] ?? null;
	out[`${f}_dN`] = dN[f] ?? null;
	}
	out.n_depth_dN = dN.n_depth ?? null;
	return out;
	}

	function liftSingleDepth(r, bucket) {
	const out = { ...r };
	for (const f of DEPTH_PERF_FIELDS) {
	out[`${f}_d0`] = bucket === 'd0' ? (r[f] ?? null) : null;
	out[`${f}_dN`] = bucket === 'dN' ? (r[f] ?? null) : null;
	}
	out.n_depth_dN = bucket === 'dN' ? (r.n_depth ?? null) : null;
	return out;
	}

	/* Reduce a flat result set down to one canonical row per
	(machineSlug, browser, model, variant, backend) cell. Picks the row with
	the most iterations; ties break on latest timestamp. This is the
	leaderboard view — "best representative number per cell" — and is what
	the dashboard renders in the table, charts, and stat cards.

	`backend` (CPU vs GPU, derived from nGpuLayers) is part of the key so
	CLI CPU+GPU pairs and browser-flow synthetic CPU rows don't collapse
	into the GPU row. */
	export function selectBestResults(records) {
	const bestByCell = new Map();
	for (const r of records) {
	const backend = r.nGpuLayers === 0 ? 'cpu' : 'gpu';
	const key = `${r.machineSlug}\|${r.browser}\|${r.model}\|${r.variant}\|${backend}`;
	const cur = bestByCell.get(key);
	if (!cur) {
	bestByCell.set(key, r);
	continue;
	}
	const curIter = cur.iterations ?? 0;
	const newIter = r.iterations ?? 0;
	if (newIter > curIter) {
	bestByCell.set(key, r);
	} else if (newIter === curIter) {
	const curTs = cur.timestamp \|\| '';
	const newTs = r.timestamp \|\| '';
	if (newTs > curTs) bestByCell.set(key, r);
	}
	}
	return [...bestByCell.values()];
	}

	/* For CLI-flow records that ship CPU and GPU as separate dataset entries,
	look up each GPU record's matching CPU companion (same machine, browser,
	model, variant) and copy its perf into cpu_baseline_*. After this pass,
	GPU records from both submission paths (browser, CLI) carry their CPU
	baseline inline, so the main table can render a single row per cell with
	both numbers side-by-side. No-op on records that already have
	cpu_baseline_* (e.g. browser-flow records, where controller.makeRecord
	embeds it at write time). */
	export function attachCpuBaselineFromCpuRecords(results) {
	const cpuByCell = new Map();
	for (const r of results) {
	if (r.nGpuLayers === 0 && r.status === 'done' && (r.decode_tok_s != null \|\| r.prefill_tok_s != null)) {
	const key = `${r.machineSlug}\|${r.browser}\|${r.model}\|${r.variant}`;
	const cur = cpuByCell.get(key);
	// Most-recent wins on tiebreak — matches selectBestResults() semantics.
	if (!cur \|\| (r.timestamp \|\| '') > (cur.timestamp \|\| '')) {
	cpuByCell.set(key, r);
	}
	}
	}
	return results.map(r => {
	if (r.nGpuLayers === 0) return r;
	if (r.cpu_baseline_decode_tok_s != null \|\| r.cpu_baseline_prefill_tok_s != null) return r;
	const key = `${r.machineSlug}\|${r.browser}\|${r.model}\|${r.variant}`;
	const cpu = cpuByCell.get(key);
	if (!cpu) return r;
	return {
	...r,
	cpu_baseline_decode_tok_s: cpu.decode_tok_s ?? null,
	cpu_baseline_prefill_tok_s: cpu.prefill_tok_s ?? null,
	};
	});
	}

	/* Synthesize a CPU row for every browser-flow GPU record (the in-page
	bench measures one CPU pass per variant alongside the GPU iterations
	and stamps the result on the same record via cpu_baseline_*). Returns
	only CPU rows — combine real (nGpuLayers === 0) and synthetic ones.
	Used by the CPU-vs-GPU views which want the CPU subset only. */
	export function expandCpuRows(results) {
	const real = results.filter(r => r.nGpuLayers === 0);
	const synthetic = synthesizeCpuRowsFromBaseline(results);
	return [...real, ...synthetic];
	}

	/* Same synthesis as expandCpuRows but returns the originals plus the
	synthesized CPU rows — for the main results table where we want both
	GPU and CPU rows visible. */
	export function withSyntheticCpuRows(results) {
	return [...results, ...synthesizeCpuRowsFromBaseline(results)];
	}

	function synthesizeCpuRowsFromBaseline(results) {
	return results
	.filter(r => r.nGpuLayers !== 0
	&& (r.cpu_baseline_decode_tok_s != null \|\| r.cpu_baseline_prefill_tok_s != null))
	.map(r => ({
	...r,
	decode_tok_s: r.cpu_baseline_decode_tok_s,
	prefill_tok_s: r.cpu_baseline_prefill_tok_s,
	// The CPU baseline is a single-rep measurement (warmup + 1 timed),
	// so it has no stddev. Null out the stddev fields the spread above
	// inherited from the GPU row — otherwise the table renders the
	// CPU avg with the GPU's stddev attached, which is nonsensical.
	decode_stddev_ts: null,
	prefill_stddev_ts: null,
	// CPU baseline runs have no t_eval / n_eval breakdowns — null those
	// out so the table doesn't show stale GPU numbers in CPU rows.
	n_eval: null,
	t_eval_ms: null,
	n_p_eval: null,
	t_p_eval_ms: null,
	// Strip the embedded baseline from synthetic CPU rows so the
	// "CPU decode tok/s" column doesn't duplicate the row's own metric.
	cpu_baseline_decode_tok_s: null,
	cpu_baseline_prefill_tok_s: null,
	cpu_baseline: null,
	nGpuLayers: 0,
	}));
	}

	export function filterResults(results, filters) {
	return results.filter(r => {
	if (filters.machine && filters.machine !== 'all' && r.machineSlug !== filters.machine) return false;
	if (filters.browser && filters.browser !== 'all' && r.browser !== filters.browser) return false;
	if (filters.model && filters.model !== 'all' && r.model !== filters.model) return false;
	if (filters.backend && filters.backend !== 'all') {
	if (filters.backend === 'cpu' && r.nGpuLayers !== 0) return false;
	if (filters.backend === 'webgpu' && r.nGpuLayers === 0) return false;
	}
	if (filters.status && filters.status !== 'all') {
	if (filters.status === 'pass' && r.status !== 'done') return false;
	if (filters.status === 'fail' && r.status === 'done') return false;
	}
	if (filters.quants && filters.quants.size > 0 && !filters.quants.has(r.variant)) return false;
	return true;
	});
	}