Spaces:
Running
Running
| import { fetchAllRuns } from './dataset.js'; | |
| import { HF_DATASET_REPO } from './run/config.js'; | |
| // In-memory cache for the current page session. | |
| let cachedData = null; | |
| // sessionStorage cache so a refresh-within-a-minute doesn't re-fetch the | |
| // entire dataset. Short TTL β submissions land continuously and the | |
| // dashboard is the surface where we actually want freshness. | |
| const SESSION_CACHE_KEY = 'webgpu-bench:dashboard-data'; | |
| const SESSION_CACHE_TTL_MS = 60 * 1000; | |
| export async function loadData() { | |
| if (cachedData) return cachedData; | |
| const fromSession = readSessionCache(); | |
| if (fromSession) { | |
| cachedData = fromSession; | |
| return cachedData; | |
| } | |
| // Single source of truth: the HF dataset repo. No static baseline. A new | |
| // dashboard with zero submissions shows an empty state until something is | |
| // submitted. | |
| const empty = makeEmptyDataset(); | |
| try { | |
| const { records, machines, fileCount } = await fetchAllRuns(HF_DATASET_REPO); | |
| if (fileCount > 0) { | |
| mergeRecords(empty, records, machines); | |
| } | |
| cachedData = empty; | |
| writeSessionCache(cachedData); | |
| } catch (err) { | |
| console.warn(`Live dataset load failed: ${err.message}`); | |
| cachedData = empty; | |
| } | |
| return cachedData; | |
| } | |
| function makeEmptyDataset() { | |
| return { | |
| meta: { | |
| machines: [], | |
| models: [], | |
| browsers: [], | |
| generatedAt: new Date().toISOString(), | |
| }, | |
| results: [], | |
| }; | |
| } | |
| /* Append records into an empty payload and recompute the meta lookups. Same | |
| shape the old combined.json had, so all downstream consumers (charts, | |
| tables, machine cards) work unchanged. */ | |
| function mergeRecords(payload, records, machines) { | |
| if (records.length === 0) return; | |
| payload.results.push(...records); | |
| const modelsSet = new Set(payload.meta.models || []); | |
| const browsersSet = new Set(payload.meta.browsers || []); | |
| for (const r of records) { | |
| if (r.model) modelsSet.add(r.model); | |
| if (r.browser) browsersSet.add(r.browser); | |
| } | |
| payload.meta.models = [...modelsSet].sort(); | |
| payload.meta.browsers = [...browsersSet].sort(); | |
| const machineMap = new Map((payload.meta.machines || []).map(m => [m.slug, m])); | |
| for (const m of machines) { | |
| if (!machineMap.has(m.slug)) machineMap.set(m.slug, { ...m }); | |
| } | |
| for (const m of machineMap.values()) { | |
| m.resultCount = 0; | |
| m.passCount = 0; | |
| } | |
| // Per-machine submitter aggregation β counts contributions and tracks the | |
| // most-recent submission so the machine card can render a stacked-avatar | |
| // row sorted by activity. | |
| const submitterAccumulator = new Map(); // slug β Map(key β {profile, count, latestAt}) | |
| for (const r of payload.results) { | |
| const m = machineMap.get(r.machineSlug); | |
| if (!m) continue; | |
| m.resultCount += 1; | |
| if (r.status === 'done') m.passCount += 1; | |
| const sb = r.submittedBy; | |
| if (!sb?.name) continue; | |
| const key = sb.hubId || sb.name; | |
| if (!submitterAccumulator.has(r.machineSlug)) submitterAccumulator.set(r.machineSlug, new Map()); | |
| const inner = submitterAccumulator.get(r.machineSlug); | |
| const cur = inner.get(key); | |
| if (!cur) { | |
| inner.set(key, { profile: sb, count: 1, latestAt: r.timestamp || '' }); | |
| } else { | |
| cur.count += 1; | |
| if (r.timestamp && r.timestamp > cur.latestAt) { | |
| cur.profile = sb; | |
| cur.latestAt = r.timestamp; | |
| } | |
| } | |
| } | |
| for (const [slug, inner] of submitterAccumulator) { | |
| const m = machineMap.get(slug); | |
| if (!m) continue; | |
| m.submitters = [...inner.values()] | |
| .map(({ profile, count, latestAt }) => ({ ...profile, count, latestAt })) | |
| .sort((a, b) => b.count - a.count || (b.latestAt || '').localeCompare(a.latestAt || '')); | |
| } | |
| payload.meta.machines = [...machineMap.values()]; | |
| payload.meta.generatedAt = new Date().toISOString(); | |
| } | |
| function readSessionCache() { | |
| try { | |
| const raw = sessionStorage.getItem(SESSION_CACHE_KEY); | |
| if (!raw) return null; | |
| const { ts, data } = JSON.parse(raw); | |
| if (typeof ts !== 'number' || (Date.now() - ts) > SESSION_CACHE_TTL_MS) return null; | |
| return data; | |
| } catch { | |
| return null; | |
| } | |
| } | |
| function writeSessionCache(data) { | |
| try { | |
| sessionStorage.setItem(SESSION_CACHE_KEY, JSON.stringify({ ts: Date.now(), data })); | |
| } catch { /* quota or disabled */ } | |
| } | |
| /* Fold the (d=0, d=N) GPU record pair that Run Study emits per variant | |
| into a single dashboard row. The d=N record stays canonical | |
| (`decode_tok_s` / `prefill_tok_s` keep the depth-loaded numbers) so | |
| existing chart/table consumers keep working unchanged; a new pair of | |
| `_d0` / `_dN` suffix fields lets depth-aware code pick a specific pass. | |
| CPU records are pinned to d=0 by the runner, so they pass through | |
| untouched. Cells with only one half of the pair (plain Run, pre-study | |
| data, or a partial study) lift their values into the suffix field on | |
| the side that exists, leaving the other side null β so consumers can | |
| render `β` without having to know the record's history. | |
| Within each cell we also tie-break duplicate records per depth bucket | |
| (same iteration / latest timestamp wins, mirroring selectBestResults) | |
| so multiple study runs of the same variant collapse cleanly. | |
| Run AFTER attachCpuBaselineFromCpuRecords (which keys on the | |
| depth-independent (machine, browser, model, variant) tuple) and | |
| BEFORE selectBestResults (CPU rows still need cell-dedup; GPU rows | |
| are already deduped here). */ | |
| export function mergeDepthPairs(records) { | |
| const cells = new Map(); | |
| const cpuRows = []; | |
| for (const r of records) { | |
| if (r.nGpuLayers === 0) { | |
| cpuRows.push(r); | |
| continue; | |
| } | |
| const cellKey = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}`; | |
| const bucket = (r.n_depth ?? 0) === 0 ? 'd0' : 'dN'; | |
| const slot = cells.get(cellKey) || { d0: null, dN: null }; | |
| if (!slot[bucket] || isStrongerRecord(r, slot[bucket])) slot[bucket] = r; | |
| cells.set(cellKey, slot); | |
| } | |
| const merged = [...cpuRows]; | |
| for (const { d0, dN } of cells.values()) { | |
| if (d0 && dN) merged.push(joinDepthPair(d0, dN)); | |
| else if (dN) merged.push(liftSingleDepth(dN, 'dN')); | |
| else if (d0) merged.push(liftSingleDepth(d0, 'd0')); | |
| } | |
| return merged; | |
| } | |
| function isStrongerRecord(a, b) { | |
| const ai = a.iterations ?? 0; | |
| const bi = b.iterations ?? 0; | |
| if (ai !== bi) return ai > bi; | |
| return (a.timestamp || '') > (b.timestamp || ''); | |
| } | |
| const DEPTH_PERF_FIELDS = [ | |
| 'decode_tok_s', 'prefill_tok_s', | |
| 'decode_stddev_ts', 'prefill_stddev_ts', | |
| 'pp_test_name', 'tg_test_name', | |
| ]; | |
| function joinDepthPair(d0, dN) { | |
| const out = { ...dN }; | |
| for (const f of DEPTH_PERF_FIELDS) { | |
| out[`${f}_d0`] = d0[f] ?? null; | |
| out[`${f}_dN`] = dN[f] ?? null; | |
| } | |
| out.n_depth_dN = dN.n_depth ?? null; | |
| return out; | |
| } | |
| function liftSingleDepth(r, bucket) { | |
| const out = { ...r }; | |
| for (const f of DEPTH_PERF_FIELDS) { | |
| out[`${f}_d0`] = bucket === 'd0' ? (r[f] ?? null) : null; | |
| out[`${f}_dN`] = bucket === 'dN' ? (r[f] ?? null) : null; | |
| } | |
| out.n_depth_dN = bucket === 'dN' ? (r.n_depth ?? null) : null; | |
| return out; | |
| } | |
| /* Reduce a flat result set down to one canonical row per | |
| (machineSlug, browser, model, variant, backend) cell. Picks the row with | |
| the most iterations; ties break on latest timestamp. This is the | |
| leaderboard view β "best representative number per cell" β and is what | |
| the dashboard renders in the table, charts, and stat cards. | |
| `backend` (CPU vs GPU, derived from nGpuLayers) is part of the key so | |
| CLI CPU+GPU pairs and browser-flow synthetic CPU rows don't collapse | |
| into the GPU row. */ | |
| export function selectBestResults(records) { | |
| const bestByCell = new Map(); | |
| for (const r of records) { | |
| const backend = r.nGpuLayers === 0 ? 'cpu' : 'gpu'; | |
| const key = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}|${backend}`; | |
| const cur = bestByCell.get(key); | |
| if (!cur) { | |
| bestByCell.set(key, r); | |
| continue; | |
| } | |
| const curIter = cur.iterations ?? 0; | |
| const newIter = r.iterations ?? 0; | |
| if (newIter > curIter) { | |
| bestByCell.set(key, r); | |
| } else if (newIter === curIter) { | |
| const curTs = cur.timestamp || ''; | |
| const newTs = r.timestamp || ''; | |
| if (newTs > curTs) bestByCell.set(key, r); | |
| } | |
| } | |
| return [...bestByCell.values()]; | |
| } | |
| /* For CLI-flow records that ship CPU and GPU as separate dataset entries, | |
| look up each GPU record's matching CPU companion (same machine, browser, | |
| model, variant) and copy its perf into cpu_baseline_*. After this pass, | |
| GPU records from both submission paths (browser, CLI) carry their CPU | |
| baseline inline, so the main table can render a single row per cell with | |
| both numbers side-by-side. No-op on records that already have | |
| cpu_baseline_* (e.g. browser-flow records, where controller.makeRecord | |
| embeds it at write time). */ | |
| export function attachCpuBaselineFromCpuRecords(results) { | |
| const cpuByCell = new Map(); | |
| for (const r of results) { | |
| if (r.nGpuLayers === 0 && r.status === 'done' && (r.decode_tok_s != null || r.prefill_tok_s != null)) { | |
| const key = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}`; | |
| const cur = cpuByCell.get(key); | |
| // Most-recent wins on tiebreak β matches selectBestResults() semantics. | |
| if (!cur || (r.timestamp || '') > (cur.timestamp || '')) { | |
| cpuByCell.set(key, r); | |
| } | |
| } | |
| } | |
| return results.map(r => { | |
| if (r.nGpuLayers === 0) return r; | |
| if (r.cpu_baseline_decode_tok_s != null || r.cpu_baseline_prefill_tok_s != null) return r; | |
| const key = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}`; | |
| const cpu = cpuByCell.get(key); | |
| if (!cpu) return r; | |
| return { | |
| ...r, | |
| cpu_baseline_decode_tok_s: cpu.decode_tok_s ?? null, | |
| cpu_baseline_prefill_tok_s: cpu.prefill_tok_s ?? null, | |
| }; | |
| }); | |
| } | |
| /* Synthesize a CPU row for every browser-flow GPU record (the in-page | |
| bench measures one CPU pass per variant alongside the GPU iterations | |
| and stamps the result on the same record via cpu_baseline_*). Returns | |
| only CPU rows β combine real (nGpuLayers === 0) and synthetic ones. | |
| Used by the CPU-vs-GPU views which want the CPU subset only. */ | |
| export function expandCpuRows(results) { | |
| const real = results.filter(r => r.nGpuLayers === 0); | |
| const synthetic = synthesizeCpuRowsFromBaseline(results); | |
| return [...real, ...synthetic]; | |
| } | |
| /* Same synthesis as expandCpuRows but returns the originals plus the | |
| synthesized CPU rows β for the main results table where we want both | |
| GPU and CPU rows visible. */ | |
| export function withSyntheticCpuRows(results) { | |
| return [...results, ...synthesizeCpuRowsFromBaseline(results)]; | |
| } | |
| function synthesizeCpuRowsFromBaseline(results) { | |
| return results | |
| .filter(r => r.nGpuLayers !== 0 | |
| && (r.cpu_baseline_decode_tok_s != null || r.cpu_baseline_prefill_tok_s != null)) | |
| .map(r => ({ | |
| ...r, | |
| decode_tok_s: r.cpu_baseline_decode_tok_s, | |
| prefill_tok_s: r.cpu_baseline_prefill_tok_s, | |
| // The CPU baseline is a single-rep measurement (warmup + 1 timed), | |
| // so it has no stddev. Null out the stddev fields the spread above | |
| // inherited from the GPU row β otherwise the table renders the | |
| // CPU avg with the GPU's stddev attached, which is nonsensical. | |
| decode_stddev_ts: null, | |
| prefill_stddev_ts: null, | |
| // CPU baseline runs have no t_eval / n_eval breakdowns β null those | |
| // out so the table doesn't show stale GPU numbers in CPU rows. | |
| n_eval: null, | |
| t_eval_ms: null, | |
| n_p_eval: null, | |
| t_p_eval_ms: null, | |
| // Strip the embedded baseline from synthetic CPU rows so the | |
| // "CPU decode tok/s" column doesn't duplicate the row's own metric. | |
| cpu_baseline_decode_tok_s: null, | |
| cpu_baseline_prefill_tok_s: null, | |
| cpu_baseline: null, | |
| nGpuLayers: 0, | |
| })); | |
| } | |
| export function filterResults(results, filters) { | |
| return results.filter(r => { | |
| if (filters.machine && filters.machine !== 'all' && r.machineSlug !== filters.machine) return false; | |
| if (filters.browser && filters.browser !== 'all' && r.browser !== filters.browser) return false; | |
| if (filters.model && filters.model !== 'all' && r.model !== filters.model) return false; | |
| if (filters.backend && filters.backend !== 'all') { | |
| if (filters.backend === 'cpu' && r.nGpuLayers !== 0) return false; | |
| if (filters.backend === 'webgpu' && r.nGpuLayers === 0) return false; | |
| } | |
| if (filters.status && filters.status !== 'all') { | |
| if (filters.status === 'pass' && r.status !== 'done') return false; | |
| if (filters.status === 'fail' && r.status === 'done') return false; | |
| } | |
| if (filters.quants && filters.quants.size > 0 && !filters.quants.has(r.variant)) return false; | |
| return true; | |
| }); | |
| } | |