File size: 12,712 Bytes
aa61dca
5047636
 
aa61dca
4721a6e
aa61dca
 
 
 
 
4721a6e
 
 
5047636
aa61dca
 
 
 
 
5047636
aa61dca
 
 
 
5047636
aa61dca
5047636
aa61dca
5047636
aa61dca
 
86d8a2e
aa61dca
5047636
4721a6e
 
 
aa61dca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5047636
aa61dca
5047636
aa61dca
 
 
5047636
 
 
aa61dca
 
5047636
aa61dca
5047636
 
 
 
 
 
 
aa61dca
5047636
aa61dca
 
5047636
aa61dca
5047636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa61dca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5047636
 
e2ac5c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5047636
e6a49d5
 
 
 
 
 
 
 
5047636
 
 
e6a49d5
 
5047636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1683f65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6a49d5
 
 
 
 
ecef386
 
e6a49d5
 
 
 
 
 
 
 
 
 
 
 
 
ecef386
 
 
 
 
 
92ad589
 
 
 
 
 
e6a49d5
 
 
 
 
 
 
 
 
 
 
ecef386
 
 
 
4721a6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
import { fetchAllRuns } from './dataset.js';
import { HF_DATASET_REPO } from './run/config.js';

// In-memory cache for the current page session.
let cachedData = null;
// sessionStorage cache so a refresh-within-a-minute doesn't re-fetch the
// entire dataset. Short TTL β€” submissions land continuously and the
// dashboard is the surface where we actually want freshness.
const SESSION_CACHE_KEY = 'webgpu-bench:dashboard-data';
const SESSION_CACHE_TTL_MS = 60 * 1000;

export async function loadData() {
  if (cachedData) return cachedData;

  const fromSession = readSessionCache();
  if (fromSession) {
    cachedData = fromSession;
    return cachedData;
  }

  // Single source of truth: the HF dataset repo. No static baseline. A new
  // dashboard with zero submissions shows an empty state until something is
  // submitted.
  const empty = makeEmptyDataset();
  try {
    const { records, machines, fileCount } = await fetchAllRuns(HF_DATASET_REPO);
    if (fileCount > 0) {
      mergeRecords(empty, records, machines);
    }
    cachedData = empty;
    writeSessionCache(cachedData);
  } catch {
    cachedData = empty;
  }
  return cachedData;
}

function makeEmptyDataset() {
  return {
    meta: {
      machines: [],
      models: [],
      browsers: [],
      generatedAt: new Date().toISOString(),
    },
    results: [],
  };
}

/* Append records into an empty payload and recompute the meta lookups. Same
   shape the old combined.json had, so all downstream consumers (charts,
   tables, machine cards) work unchanged. */
function mergeRecords(payload, records, machines) {
  if (records.length === 0) return;

  payload.results.push(...records);

  const modelsSet = new Set(payload.meta.models || []);
  const browsersSet = new Set(payload.meta.browsers || []);
  for (const r of records) {
    if (r.model) modelsSet.add(r.model);
    if (r.browser) browsersSet.add(r.browser);
  }
  payload.meta.models = [...modelsSet].sort();
  payload.meta.browsers = [...browsersSet].sort();

  const machineMap = new Map((payload.meta.machines || []).map(m => [m.slug, m]));
  for (const m of machines) {
    if (!machineMap.has(m.slug)) machineMap.set(m.slug, { ...m });
  }
  for (const m of machineMap.values()) {
    m.resultCount = 0;
    m.passCount = 0;
  }

  // Per-machine submitter aggregation β€” counts contributions and tracks the
  // most-recent submission so the machine card can render a stacked-avatar
  // row sorted by activity.
  const submitterAccumulator = new Map(); // slug β†’ Map(key β†’ {profile, count, latestAt})
  for (const r of payload.results) {
    const m = machineMap.get(r.machineSlug);
    if (!m) continue;
    m.resultCount += 1;
    if (r.status === 'done') m.passCount += 1;
    const sb = r.submittedBy;
    if (!sb?.name) continue;
    const key = sb.hubId || sb.name;
    if (!submitterAccumulator.has(r.machineSlug)) submitterAccumulator.set(r.machineSlug, new Map());
    const inner = submitterAccumulator.get(r.machineSlug);
    const cur = inner.get(key);
    if (!cur) {
      inner.set(key, { profile: sb, count: 1, latestAt: r.timestamp || '' });
    } else {
      cur.count += 1;
      if (r.timestamp && r.timestamp > cur.latestAt) {
        cur.profile = sb;
        cur.latestAt = r.timestamp;
      }
    }
  }
  for (const [slug, inner] of submitterAccumulator) {
    const m = machineMap.get(slug);
    if (!m) continue;
    m.submitters = [...inner.values()]
      .map(({ profile, count, latestAt }) => ({ ...profile, count, latestAt }))
      .sort((a, b) => b.count - a.count || (b.latestAt || '').localeCompare(a.latestAt || ''));
  }
  payload.meta.machines = [...machineMap.values()];
  payload.meta.generatedAt = new Date().toISOString();
}

function readSessionCache() {
  try {
    const raw = sessionStorage.getItem(SESSION_CACHE_KEY);
    if (!raw) return null;
    const { ts, data } = JSON.parse(raw);
    if (typeof ts !== 'number' || (Date.now() - ts) > SESSION_CACHE_TTL_MS) return null;
    return data;
  } catch {
    return null;
  }
}

function writeSessionCache(data) {
  try {
    sessionStorage.setItem(SESSION_CACHE_KEY, JSON.stringify({ ts: Date.now(), data }));
  } catch { /* quota or disabled */ }
}

/* Fold the (d=0, d=N) GPU record pair that Run Study emits per variant
   into a single dashboard row. The d=N record stays canonical
   (`decode_tok_s` / `prefill_tok_s` keep the depth-loaded numbers) so
   existing chart/table consumers keep working unchanged; a new pair of
   `_d0` / `_dN` suffix fields lets depth-aware code pick a specific pass.

   CPU records are pinned to d=0 by the runner, so they pass through
   untouched. Cells with only one half of the pair (plain Run, pre-study
   data, or a partial study) lift their values into the suffix field on
   the side that exists, leaving the other side null β€” so consumers can
   render `β€”` without having to know the record's history.

   Within each cell we also tie-break duplicate records per depth bucket
   (same iteration / latest timestamp wins, mirroring selectBestResults)
   so multiple study runs of the same variant collapse cleanly.

   Run AFTER attachCpuBaselineFromCpuRecords (which keys on the
   depth-independent (machine, browser, model, variant) tuple) and
   BEFORE selectBestResults (CPU rows still need cell-dedup; GPU rows
   are already deduped here). */
export function mergeDepthPairs(records) {
  const cells = new Map();
  const cpuRows = [];
  for (const r of records) {
    if (r.nGpuLayers === 0) {
      cpuRows.push(r);
      continue;
    }
    const cellKey = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}`;
    const bucket = (r.n_depth ?? 0) === 0 ? 'd0' : 'dN';
    const slot = cells.get(cellKey) || { d0: null, dN: null };
    if (!slot[bucket] || isStrongerRecord(r, slot[bucket])) slot[bucket] = r;
    cells.set(cellKey, slot);
  }
  const merged = [...cpuRows];
  for (const { d0, dN } of cells.values()) {
    if (d0 && dN) merged.push(joinDepthPair(d0, dN));
    else if (dN) merged.push(liftSingleDepth(dN, 'dN'));
    else if (d0) merged.push(liftSingleDepth(d0, 'd0'));
  }
  return merged;
}

function isStrongerRecord(a, b) {
  const ai = a.iterations ?? 0;
  const bi = b.iterations ?? 0;
  if (ai !== bi) return ai > bi;
  return (a.timestamp || '') > (b.timestamp || '');
}

const DEPTH_PERF_FIELDS = [
  'decode_tok_s', 'prefill_tok_s',
  'decode_stddev_ts', 'prefill_stddev_ts',
  'pp_test_name', 'tg_test_name',
];

function joinDepthPair(d0, dN) {
  const out = { ...dN };
  for (const f of DEPTH_PERF_FIELDS) {
    out[`${f}_d0`] = d0[f] ?? null;
    out[`${f}_dN`] = dN[f] ?? null;
  }
  out.n_depth_dN = dN.n_depth ?? null;
  return out;
}

function liftSingleDepth(r, bucket) {
  const out = { ...r };
  for (const f of DEPTH_PERF_FIELDS) {
    out[`${f}_d0`] = bucket === 'd0' ? (r[f] ?? null) : null;
    out[`${f}_dN`] = bucket === 'dN' ? (r[f] ?? null) : null;
  }
  out.n_depth_dN = bucket === 'dN' ? (r.n_depth ?? null) : null;
  return out;
}

/* Reduce a flat result set down to one canonical row per
   (machineSlug, browser, model, variant, backend) cell. Picks the row with
   the most iterations; ties break on latest timestamp. This is the
   leaderboard view β€” "best representative number per cell" β€” and is what
   the dashboard renders in the table, charts, and stat cards.

   `backend` (CPU vs GPU, derived from nGpuLayers) is part of the key so
   CLI CPU+GPU pairs and browser-flow synthetic CPU rows don't collapse
   into the GPU row. */
export function selectBestResults(records) {
  const bestByCell = new Map();
  for (const r of records) {
    const backend = r.nGpuLayers === 0 ? 'cpu' : 'gpu';
    const key = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}|${backend}`;
    const cur = bestByCell.get(key);
    if (!cur) {
      bestByCell.set(key, r);
      continue;
    }
    const curIter = cur.iterations ?? 0;
    const newIter = r.iterations ?? 0;
    if (newIter > curIter) {
      bestByCell.set(key, r);
    } else if (newIter === curIter) {
      const curTs = cur.timestamp || '';
      const newTs = r.timestamp || '';
      if (newTs > curTs) bestByCell.set(key, r);
    }
  }
  return [...bestByCell.values()];
}

/* For CLI-flow records that ship CPU and GPU as separate dataset entries,
   look up each GPU record's matching CPU companion (same machine, browser,
   model, variant) and copy its perf into cpu_baseline_*. After this pass,
   GPU records from both submission paths (browser, CLI) carry their CPU
   baseline inline, so the main table can render a single row per cell with
   both numbers side-by-side. No-op on records that already have
   cpu_baseline_* (e.g. browser-flow records, where controller.makeRecord
   embeds it at write time). */
export function attachCpuBaselineFromCpuRecords(results) {
  const cpuByCell = new Map();
  for (const r of results) {
    if (r.nGpuLayers === 0 && r.status === 'done' && (r.decode_tok_s != null || r.prefill_tok_s != null)) {
      const key = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}`;
      const cur = cpuByCell.get(key);
      // Most-recent wins on tiebreak β€” matches selectBestResults() semantics.
      if (!cur || (r.timestamp || '') > (cur.timestamp || '')) {
        cpuByCell.set(key, r);
      }
    }
  }
  return results.map(r => {
    if (r.nGpuLayers === 0) return r;
    if (r.cpu_baseline_decode_tok_s != null || r.cpu_baseline_prefill_tok_s != null) return r;
    const key = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}`;
    const cpu = cpuByCell.get(key);
    if (!cpu) return r;
    return {
      ...r,
      cpu_baseline_decode_tok_s: cpu.decode_tok_s ?? null,
      cpu_baseline_prefill_tok_s: cpu.prefill_tok_s ?? null,
    };
  });
}

/* Synthesize a CPU row for every browser-flow GPU record (the in-page
   bench measures one CPU pass per variant alongside the GPU iterations
   and stamps the result on the same record via cpu_baseline_*). Returns
   only CPU rows β€” combine real (nGpuLayers === 0) and synthetic ones.
   Used by the CPU-vs-GPU views which want the CPU subset only. */
export function expandCpuRows(results) {
  const real = results.filter(r => r.nGpuLayers === 0);
  const synthetic = synthesizeCpuRowsFromBaseline(results);
  return [...real, ...synthetic];
}

/* Same synthesis as expandCpuRows but returns the originals plus the
   synthesized CPU rows β€” for the main results table where we want both
   GPU and CPU rows visible. */
export function withSyntheticCpuRows(results) {
  return [...results, ...synthesizeCpuRowsFromBaseline(results)];
}

function synthesizeCpuRowsFromBaseline(results) {
  return results
    .filter(r => r.nGpuLayers !== 0
      && (r.cpu_baseline_decode_tok_s != null || r.cpu_baseline_prefill_tok_s != null))
    .map(r => ({
      ...r,
      decode_tok_s: r.cpu_baseline_decode_tok_s,
      prefill_tok_s: r.cpu_baseline_prefill_tok_s,
      // The CPU baseline is a single-rep measurement (warmup + 1 timed),
      // so it has no stddev. Null out the stddev fields the spread above
      // inherited from the GPU row β€” otherwise the table renders the
      // CPU avg with the GPU's stddev attached, which is nonsensical.
      decode_stddev_ts: null,
      prefill_stddev_ts: null,
      // CPU baseline runs have no t_eval / n_eval breakdowns β€” null those
      // out so the table doesn't show stale GPU numbers in CPU rows.
      n_eval: null,
      t_eval_ms: null,
      n_p_eval: null,
      t_p_eval_ms: null,
      // Strip the embedded baseline from synthetic CPU rows so the
      // "CPU decode tok/s" column doesn't duplicate the row's own metric.
      cpu_baseline_decode_tok_s: null,
      cpu_baseline_prefill_tok_s: null,
      cpu_baseline: null,
      nGpuLayers: 0,
    }));
}

export function filterResults(results, filters) {
  return results.filter(r => {
    if (filters.machine && filters.machine !== 'all' && r.machineSlug !== filters.machine) return false;
    if (filters.browser && filters.browser !== 'all' && r.browser !== filters.browser) return false;
    if (filters.model && filters.model !== 'all' && r.model !== filters.model) return false;
    if (filters.backend && filters.backend !== 'all') {
      if (filters.backend === 'cpu' && r.nGpuLayers !== 0) return false;
      if (filters.backend === 'webgpu' && r.nGpuLayers === 0) return false;
    }
    if (filters.status && filters.status !== 'all') {
      if (filters.status === 'pass' && r.status !== 'done') return false;
      if (filters.status === 'fail' && r.status === 'done') return false;
    }
    if (filters.quants && filters.quants.size > 0 && !filters.quants.has(r.variant)) return false;
    return true;
  });
}