Spaces:

abhijitramesh
/

webgpu-bench

Running

File size: 12,712 Bytes

import { fetchAllRuns } from './dataset.js';
import { HF_DATASET_REPO } from './run/config.js';

// In-memory cache for the current page session.
let cachedData = null;
// sessionStorage cache so a refresh-within-a-minute doesn't re-fetch the
// entire dataset. Short TTL — submissions land continuously and the
// dashboard is the surface where we actually want freshness.
const SESSION_CACHE_KEY = 'webgpu-bench:dashboard-data';
const SESSION_CACHE_TTL_MS = 60 * 1000;

export async function loadData() {
  if (cachedData) return cachedData;

  const fromSession = readSessionCache();
  if (fromSession) {
    cachedData = fromSession;
    return cachedData;
  }

  // Single source of truth: the HF dataset repo. No static baseline. A new
  // dashboard with zero submissions shows an empty state until something is
  // submitted.
  const empty = makeEmptyDataset();
  try {
    const { records, machines, fileCount } = await fetchAllRuns(HF_DATASET_REPO);
    if (fileCount > 0) {
      mergeRecords(empty, records, machines);
    }
    cachedData = empty;
    writeSessionCache(cachedData);
  } catch {
    cachedData = empty;
  }
  return cachedData;
}

function makeEmptyDataset() {
  return {
    meta: {
      machines: [],
      models: [],
      browsers: [],
      generatedAt: new Date().toISOString(),
    },
    results: [],
  };
}

/* Append records into an empty payload and recompute the meta lookups. Same
   shape the old combined.json had, so all downstream consumers (charts,
   tables, machine cards) work unchanged. */
function mergeRecords(payload, records, machines) {
  if (records.length === 0) return;

  payload.results.push(...records);

  const modelsSet = new Set(payload.meta.models || []);
  const browsersSet = new Set(payload.meta.browsers || []);
  for (const r of records) {
    if (r.model) modelsSet.add(r.model);
    if (r.browser) browsersSet.add(r.browser);
  }
  payload.meta.models = [...modelsSet].sort();
  payload.meta.browsers = [...browsersSet].sort();

  const machineMap = new Map((payload.meta.machines || []).map(m => [m.slug, m]));
  for (const m of machines) {
    if (!machineMap.has(m.slug)) machineMap.set(m.slug, { ...m });
  }
  for (const m of machineMap.values()) {
    m.resultCount = 0;
    m.passCount = 0;
  }

  // Per-machine submitter aggregation — counts contributions and tracks the
  // most-recent submission so the machine card can render a stacked-avatar
  // row sorted by activity.
  const submitterAccumulator = new Map(); // slug → Map(key → {profile, count, latestAt})
  for (const r of payload.results) {
    const m = machineMap.get(r.machineSlug);
    if (!m) continue;
    m.resultCount += 1;
    if (r.status === 'done') m.passCount += 1;
    const sb = r.submittedBy;
    if (!sb?.name) continue;
    const key = sb.hubId || sb.name;
    if (!submitterAccumulator.has(r.machineSlug)) submitterAccumulator.set(r.machineSlug, new Map());
    const inner = submitterAccumulator.get(r.machineSlug);
    const cur = inner.get(key);
    if (!cur) {
      inner.set(key, { profile: sb, count: 1, latestAt: r.timestamp || '' });
    } else {
      cur.count += 1;
      if (r.timestamp && r.timestamp > cur.latestAt) {
        cur.profile = sb;
        cur.latestAt = r.timestamp;
      }
    }
  }
  for (const [slug, inner] of submitterAccumulator) {
    const m = machineMap.get(slug);
    if (!m) continue;
    m.submitters = [...inner.values()]
      .map(({ profile, count, latestAt }) => ({ ...profile, count, latestAt }))
      .sort((a, b) => b.count - a.count || (b.latestAt || '').localeCompare(a.latestAt || ''));
  }
  payload.meta.machines = [...machineMap.values()];
  payload.meta.generatedAt = new Date().toISOString();
}

function readSessionCache() {
  try {
    const raw = sessionStorage.getItem(SESSION_CACHE_KEY);
    if (!raw) return null;
    const { ts, data } = JSON.parse(raw);
    if (typeof ts !== 'number' || (Date.now() - ts) > SESSION_CACHE_TTL_MS) return null;
    return data;
  } catch {
    return null;
  }
}

function writeSessionCache(data) {
  try {
    sessionStorage.setItem(SESSION_CACHE_KEY, JSON.stringify({ ts: Date.now(), data }));
  } catch { /* quota or disabled */ }
}

/* Fold the (d=0, d=N) GPU record pair that Run Study emits per variant
   into a single dashboard row. The d=N record stays canonical
   (`decode_tok_s` / `prefill_tok_s` keep the depth-loaded numbers) so
   existing chart/table consumers keep working unchanged; a new pair of
   `_d0` / `_dN` suffix fields lets depth-aware code pick a specific pass.

   CPU records are pinned to d=0 by the runner, so they pass through
   untouched. Cells with only one half of the pair (plain Run, pre-study
   data, or a partial study) lift their values into the suffix field on
   the side that exists, leaving the other side null — so consumers can
   render `—` without having to know the record's history.

   Within each cell we also tie-break duplicate records per depth bucket
   (same iteration / latest timestamp wins, mirroring selectBestResults)
   so multiple study runs of the same variant collapse cleanly.

   Run AFTER attachCpuBaselineFromCpuRecords (which keys on the
   depth-independent (machine, browser, model, variant) tuple) and
   BEFORE selectBestResults (CPU rows still need cell-dedup; GPU rows
   are already deduped here). */
export function mergeDepthPairs(records) {
  const cells = new Map();
  const cpuRows = [];
  for (const r of records) {
    if (r.nGpuLayers === 0) {
      cpuRows.push(r);
      continue;
    }
    const cellKey = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}`;
    const bucket = (r.n_depth ?? 0) === 0 ? 'd0' : 'dN';
    const slot = cells.get(cellKey) || { d0: null, dN: null };
    if (!slot[bucket] || isStrongerRecord(r, slot[bucket])) slot[bucket] = r;
    cells.set(cellKey, slot);
  }
  const merged = [...cpuRows];
  for (const { d0, dN } of cells.values()) {
    if (d0 && dN) merged.push(joinDepthPair(d0, dN));
    else if (dN) merged.push(liftSingleDepth(dN, 'dN'));
    else if (d0) merged.push(liftSingleDepth(d0, 'd0'));
  }
  return merged;
}

function isStrongerRecord(a, b) {
  const ai = a.iterations ?? 0;
  const bi = b.iterations ?? 0;
  if (ai !== bi) return ai > bi;
  return (a.timestamp || '') > (b.timestamp || '');
}

const DEPTH_PERF_FIELDS = [
  'decode_tok_s', 'prefill_tok_s',
  'decode_stddev_ts', 'prefill_stddev_ts',
  'pp_test_name', 'tg_test_name',
];

function joinDepthPair(d0, dN) {
  const out = { ...dN };
  for (const f of DEPTH_PERF_FIELDS) {
    out[`${f}_d0`] = d0[f] ?? null;
    out[`${f}_dN`] = dN[f] ?? null;
  }
  out.n_depth_dN = dN.n_depth ?? null;
  return out;
}

function liftSingleDepth(r, bucket) {
  const out = { ...r };
  for (const f of DEPTH_PERF_FIELDS) {
    out[`${f}_d0`] = bucket === 'd0' ? (r[f] ?? null) : null;
    out[`${f}_dN`] = bucket === 'dN' ? (r[f] ?? null) : null;
  }
  out.n_depth_dN = bucket === 'dN' ? (r.n_depth ?? null) : null;
  return out;
}

/* Reduce a flat result set down to one canonical row per
   (machineSlug, browser, model, variant, backend) cell. Picks the row with
   the most iterations; ties break on latest timestamp. This is the
   leaderboard view — "best representative number per cell" — and is what
   the dashboard renders in the table, charts, and stat cards.

   `backend` (CPU vs GPU, derived from nGpuLayers) is part of the key so
   CLI CPU+GPU pairs and browser-flow synthetic CPU rows don't collapse
   into the GPU row. */
export function selectBestResults(records) {
  const bestByCell = new Map();
  for (const r of records) {
    const backend = r.nGpuLayers === 0 ? 'cpu' : 'gpu';
    const key = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}|${backend}`;
    const cur = bestByCell.get(key);
    if (!cur) {
      bestByCell.set(key, r);
      continue;
    }
    const curIter = cur.iterations ?? 0;
    const newIter = r.iterations ?? 0;
    if (newIter > curIter) {
      bestByCell.set(key, r);
    } else if (newIter === curIter) {
      const curTs = cur.timestamp || '';
      const newTs = r.timestamp || '';
      if (newTs > curTs) bestByCell.set(key, r);
    }
  }
  return [...bestByCell.values()];
}

/* For CLI-flow records that ship CPU and GPU as separate dataset entries,
   look up each GPU record's matching CPU companion (same machine, browser,
   model, variant) and copy its perf into cpu_baseline_*. After this pass,
   GPU records from both submission paths (browser, CLI) carry their CPU
   baseline inline, so the main table can render a single row per cell with
   both numbers side-by-side. No-op on records that already have
   cpu_baseline_* (e.g. browser-flow records, where controller.makeRecord
   embeds it at write time). */
export function attachCpuBaselineFromCpuRecords(results) {
  const cpuByCell = new Map();
  for (const r of results) {
    if (r.nGpuLayers === 0 && r.status === 'done' && (r.decode_tok_s != null || r.prefill_tok_s != null)) {
      const key = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}`;
      const cur = cpuByCell.get(key);
      // Most-recent wins on tiebreak — matches selectBestResults() semantics.
      if (!cur || (r.timestamp || '') > (cur.timestamp || '')) {
        cpuByCell.set(key, r);
      }
    }
  }
  return results.map(r => {
    if (r.nGpuLayers === 0) return r;
    if (r.cpu_baseline_decode_tok_s != null || r.cpu_baseline_prefill_tok_s != null) return r;
    const key = `${r.machineSlug}|${r.browser}|${r.model}|${r.variant}`;
    const cpu = cpuByCell.get(key);
    if (!cpu) return r;
    return {
      ...r,
      cpu_baseline_decode_tok_s: cpu.decode_tok_s ?? null,
      cpu_baseline_prefill_tok_s: cpu.prefill_tok_s ?? null,
    };
  });
}

/* Synthesize a CPU row for every browser-flow GPU record (the in-page
   bench measures one CPU pass per variant alongside the GPU iterations
   and stamps the result on the same record via cpu_baseline_*). Returns
   only CPU rows — combine real (nGpuLayers === 0) and synthetic ones.
   Used by the CPU-vs-GPU views which want the CPU subset only. */
export function expandCpuRows(results) {
  const real = results.filter(r => r.nGpuLayers === 0);
  const synthetic = synthesizeCpuRowsFromBaseline(results);
  return [...real, ...synthetic];
}

/* Same synthesis as expandCpuRows but returns the originals plus the
   synthesized CPU rows — for the main results table where we want both
   GPU and CPU rows visible. */
export function withSyntheticCpuRows(results) {
  return [...results, ...synthesizeCpuRowsFromBaseline(results)];
}

function synthesizeCpuRowsFromBaseline(results) {
  return results
    .filter(r => r.nGpuLayers !== 0
      && (r.cpu_baseline_decode_tok_s != null || r.cpu_baseline_prefill_tok_s != null))
    .map(r => ({
      ...r,
      decode_tok_s: r.cpu_baseline_decode_tok_s,
      prefill_tok_s: r.cpu_baseline_prefill_tok_s,
      // The CPU baseline is a single-rep measurement (warmup + 1 timed),
      // so it has no stddev. Null out the stddev fields the spread above
      // inherited from the GPU row — otherwise the table renders the
      // CPU avg with the GPU's stddev attached, which is nonsensical.
      decode_stddev_ts: null,
      prefill_stddev_ts: null,
      // CPU baseline runs have no t_eval / n_eval breakdowns — null those
      // out so the table doesn't show stale GPU numbers in CPU rows.
      n_eval: null,
      t_eval_ms: null,
      n_p_eval: null,
      t_p_eval_ms: null,
      // Strip the embedded baseline from synthetic CPU rows so the
      // "CPU decode tok/s" column doesn't duplicate the row's own metric.
      cpu_baseline_decode_tok_s: null,
      cpu_baseline_prefill_tok_s: null,
      cpu_baseline: null,
      nGpuLayers: 0,
    }));
}

export function filterResults(results, filters) {
  return results.filter(r => {
    if (filters.machine && filters.machine !== 'all' && r.machineSlug !== filters.machine) return false;
    if (filters.browser && filters.browser !== 'all' && r.browser !== filters.browser) return false;
    if (filters.model && filters.model !== 'all' && r.model !== filters.model) return false;
    if (filters.backend && filters.backend !== 'all') {
      if (filters.backend === 'cpu' && r.nGpuLayers !== 0) return false;
      if (filters.backend === 'webgpu' && r.nGpuLayers === 0) return false;
    }
    if (filters.status && filters.status !== 'all') {
      if (filters.status === 'pass' && r.status !== 'done') return false;
      if (filters.status === 'fail' && r.status === 'done') return false;
    }
    if (filters.quants && filters.quants.size > 0 && !filters.quants.has(r.variant)) return false;
    return true;
  });
}