webgpu-bench / js /dataset.js
GitHub Actions
sync from abhijitramesh/webgpu-bench@ef7e64472d
ed5d4b6
// Live read of recent submissions from the HF leaderboard dataset.
//
// The dashboard's static `data/combined.json` is rebuilt only when CI runs,
// so freshly-submitted results don't appear until the next code push. This
// module fetches files written to the dataset since `combined.json` was
// last generated and merges them into the dashboard at load time.
//
// HF endpoints used (no auth, public dataset):
// GET /api/datasets/<repo>/tree/main/runs?recursive=1 → file listing
// GET /datasets/<repo>/resolve/main/<path> → file content
//
// Both endpoints support CORS for public datasets so we can call them
// directly from the dashboard.
const HF = 'https://huggingface.co';
// Safety window for clock skew between the dataset commit timestamps and
// the `meta.generatedAt` we compare against. 10 minutes should be more
// than enough — the cost of overshooting is just a few extra files that
// dedupe out anyway.
const CLOCK_SKEW_MS = 10 * 60 * 1000;
// Cap on parallel/total file fetches per dashboard load. The dashboard now
// pulls the entire dataset live (no static baseline), so this cap is the
// upper bound on how many run files the page will download at once. 1000
// is conservative — actual bench submissions are typically ≤ 1 KB each so
// the bandwidth ceiling is well under a megabyte even at the cap.
const MAX_FETCH = 1000;
/* Fetch the runs/ tree from the dataset. Returns the file entries that
look newer than `sinceISO` (with a clock-skew buffer applied). On any
network/CORS/parse failure, returns an empty array — the dashboard then
silently falls back to the static combined.json baseline. */
async function listRecentRunFiles(datasetRepo, sinceISO) {
if (!datasetRepo) return [];
// Cache-bust the listing — HF's CDN can serve a stale tree response, and
// we specifically care about reading-our-own-write after a submit.
const url = `${HF}/api/datasets/${datasetRepo}/tree/main/runs?recursive=1&_=${Date.now()}`;
const resp = await fetch(url, { cache: 'no-store' });
if (!resp.ok) {
throw new Error(`tree listing ${resp.status} ${resp.statusText}`);
}
const tree = await resp.json();
if (!Array.isArray(tree)) return [];
const cutoff = sinceISO ? new Date(sinceISO).getTime() - CLOCK_SKEW_MS : 0;
const files = tree
.filter(it => it.type === 'file' && it.path.endsWith('.json'))
.filter(it => {
if (!cutoff) return true;
const t = it.lastCommit?.date ? new Date(it.lastCommit.date).getTime() : NaN;
// Files with no commit timestamp pass through — better to over-include
// than miss the user's own freshly-pushed submission.
return Number.isNaN(t) ? true : t > cutoff;
});
return files.slice(0, MAX_FETCH);
}
async function fetchRunFile(datasetRepo, filePath) {
const url = `${HF}/datasets/${datasetRepo}/resolve/main/${filePath}`;
const resp = await fetch(url, { cache: 'no-store' });
if (!resp.ok) throw new Error(`fetch ${filePath}: ${resp.status}`);
return resp.json();
}
/* List the dataset tree and download every file in `runs/`. Caller is
responsible for rate-limiting/caching. */
export async function fetchAllRuns(datasetRepo) {
return fetchRunsBatch(datasetRepo, await listRecentRunFiles(datasetRepo, null));
}
async function fetchRunsBatch(datasetRepo, files) {
if (files.length === 0) return { records: [], machines: [], fileCount: 0 };
const records = [];
const machinesBySlug = new Map();
// Most-recent userReported.machineName per slug — the same machine can be
// submitted by multiple people who'd label it differently.
const userNameBySlug = new Map(); // slug → { name, ts }
// Fetch in parallel — HF's CDN handles concurrent reads fine.
const results = await Promise.allSettled(
files.map(f => fetchRunFile(datasetRepo, f.path)),
);
for (const res of results) {
if (res.status !== 'fulfilled' || !Array.isArray(res.value)) continue;
const arr = res.value;
for (const r of arr) {
const slug = generateSlug(r.machine);
records.push(flattenForDashboard(r, slug));
if (!machinesBySlug.has(slug) && r.machine) {
machinesBySlug.set(slug, {
slug,
cpus: r.machine.cpus || 'unknown',
platform: r.machine.platform || 'unknown',
arch: r.machine.arch || 'unknown',
totalMemoryGB: r.machine.totalMemoryGB || 0,
submittedAt: r.timestamp || new Date().toISOString(),
// Per-machine resultCount/passCount get computed by the caller
// after the merge — leaving them as 0 here is a placeholder.
resultCount: 0,
passCount: 0,
userMachineName: null,
llamaCppCommit: r.llamaCppCommit ?? null,
llamaCppDescribe: r.llamaCppDescribe ?? null,
});
}
const userName = r.userReported?.machineName?.trim();
if (userName) {
const ts = r.timestamp || '';
const cur = userNameBySlug.get(slug);
if (!cur || ts > cur.ts) userNameBySlug.set(slug, { name: userName, ts });
}
}
}
for (const [slug, { name }] of userNameBySlug) {
const m = machinesBySlug.get(slug);
if (m) m.userMachineName = name;
}
return { records, machines: [...machinesBySlug.values()], fileCount: files.length };
}
/* Flatten a raw dataset record into the same shape `scripts/build-site.js`
produces. Keep field-for-field aligned with build-site.js so the merged
results are indistinguishable from the baseline. */
function flattenForDashboard(r, slug) {
// New-format records have metrics.tests = [{name:'pp512',...},{name:'tg128',...}].
// Old-format records have flat metrics.prefill_tok_s / decode_tok_s only.
// Surface both shapes so the table can render llama-bench-style "avg \u00b1 stddev"
// when stddev is available without breaking on older rows.
const tests = Array.isArray(r.metrics?.tests) ? r.metrics.tests : null;
const pp = tests?.find(t => t.name?.startsWith('pp')) || null;
const tg = tests?.find(t => t.name?.startsWith('tg')) || null;
return {
machineSlug: slug,
timestamp: r.timestamp,
browser: r.browser,
model: r.model,
repo: r.repo,
variant: r.variant,
filename: r.filename,
sizeMB: r.sizeMB,
status: r.status,
error: r.error,
buildType: r.buildType,
webgpuAvailable: r.webgpuAvailable,
nGpuLayers: r.nGpuLayers ?? null,
wallTimeMs: r.wallTimeMs,
prefill_tok_s: r.metrics?.prefill_tok_s ?? null,
decode_tok_s: r.metrics?.decode_tok_s ?? null,
// llama-bench shape: per-test stddev + the test labels (pp{N} / tg{N})
prefill_stddev_ts: pp?.stddev_ts ?? r.metrics?.prefill_tok_s_stdev ?? null,
decode_stddev_ts: tg?.stddev_ts ?? r.metrics?.decode_tok_s_stdev ?? null,
pp_test_name: pp?.name ?? null,
tg_test_name: tg?.name ?? null,
pp_n_prompt: pp?.n_prompt ?? r.nPrompt ?? null,
tg_n_gen: tg?.n_gen ?? r.nGen ?? null,
// KV-cache depth the timed reps ran at. Mirrors llama-bench's `-d` and
// is per-test in metrics.tests; record-level r.nDepth is the
// study/runner-set value, used as a fallback for older exports.
n_depth: pp?.n_depth ?? tg?.n_depth ?? r.nDepth ?? 0,
n_p_eval: r.metrics?.n_p_eval ?? null,
t_p_eval_ms: r.metrics?.t_p_eval_ms ?? null,
n_eval: r.metrics?.n_eval ?? null,
t_eval_ms: r.metrics?.t_eval_ms ?? null,
consistency_rate: r.consistency?.agreement_rate ?? null,
consistency_first_disagree: r.consistency?.first_disagreement ?? null,
// Keep these in sync with scripts/build-site.js — the dashboard merges
// baseline (combined.json) and live (here) records into one table.
cpu_baseline_prefill_tok_s: r.cpu_baseline?.prefill_tok_s ?? null,
cpu_baseline_decode_tok_s: r.cpu_baseline?.decode_tok_s ?? null,
llamaCppCommit: r.llamaCppCommit ?? null,
llamaCppDescribe: r.llamaCppDescribe ?? null,
dawnTag: r.dawnTag ?? null,
submittedBy: r.submittedBy ?? null,
userMachineName: r.userReported?.machineName?.trim() || null,
iterations: r.metrics?.iterations ?? null,
};
}
// Mirror of scripts/_hub.mjs:generateSlug — keep in sync.
function generateSlug(machine) {
if (machine?.slug) return machine.slug;
const cpu = slugify(machine?.cpus || 'unknown');
const ram = machine?.totalMemoryGB || 0;
const platform = machine?.platform || 'unknown';
return `${cpu}-${ram}gb-${platform}`;
}
function slugify(s) {
return String(s).toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '');
}