// Live read of recent submissions from the HF leaderboard dataset. // // The dashboard's static `data/combined.json` is rebuilt only when CI runs, // so freshly-submitted results don't appear until the next code push. This // module fetches files written to the dataset since `combined.json` was // last generated and merges them into the dashboard at load time. // // HF endpoints used (no auth, public dataset): // GET /api/datasets//tree/main/runs?recursive=1 → file listing // GET /datasets//resolve/main/ → file content // // Both endpoints support CORS for public datasets so we can call them // directly from the dashboard. const HF = 'https://huggingface.co'; // Safety window for clock skew between the dataset commit timestamps and // the `meta.generatedAt` we compare against. 10 minutes should be more // than enough — the cost of overshooting is just a few extra files that // dedupe out anyway. const CLOCK_SKEW_MS = 10 * 60 * 1000; // Cap on parallel/total file fetches per dashboard load. The dashboard now // pulls the entire dataset live (no static baseline), so this cap is the // upper bound on how many run files the page will download at once. 1000 // is conservative — actual bench submissions are typically ≤ 1 KB each so // the bandwidth ceiling is well under a megabyte even at the cap. const MAX_FETCH = 1000; /* Fetch the runs/ tree from the dataset. Returns the file entries that look newer than `sinceISO` (with a clock-skew buffer applied). On any network/CORS/parse failure, returns an empty array — the dashboard then silently falls back to the static combined.json baseline. */ async function listRecentRunFiles(datasetRepo, sinceISO) { if (!datasetRepo) return []; // Cache-bust the listing — HF's CDN can serve a stale tree response, and // we specifically care about reading-our-own-write after a submit. const url = `${HF}/api/datasets/${datasetRepo}/tree/main/runs?recursive=1&_=${Date.now()}`; const resp = await fetch(url, { cache: 'no-store' }); if (!resp.ok) { throw new Error(`tree listing ${resp.status} ${resp.statusText}`); } const tree = await resp.json(); if (!Array.isArray(tree)) return []; const cutoff = sinceISO ? new Date(sinceISO).getTime() - CLOCK_SKEW_MS : 0; const files = tree .filter(it => it.type === 'file' && it.path.endsWith('.json')) .filter(it => { if (!cutoff) return true; const t = it.lastCommit?.date ? new Date(it.lastCommit.date).getTime() : NaN; // Files with no commit timestamp pass through — better to over-include // than miss the user's own freshly-pushed submission. return Number.isNaN(t) ? true : t > cutoff; }); return files.slice(0, MAX_FETCH); } async function fetchRunFile(datasetRepo, filePath) { const url = `${HF}/datasets/${datasetRepo}/resolve/main/${filePath}`; const resp = await fetch(url, { cache: 'no-store' }); if (!resp.ok) throw new Error(`fetch ${filePath}: ${resp.status}`); return resp.json(); } /* List the dataset tree and download every file in `runs/`. Caller is responsible for rate-limiting/caching. */ export async function fetchAllRuns(datasetRepo) { return fetchRunsBatch(datasetRepo, await listRecentRunFiles(datasetRepo, null)); } async function fetchRunsBatch(datasetRepo, files) { if (files.length === 0) return { records: [], machines: [], fileCount: 0 }; const records = []; const machinesBySlug = new Map(); // Most-recent userReported.machineName per slug — the same machine can be // submitted by multiple people who'd label it differently. const userNameBySlug = new Map(); // slug → { name, ts } // Fetch in parallel — HF's CDN handles concurrent reads fine. const results = await Promise.allSettled( files.map(f => fetchRunFile(datasetRepo, f.path)), ); for (const res of results) { if (res.status !== 'fulfilled' || !Array.isArray(res.value)) continue; const arr = res.value; for (const r of arr) { const slug = generateSlug(r.machine); records.push(flattenForDashboard(r, slug)); if (!machinesBySlug.has(slug) && r.machine) { machinesBySlug.set(slug, { slug, cpus: r.machine.cpus || 'unknown', platform: r.machine.platform || 'unknown', arch: r.machine.arch || 'unknown', totalMemoryGB: r.machine.totalMemoryGB || 0, submittedAt: r.timestamp || new Date().toISOString(), // Per-machine resultCount/passCount get computed by the caller // after the merge — leaving them as 0 here is a placeholder. resultCount: 0, passCount: 0, userMachineName: null, llamaCppCommit: r.llamaCppCommit ?? null, llamaCppDescribe: r.llamaCppDescribe ?? null, }); } const userName = r.userReported?.machineName?.trim(); if (userName) { const ts = r.timestamp || ''; const cur = userNameBySlug.get(slug); if (!cur || ts > cur.ts) userNameBySlug.set(slug, { name: userName, ts }); } } } for (const [slug, { name }] of userNameBySlug) { const m = machinesBySlug.get(slug); if (m) m.userMachineName = name; } return { records, machines: [...machinesBySlug.values()], fileCount: files.length }; } /* Flatten a raw dataset record into the same shape `scripts/build-site.js` produces. Keep field-for-field aligned with build-site.js so the merged results are indistinguishable from the baseline. */ function flattenForDashboard(r, slug) { // New-format records have metrics.tests = [{name:'pp512',...},{name:'tg128',...}]. // Old-format records have flat metrics.prefill_tok_s / decode_tok_s only. // Surface both shapes so the table can render llama-bench-style "avg \u00b1 stddev" // when stddev is available without breaking on older rows. const tests = Array.isArray(r.metrics?.tests) ? r.metrics.tests : null; const pp = tests?.find(t => t.name?.startsWith('pp')) || null; const tg = tests?.find(t => t.name?.startsWith('tg')) || null; return { machineSlug: slug, timestamp: r.timestamp, browser: r.browser, model: r.model, repo: r.repo, variant: r.variant, filename: r.filename, sizeMB: r.sizeMB, status: r.status, error: r.error, buildType: r.buildType, webgpuAvailable: r.webgpuAvailable, nGpuLayers: r.nGpuLayers ?? null, wallTimeMs: r.wallTimeMs, prefill_tok_s: r.metrics?.prefill_tok_s ?? null, decode_tok_s: r.metrics?.decode_tok_s ?? null, // llama-bench shape: per-test stddev + the test labels (pp{N} / tg{N}) prefill_stddev_ts: pp?.stddev_ts ?? r.metrics?.prefill_tok_s_stdev ?? null, decode_stddev_ts: tg?.stddev_ts ?? r.metrics?.decode_tok_s_stdev ?? null, pp_test_name: pp?.name ?? null, tg_test_name: tg?.name ?? null, pp_n_prompt: pp?.n_prompt ?? r.nPrompt ?? null, tg_n_gen: tg?.n_gen ?? r.nGen ?? null, // KV-cache depth the timed reps ran at. Mirrors llama-bench's `-d` and // is per-test in metrics.tests; record-level r.nDepth is the // study/runner-set value, used as a fallback for older exports. n_depth: pp?.n_depth ?? tg?.n_depth ?? r.nDepth ?? 0, n_p_eval: r.metrics?.n_p_eval ?? null, t_p_eval_ms: r.metrics?.t_p_eval_ms ?? null, n_eval: r.metrics?.n_eval ?? null, t_eval_ms: r.metrics?.t_eval_ms ?? null, consistency_rate: r.consistency?.agreement_rate ?? null, consistency_first_disagree: r.consistency?.first_disagreement ?? null, // Keep these in sync with scripts/build-site.js — the dashboard merges // baseline (combined.json) and live (here) records into one table. cpu_baseline_prefill_tok_s: r.cpu_baseline?.prefill_tok_s ?? null, cpu_baseline_decode_tok_s: r.cpu_baseline?.decode_tok_s ?? null, llamaCppCommit: r.llamaCppCommit ?? null, llamaCppDescribe: r.llamaCppDescribe ?? null, dawnTag: r.dawnTag ?? null, submittedBy: r.submittedBy ?? null, userMachineName: r.userReported?.machineName?.trim() || null, iterations: r.metrics?.iterations ?? null, }; } // Mirror of scripts/_hub.mjs:generateSlug — keep in sync. function generateSlug(machine) { if (machine?.slug) return machine.slug; const cpu = slugify(machine?.cpus || 'unknown'); const ram = machine?.totalMemoryGB || 0; const platform = machine?.platform || 'unknown'; return `${cpu}-${ram}gb-${platform}`; } function slugify(s) { return String(s).toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, ''); }