Spaces:
Running
Running
File size: 8,576 Bytes
5047636 aa61dca 5047636 ed5d4b6 5047636 ed5d4b6 aa61dca 5047636 6a35ded 5047636 6a35ded 5047636 060e926 5047636 6a35ded 5047636 6a35ded 5047636 e72601b 5047636 e72601b ee944ff 5047636 060e926 5047636 060e926 5047636 6a35ded 5047636 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | // Live read of recent submissions from the HF leaderboard dataset.
//
// The dashboard's static `data/combined.json` is rebuilt only when CI runs,
// so freshly-submitted results don't appear until the next code push. This
// module fetches files written to the dataset since `combined.json` was
// last generated and merges them into the dashboard at load time.
//
// HF endpoints used (no auth, public dataset):
// GET /api/datasets/<repo>/tree/main/runs?recursive=1 β file listing
// GET /datasets/<repo>/resolve/main/<path> β file content
//
// Both endpoints support CORS for public datasets so we can call them
// directly from the dashboard.
const HF = 'https://huggingface.co';
// Safety window for clock skew between the dataset commit timestamps and
// the `meta.generatedAt` we compare against. 10 minutes should be more
// than enough β the cost of overshooting is just a few extra files that
// dedupe out anyway.
const CLOCK_SKEW_MS = 10 * 60 * 1000;
// Cap on parallel/total file fetches per dashboard load. The dashboard now
// pulls the entire dataset live (no static baseline), so this cap is the
// upper bound on how many run files the page will download at once. 1000
// is conservative β actual bench submissions are typically β€ 1 KB each so
// the bandwidth ceiling is well under a megabyte even at the cap.
const MAX_FETCH = 1000;
/* Fetch the runs/ tree from the dataset. Returns the file entries that
look newer than `sinceISO` (with a clock-skew buffer applied). On any
network/CORS/parse failure, returns an empty array β the dashboard then
silently falls back to the static combined.json baseline. */
async function listRecentRunFiles(datasetRepo, sinceISO) {
if (!datasetRepo) return [];
// Cache-bust the listing β HF's CDN can serve a stale tree response, and
// we specifically care about reading-our-own-write after a submit.
const url = `${HF}/api/datasets/${datasetRepo}/tree/main/runs?recursive=1&_=${Date.now()}`;
const resp = await fetch(url, { cache: 'no-store' });
if (!resp.ok) {
throw new Error(`tree listing ${resp.status} ${resp.statusText}`);
}
const tree = await resp.json();
if (!Array.isArray(tree)) return [];
const cutoff = sinceISO ? new Date(sinceISO).getTime() - CLOCK_SKEW_MS : 0;
const files = tree
.filter(it => it.type === 'file' && it.path.endsWith('.json'))
.filter(it => {
if (!cutoff) return true;
const t = it.lastCommit?.date ? new Date(it.lastCommit.date).getTime() : NaN;
// Files with no commit timestamp pass through β better to over-include
// than miss the user's own freshly-pushed submission.
return Number.isNaN(t) ? true : t > cutoff;
});
return files.slice(0, MAX_FETCH);
}
async function fetchRunFile(datasetRepo, filePath) {
const url = `${HF}/datasets/${datasetRepo}/resolve/main/${filePath}`;
const resp = await fetch(url, { cache: 'no-store' });
if (!resp.ok) throw new Error(`fetch ${filePath}: ${resp.status}`);
return resp.json();
}
/* List the dataset tree and download every file in `runs/`. Caller is
responsible for rate-limiting/caching. */
export async function fetchAllRuns(datasetRepo) {
return fetchRunsBatch(datasetRepo, await listRecentRunFiles(datasetRepo, null));
}
async function fetchRunsBatch(datasetRepo, files) {
if (files.length === 0) return { records: [], machines: [], fileCount: 0 };
const records = [];
const machinesBySlug = new Map();
// Most-recent userReported.machineName per slug β the same machine can be
// submitted by multiple people who'd label it differently.
const userNameBySlug = new Map(); // slug β { name, ts }
// Fetch in parallel β HF's CDN handles concurrent reads fine.
const results = await Promise.allSettled(
files.map(f => fetchRunFile(datasetRepo, f.path)),
);
for (const res of results) {
if (res.status !== 'fulfilled' || !Array.isArray(res.value)) continue;
const arr = res.value;
for (const r of arr) {
const slug = generateSlug(r.machine);
records.push(flattenForDashboard(r, slug));
if (!machinesBySlug.has(slug) && r.machine) {
machinesBySlug.set(slug, {
slug,
cpus: r.machine.cpus || 'unknown',
platform: r.machine.platform || 'unknown',
arch: r.machine.arch || 'unknown',
totalMemoryGB: r.machine.totalMemoryGB || 0,
submittedAt: r.timestamp || new Date().toISOString(),
// Per-machine resultCount/passCount get computed by the caller
// after the merge β leaving them as 0 here is a placeholder.
resultCount: 0,
passCount: 0,
userMachineName: null,
llamaCppCommit: r.llamaCppCommit ?? null,
llamaCppDescribe: r.llamaCppDescribe ?? null,
});
}
const userName = r.userReported?.machineName?.trim();
if (userName) {
const ts = r.timestamp || '';
const cur = userNameBySlug.get(slug);
if (!cur || ts > cur.ts) userNameBySlug.set(slug, { name: userName, ts });
}
}
}
for (const [slug, { name }] of userNameBySlug) {
const m = machinesBySlug.get(slug);
if (m) m.userMachineName = name;
}
return { records, machines: [...machinesBySlug.values()], fileCount: files.length };
}
/* Flatten a raw dataset record into the same shape `scripts/build-site.js`
produces. Keep field-for-field aligned with build-site.js so the merged
results are indistinguishable from the baseline. */
function flattenForDashboard(r, slug) {
// New-format records have metrics.tests = [{name:'pp512',...},{name:'tg128',...}].
// Old-format records have flat metrics.prefill_tok_s / decode_tok_s only.
// Surface both shapes so the table can render llama-bench-style "avg \u00b1 stddev"
// when stddev is available without breaking on older rows.
const tests = Array.isArray(r.metrics?.tests) ? r.metrics.tests : null;
const pp = tests?.find(t => t.name?.startsWith('pp')) || null;
const tg = tests?.find(t => t.name?.startsWith('tg')) || null;
return {
machineSlug: slug,
timestamp: r.timestamp,
browser: r.browser,
model: r.model,
repo: r.repo,
variant: r.variant,
filename: r.filename,
sizeMB: r.sizeMB,
status: r.status,
error: r.error,
buildType: r.buildType,
webgpuAvailable: r.webgpuAvailable,
nGpuLayers: r.nGpuLayers ?? null,
wallTimeMs: r.wallTimeMs,
prefill_tok_s: r.metrics?.prefill_tok_s ?? null,
decode_tok_s: r.metrics?.decode_tok_s ?? null,
// llama-bench shape: per-test stddev + the test labels (pp{N} / tg{N})
prefill_stddev_ts: pp?.stddev_ts ?? r.metrics?.prefill_tok_s_stdev ?? null,
decode_stddev_ts: tg?.stddev_ts ?? r.metrics?.decode_tok_s_stdev ?? null,
pp_test_name: pp?.name ?? null,
tg_test_name: tg?.name ?? null,
pp_n_prompt: pp?.n_prompt ?? r.nPrompt ?? null,
tg_n_gen: tg?.n_gen ?? r.nGen ?? null,
// KV-cache depth the timed reps ran at. Mirrors llama-bench's `-d` and
// is per-test in metrics.tests; record-level r.nDepth is the
// study/runner-set value, used as a fallback for older exports.
n_depth: pp?.n_depth ?? tg?.n_depth ?? r.nDepth ?? 0,
n_p_eval: r.metrics?.n_p_eval ?? null,
t_p_eval_ms: r.metrics?.t_p_eval_ms ?? null,
n_eval: r.metrics?.n_eval ?? null,
t_eval_ms: r.metrics?.t_eval_ms ?? null,
consistency_rate: r.consistency?.agreement_rate ?? null,
consistency_first_disagree: r.consistency?.first_disagreement ?? null,
// Keep these in sync with scripts/build-site.js β the dashboard merges
// baseline (combined.json) and live (here) records into one table.
cpu_baseline_prefill_tok_s: r.cpu_baseline?.prefill_tok_s ?? null,
cpu_baseline_decode_tok_s: r.cpu_baseline?.decode_tok_s ?? null,
llamaCppCommit: r.llamaCppCommit ?? null,
llamaCppDescribe: r.llamaCppDescribe ?? null,
dawnTag: r.dawnTag ?? null,
submittedBy: r.submittedBy ?? null,
userMachineName: r.userReported?.machineName?.trim() || null,
iterations: r.metrics?.iterations ?? null,
};
}
// Mirror of scripts/_hub.mjs:generateSlug β keep in sync.
function generateSlug(machine) {
if (machine?.slug) return machine.slug;
const cpu = slugify(machine?.cpus || 'unknown');
const ram = machine?.totalMemoryGB || 0;
const platform = machine?.platform || 'unknown';
return `${cpu}-${ram}gb-${platform}`;
}
function slugify(s) {
return String(s).toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '');
}
|