Spaces:

abhijitramesh
/

webgpu-bench

Running

File size: 8,576 Bytes

// Live read of recent submissions from the HF leaderboard dataset.
//
// The dashboard's static `data/combined.json` is rebuilt only when CI runs,
// so freshly-submitted results don't appear until the next code push. This
// module fetches files written to the dataset since `combined.json` was
// last generated and merges them into the dashboard at load time.
//
// HF endpoints used (no auth, public dataset):
//   GET /api/datasets/<repo>/tree/main/runs?recursive=1   → file listing
//   GET /datasets/<repo>/resolve/main/<path>              → file content
//
// Both endpoints support CORS for public datasets so we can call them
// directly from the dashboard.

const HF = 'https://huggingface.co';

// Safety window for clock skew between the dataset commit timestamps and
// the `meta.generatedAt` we compare against. 10 minutes should be more
// than enough — the cost of overshooting is just a few extra files that
// dedupe out anyway.
const CLOCK_SKEW_MS = 10 * 60 * 1000;

// Cap on parallel/total file fetches per dashboard load. The dashboard now
// pulls the entire dataset live (no static baseline), so this cap is the
// upper bound on how many run files the page will download at once. 1000
// is conservative — actual bench submissions are typically ≤ 1 KB each so
// the bandwidth ceiling is well under a megabyte even at the cap.
const MAX_FETCH = 1000;

/* Fetch the runs/ tree from the dataset. Returns the file entries that
   look newer than `sinceISO` (with a clock-skew buffer applied). On any
   network/CORS/parse failure, returns an empty array — the dashboard then
   silently falls back to the static combined.json baseline. */
async function listRecentRunFiles(datasetRepo, sinceISO) {
  if (!datasetRepo) return [];
  // Cache-bust the listing — HF's CDN can serve a stale tree response, and
  // we specifically care about reading-our-own-write after a submit.
  const url = `${HF}/api/datasets/${datasetRepo}/tree/main/runs?recursive=1&_=${Date.now()}`;
  const resp = await fetch(url, { cache: 'no-store' });
  if (!resp.ok) {
    throw new Error(`tree listing ${resp.status} ${resp.statusText}`);
  }
  const tree = await resp.json();
  if (!Array.isArray(tree)) return [];

  const cutoff = sinceISO ? new Date(sinceISO).getTime() - CLOCK_SKEW_MS : 0;
  const files = tree
    .filter(it => it.type === 'file' && it.path.endsWith('.json'))
    .filter(it => {
      if (!cutoff) return true;
      const t = it.lastCommit?.date ? new Date(it.lastCommit.date).getTime() : NaN;
      // Files with no commit timestamp pass through — better to over-include
      // than miss the user's own freshly-pushed submission.
      return Number.isNaN(t) ? true : t > cutoff;
    });

  return files.slice(0, MAX_FETCH);
}

async function fetchRunFile(datasetRepo, filePath) {
  const url = `${HF}/datasets/${datasetRepo}/resolve/main/${filePath}`;
  const resp = await fetch(url, { cache: 'no-store' });
  if (!resp.ok) throw new Error(`fetch ${filePath}: ${resp.status}`);
  return resp.json();
}

/* List the dataset tree and download every file in `runs/`. Caller is
   responsible for rate-limiting/caching. */
export async function fetchAllRuns(datasetRepo) {
  return fetchRunsBatch(datasetRepo, await listRecentRunFiles(datasetRepo, null));
}

async function fetchRunsBatch(datasetRepo, files) {
  if (files.length === 0) return { records: [], machines: [], fileCount: 0 };

  const records = [];
  const machinesBySlug = new Map();
  // Most-recent userReported.machineName per slug — the same machine can be
  // submitted by multiple people who'd label it differently.
  const userNameBySlug = new Map(); // slug → { name, ts }

  // Fetch in parallel — HF's CDN handles concurrent reads fine.
  const results = await Promise.allSettled(
    files.map(f => fetchRunFile(datasetRepo, f.path)),
  );

  for (const res of results) {
    if (res.status !== 'fulfilled' || !Array.isArray(res.value)) continue;
    const arr = res.value;
    for (const r of arr) {
      const slug = generateSlug(r.machine);
      records.push(flattenForDashboard(r, slug));
      if (!machinesBySlug.has(slug) && r.machine) {
        machinesBySlug.set(slug, {
          slug,
          cpus: r.machine.cpus || 'unknown',
          platform: r.machine.platform || 'unknown',
          arch: r.machine.arch || 'unknown',
          totalMemoryGB: r.machine.totalMemoryGB || 0,
          submittedAt: r.timestamp || new Date().toISOString(),
          // Per-machine resultCount/passCount get computed by the caller
          // after the merge — leaving them as 0 here is a placeholder.
          resultCount: 0,
          passCount: 0,
          userMachineName: null,
          llamaCppCommit: r.llamaCppCommit ?? null,
          llamaCppDescribe: r.llamaCppDescribe ?? null,
        });
      }
      const userName = r.userReported?.machineName?.trim();
      if (userName) {
        const ts = r.timestamp || '';
        const cur = userNameBySlug.get(slug);
        if (!cur || ts > cur.ts) userNameBySlug.set(slug, { name: userName, ts });
      }
    }
  }

  for (const [slug, { name }] of userNameBySlug) {
    const m = machinesBySlug.get(slug);
    if (m) m.userMachineName = name;
  }

  return { records, machines: [...machinesBySlug.values()], fileCount: files.length };
}

/* Flatten a raw dataset record into the same shape `scripts/build-site.js`
   produces. Keep field-for-field aligned with build-site.js so the merged
   results are indistinguishable from the baseline. */
function flattenForDashboard(r, slug) {
  // New-format records have metrics.tests = [{name:'pp512',...},{name:'tg128',...}].
  // Old-format records have flat metrics.prefill_tok_s / decode_tok_s only.
  // Surface both shapes so the table can render llama-bench-style "avg \u00b1 stddev"
  // when stddev is available without breaking on older rows.
  const tests = Array.isArray(r.metrics?.tests) ? r.metrics.tests : null;
  const pp = tests?.find(t => t.name?.startsWith('pp')) || null;
  const tg = tests?.find(t => t.name?.startsWith('tg')) || null;
  return {
    machineSlug: slug,
    timestamp: r.timestamp,
    browser: r.browser,
    model: r.model,
    repo: r.repo,
    variant: r.variant,
    filename: r.filename,
    sizeMB: r.sizeMB,
    status: r.status,
    error: r.error,
    buildType: r.buildType,
    webgpuAvailable: r.webgpuAvailable,
    nGpuLayers: r.nGpuLayers ?? null,
    wallTimeMs: r.wallTimeMs,
    prefill_tok_s: r.metrics?.prefill_tok_s ?? null,
    decode_tok_s: r.metrics?.decode_tok_s ?? null,
    // llama-bench shape: per-test stddev + the test labels (pp{N} / tg{N})
    prefill_stddev_ts: pp?.stddev_ts ?? r.metrics?.prefill_tok_s_stdev ?? null,
    decode_stddev_ts:  tg?.stddev_ts ?? r.metrics?.decode_tok_s_stdev  ?? null,
    pp_test_name: pp?.name ?? null,
    tg_test_name: tg?.name ?? null,
    pp_n_prompt: pp?.n_prompt ?? r.nPrompt ?? null,
    tg_n_gen:    tg?.n_gen    ?? r.nGen    ?? null,
    // KV-cache depth the timed reps ran at. Mirrors llama-bench's `-d` and
    // is per-test in metrics.tests; record-level r.nDepth is the
    // study/runner-set value, used as a fallback for older exports.
    n_depth: pp?.n_depth ?? tg?.n_depth ?? r.nDepth ?? 0,
    n_p_eval: r.metrics?.n_p_eval ?? null,
    t_p_eval_ms: r.metrics?.t_p_eval_ms ?? null,
    n_eval: r.metrics?.n_eval ?? null,
    t_eval_ms: r.metrics?.t_eval_ms ?? null,
    consistency_rate: r.consistency?.agreement_rate ?? null,
    consistency_first_disagree: r.consistency?.first_disagreement ?? null,
    // Keep these in sync with scripts/build-site.js — the dashboard merges
    // baseline (combined.json) and live (here) records into one table.
    cpu_baseline_prefill_tok_s: r.cpu_baseline?.prefill_tok_s ?? null,
    cpu_baseline_decode_tok_s: r.cpu_baseline?.decode_tok_s ?? null,
    llamaCppCommit: r.llamaCppCommit ?? null,
    llamaCppDescribe: r.llamaCppDescribe ?? null,
    dawnTag: r.dawnTag ?? null,
    submittedBy: r.submittedBy ?? null,
    userMachineName: r.userReported?.machineName?.trim() || null,
    iterations: r.metrics?.iterations ?? null,
  };
}

// Mirror of scripts/_hub.mjs:generateSlug — keep in sync.
function generateSlug(machine) {
  if (machine?.slug) return machine.slug;
  const cpu = slugify(machine?.cpus || 'unknown');
  const ram = machine?.totalMemoryGB || 0;
  const platform = machine?.platform || 'unknown';
  return `${cpu}-${ram}gb-${platform}`;
}

function slugify(s) {
  return String(s).toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '');
}