Spaces:
Running
Running
| // Precompute per-evaluation multi-metric and per-slice score matrices and | |
| // emit them as one JSON map at data/eval-matrices.json. | |
| // | |
| // The runtime eval-summary endpoint currently only returns one | |
| // (metric, model) row per model — sourced from the primary metric of | |
| // `eval_results_view`. The other declared `leaderboard_metrics` (and | |
| // per-slice subtask scores hiding inside `fact_results`) are dropped on | |
| // the floor, which is why the eval page can't render a multi-metric | |
| // leaderboard or a slice dropdown. | |
| // | |
| // Both views are reconstructable from the warehouse parquet files; this | |
| // script does the join once at build time so the runtime path is a flat | |
| // O(1) lookup against the precomputed map. | |
| // | |
| // Output schema: | |
| // { | |
| // snapshot_id, // pinned for cache busting | |
| // generated_at, | |
| // evals: { | |
| // "<evaluation_id>": { | |
| // // Per-(model, metric) values across the eval's full | |
| // // leaderboard_metrics list. Drives the multi-metric matrix. | |
| // leaderboard_rows: [ | |
| // { model_route_id, values: { "<column_key>": score | null } } | |
| // ], | |
| // // Subtask-scope metric entries to *append* to the eval's | |
| // // leaderboard_metrics. Each carries column_key | |
| // // "<metric_id>::<slice_key>" so it slots into values{} above. | |
| // subtask_metrics: [BenchmarkLeaderboardMetric] | |
| // } | |
| // } | |
| // } | |
| // | |
| // Run via the build chain (`pnpm build`) or standalone: | |
| // node scripts/build-eval-matrices.mjs | |
| import { DuckDBInstance } from "@duckdb/node-api" | |
| import fs from "node:fs/promises" | |
| import path from "node:path" | |
| import { fileURLToPath } from "node:url" | |
| // DuckDB-node returns its list / struct / map values as opaque wrapper | |
| // classes whose payload lives behind `.items` / `.entries`. Walk through | |
| // these so downstream code can treat the result as plain JSON. | |
| function normalizeDuck(value) { | |
| if (value == null) return value | |
| if (typeof value === "bigint") return Number(value) | |
| if (Array.isArray(value)) return value.map(normalizeDuck) | |
| if (typeof value === "object") { | |
| const ctor = value.constructor?.name ?? "" | |
| if (ctor === "DuckDBListValue" || ctor === "DuckDBArrayValue") { | |
| return (value.items ?? []).map(normalizeDuck) | |
| } | |
| if (ctor === "DuckDBStructValue") { | |
| return normalizeDuck(value.entries) | |
| } | |
| if (ctor === "DuckDBMapValue" && Array.isArray(value.entries)) { | |
| const out = {} | |
| for (const e of value.entries) out[String(e.key)] = normalizeDuck(e.value) | |
| return out | |
| } | |
| if (ctor === "DuckDBDecimalValue" && typeof value.toString === "function") { | |
| return Number(value.toString()) | |
| } | |
| if (ctor.startsWith("DuckDB") && typeof value.toString === "function") { | |
| return value.toString() | |
| } | |
| const out = {} | |
| for (const [k, v] of Object.entries(value)) out[k] = normalizeDuck(v) | |
| return out | |
| } | |
| return value | |
| } | |
| function readDuckRows(reader) { | |
| return reader.getRowObjects().map(normalizeDuck).map((row) => normalizeDuck(row)) | |
| } | |
| const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "..") | |
| const WAREHOUSE = path.join(ROOT, ".cache/hf-data/warehouse/latest") | |
| const OUT_PATH = path.join(ROOT, "data/eval-matrices.json") | |
| async function main() { | |
| // Sanity check: the warehouse parquet files must exist locally. | |
| // `pnpm cache-hf-data` (legacy) or a manual download populates them; the | |
| // v2 build-time path streams them via duckdb's HTTPS reader, so when we | |
| // can't find them locally we point duckdb at SNAPSHOT_URL instead. | |
| const snapshotUrl = process.env.SNAPSHOT_URL?.replace(/\/+$/, "") | |
| let base = WAREHOUSE | |
| let useRemote = false | |
| try { | |
| await fs.access(path.join(WAREHOUSE, "eval_results_view.parquet")) | |
| } catch { | |
| if (!snapshotUrl) { | |
| console.error( | |
| "[build-eval-matrices] no local warehouse cache and no SNAPSHOT_URL — abort.", | |
| ) | |
| process.exit(1) | |
| } | |
| base = snapshotUrl | |
| useRemote = true | |
| } | |
| const t0 = Date.now() | |
| const db = await DuckDBInstance.create() | |
| const con = await db.connect() | |
| const fileRef = (name) => { | |
| const url = useRemote ? `${base}/${name}` : path.join(base, name) | |
| return `'${url.replace(/'/g, "''")}'` | |
| } | |
| // Reading snapshot_id from snapshot_meta.json so the output can be | |
| // matched against the pinned build snapshot. | |
| let snapshotId = "unknown" | |
| try { | |
| const metaText = useRemote | |
| ? await (await fetch(`${base}/snapshot_meta.json`)).text() | |
| : await fs.readFile(path.join(WAREHOUSE, "snapshot_meta.json"), "utf8") | |
| snapshotId = JSON.parse(metaText).snapshot_id ?? "unknown" | |
| } catch (err) { | |
| console.warn( | |
| `[build-eval-matrices] couldn't resolve snapshot_id: ${err instanceof Error ? err.message : String(err)}`, | |
| ) | |
| } | |
| // 1. All (eval, model, metric, score) rows. Includes non-primary | |
| // metrics that getEvalSummaryById currently filters out. | |
| const metricRows = await con.runAndReadAll(` | |
| SELECT | |
| r.evaluation_id, | |
| r.metric_id, | |
| r.model_route_id, | |
| r.score | |
| FROM read_parquet(${fileRef("eval_results_view.parquet")}) r | |
| WHERE r.score IS NOT NULL | |
| AND r.model_route_id IS NOT NULL | |
| `) | |
| // 2. Per-slice (composite_slug, benchmark, model, metric, slice_key, | |
| // score) rows. The upstream pipeline parks slice scores in | |
| // fact_results rather than threading them through eval_results_view, | |
| // so we have to reach in here. We carry composite_slug because some | |
| // benchmarks (e.g. `gpqa`) appear under multiple composites and | |
| // fact_results emits a per-source pseudo-slice (slice_key = | |
| // "artificial analysis", "llm stats", "openeval gpqa", ...) for | |
| // each source family. Joining slices on (composite_slug, | |
| // benchmark_id) keeps each composite's slices in its own lane, | |
| // so HF Open LLM v2's GPQA doesn't inherit Artificial Analysis's | |
| // pseudo-slice, etc. Also drop the self-rollup (slice_key == | |
| // benchmark_id) since that duplicates the eval's overall score. | |
| // AVG collapses the rare duplicate (model, slice) pairs. | |
| const sliceRows = await con.runAndReadAll(` | |
| SELECT | |
| f.composite_slug, | |
| f.benchmark_id, | |
| f.parent_benchmark_id, | |
| f.metric_id, | |
| f.slice_key, | |
| f.slice_name, | |
| f.model_id, | |
| AVG(f.score) AS score | |
| FROM read_parquet(${fileRef("fact_results.parquet")}) f | |
| WHERE f.score IS NOT NULL | |
| AND f.slice_key IS NOT NULL | |
| AND f.metric_id IS NOT NULL | |
| AND f.composite_slug IS NOT NULL | |
| -- Drop any slice that's a self-rollup of the eval — slice_key | |
| -- equals the benchmark, the composite, or the parent benchmark | |
| -- after normalising separators (so "global mmlu lite" filters | |
| -- against benchmark_id "global-mmlu-lite", "fibble_arena" | |
| -- against "fibble-arena", "artificial analysis" against | |
| -- composite "artificial-analysis-llms", etc.). | |
| AND regexp_replace(lower(f.slice_key), '[^a-z0-9]+', '', 'g') | |
| != regexp_replace(lower(f.benchmark_id), '[^a-z0-9]+', '', 'g') | |
| AND regexp_replace(lower(f.slice_key), '[^a-z0-9]+', '', 'g') | |
| != regexp_replace(lower(f.composite_slug), '[^a-z0-9]+', '', 'g') | |
| -- Also drop slices whose slug is a strict prefix of the | |
| -- composite_slug (e.g. "artificial analysis" vs | |
| -- composite "artificial-analysis-llms" — the slice is just | |
| -- the source family naming itself, not a real subtask). | |
| AND NOT regexp_replace(lower(f.composite_slug), '[^a-z0-9]+', '', 'g') | |
| LIKE regexp_replace(lower(f.slice_key), '[^a-z0-9]+', '', 'g') || '%' | |
| AND ( | |
| f.parent_benchmark_id IS NULL | |
| OR regexp_replace(lower(f.slice_key), '[^a-z0-9]+', '', 'g') | |
| != regexp_replace(lower(f.parent_benchmark_id), '[^a-z0-9]+', '', 'g') | |
| ) | |
| GROUP BY 1,2,3,4,5,6,7 | |
| `) | |
| // 3. eval → (composite_slug, benchmark_id) mapping so we can join | |
| // slice rows back to the right evaluation_id. composite_slug is | |
| // what disambiguates HF Open LLM v2's GPQA from Artificial | |
| // Analysis's GPQA — both share benchmark_id `gpqa`. Also pull | |
| // leaderboard_metrics so we know each metric's metric_summary_id / | |
| // unit / lower_is_better when synthesising subtask-scope entries. | |
| const evalRows = await con.runAndReadAll(` | |
| SELECT | |
| evaluation_id, | |
| benchmark_id, | |
| parent_benchmark_id, | |
| composite_slug, | |
| leaderboard_metrics | |
| FROM read_parquet(${fileRef("evals_view.parquet")}) | |
| `) | |
| // 4. Map model_id → model_route_id so per-slice rows (which carry | |
| // model_id) can land alongside per-metric rows (model_route_id). | |
| const modelKeyRows = await con.runAndReadAll(` | |
| SELECT DISTINCT model_id, model_route_id | |
| FROM read_parquet(${fileRef("eval_results_view.parquet")}) | |
| WHERE model_route_id IS NOT NULL | |
| `) | |
| await con.disconnectSync() | |
| // Index the model_id → route_id map so slice lookups are O(1). | |
| const modelIdToRoute = new Map() | |
| for (const row of modelKeyRows.getRowObjects().map(normalizeDuck)) { | |
| if (!modelIdToRoute.has(row.model_id)) { | |
| modelIdToRoute.set(row.model_id, row.model_route_id) | |
| } | |
| } | |
| // Group eval rows by evaluation_id, indexed by (composite_slug, | |
| // benchmark_id) for the slice join. Two evals can share a benchmark_id | |
| // across composites (gpqa under both hfopenllm-v2 and | |
| // artificial-analysis-llms), so the composite_slug component is what | |
| // keeps them separated. | |
| const evalsByCompositeBench = new Map() | |
| const evalsById = new Map() | |
| const compositeBenchKey = (composite, bench) => | |
| `${composite ?? ""}|${bench ?? ""}` | |
| for (const row of evalRows.getRowObjects().map(normalizeDuck)) { | |
| evalsById.set(row.evaluation_id, row) | |
| const bid = row.benchmark_id ?? null | |
| const composite = row.composite_slug ?? null | |
| if (bid && composite) { | |
| const key = compositeBenchKey(composite, bid) | |
| if (!evalsByCompositeBench.has(key)) evalsByCompositeBench.set(key, []) | |
| evalsByCompositeBench.get(key).push(row.evaluation_id) | |
| } | |
| } | |
| // Bucket metric rows by evaluation_id and within that by model. | |
| // out[evalId].rows[modelRoute].values = { column_key: score } | |
| const out = {} | |
| const ensureEval = (evalId) => { | |
| if (!out[evalId]) { | |
| out[evalId] = { | |
| leaderboard_rows: new Map(), // route_id → values | |
| subtask_metric_keys: new Set(), // tracks which subtask cols we've seen | |
| subtask_metrics: [], | |
| } | |
| } | |
| return out[evalId] | |
| } | |
| for (const row of metricRows.getRowObjects().map(normalizeDuck)) { | |
| const bucket = ensureEval(row.evaluation_id) | |
| let modelEntry = bucket.leaderboard_rows.get(row.model_route_id) | |
| if (!modelEntry) { | |
| modelEntry = {} | |
| bucket.leaderboard_rows.set(row.model_route_id, modelEntry) | |
| } | |
| modelEntry[row.metric_id] = Number(row.score) | |
| } | |
| // Plant slice scores. Each (metric_id, slice_key) becomes a column | |
| // keyed "<metric_id>::<slice_key>" so it slots into values{} alongside | |
| // root metrics. The matching subtask leaderboard metric metadata is | |
| // emitted in subtask_metrics for the runtime to splice into the eval's | |
| // leaderboard_metrics array. | |
| for (const row of sliceRows.getRowObjects().map(normalizeDuck)) { | |
| const evalIds = evalsByCompositeBench.get( | |
| compositeBenchKey(row.composite_slug, row.benchmark_id), | |
| ) | |
| if (!evalIds) continue | |
| const route = modelIdToRoute.get(row.model_id) | |
| if (!route) continue | |
| const sliceKey = String(row.slice_key) | |
| const sliceName = row.slice_name ? String(row.slice_name) : sliceKey | |
| const metricId = String(row.metric_id) | |
| const columnKey = `${metricId}::${sliceKey}` | |
| const score = Number(row.score) | |
| if (!Number.isFinite(score)) continue | |
| for (const evalId of evalIds) { | |
| const bucket = ensureEval(evalId) | |
| let modelEntry = bucket.leaderboard_rows.get(route) | |
| if (!modelEntry) { | |
| modelEntry = {} | |
| bucket.leaderboard_rows.set(route, modelEntry) | |
| } | |
| modelEntry[columnKey] = score | |
| if (!bucket.subtask_metric_keys.has(columnKey)) { | |
| bucket.subtask_metric_keys.add(columnKey) | |
| // Look up the parent eval's metric metadata so the subtask-scope | |
| // entry inherits unit / lower_is_better. Fall back to defaults | |
| // when the registry doesn't carry the metric. | |
| const evalMeta = evalsById.get(evalId) | |
| const rootMetric = (evalMeta?.leaderboard_metrics ?? []).find( | |
| (m) => m.metric_id === metricId, | |
| ) | |
| bucket.subtask_metrics.push({ | |
| column_key: columnKey, | |
| metric_summary_id: rootMetric?.metric_summary_id ?? `${evalId}%3A${metricId}`, | |
| metric_id: metricId, | |
| metric_name: rootMetric?.metric_name ?? metricId, | |
| display_name: rootMetric?.display_name ?? metricId, | |
| canonical_display_name: rootMetric?.canonical_display_name ?? null, | |
| lower_is_better: rootMetric?.lower_is_better ?? false, | |
| unit: rootMetric?.unit ?? null, | |
| scope: "subtask", | |
| subtask_key: sliceKey, | |
| subtask_name: sliceName, | |
| }) | |
| } | |
| } | |
| } | |
| // Materialise: convert internal Maps to JSON-friendly arrays. Drop | |
| // evals that ended up with a single root metric and no subtasks since | |
| // the runtime can already render those through the existing path. | |
| const finalEvals = {} | |
| for (const [evalId, bucket] of Object.entries(out)) { | |
| const rows = [] | |
| for (const [routeId, values] of bucket.leaderboard_rows) { | |
| rows.push({ model_route_id: routeId, values }) | |
| } | |
| // Skip evals where every model has at most one metric and no | |
| // subtask data — adds no information beyond the existing summary. | |
| const hasMultiMetric = rows.some((r) => Object.keys(r.values).length > 1) | |
| if (!hasMultiMetric && bucket.subtask_metrics.length === 0) continue | |
| finalEvals[evalId] = { | |
| leaderboard_rows: rows, | |
| subtask_metrics: bucket.subtask_metrics, | |
| } | |
| } | |
| const payload = { | |
| snapshot_id: snapshotId, | |
| generated_at: new Date().toISOString(), | |
| evals: finalEvals, | |
| } | |
| await fs.mkdir(path.dirname(OUT_PATH), { recursive: true }) | |
| await fs.writeFile(OUT_PATH, JSON.stringify(payload)) | |
| const sizeMb = ( | |
| Buffer.byteLength(JSON.stringify(payload), "utf8") / 1024 / 1024 | |
| ).toFixed(2) | |
| console.log( | |
| `[build-eval-matrices] wrote ${Object.keys(finalEvals).length} evals to ${path.relative(ROOT, OUT_PATH)} (${sizeMb} MB) in ${Date.now() - t0}ms`, | |
| ) | |
| } | |
| main().catch((err) => { | |
| console.error("[build-eval-matrices] failed:", err) | |
| process.exit(1) | |
| }) | |