#!/usr/bin/env node // Precompute per-evaluation multi-metric and per-slice score matrices and // emit them as one JSON map at data/eval-matrices.json. // // The runtime eval-summary endpoint currently only returns one // (metric, model) row per model — sourced from the primary metric of // `eval_results_view`. The other declared `leaderboard_metrics` (and // per-slice subtask scores hiding inside `fact_results`) are dropped on // the floor, which is why the eval page can't render a multi-metric // leaderboard or a slice dropdown. // // Both views are reconstructable from the warehouse parquet files; this // script does the join once at build time so the runtime path is a flat // O(1) lookup against the precomputed map. // // Output schema: // { // snapshot_id, // pinned for cache busting // generated_at, // evals: { // "": { // // Per-(model, metric) values across the eval's full // // leaderboard_metrics list. Drives the multi-metric matrix. // leaderboard_rows: [ // { model_route_id, values: { "": score | null } } // ], // // Subtask-scope metric entries to *append* to the eval's // // leaderboard_metrics. Each carries column_key // // "::" so it slots into values{} above. // subtask_metrics: [BenchmarkLeaderboardMetric] // } // } // } // // Run via the build chain (`pnpm build`) or standalone: // node scripts/build-eval-matrices.mjs import { DuckDBInstance } from "@duckdb/node-api" import fs from "node:fs/promises" import path from "node:path" import { fileURLToPath } from "node:url" // DuckDB-node returns its list / struct / map values as opaque wrapper // classes whose payload lives behind `.items` / `.entries`. Walk through // these so downstream code can treat the result as plain JSON. function normalizeDuck(value) { if (value == null) return value if (typeof value === "bigint") return Number(value) if (Array.isArray(value)) return value.map(normalizeDuck) if (typeof value === "object") { const ctor = value.constructor?.name ?? "" if (ctor === "DuckDBListValue" || ctor === "DuckDBArrayValue") { return (value.items ?? []).map(normalizeDuck) } if (ctor === "DuckDBStructValue") { return normalizeDuck(value.entries) } if (ctor === "DuckDBMapValue" && Array.isArray(value.entries)) { const out = {} for (const e of value.entries) out[String(e.key)] = normalizeDuck(e.value) return out } if (ctor === "DuckDBDecimalValue" && typeof value.toString === "function") { return Number(value.toString()) } if (ctor.startsWith("DuckDB") && typeof value.toString === "function") { return value.toString() } const out = {} for (const [k, v] of Object.entries(value)) out[k] = normalizeDuck(v) return out } return value } function readDuckRows(reader) { return reader.getRowObjects().map(normalizeDuck).map((row) => normalizeDuck(row)) } const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "..") const WAREHOUSE = path.join(ROOT, ".cache/hf-data/warehouse/latest") const OUT_PATH = path.join(ROOT, "data/eval-matrices.json") async function main() { // Sanity check: the warehouse parquet files must exist locally. // `pnpm cache-hf-data` (legacy) or a manual download populates them; the // v2 build-time path streams them via duckdb's HTTPS reader, so when we // can't find them locally we point duckdb at SNAPSHOT_URL instead. const snapshotUrl = process.env.SNAPSHOT_URL?.replace(/\/+$/, "") let base = WAREHOUSE let useRemote = false try { await fs.access(path.join(WAREHOUSE, "eval_results_view.parquet")) } catch { if (!snapshotUrl) { console.error( "[build-eval-matrices] no local warehouse cache and no SNAPSHOT_URL — abort.", ) process.exit(1) } base = snapshotUrl useRemote = true } const t0 = Date.now() const db = await DuckDBInstance.create() const con = await db.connect() const fileRef = (name) => { const url = useRemote ? `${base}/${name}` : path.join(base, name) return `'${url.replace(/'/g, "''")}'` } // Reading snapshot_id from snapshot_meta.json so the output can be // matched against the pinned build snapshot. let snapshotId = "unknown" try { const metaText = useRemote ? await (await fetch(`${base}/snapshot_meta.json`)).text() : await fs.readFile(path.join(WAREHOUSE, "snapshot_meta.json"), "utf8") snapshotId = JSON.parse(metaText).snapshot_id ?? "unknown" } catch (err) { console.warn( `[build-eval-matrices] couldn't resolve snapshot_id: ${err instanceof Error ? err.message : String(err)}`, ) } // 1. All (eval, model, metric, score) rows. Includes non-primary // metrics that getEvalSummaryById currently filters out. const metricRows = await con.runAndReadAll(` SELECT r.evaluation_id, r.metric_id, r.model_route_id, r.score FROM read_parquet(${fileRef("eval_results_view.parquet")}) r WHERE r.score IS NOT NULL AND r.model_route_id IS NOT NULL `) // 2. Per-slice (composite_slug, benchmark, model, metric, slice_key, // score) rows. The upstream pipeline parks slice scores in // fact_results rather than threading them through eval_results_view, // so we have to reach in here. We carry composite_slug because some // benchmarks (e.g. `gpqa`) appear under multiple composites and // fact_results emits a per-source pseudo-slice (slice_key = // "artificial analysis", "llm stats", "openeval gpqa", ...) for // each source family. Joining slices on (composite_slug, // benchmark_id) keeps each composite's slices in its own lane, // so HF Open LLM v2's GPQA doesn't inherit Artificial Analysis's // pseudo-slice, etc. Also drop the self-rollup (slice_key == // benchmark_id) since that duplicates the eval's overall score. // AVG collapses the rare duplicate (model, slice) pairs. const sliceRows = await con.runAndReadAll(` SELECT f.composite_slug, f.benchmark_id, f.parent_benchmark_id, f.metric_id, f.slice_key, f.slice_name, f.model_id, AVG(f.score) AS score FROM read_parquet(${fileRef("fact_results.parquet")}) f WHERE f.score IS NOT NULL AND f.slice_key IS NOT NULL AND f.metric_id IS NOT NULL AND f.composite_slug IS NOT NULL -- Drop any slice that's a self-rollup of the eval — slice_key -- equals the benchmark, the composite, or the parent benchmark -- after normalising separators (so "global mmlu lite" filters -- against benchmark_id "global-mmlu-lite", "fibble_arena" -- against "fibble-arena", "artificial analysis" against -- composite "artificial-analysis-llms", etc.). AND regexp_replace(lower(f.slice_key), '[^a-z0-9]+', '', 'g') != regexp_replace(lower(f.benchmark_id), '[^a-z0-9]+', '', 'g') AND regexp_replace(lower(f.slice_key), '[^a-z0-9]+', '', 'g') != regexp_replace(lower(f.composite_slug), '[^a-z0-9]+', '', 'g') -- Also drop slices whose slug is a strict prefix of the -- composite_slug (e.g. "artificial analysis" vs -- composite "artificial-analysis-llms" — the slice is just -- the source family naming itself, not a real subtask). AND NOT regexp_replace(lower(f.composite_slug), '[^a-z0-9]+', '', 'g') LIKE regexp_replace(lower(f.slice_key), '[^a-z0-9]+', '', 'g') || '%' AND ( f.parent_benchmark_id IS NULL OR regexp_replace(lower(f.slice_key), '[^a-z0-9]+', '', 'g') != regexp_replace(lower(f.parent_benchmark_id), '[^a-z0-9]+', '', 'g') ) GROUP BY 1,2,3,4,5,6,7 `) // 3. eval → (composite_slug, benchmark_id) mapping so we can join // slice rows back to the right evaluation_id. composite_slug is // what disambiguates HF Open LLM v2's GPQA from Artificial // Analysis's GPQA — both share benchmark_id `gpqa`. Also pull // leaderboard_metrics so we know each metric's metric_summary_id / // unit / lower_is_better when synthesising subtask-scope entries. const evalRows = await con.runAndReadAll(` SELECT evaluation_id, benchmark_id, parent_benchmark_id, composite_slug, leaderboard_metrics FROM read_parquet(${fileRef("evals_view.parquet")}) `) // 4. Map model_id → model_route_id so per-slice rows (which carry // model_id) can land alongside per-metric rows (model_route_id). const modelKeyRows = await con.runAndReadAll(` SELECT DISTINCT model_id, model_route_id FROM read_parquet(${fileRef("eval_results_view.parquet")}) WHERE model_route_id IS NOT NULL `) await con.disconnectSync() // Index the model_id → route_id map so slice lookups are O(1). const modelIdToRoute = new Map() for (const row of modelKeyRows.getRowObjects().map(normalizeDuck)) { if (!modelIdToRoute.has(row.model_id)) { modelIdToRoute.set(row.model_id, row.model_route_id) } } // Group eval rows by evaluation_id, indexed by (composite_slug, // benchmark_id) for the slice join. Two evals can share a benchmark_id // across composites (gpqa under both hfopenllm-v2 and // artificial-analysis-llms), so the composite_slug component is what // keeps them separated. const evalsByCompositeBench = new Map() const evalsById = new Map() const compositeBenchKey = (composite, bench) => `${composite ?? ""}|${bench ?? ""}` for (const row of evalRows.getRowObjects().map(normalizeDuck)) { evalsById.set(row.evaluation_id, row) const bid = row.benchmark_id ?? null const composite = row.composite_slug ?? null if (bid && composite) { const key = compositeBenchKey(composite, bid) if (!evalsByCompositeBench.has(key)) evalsByCompositeBench.set(key, []) evalsByCompositeBench.get(key).push(row.evaluation_id) } } // Bucket metric rows by evaluation_id and within that by model. // out[evalId].rows[modelRoute].values = { column_key: score } const out = {} const ensureEval = (evalId) => { if (!out[evalId]) { out[evalId] = { leaderboard_rows: new Map(), // route_id → values subtask_metric_keys: new Set(), // tracks which subtask cols we've seen subtask_metrics: [], } } return out[evalId] } for (const row of metricRows.getRowObjects().map(normalizeDuck)) { const bucket = ensureEval(row.evaluation_id) let modelEntry = bucket.leaderboard_rows.get(row.model_route_id) if (!modelEntry) { modelEntry = {} bucket.leaderboard_rows.set(row.model_route_id, modelEntry) } modelEntry[row.metric_id] = Number(row.score) } // Plant slice scores. Each (metric_id, slice_key) becomes a column // keyed "::" so it slots into values{} alongside // root metrics. The matching subtask leaderboard metric metadata is // emitted in subtask_metrics for the runtime to splice into the eval's // leaderboard_metrics array. for (const row of sliceRows.getRowObjects().map(normalizeDuck)) { const evalIds = evalsByCompositeBench.get( compositeBenchKey(row.composite_slug, row.benchmark_id), ) if (!evalIds) continue const route = modelIdToRoute.get(row.model_id) if (!route) continue const sliceKey = String(row.slice_key) const sliceName = row.slice_name ? String(row.slice_name) : sliceKey const metricId = String(row.metric_id) const columnKey = `${metricId}::${sliceKey}` const score = Number(row.score) if (!Number.isFinite(score)) continue for (const evalId of evalIds) { const bucket = ensureEval(evalId) let modelEntry = bucket.leaderboard_rows.get(route) if (!modelEntry) { modelEntry = {} bucket.leaderboard_rows.set(route, modelEntry) } modelEntry[columnKey] = score if (!bucket.subtask_metric_keys.has(columnKey)) { bucket.subtask_metric_keys.add(columnKey) // Look up the parent eval's metric metadata so the subtask-scope // entry inherits unit / lower_is_better. Fall back to defaults // when the registry doesn't carry the metric. const evalMeta = evalsById.get(evalId) const rootMetric = (evalMeta?.leaderboard_metrics ?? []).find( (m) => m.metric_id === metricId, ) bucket.subtask_metrics.push({ column_key: columnKey, metric_summary_id: rootMetric?.metric_summary_id ?? `${evalId}%3A${metricId}`, metric_id: metricId, metric_name: rootMetric?.metric_name ?? metricId, display_name: rootMetric?.display_name ?? metricId, canonical_display_name: rootMetric?.canonical_display_name ?? null, lower_is_better: rootMetric?.lower_is_better ?? false, unit: rootMetric?.unit ?? null, scope: "subtask", subtask_key: sliceKey, subtask_name: sliceName, }) } } } // Materialise: convert internal Maps to JSON-friendly arrays. Drop // evals that ended up with a single root metric and no subtasks since // the runtime can already render those through the existing path. const finalEvals = {} for (const [evalId, bucket] of Object.entries(out)) { const rows = [] for (const [routeId, values] of bucket.leaderboard_rows) { rows.push({ model_route_id: routeId, values }) } // Skip evals where every model has at most one metric and no // subtask data — adds no information beyond the existing summary. const hasMultiMetric = rows.some((r) => Object.keys(r.values).length > 1) if (!hasMultiMetric && bucket.subtask_metrics.length === 0) continue finalEvals[evalId] = { leaderboard_rows: rows, subtask_metrics: bucket.subtask_metrics, } } const payload = { snapshot_id: snapshotId, generated_at: new Date().toISOString(), evals: finalEvals, } await fs.mkdir(path.dirname(OUT_PATH), { recursive: true }) await fs.writeFile(OUT_PATH, JSON.stringify(payload)) const sizeMb = ( Buffer.byteLength(JSON.stringify(payload), "utf8") / 1024 / 1024 ).toFixed(2) console.log( `[build-eval-matrices] wrote ${Object.keys(finalEvals).length} evals to ${path.relative(ROOT, OUT_PATH)} (${sizeMb} MB) in ${Date.now() - t0}ms`, ) } main().catch((err) => { console.error("[build-eval-matrices] failed:", err) process.exit(1) })