Spaces:

evaleval
/

general-eval-card

Running

App Files Files Community

general-eval-card / scripts /build-eval-matrices.mjs

evijit's picture

evijit HF Staff

Live snapshot date, hide empty Updated col, clean slice contamination

cb0ce7c 16 days ago

history blame contribute delete

14.6 kB

	#!/usr/bin/env node
	// Precompute per-evaluation multi-metric and per-slice score matrices and
	// emit them as one JSON map at data/eval-matrices.json.
	//
	// The runtime eval-summary endpoint currently only returns one
	// (metric, model) row per model — sourced from the primary metric of
	// `eval_results_view`. The other declared `leaderboard_metrics` (and
	// per-slice subtask scores hiding inside `fact_results`) are dropped on
	// the floor, which is why the eval page can't render a multi-metric
	// leaderboard or a slice dropdown.
	//
	// Both views are reconstructable from the warehouse parquet files; this
	// script does the join once at build time so the runtime path is a flat
	// O(1) lookup against the precomputed map.
	//
	// Output schema:
	// {
	// snapshot_id, // pinned for cache busting
	// generated_at,
	// evals: {
	// "<evaluation_id>": {
	// // Per-(model, metric) values across the eval's full
	// // leaderboard_metrics list. Drives the multi-metric matrix.
	// leaderboard_rows: [
	// { model_route_id, values: { "<column_key>": score \| null } }
	// ],
	// // Subtask-scope metric entries to append to the eval's
	// // leaderboard_metrics. Each carries column_key
	// // "<metric_id>::<slice_key>" so it slots into values{} above.
	// subtask_metrics: [BenchmarkLeaderboardMetric]
	// }
	// }
	// }
	//
	// Run via the build chain (`pnpm build`) or standalone:
	// node scripts/build-eval-matrices.mjs

	import { DuckDBInstance } from "@duckdb/node-api"
	import fs from "node:fs/promises"
	import path from "node:path"
	import { fileURLToPath } from "node:url"

	// DuckDB-node returns its list / struct / map values as opaque wrapper
	// classes whose payload lives behind `.items` / `.entries`. Walk through
	// these so downstream code can treat the result as plain JSON.
	function normalizeDuck(value) {
	if (value == null) return value
	if (typeof value === "bigint") return Number(value)
	if (Array.isArray(value)) return value.map(normalizeDuck)
	if (typeof value === "object") {
	const ctor = value.constructor?.name ?? ""
	if (ctor === "DuckDBListValue" \|\| ctor === "DuckDBArrayValue") {
	return (value.items ?? []).map(normalizeDuck)
	}
	if (ctor === "DuckDBStructValue") {
	return normalizeDuck(value.entries)
	}
	if (ctor === "DuckDBMapValue" && Array.isArray(value.entries)) {
	const out = {}
	for (const e of value.entries) out[String(e.key)] = normalizeDuck(e.value)
	return out
	}
	if (ctor === "DuckDBDecimalValue" && typeof value.toString === "function") {
	return Number(value.toString())
	}
	if (ctor.startsWith("DuckDB") && typeof value.toString === "function") {
	return value.toString()
	}
	const out = {}
	for (const [k, v] of Object.entries(value)) out[k] = normalizeDuck(v)
	return out
	}
	return value
	}

	function readDuckRows(reader) {
	return reader.getRowObjects().map(normalizeDuck).map((row) => normalizeDuck(row))
	}

	const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "..")
	const WAREHOUSE = path.join(ROOT, ".cache/hf-data/warehouse/latest")
	const OUT_PATH = path.join(ROOT, "data/eval-matrices.json")

	async function main() {
	// Sanity check: the warehouse parquet files must exist locally.
	// `pnpm cache-hf-data` (legacy) or a manual download populates them; the
	// v2 build-time path streams them via duckdb's HTTPS reader, so when we
	// can't find them locally we point duckdb at SNAPSHOT_URL instead.
	const snapshotUrl = process.env.SNAPSHOT_URL?.replace(/\/+$/, "")
	let base = WAREHOUSE
	let useRemote = false
	try {
	await fs.access(path.join(WAREHOUSE, "eval_results_view.parquet"))
	} catch {
	if (!snapshotUrl) {
	console.error(
	"[build-eval-matrices] no local warehouse cache and no SNAPSHOT_URL — abort.",
	)
	process.exit(1)
	}
	base = snapshotUrl
	useRemote = true
	}

	const t0 = Date.now()
	const db = await DuckDBInstance.create()
	const con = await db.connect()

	const fileRef = (name) => {
	const url = useRemote ? `${base}/${name}` : path.join(base, name)
	return `'${url.replace(/'/g, "''")}'`
	}

	// Reading snapshot_id from snapshot_meta.json so the output can be
	// matched against the pinned build snapshot.
	let snapshotId = "unknown"
	try {
	const metaText = useRemote
	? await (await fetch(`${base}/snapshot_meta.json`)).text()
	: await fs.readFile(path.join(WAREHOUSE, "snapshot_meta.json"), "utf8")
	snapshotId = JSON.parse(metaText).snapshot_id ?? "unknown"
	} catch (err) {
	console.warn(
	`[build-eval-matrices] couldn't resolve snapshot_id: ${err instanceof Error ? err.message : String(err)}`,
	)
	}

	// 1. All (eval, model, metric, score) rows. Includes non-primary
	// metrics that getEvalSummaryById currently filters out.
	const metricRows = await con.runAndReadAll(`
	SELECT
	r.evaluation_id,
	r.metric_id,
	r.model_route_id,
	r.score
	FROM read_parquet(${fileRef("eval_results_view.parquet")}) r
	WHERE r.score IS NOT NULL
	AND r.model_route_id IS NOT NULL
	`)

	// 2. Per-slice (composite_slug, benchmark, model, metric, slice_key,
	// score) rows. The upstream pipeline parks slice scores in
	// fact_results rather than threading them through eval_results_view,
	// so we have to reach in here. We carry composite_slug because some
	// benchmarks (e.g. `gpqa`) appear under multiple composites and
	// fact_results emits a per-source pseudo-slice (slice_key =
	// "artificial analysis", "llm stats", "openeval gpqa", ...) for
	// each source family. Joining slices on (composite_slug,
	// benchmark_id) keeps each composite's slices in its own lane,
	// so HF Open LLM v2's GPQA doesn't inherit Artificial Analysis's
	// pseudo-slice, etc. Also drop the self-rollup (slice_key ==
	// benchmark_id) since that duplicates the eval's overall score.
	// AVG collapses the rare duplicate (model, slice) pairs.
	const sliceRows = await con.runAndReadAll(`
	SELECT
	f.composite_slug,
	f.benchmark_id,
	f.parent_benchmark_id,
	f.metric_id,
	f.slice_key,
	f.slice_name,
	f.model_id,
	AVG(f.score) AS score
	FROM read_parquet(${fileRef("fact_results.parquet")}) f
	WHERE f.score IS NOT NULL
	AND f.slice_key IS NOT NULL
	AND f.metric_id IS NOT NULL
	AND f.composite_slug IS NOT NULL
	-- Drop any slice that's a self-rollup of the eval — slice_key
	-- equals the benchmark, the composite, or the parent benchmark
	-- after normalising separators (so "global mmlu lite" filters
	-- against benchmark_id "global-mmlu-lite", "fibble_arena"
	-- against "fibble-arena", "artificial analysis" against
	-- composite "artificial-analysis-llms", etc.).
	AND regexp_replace(lower(f.slice_key), '[^a-z0-9]+', '', 'g')
	!= regexp_replace(lower(f.benchmark_id), '[^a-z0-9]+', '', 'g')
	AND regexp_replace(lower(f.slice_key), '[^a-z0-9]+', '', 'g')
	!= regexp_replace(lower(f.composite_slug), '[^a-z0-9]+', '', 'g')
	-- Also drop slices whose slug is a strict prefix of the
	-- composite_slug (e.g. "artificial analysis" vs
	-- composite "artificial-analysis-llms" — the slice is just
	-- the source family naming itself, not a real subtask).
	AND NOT regexp_replace(lower(f.composite_slug), '[^a-z0-9]+', '', 'g')
	LIKE regexp_replace(lower(f.slice_key), '[^a-z0-9]+', '', 'g') \|\| '%'
	AND (
	f.parent_benchmark_id IS NULL
	OR regexp_replace(lower(f.slice_key), '[^a-z0-9]+', '', 'g')
	!= regexp_replace(lower(f.parent_benchmark_id), '[^a-z0-9]+', '', 'g')
	)
	GROUP BY 1,2,3,4,5,6,7
	`)

	// 3. eval → (composite_slug, benchmark_id) mapping so we can join
	// slice rows back to the right evaluation_id. composite_slug is
	// what disambiguates HF Open LLM v2's GPQA from Artificial
	// Analysis's GPQA — both share benchmark_id `gpqa`. Also pull
	// leaderboard_metrics so we know each metric's metric_summary_id /
	// unit / lower_is_better when synthesising subtask-scope entries.
	const evalRows = await con.runAndReadAll(`
	SELECT
	evaluation_id,
	benchmark_id,
	parent_benchmark_id,
	composite_slug,
	leaderboard_metrics
	FROM read_parquet(${fileRef("evals_view.parquet")})
	`)

	// 4. Map model_id → model_route_id so per-slice rows (which carry
	// model_id) can land alongside per-metric rows (model_route_id).
	const modelKeyRows = await con.runAndReadAll(`
	SELECT DISTINCT model_id, model_route_id
	FROM read_parquet(${fileRef("eval_results_view.parquet")})
	WHERE model_route_id IS NOT NULL
	`)

	await con.disconnectSync()

	// Index the model_id → route_id map so slice lookups are O(1).
	const modelIdToRoute = new Map()
	for (const row of modelKeyRows.getRowObjects().map(normalizeDuck)) {
	if (!modelIdToRoute.has(row.model_id)) {
	modelIdToRoute.set(row.model_id, row.model_route_id)
	}
	}

	// Group eval rows by evaluation_id, indexed by (composite_slug,
	// benchmark_id) for the slice join. Two evals can share a benchmark_id
	// across composites (gpqa under both hfopenllm-v2 and
	// artificial-analysis-llms), so the composite_slug component is what
	// keeps them separated.
	const evalsByCompositeBench = new Map()
	const evalsById = new Map()
	const compositeBenchKey = (composite, bench) =>
	`${composite ?? ""}\|${bench ?? ""}`
	for (const row of evalRows.getRowObjects().map(normalizeDuck)) {
	evalsById.set(row.evaluation_id, row)
	const bid = row.benchmark_id ?? null
	const composite = row.composite_slug ?? null
	if (bid && composite) {
	const key = compositeBenchKey(composite, bid)
	if (!evalsByCompositeBench.has(key)) evalsByCompositeBench.set(key, [])
	evalsByCompositeBench.get(key).push(row.evaluation_id)
	}
	}

	// Bucket metric rows by evaluation_id and within that by model.
	// out[evalId].rows[modelRoute].values = { column_key: score }
	const out = {}
	const ensureEval = (evalId) => {
	if (!out[evalId]) {
	out[evalId] = {
	leaderboard_rows: new Map(), // route_id → values
	subtask_metric_keys: new Set(), // tracks which subtask cols we've seen
	subtask_metrics: [],
	}
	}
	return out[evalId]
	}

	for (const row of metricRows.getRowObjects().map(normalizeDuck)) {
	const bucket = ensureEval(row.evaluation_id)
	let modelEntry = bucket.leaderboard_rows.get(row.model_route_id)
	if (!modelEntry) {
	modelEntry = {}
	bucket.leaderboard_rows.set(row.model_route_id, modelEntry)
	}
	modelEntry[row.metric_id] = Number(row.score)
	}

	// Plant slice scores. Each (metric_id, slice_key) becomes a column
	// keyed "<metric_id>::<slice_key>" so it slots into values{} alongside
	// root metrics. The matching subtask leaderboard metric metadata is
	// emitted in subtask_metrics for the runtime to splice into the eval's
	// leaderboard_metrics array.
	for (const row of sliceRows.getRowObjects().map(normalizeDuck)) {
	const evalIds = evalsByCompositeBench.get(
	compositeBenchKey(row.composite_slug, row.benchmark_id),
	)
	if (!evalIds) continue
	const route = modelIdToRoute.get(row.model_id)
	if (!route) continue
	const sliceKey = String(row.slice_key)
	const sliceName = row.slice_name ? String(row.slice_name) : sliceKey
	const metricId = String(row.metric_id)
	const columnKey = `${metricId}::${sliceKey}`
	const score = Number(row.score)
	if (!Number.isFinite(score)) continue

	for (const evalId of evalIds) {
	const bucket = ensureEval(evalId)
	let modelEntry = bucket.leaderboard_rows.get(route)
	if (!modelEntry) {
	modelEntry = {}
	bucket.leaderboard_rows.set(route, modelEntry)
	}
	modelEntry[columnKey] = score

	if (!bucket.subtask_metric_keys.has(columnKey)) {
	bucket.subtask_metric_keys.add(columnKey)
	// Look up the parent eval's metric metadata so the subtask-scope
	// entry inherits unit / lower_is_better. Fall back to defaults
	// when the registry doesn't carry the metric.
	const evalMeta = evalsById.get(evalId)
	const rootMetric = (evalMeta?.leaderboard_metrics ?? []).find(
	(m) => m.metric_id === metricId,
	)
	bucket.subtask_metrics.push({
	column_key: columnKey,
	metric_summary_id: rootMetric?.metric_summary_id ?? `${evalId}%3A${metricId}`,
	metric_id: metricId,
	metric_name: rootMetric?.metric_name ?? metricId,
	display_name: rootMetric?.display_name ?? metricId,
	canonical_display_name: rootMetric?.canonical_display_name ?? null,
	lower_is_better: rootMetric?.lower_is_better ?? false,
	unit: rootMetric?.unit ?? null,
	scope: "subtask",
	subtask_key: sliceKey,
	subtask_name: sliceName,
	})
	}
	}
	}

	// Materialise: convert internal Maps to JSON-friendly arrays. Drop
	// evals that ended up with a single root metric and no subtasks since
	// the runtime can already render those through the existing path.
	const finalEvals = {}
	for (const [evalId, bucket] of Object.entries(out)) {
	const rows = []
	for (const [routeId, values] of bucket.leaderboard_rows) {
	rows.push({ model_route_id: routeId, values })
	}
	// Skip evals where every model has at most one metric and no
	// subtask data — adds no information beyond the existing summary.
	const hasMultiMetric = rows.some((r) => Object.keys(r.values).length > 1)
	if (!hasMultiMetric && bucket.subtask_metrics.length === 0) continue
	finalEvals[evalId] = {
	leaderboard_rows: rows,
	subtask_metrics: bucket.subtask_metrics,
	}
	}

	const payload = {
	snapshot_id: snapshotId,
	generated_at: new Date().toISOString(),
	evals: finalEvals,
	}

	await fs.mkdir(path.dirname(OUT_PATH), { recursive: true })
	await fs.writeFile(OUT_PATH, JSON.stringify(payload))

	const sizeMb = (
	Buffer.byteLength(JSON.stringify(payload), "utf8") / 1024 / 1024
	).toFixed(2)
	console.log(
	`[build-eval-matrices] wrote ${Object.keys(finalEvals).length} evals to ${path.relative(ROOT, OUT_PATH)} (${sizeMb} MB) in ${Date.now() - t0}ms`,
	)
	}

	main().catch((err) => {
	console.error("[build-eval-matrices] failed:", err)
	process.exit(1)
	})