Spaces:

evaleval
/

general-eval-card

Running

File size: 14,607 Bytes

#!/usr/bin/env node
// Precompute per-evaluation multi-metric and per-slice score matrices and
// emit them as one JSON map at data/eval-matrices.json.
//
// The runtime eval-summary endpoint currently only returns one
// (metric, model) row per model — sourced from the primary metric of
// `eval_results_view`. The other declared `leaderboard_metrics` (and
// per-slice subtask scores hiding inside `fact_results`) are dropped on
// the floor, which is why the eval page can't render a multi-metric
// leaderboard or a slice dropdown.
//
// Both views are reconstructable from the warehouse parquet files; this
// script does the join once at build time so the runtime path is a flat
// O(1) lookup against the precomputed map.
//
// Output schema:
//   {
//     snapshot_id,                     // pinned for cache busting
//     generated_at,
//     evals: {
//       "<evaluation_id>": {
//         // Per-(model, metric) values across the eval's full
//         // leaderboard_metrics list. Drives the multi-metric matrix.
//         leaderboard_rows: [
//           { model_route_id, values: { "<column_key>": score | null } }
//         ],
//         // Subtask-scope metric entries to *append* to the eval's
//         // leaderboard_metrics. Each carries column_key
//         // "<metric_id>::<slice_key>" so it slots into values{} above.
//         subtask_metrics: [BenchmarkLeaderboardMetric]
//       }
//     }
//   }
//
// Run via the build chain (`pnpm build`) or standalone:
//   node scripts/build-eval-matrices.mjs

import { DuckDBInstance } from "@duckdb/node-api"
import fs from "node:fs/promises"
import path from "node:path"
import { fileURLToPath } from "node:url"

// DuckDB-node returns its list / struct / map values as opaque wrapper
// classes whose payload lives behind `.items` / `.entries`. Walk through
// these so downstream code can treat the result as plain JSON.
function normalizeDuck(value) {
  if (value == null) return value
  if (typeof value === "bigint") return Number(value)
  if (Array.isArray(value)) return value.map(normalizeDuck)
  if (typeof value === "object") {
    const ctor = value.constructor?.name ?? ""
    if (ctor === "DuckDBListValue" || ctor === "DuckDBArrayValue") {
      return (value.items ?? []).map(normalizeDuck)
    }
    if (ctor === "DuckDBStructValue") {
      return normalizeDuck(value.entries)
    }
    if (ctor === "DuckDBMapValue" && Array.isArray(value.entries)) {
      const out = {}
      for (const e of value.entries) out[String(e.key)] = normalizeDuck(e.value)
      return out
    }
    if (ctor === "DuckDBDecimalValue" && typeof value.toString === "function") {
      return Number(value.toString())
    }
    if (ctor.startsWith("DuckDB") && typeof value.toString === "function") {
      return value.toString()
    }
    const out = {}
    for (const [k, v] of Object.entries(value)) out[k] = normalizeDuck(v)
    return out
  }
  return value
}

function readDuckRows(reader) {
  return reader.getRowObjects().map(normalizeDuck).map((row) => normalizeDuck(row))
}

const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "..")
const WAREHOUSE = path.join(ROOT, ".cache/hf-data/warehouse/latest")
const OUT_PATH = path.join(ROOT, "data/eval-matrices.json")

async function main() {
  // Sanity check: the warehouse parquet files must exist locally.
  // `pnpm cache-hf-data` (legacy) or a manual download populates them; the
  // v2 build-time path streams them via duckdb's HTTPS reader, so when we
  // can't find them locally we point duckdb at SNAPSHOT_URL instead.
  const snapshotUrl = process.env.SNAPSHOT_URL?.replace(/\/+$/, "")
  let base = WAREHOUSE
  let useRemote = false
  try {
    await fs.access(path.join(WAREHOUSE, "eval_results_view.parquet"))
  } catch {
    if (!snapshotUrl) {
      console.error(
        "[build-eval-matrices] no local warehouse cache and no SNAPSHOT_URL — abort.",
      )
      process.exit(1)
    }
    base = snapshotUrl
    useRemote = true
  }

  const t0 = Date.now()
  const db = await DuckDBInstance.create()
  const con = await db.connect()

  const fileRef = (name) => {
    const url = useRemote ? `${base}/${name}` : path.join(base, name)
    return `'${url.replace(/'/g, "''")}'`
  }

  // Reading snapshot_id from snapshot_meta.json so the output can be
  // matched against the pinned build snapshot.
  let snapshotId = "unknown"
  try {
    const metaText = useRemote
      ? await (await fetch(`${base}/snapshot_meta.json`)).text()
      : await fs.readFile(path.join(WAREHOUSE, "snapshot_meta.json"), "utf8")
    snapshotId = JSON.parse(metaText).snapshot_id ?? "unknown"
  } catch (err) {
    console.warn(
      `[build-eval-matrices] couldn't resolve snapshot_id: ${err instanceof Error ? err.message : String(err)}`,
    )
  }

  // 1. All (eval, model, metric, score) rows. Includes non-primary
  //    metrics that getEvalSummaryById currently filters out.
  const metricRows = await con.runAndReadAll(`
    SELECT
      r.evaluation_id,
      r.metric_id,
      r.model_route_id,
      r.score
    FROM read_parquet(${fileRef("eval_results_view.parquet")}) r
    WHERE r.score IS NOT NULL
      AND r.model_route_id IS NOT NULL
  `)

  // 2. Per-slice (composite_slug, benchmark, model, metric, slice_key,
  //    score) rows. The upstream pipeline parks slice scores in
  //    fact_results rather than threading them through eval_results_view,
  //    so we have to reach in here. We carry composite_slug because some
  //    benchmarks (e.g. `gpqa`) appear under multiple composites and
  //    fact_results emits a per-source pseudo-slice (slice_key =
  //    "artificial analysis", "llm stats", "openeval gpqa", ...) for
  //    each source family. Joining slices on (composite_slug,
  //    benchmark_id) keeps each composite's slices in its own lane,
  //    so HF Open LLM v2's GPQA doesn't inherit Artificial Analysis's
  //    pseudo-slice, etc. Also drop the self-rollup (slice_key ==
  //    benchmark_id) since that duplicates the eval's overall score.
  //    AVG collapses the rare duplicate (model, slice) pairs.
  const sliceRows = await con.runAndReadAll(`
    SELECT
      f.composite_slug,
      f.benchmark_id,
      f.parent_benchmark_id,
      f.metric_id,
      f.slice_key,
      f.slice_name,
      f.model_id,
      AVG(f.score) AS score
    FROM read_parquet(${fileRef("fact_results.parquet")}) f
    WHERE f.score IS NOT NULL
      AND f.slice_key IS NOT NULL
      AND f.metric_id IS NOT NULL
      AND f.composite_slug IS NOT NULL
      -- Drop any slice that's a self-rollup of the eval — slice_key
      -- equals the benchmark, the composite, or the parent benchmark
      -- after normalising separators (so "global mmlu lite" filters
      -- against benchmark_id "global-mmlu-lite", "fibble_arena"
      -- against "fibble-arena", "artificial analysis" against
      -- composite "artificial-analysis-llms", etc.).
      AND regexp_replace(lower(f.slice_key), '[^a-z0-9]+', '', 'g')
          != regexp_replace(lower(f.benchmark_id), '[^a-z0-9]+', '', 'g')
      AND regexp_replace(lower(f.slice_key), '[^a-z0-9]+', '', 'g')
          != regexp_replace(lower(f.composite_slug), '[^a-z0-9]+', '', 'g')
      -- Also drop slices whose slug is a strict prefix of the
      -- composite_slug (e.g. "artificial analysis" vs
      -- composite "artificial-analysis-llms" — the slice is just
      -- the source family naming itself, not a real subtask).
      AND NOT regexp_replace(lower(f.composite_slug), '[^a-z0-9]+', '', 'g')
          LIKE regexp_replace(lower(f.slice_key), '[^a-z0-9]+', '', 'g') || '%'
      AND (
        f.parent_benchmark_id IS NULL
        OR regexp_replace(lower(f.slice_key), '[^a-z0-9]+', '', 'g')
           != regexp_replace(lower(f.parent_benchmark_id), '[^a-z0-9]+', '', 'g')
      )
    GROUP BY 1,2,3,4,5,6,7
  `)

  // 3. eval → (composite_slug, benchmark_id) mapping so we can join
  //    slice rows back to the right evaluation_id. composite_slug is
  //    what disambiguates HF Open LLM v2's GPQA from Artificial
  //    Analysis's GPQA — both share benchmark_id `gpqa`. Also pull
  //    leaderboard_metrics so we know each metric's metric_summary_id /
  //    unit / lower_is_better when synthesising subtask-scope entries.
  const evalRows = await con.runAndReadAll(`
    SELECT
      evaluation_id,
      benchmark_id,
      parent_benchmark_id,
      composite_slug,
      leaderboard_metrics
    FROM read_parquet(${fileRef("evals_view.parquet")})
  `)

  // 4. Map model_id → model_route_id so per-slice rows (which carry
  //    model_id) can land alongside per-metric rows (model_route_id).
  const modelKeyRows = await con.runAndReadAll(`
    SELECT DISTINCT model_id, model_route_id
    FROM read_parquet(${fileRef("eval_results_view.parquet")})
    WHERE model_route_id IS NOT NULL
  `)

  await con.disconnectSync()

  // Index the model_id → route_id map so slice lookups are O(1).
  const modelIdToRoute = new Map()
  for (const row of modelKeyRows.getRowObjects().map(normalizeDuck)) {
    if (!modelIdToRoute.has(row.model_id)) {
      modelIdToRoute.set(row.model_id, row.model_route_id)
    }
  }

  // Group eval rows by evaluation_id, indexed by (composite_slug,
  // benchmark_id) for the slice join. Two evals can share a benchmark_id
  // across composites (gpqa under both hfopenllm-v2 and
  // artificial-analysis-llms), so the composite_slug component is what
  // keeps them separated.
  const evalsByCompositeBench = new Map()
  const evalsById = new Map()
  const compositeBenchKey = (composite, bench) =>
    `${composite ?? ""}|${bench ?? ""}`
  for (const row of evalRows.getRowObjects().map(normalizeDuck)) {
    evalsById.set(row.evaluation_id, row)
    const bid = row.benchmark_id ?? null
    const composite = row.composite_slug ?? null
    if (bid && composite) {
      const key = compositeBenchKey(composite, bid)
      if (!evalsByCompositeBench.has(key)) evalsByCompositeBench.set(key, [])
      evalsByCompositeBench.get(key).push(row.evaluation_id)
    }
  }

  // Bucket metric rows by evaluation_id and within that by model.
  // out[evalId].rows[modelRoute].values = { column_key: score }
  const out = {}
  const ensureEval = (evalId) => {
    if (!out[evalId]) {
      out[evalId] = {
        leaderboard_rows: new Map(), // route_id → values
        subtask_metric_keys: new Set(), // tracks which subtask cols we've seen
        subtask_metrics: [],
      }
    }
    return out[evalId]
  }

  for (const row of metricRows.getRowObjects().map(normalizeDuck)) {
    const bucket = ensureEval(row.evaluation_id)
    let modelEntry = bucket.leaderboard_rows.get(row.model_route_id)
    if (!modelEntry) {
      modelEntry = {}
      bucket.leaderboard_rows.set(row.model_route_id, modelEntry)
    }
    modelEntry[row.metric_id] = Number(row.score)
  }

  // Plant slice scores. Each (metric_id, slice_key) becomes a column
  // keyed "<metric_id>::<slice_key>" so it slots into values{} alongside
  // root metrics. The matching subtask leaderboard metric metadata is
  // emitted in subtask_metrics for the runtime to splice into the eval's
  // leaderboard_metrics array.
  for (const row of sliceRows.getRowObjects().map(normalizeDuck)) {
    const evalIds = evalsByCompositeBench.get(
      compositeBenchKey(row.composite_slug, row.benchmark_id),
    )
    if (!evalIds) continue
    const route = modelIdToRoute.get(row.model_id)
    if (!route) continue
    const sliceKey = String(row.slice_key)
    const sliceName = row.slice_name ? String(row.slice_name) : sliceKey
    const metricId = String(row.metric_id)
    const columnKey = `${metricId}::${sliceKey}`
    const score = Number(row.score)
    if (!Number.isFinite(score)) continue

    for (const evalId of evalIds) {
      const bucket = ensureEval(evalId)
      let modelEntry = bucket.leaderboard_rows.get(route)
      if (!modelEntry) {
        modelEntry = {}
        bucket.leaderboard_rows.set(route, modelEntry)
      }
      modelEntry[columnKey] = score

      if (!bucket.subtask_metric_keys.has(columnKey)) {
        bucket.subtask_metric_keys.add(columnKey)
        // Look up the parent eval's metric metadata so the subtask-scope
        // entry inherits unit / lower_is_better. Fall back to defaults
        // when the registry doesn't carry the metric.
        const evalMeta = evalsById.get(evalId)
        const rootMetric = (evalMeta?.leaderboard_metrics ?? []).find(
          (m) => m.metric_id === metricId,
        )
        bucket.subtask_metrics.push({
          column_key: columnKey,
          metric_summary_id: rootMetric?.metric_summary_id ?? `${evalId}%3A${metricId}`,
          metric_id: metricId,
          metric_name: rootMetric?.metric_name ?? metricId,
          display_name: rootMetric?.display_name ?? metricId,
          canonical_display_name: rootMetric?.canonical_display_name ?? null,
          lower_is_better: rootMetric?.lower_is_better ?? false,
          unit: rootMetric?.unit ?? null,
          scope: "subtask",
          subtask_key: sliceKey,
          subtask_name: sliceName,
        })
      }
    }
  }

  // Materialise: convert internal Maps to JSON-friendly arrays. Drop
  // evals that ended up with a single root metric and no subtasks since
  // the runtime can already render those through the existing path.
  const finalEvals = {}
  for (const [evalId, bucket] of Object.entries(out)) {
    const rows = []
    for (const [routeId, values] of bucket.leaderboard_rows) {
      rows.push({ model_route_id: routeId, values })
    }
    // Skip evals where every model has at most one metric and no
    // subtask data — adds no information beyond the existing summary.
    const hasMultiMetric = rows.some((r) => Object.keys(r.values).length > 1)
    if (!hasMultiMetric && bucket.subtask_metrics.length === 0) continue
    finalEvals[evalId] = {
      leaderboard_rows: rows,
      subtask_metrics: bucket.subtask_metrics,
    }
  }

  const payload = {
    snapshot_id: snapshotId,
    generated_at: new Date().toISOString(),
    evals: finalEvals,
  }

  await fs.mkdir(path.dirname(OUT_PATH), { recursive: true })
  await fs.writeFile(OUT_PATH, JSON.stringify(payload))

  const sizeMb = (
    Buffer.byteLength(JSON.stringify(payload), "utf8") / 1024 / 1024
  ).toFixed(2)
  console.log(
    `[build-eval-matrices] wrote ${Object.keys(finalEvals).length} evals to ${path.relative(ROOT, OUT_PATH)} (${sizeMb} MB) in ${Date.now() - t0}ms`,
  )
}

main().catch((err) => {
  console.error("[build-eval-matrices] failed:", err)
  process.exit(1)
})