Spaces:

evaleval
/

general-eval-card

Running on CPU Spr

Jenny Chim Claude Opus 4.7 (1M context) commited on Apr 28

Commit

d3cbe09

1 Parent(s): 2fcae3f

Add three-tier test infrastructure for migration safety

Tier A — pipeline contract tests (tests/pipeline-contract.test.ts, ~14
tests against pinned tests/fixtures/) assert that every field the TS code
depends on is present in upstream artifacts. tests/upstream-drift.test.ts
runs the same shape against the live .cache/hf-data/ (5830 models, 587
evals); opt-in via `pnpm test:drift` (RUN_DRIFT=1 env gate) so it doesn't
flap on every upstream refresh.

Tier B — adapter snapshot tests (tests/adapters.test.ts) snapshot the
output of hfModelCardToEvaluationCardData, hfEvalDetailToSummary,
flattenModelEvaluations, and hfDeveloperDetailToSummary against ~17
hand-curated fixtures. Large outputs use a digest (count + distinct sets +
sha256 of full output) so the snapshot stays reviewable.

Tier C — full-cache differential audit (scripts/audit-adapters.mjs) runs
every adapter against either pinned fixtures or the full live cache,
produces a deterministic JSON digest, and supports `--diff` mode for
before/after comparison of a code change. Catches distribution shifts that
fixture-based tests can't surface.

Fixture management: tests/fixtures/manifest.json catalogs each fixture
with a `why` annotation explaining the code path it exercises (multi-
variant model, first-party Mercor, third-party Artificial Analysis,
Safety regression-bait, coding hierarchy key, etc). `pnpm refresh-fixtures`
re-pins from .cache/hf-data/. Each fixture is small enough to review in a
PR diff. The fixture-vs-manifest consistency test catches both orphan
files and missing files.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (26) hide show

scripts/audit-adapters.mjs +334 -0
scripts/refresh-fixtures.mjs +110 -0
scripts/server-only-shim.mjs +15 -0
tests/__snapshots__/adapters.test.ts.snap +0 -0
tests/adapters.test.ts +122 -0
tests/fixtures/developers/01-ai.json +2669 -0
tests/fixtures/developers/anthropic.json +0 -0
tests/fixtures/developers/openai.json +0 -0
tests/fixtures/evals/apex_v1.json +1929 -0
tests/fixtures/evals/appworld.json +0 -0
tests/fixtures/evals/artificial_analysis_llms_artificial_analysis_aime.json +0 -0
tests/fixtures/evals/helm_capabilities.json +0 -0
tests/fixtures/evals/helm_classic_truthfulqa.json +0 -0
tests/fixtures/evals/helm_lite_narrativeqa.json +0 -0
tests/fixtures/evals/helm_safety_simplesafetytests.json +0 -0
tests/fixtures/loader.ts +92 -0
tests/fixtures/manifest.json +81 -0
tests/fixtures/model-cards/01-ai__yi-34b.json +213 -0
tests/fixtures/model-cards/anthropic__claude-opus-4.5.json +241 -0
tests/fixtures/model-cards/openai__gpt-5.json +307 -0
tests/fixtures/models/ai21__j1-grande-v1-17b.json +0 -0
tests/fixtures/models/bytedance__seed-2-0-lite.json +2086 -0
tests/fixtures/models/google__gemini-3-flash.json +0 -0
tests/fixtures/models/openai__gpt-5-2-pro.json +0 -0
tests/pipeline-contract.test.ts +284 -0
tests/upstream-drift.test.ts +129 -0

scripts/audit-adapters.mjs ADDED Viewed

	@@ -0,0 +1,334 @@

+#!/usr/bin/env node
+// Tier C — full-cache differential audit.
+//
+// Runs every major adapter against either pinned fixtures or the live HF
+// cache, produces a deterministic JSON digest (per-adapter outputs_count,
+// outputs_hash, field distributions, invariant violation counts), and
+// supports a diff mode to compare two digests side-by-side.
+//
+// Usage:
+//   node scripts/audit-adapters.mjs --output baseline.json   # capture digest
+//   node scripts/audit-adapters.mjs --output candidate.json  # after a change
+//   node scripts/audit-adapters.mjs --diff baseline.json candidate.json
+//   node scripts/audit-adapters.mjs --against tests/fixtures # use pinned set
+//   node scripts/audit-adapters.mjs --against .cache/hf-data --output live.json
+//
+// Default --against is .cache/hf-data (the full production cache snapshot).
+// `--against tests/fixtures` falls back to manifest-listed IDs only.
+//
+// The script imports the same adapter functions the runtime uses, so output
+// changes when either adapter logic changes OR input data changes. Use diff
+// mode to separate the two: re-run with the same --against before and after a
+// code change, diff the digests.
+import "./server-only-shim.mjs"
+import { promises as fs } from "fs"
+import { createHash } from "crypto"
+import path from "path"
+const ROOT = path.resolve(import.meta.dirname, "..")
+const args = parseArgs(process.argv.slice(2))
+if (args.diff) {
+  const [baselinePath, candidatePath] = args.diff
+  await runDiff(baselinePath, candidatePath)
+  process.exit(0)
+}
+const sourceDir = path.resolve(ROOT, args.against ?? ".cache/hf-data")
+await ensureDir(sourceDir)
+console.log(`[audit] reading from ${sourceDir}`)
+// Lazy-load adapters AFTER tsx is registered.
+const { flattenModelEvaluations } = await import("../lib/hf-data.ts")
+const {
+  hfModelCardToEvaluationCardData,
+  hfEvalDetailToSummary,
+  hfDeveloperDetailToSummary,
+} = await import("../lib/model-data.ts")
+const { evals, models, developers, modelCards } = await loadInputs(sourceDir, args.against === "tests/fixtures")
+console.log(`[audit] inputs: ${evals.length} evals, ${models.length} models, ${developers.length} developers, ${modelCards.length} model cards`)
+const digest = {
+  version: 1,
+  source: args.against ?? ".cache/hf-data",
+  generated_at: new Date().toISOString(),
+  inputs: {
+    evals: evals.length,
+    models: models.length,
+    developers: developers.length,
+    model_cards: modelCards.length,
+  },
+  adapters: {
+    hfModelCardToEvaluationCardData: auditAdapter(modelCards, (entry) => entry.model_route_id, hfModelCardToEvaluationCardData, {
+      categorical: ["developer"],
+      numeric: ["evaluations_count", "benchmarks_count", "variant_count", "evaluator_count"],
+    }),
+    hfEvalDetailToSummary: auditAdapter(evals, (entry) => entry.eval_summary_id, hfEvalDetailToSummary, {
+      categorical: ["category"],
+      numeric: ["models_count", "metrics_count", "subtasks_count"],
+    }),
+    flattenModelEvaluations: auditAdapter(models, (entry) => entry.model_route_id, (input) => {
+      // Hash the FULL evaluations (so a score/timestamp/metric_name change
+      // is detected), but project to a small set of fields for distribution
+      // tracking (so the per-field histograms stay readable).
+      return flattenModelEvaluations(input)
+    }, {
+      categorical: ["category"],
+      numeric: [],
+      arrayOutput: true,
+      // Pull these from a nested field for distribution tracking only — they
+      // don't affect hashing because the full output is hashed via the items
+      // themselves.
+      categoricalGetters: {
+        evaluator_relationship: (e) => e.source_metadata?.evaluator_relationship,
+        benchmark_family_key: (e) => e.benchmark_family_key,
+      },
+    }),
+    hfDeveloperDetailToSummary: auditAdapter(developers, (entry) => entry.developer, hfDeveloperDetailToSummary, {
+      categorical: ["developer"],
+      numeric: ["model_count", "benchmark_count", "evaluation_count"],
+    }),
+  },
+}
+if (args.output) {
+  await fs.writeFile(args.output, `${JSON.stringify(digest, null, 2)}\n`)
+  console.log(`[audit] wrote ${args.output}`)
+} else {
+  console.log(JSON.stringify(digest, null, 2))
+}
+// -----------------------------------------------------------------------------
+function auditAdapter(inputs, getId, adapter, opts) {
+  const fieldValues = {}
+  for (const field of opts.categorical) fieldValues[field] = new Map()
+  for (const field of opts.numeric) fieldValues[field] = []
+  const getters = opts.categoricalGetters ?? {}
+  for (const field of Object.keys(getters)) fieldValues[field] = new Map()
+  let outputsHash = createHash("sha256")
+  let throws = 0
+  const throwsExamples = []
+  let outputsCount = 0
+  for (const input of inputs) {
+    const id = getId(input) ?? "<no-id>"
+    let output
+    try {
+      output = adapter(input)
+    } catch (err) {
+      throws += 1
+      if (throwsExamples.length < 5) {
+        throwsExamples.push({ id, error: err instanceof Error ? err.message : String(err) })
+      }
+      continue
+    }
+    const items = opts.arrayOutput ? output : [output]
+    outputsCount += opts.arrayOutput ? items.length : 1
+    for (const item of items) {
+      // Hash the full item for change-detection — every leaf value contributes.
+      outputsHash.update(JSON.stringify(stableSort(item)))
+      for (const field of opts.categorical) {
+        const v = String(item?.[field] ?? "<missing>")
+        const counts = fieldValues[field]
+        counts.set(v, (counts.get(v) ?? 0) + 1)
+      }
+      for (const field of opts.numeric) {
+        const v = item?.[field]
+        if (typeof v === "number" && Number.isFinite(v)) fieldValues[field].push(v)
+      }
+      for (const [field, getter] of Object.entries(getters)) {
+        const v = String(getter(item) ?? "<missing>")
+        fieldValues[field].set(v, (fieldValues[field].get(v) ?? 0) + 1)
+      }
+    }
+  }
+  const distributions = {}
+  for (const field of [...opts.categorical, ...Object.keys(getters)]) {
+    distributions[field] = Object.fromEntries(
+      [...fieldValues[field].entries()].sort(([a], [b]) => a.localeCompare(b))
+    )
+  }
+  for (const field of opts.numeric) {
+    const arr = fieldValues[field]
+    if (arr.length === 0) {
+      distributions[field] = { count: 0 }
+      continue
+    }
+    const sorted = [...arr].sort((a, b) => a - b)
+    distributions[field] = {
+      count: arr.length,
+      sum: sorted.reduce((a, b) => a + b, 0),
+      min: sorted[0],
+      max: sorted[sorted.length - 1],
+      median: sorted[Math.floor(sorted.length / 2)],
+    }
+  }
+  return {
+    inputs_count: inputs.length,
+    outputs_count: outputsCount,
+    outputs_hash: `sha256:${outputsHash.digest("hex").slice(0, 16)}`,
+    throws,
+    throws_examples: throwsExamples,
+    field_distributions: distributions,
+  }
+}
+function stableSort(value) {
+  if (Array.isArray(value)) return value.map(stableSort)
+  if (value && typeof value === "object") {
+    return Object.fromEntries(
+      Object.entries(value)
+        .sort(([a], [b]) => a.localeCompare(b))
+        .map(([k, v]) => [k, stableSort(v)])
+    )
+  }
+  return value
+}
+async function loadInputs(sourceDir, isPinnedFixtures) {
+  if (isPinnedFixtures) {
+    return loadFromFixtures(sourceDir)
+  }
+  return loadFromCache(sourceDir)
+}
+async function loadFromFixtures(sourceDir) {
+  const manifest = JSON.parse(await fs.readFile(path.join(sourceDir, "manifest.json"), "utf8"))
+  const groups = { evals: [], models: [], developers: [], modelCards: [] }
+  for (const entry of manifest.evals ?? []) {
+    groups.evals.push(JSON.parse(await fs.readFile(path.join(sourceDir, "evals", `${entry.id}.json`), "utf8")))
+  }
+  for (const entry of manifest.models ?? []) {
+    groups.models.push(JSON.parse(await fs.readFile(path.join(sourceDir, "models", `${entry.id}.json`), "utf8")))
+  }
+  for (const entry of manifest.developers ?? []) {
+    groups.developers.push(JSON.parse(await fs.readFile(path.join(sourceDir, "developers", `${entry.id}.json`), "utf8")))
+  }
+  for (const entry of manifest.model_cards ?? []) {
+    groups.modelCards.push(JSON.parse(await fs.readFile(path.join(sourceDir, "model-cards", `${entry.id}.json`), "utf8")))
+  }
+  return groups
+}
+async function loadFromCache(sourceDir) {
+  const evalFiles = await fs.readdir(path.join(sourceDir, "evals")).catch(() => [])
+  const modelFiles = await fs.readdir(path.join(sourceDir, "models")).catch(() => [])
+  const developerFiles = await fs.readdir(path.join(sourceDir, "developers")).catch(() => [])
+  const modelCardsRaw = await fs.readFile(path.join(sourceDir, "model-cards.json"), "utf8").catch(() => "[]")
+  const groups = { evals: [], models: [], developers: [], modelCards: [] }
+  for (const file of evalFiles) {
+    if (!file.endsWith(".json")) continue
+    groups.evals.push(JSON.parse(await fs.readFile(path.join(sourceDir, "evals", file), "utf8")))
+  }
+  for (const file of modelFiles) {
+    if (!file.endsWith(".json")) continue
+    groups.models.push(JSON.parse(await fs.readFile(path.join(sourceDir, "models", file), "utf8")))
+  }
+  for (const file of developerFiles) {
+    if (!file.endsWith(".json")) continue
+    groups.developers.push(JSON.parse(await fs.readFile(path.join(sourceDir, "developers", file), "utf8")))
+  }
+  groups.modelCards = JSON.parse(modelCardsRaw)
+  return groups
+}
+async function runDiff(baselinePath, candidatePath) {
+  const baseline = JSON.parse(await fs.readFile(baselinePath, "utf8"))
+  const candidate = JSON.parse(await fs.readFile(candidatePath, "utf8"))
+  console.log(`baseline:  ${baseline.source} @ ${baseline.generated_at}`)
+  console.log(`candidate: ${candidate.source} @ ${candidate.generated_at}`)
+  console.log()
+  const adapterNames = new Set([...Object.keys(baseline.adapters ?? {}), ...Object.keys(candidate.adapters ?? {})])
+  for (const name of [...adapterNames].sort()) {
+    const b = baseline.adapters?.[name]
+    const c = candidate.adapters?.[name]
+    if (!b || !c) {
+      console.log(`${name}: ${b ? "removed" : "added"}`)
+      continue
+    }
+    const lines = []
+    if (b.outputs_hash !== c.outputs_hash) lines.push(`  hash:        ${b.outputs_hash} → ${c.outputs_hash}`)
+    if (b.outputs_count !== c.outputs_count) lines.push(`  outputs:     ${b.outputs_count} → ${c.outputs_count}`)
+    if (b.throws !== c.throws) lines.push(`  throws:      ${b.throws} → ${c.throws}`)
+    if (c.throws > b.throws && c.throws_examples?.length > 0) {
+      lines.push(`  new errors:  ${c.throws_examples.slice(0, 3).map((e) => `${e.id}: ${e.error}`).join("; ")}`)
+    }
+    for (const field of new Set([...Object.keys(b.field_distributions ?? {}), ...Object.keys(c.field_distributions ?? {})])) {
+      const distA = b.field_distributions?.[field] ?? {}
+      const distB = c.field_distributions?.[field] ?? {}
+      const aText = JSON.stringify(distA)
+      const bText = JSON.stringify(distB)
+      if (aText === bText) continue
+      lines.push(`  ${field}:`)
+      // Categorical: highlight added/removed/changed keys
+      if (distA && typeof distA === "object" && !("count" in distA)) {
+        const keys = new Set([...Object.keys(distA), ...Object.keys(distB)])
+        for (const k of [...keys].sort()) {
+          const va = distA[k]
+          const vb = distB[k]
+          if (va !== vb) lines.push(`    ${k}: ${va ?? "—"} → ${vb ?? "—"}`)
+        }
+      } else {
+        // Numeric: show min/median/max
+        for (const stat of ["count", "min", "median", "max", "sum"]) {
+          if (distA[stat] !== distB[stat]) {
+            lines.push(`    ${stat}: ${distA[stat]} → ${distB[stat]}`)
+          }
+        }
+      }
+    }
+    if (lines.length === 0) {
+      console.log(`${name}: no change`)
+    } else {
+      console.log(`${name}:`)
+      for (const line of lines) console.log(line)
+    }
+    console.log()
+  }
+}
+async function ensureDir(dir) {
+  await fs.access(dir).catch(() => {
+    throw new Error(`Source directory ${dir} not found.`)
+  })
+}
+function parseArgs(argv) {
+  const out = {}
+  for (let i = 0; i < argv.length; i++) {
+    const a = argv[i]
+    if (a === "--output") out.output = argv[++i]
+    else if (a === "--against") out.against = argv[++i]
+    else if (a === "--diff") {
+      out.diff = [argv[++i], argv[++i]]
+    } else if (a === "--live") out.against = ".cache/hf-data"
+    else if (a === "--help" || a === "-h") {
+      console.log(`Usage: node scripts/audit-adapters.mjs [options]
+  --output FILE         write digest as JSON
+  --against PATH        source dir (default: .cache/hf-data); pinned: tests/fixtures
+  --live                shorthand for --against .cache/hf-data
+  --diff A B            diff two previously-written digests`)
+      process.exit(0)
+    }
+  }
+  return out
+}

scripts/refresh-fixtures.mjs ADDED Viewed

	@@ -0,0 +1,110 @@

+#!/usr/bin/env node
+// Refresh tests/fixtures/{evals,models,developers}/ from .cache/hf-data/.
+// Reads tests/fixtures/manifest.json for the curated ID list, copies each
+// referenced file from the live cache, and bumps manifest.snapshot_ts.
+//
+// Workflow: pnpm refresh-fixtures → git diff tests/fixtures/ → review what
+// upstream changed → pnpm test → if snapshots diff, decide intent → commit.
+//
+// Always re-pin everything (no incremental) so the snapshot is internally
+// consistent. If the cache lacks a referenced file, fail loudly — most likely
+// the manifest references a stale ID and either it should be updated or the
+// cache is incomplete.
+import { promises as fs } from "fs"
+import path from "path"
+const ROOT = path.resolve(import.meta.dirname, "..")
+const CACHE = path.join(ROOT, ".cache", "hf-data")
+const FIXTURES = path.join(ROOT, "tests", "fixtures")
+const MANIFEST = path.join(FIXTURES, "manifest.json")
+const manifest = JSON.parse(await fs.readFile(MANIFEST, "utf8"))
+const sourceDir = path.resolve(ROOT, manifest.snapshot_source ?? ".cache/hf-data")
+if (sourceDir !== CACHE) {
+  console.warn(`Note: manifest.snapshot_source = ${manifest.snapshot_source}, resolving to ${sourceDir}`)
+}
+await fs.access(sourceDir).catch(() => {
+  throw new Error(`Cache directory ${sourceDir} not found. Run \`pnpm cache-hf-data\` first.`)
+})
+let copied = 0
+let removed = 0
+const errors = []
+// Detail files: copy whole file from cache subdirectory.
+for (const [groupName, dirName] of [["evals", "evals"], ["models", "models"], ["developers", "developers"]]) {
+  const entries = manifest[groupName] ?? []
+  const targetDir = path.join(FIXTURES, dirName)
+  await fs.mkdir(targetDir, { recursive: true })
+  // Pin: only files in the manifest survive in tests/fixtures/<dir>/
+  const wanted = new Set(entries.map((entry) => `${entry.id}.json`))
+  const existing = await fs.readdir(targetDir).catch(() => [])
+  for (const file of existing) {
+    if (!wanted.has(file)) {
+      await fs.unlink(path.join(targetDir, file))
+      removed += 1
+    }
+  }
+  for (const entry of entries) {
+    const fileName = `${entry.id}.json`
+    const src = path.join(sourceDir, dirName, fileName)
+    const dst = path.join(targetDir, fileName)
+    try {
+      await fs.copyFile(src, dst)
+      copied += 1
+    } catch (err) {
+      errors.push({ group: groupName, id: entry.id, error: err instanceof Error ? err.message : String(err) })
+    }
+  }
+}
+// model_cards: extract individual entries from model-cards.json (the flat list).
+const modelCardsManifest = manifest.model_cards ?? []
+if (modelCardsManifest.length > 0) {
+  const targetDir = path.join(FIXTURES, "model-cards")
+  await fs.mkdir(targetDir, { recursive: true })
+  const wanted = new Set(modelCardsManifest.map((entry) => `${entry.id}.json`))
+  const existing = await fs.readdir(targetDir).catch(() => [])
+  for (const file of existing) {
+    if (!wanted.has(file)) {
+      await fs.unlink(path.join(targetDir, file))
+      removed += 1
+    }
+  }
+  const allCards = JSON.parse(await fs.readFile(path.join(sourceDir, "model-cards.json"), "utf8"))
+  const byRouteId = new Map(allCards.map((card) => [card.model_route_id, card]))
+  for (const entry of modelCardsManifest) {
+    const card = byRouteId.get(entry.id)
+    if (!card) {
+      errors.push({ group: "model_cards", id: entry.id, error: "model_route_id not found in model-cards.json" })
+      continue
+    }
+    const dst = path.join(targetDir, `${entry.id}.json`)
+    await fs.writeFile(dst, `${JSON.stringify(card, null, 2)}\n`)
+    copied += 1
+  }
+}
+if (errors.length > 0) {
+  console.error("Failed to copy the following fixtures (likely missing from local cache):")
+  for (const { group, id, error } of errors) {
+    console.error(`  ${group}/${id}: ${error}`)
+  }
+  process.exit(1)
+}
+const updatedManifest = {
+  ...manifest,
+  snapshot_ts: new Date().toISOString(),
+}
+await fs.writeFile(MANIFEST, `${JSON.stringify(updatedManifest, null, 2)}\n`)
+console.log(`Refreshed ${copied} fixture(s) from ${sourceDir} (removed ${removed} stale).`)
+console.log(`snapshot_ts → ${updatedManifest.snapshot_ts}`)
+console.log("\nNext: review `git diff tests/fixtures/`, run `pnpm test`, update snapshots with `pnpm test -- -u` if intentional.")

scripts/server-only-shim.mjs ADDED Viewed

	@@ -0,0 +1,15 @@

+// Monkey-patch require/import to no-op the `server-only` package, which
+// throws when imported outside a Next.js Server Component context. This lets
+// the audit script + adapter tests import lib/hf-data.ts and lib/model-data.ts
+// directly. Same trick as tests/server-only-stub.ts but at the require/import
+// resolution level instead of via vitest's alias config.
+import { createRequire } from "node:module"
+import Module from "node:module"
+const require = createRequire(import.meta.url)
+const original = Module.prototype.require
+Module.prototype.require = function patchedRequire(specifier) {
+  if (specifier === "server-only") return {}
+  return original.apply(this, arguments)
+}

tests/__snapshots__/adapters.test.ts.snap ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/adapters.test.ts ADDED Viewed

	@@ -0,0 +1,122 @@

+import { describe, expect, it } from "vitest"
+import type { HFEvalDetail, HFModelDetail, HFModelCardEntry } from "../lib/hf-data"
+import { flattenModelEvaluations } from "../lib/hf-data"
+import {
+  hfEvalDetailToSummary,
+  hfModelCardToEvaluationCardData,
+  hfDeveloperDetailToSummary,
+} from "../lib/model-data"
+import { fixtureEntries, loadFixture } from "./fixtures/loader"
+// Tier B — adapter snapshot tests.
+//
+// Each adapter is run against every relevant fixture, and the output is
+// snapshotted via vitest's toMatchSnapshot(). Initial snapshots are committed.
+//
+// Workflow:
+//   - Edit code → `pnpm test`
+//   - If snapshots match → no behavior change. Safe.
+//   - If snapshots differ → review the snap diff in tests/__snapshots__/
+//     alongside the code diff. If intentional behavior change, run
+//     `pnpm test -- -u` to update snapshots and commit them in the same PR.
+//     If unintentional, the test caught a regression — fix the code.
+//
+// New fixtures: add to tests/fixtures/manifest.json, run `pnpm refresh-fixtures`,
+// run `pnpm test -- -u` to capture initial snapshots, commit fixtures + snaps
+// together.
+describe("hfModelCardToEvaluationCardData", () => {
+  for (const entry of fixtureEntries("model_cards")) {
+    it(`${entry.id} — ${entry.why}`, () => {
+      const input = loadFixture<HFModelCardEntry>("model_cards", entry.id)
+      expect(hfModelCardToEvaluationCardData(input)).toMatchSnapshot()
+    })
+  }
+})
+describe("hfEvalDetailToSummary", () => {
+  for (const entry of fixtureEntries("evals")) {
+    it(`${entry.id} — ${entry.why}`, () => {
+      const input = loadFixture<HFEvalDetail>("evals", entry.id)
+      // attachBenchmarkCardToSummary is async + I/O bound — snapshot the
+      // synchronous core. attachBenchmarkCardToSummary is covered separately
+      // by the parity harness.
+      expect(hfEvalDetailToSummary(input)).toMatchSnapshot()
+    })
+  }
+})
+describe("flattenModelEvaluations", () => {
+  for (const entry of fixtureEntries("models")) {
+    it(`${entry.id} — ${entry.why}`, () => {
+      const input = loadFixture<HFModelDetail>("models", entry.id)
+      const evaluations = flattenModelEvaluations(input)
+      // Snapshot a digest rather than the full output (which can be 10k+ lines
+      // for large models). The digest captures: count, distinct categories,
+      // distinct evaluator_relationships, count of distinct benchmark_family_keys,
+      // count of unique evaluation_ids, and a hash of the full output. Any change
+      // to the full output changes the hash; the structured fields make the diff
+      // readable when something changes.
+      expect(digestEvaluations(evaluations)).toMatchSnapshot()
+    })
+  }
+})
+describe("hfDeveloperDetailToSummary", () => {
+  for (const entry of fixtureEntries("developers")) {
+    it(`${entry.id} — ${entry.why}`, () => {
+      const input = loadFixture<{ developer: string; models: HFModelCardEntry[] }>("developers", entry.id)
+      // Developer fixtures can be large (anthropic.json is 389KB, many model
+      // cards). Snapshot a digest: scalar fields plus a count + hash of the
+      // model_cards array. Bigger detail goes through hfModelCardToEvaluationCardData
+      // tests above.
+      expect(digestDeveloperSummary(hfDeveloperDetailToSummary(input))).toMatchSnapshot()
+    })
+  }
+})
+import { createHash } from "crypto"
+import type { BenchmarkEvaluation } from "../lib/benchmark-schema"
+function stableHash(value: unknown): string {
+  return createHash("sha256").update(JSON.stringify(value)).digest("hex").slice(0, 12)
+}
+function digestEvaluations(evaluations: BenchmarkEvaluation[]) {
+  const categories = new Set<string>()
+  const families = new Set<string>()
+  const evaluators = new Set<string>()
+  const evaluationIds = new Set<string>()
+  let missingSourceMetadata = 0
+  for (const e of evaluations) {
+    if (e.category) categories.add(e.category)
+    if (e.benchmark_family_key) families.add(e.benchmark_family_key)
+    if (e.source_metadata?.evaluator_relationship) evaluators.add(e.source_metadata.evaluator_relationship)
+    if (e.evaluation_id) evaluationIds.add(e.evaluation_id)
+    if (!e.source_metadata) missingSourceMetadata += 1
+  }
+  return {
+    count: evaluations.length,
+    distinct_evaluation_ids: evaluationIds.size,
+    distinct_categories: [...categories].sort(),
+    distinct_benchmark_family_keys: families.size,
+    distinct_evaluator_relationships: [...evaluators].sort(),
+    missing_source_metadata: missingSourceMetadata,
+    full_output_hash: stableHash(evaluations),
+  }
+}
+function digestDeveloperSummary(summary: ReturnType<typeof hfDeveloperDetailToSummary>) {
+  return {
+    developer: summary.developer,
+    route_id: summary.route_id,
+    model_count: summary.model_count,
+    benchmark_count: summary.benchmark_count,
+    evaluation_count: summary.evaluation_count,
+    popular_evals: summary.popular_evals,
+    models_hash: stableHash(summary.models),
+    models_count: summary.models.length,
+  }
+}

tests/fixtures/developers/01-ai.json ADDED Viewed

	@@ -0,0 +1,2669 @@

+{
+  "developer": "01-ai",
+  "models": [
+    {
+      "model_family_id": "01-ai/yi-34b",
+      "model_route_id": "01-ai__yi-34b",
+      "model_family_name": "Yi 34B",
+      "developer": "01-ai",
+      "params_billions": 34.0,
+      "total_evaluations": 3,
+      "benchmark_count": 3,
+      "benchmark_family_count": 3,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-21T12:31:52.005480Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 3,
+          "raw_model_ids": [
+            "01-ai/Yi-34B",
+            "01-ai/yi-34b"
+          ],
+          "last_updated": "2026-03-21T12:31:52.005480Z"
+        }
+      ],
+      "score_summary": {
+        "count": 52,
+        "min": 0.0514,
+        "max": 0.936,
+        "average": 0.6793153846153845
+      },
+      "reproducibility_summary": {
+        "results_total": 52,
+        "has_reproducibility_gap_count": 52,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 52,
+        "total_groups": 52,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 52,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 52,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "GSM8K",
+        "Helm lite",
+        "IFEval",
+        "LegalBench",
+        "MATH",
+        "MATH Level 5",
+        "MMLU",
+        "MMLU-PRO",
+        "MUSR",
+        "MedQA",
+        "NarrativeQA",
+        "NaturalQuestions (closed-book)",
+        "OpenbookQA",
+        "WMT 2014"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "MMLU",
+          "benchmarkKey": "helm_mmlu",
+          "canonical_display_name": "Mmlu / Marketing / Exact Match",
+          "evaluation_name": "Marketing",
+          "score": 0.936,
+          "metric": "EM on Marketing",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "OpenbookQA",
+          "benchmarkKey": "helm_lite_openbookqa",
+          "canonical_display_name": "OpenbookQA / Exact Match",
+          "evaluation_name": "OpenbookQA",
+          "score": 0.92,
+          "metric": "EM on OpenbookQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "NarrativeQA",
+          "benchmarkKey": "helm_lite_narrativeqa",
+          "canonical_display_name": "NarrativeQA / F1",
+          "evaluation_name": "NarrativeQA",
+          "score": 0.782,
+          "metric": "F1 on NarrativeQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MedQA",
+          "benchmarkKey": "helm_lite_medqa",
+          "canonical_display_name": "MedQA / Exact Match",
+          "evaluation_name": "MedQA",
+          "score": 0.656,
+          "metric": "EM on MedQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU",
+          "benchmarkKey": "helm_lite_mmlu",
+          "canonical_display_name": "MMLU / Exact Match",
+          "evaluation_name": "MMLU",
+          "score": 0.65,
+          "metric": "EM on MMLU",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GSM8K",
+          "benchmarkKey": "helm_lite_gsm8k",
+          "canonical_display_name": "GSM8K / Exact Match",
+          "evaluation_name": "GSM8K",
+          "score": 0.648,
+          "metric": "EM on GSM8K",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "LegalBench",
+          "benchmarkKey": "helm_lite_legalbench",
+          "canonical_display_name": "LegalBench / Exact Match",
+          "evaluation_name": "LegalBench",
+          "score": 0.618,
+          "metric": "EM on LegalBench",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "Helm lite",
+          "benchmarkKey": "helm_lite",
+          "canonical_display_name": "Helm lite / Win Rate",
+          "evaluation_name": "helm_lite",
+          "score": 0.57,
+          "metric": "How many models this model outperforms on average (over columns).",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.5457,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "NaturalQuestions (closed-book)",
+          "benchmarkKey": "helm_lite_naturalquestions_closed_book",
+          "canonical_display_name": "NaturalQuestions (closed-book) / F1",
+          "evaluation_name": "NaturalQuestions (closed-book)",
+          "score": 0.443,
+          "metric": "F1 on NaturalQuestions (closed-book)",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.4412,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.4119,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH",
+          "benchmarkKey": "helm_lite_math",
+          "canonical_display_name": "MATH / Equivalent (CoT)",
+          "evaluation_name": "MATH",
+          "score": 0.375,
+          "metric": "Equivalent (CoT) on MATH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.3666,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.3046,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-6b",
+      "model_route_id": "01-ai__yi-6b",
+      "model_family_name": "Yi 6B",
+      "developer": "01-ai",
+      "params_billions": 6.0,
+      "total_evaluations": 3,
+      "benchmark_count": 3,
+      "benchmark_family_count": 3,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-21T12:31:52.005480Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 3,
+          "raw_model_ids": [
+            "01-ai/Yi-6B",
+            "01-ai/yi-6b"
+          ],
+          "last_updated": "2026-03-21T12:31:52.005480Z"
+        }
+      ],
+      "score_summary": {
+        "count": 52,
+        "min": 0.0159,
+        "max": 0.893,
+        "average": 0.5652923076923078
+      },
+      "reproducibility_summary": {
+        "results_total": 52,
+        "has_reproducibility_gap_count": 52,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 52,
+        "total_groups": 52,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 52,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 52,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "GSM8K",
+        "Helm lite",
+        "IFEval",
+        "LegalBench",
+        "MATH",
+        "MATH Level 5",
+        "MMLU",
+        "MMLU-PRO",
+        "MUSR",
+        "MedQA",
+        "NarrativeQA",
+        "NaturalQuestions (closed-book)",
+        "OpenbookQA",
+        "WMT 2014"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "MMLU",
+          "benchmarkKey": "helm_mmlu",
+          "canonical_display_name": "Mmlu / Marketing / Exact Match",
+          "evaluation_name": "Marketing",
+          "score": 0.893,
+          "metric": "EM on Marketing",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "OpenbookQA",
+          "benchmarkKey": "helm_lite_openbookqa",
+          "canonical_display_name": "OpenbookQA / Exact Match",
+          "evaluation_name": "OpenbookQA",
+          "score": 0.8,
+          "metric": "EM on OpenbookQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "NarrativeQA",
+          "benchmarkKey": "helm_lite_narrativeqa",
+          "canonical_display_name": "NarrativeQA / F1",
+          "evaluation_name": "NarrativeQA",
+          "score": 0.702,
+          "metric": "F1 on NarrativeQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU",
+          "benchmarkKey": "helm_lite_mmlu",
+          "canonical_display_name": "MMLU / Exact Match",
+          "evaluation_name": "MMLU",
+          "score": 0.53,
+          "metric": "EM on MMLU",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "LegalBench",
+          "benchmarkKey": "helm_lite_legalbench",
+          "canonical_display_name": "LegalBench / Exact Match",
+          "evaluation_name": "LegalBench",
+          "score": 0.519,
+          "metric": "EM on LegalBench",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MedQA",
+          "benchmarkKey": "helm_lite_medqa",
+          "canonical_display_name": "MedQA / Exact Match",
+          "evaluation_name": "MedQA",
+          "score": 0.497,
+          "metric": "EM on MedQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.4309,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.3937,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GSM8K",
+          "benchmarkKey": "helm_lite_gsm8k",
+          "canonical_display_name": "GSM8K / Exact Match",
+          "evaluation_name": "GSM8K",
+          "score": 0.375,
+          "metric": "EM on GSM8K",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "NaturalQuestions (closed-book)",
+          "benchmarkKey": "helm_lite_naturalquestions_closed_book",
+          "canonical_display_name": "NaturalQuestions (closed-book) / F1",
+          "evaluation_name": "NaturalQuestions (closed-book)",
+          "score": 0.31,
+          "metric": "F1 on NaturalQuestions (closed-book)",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.2991,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.2893,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.2693,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "Helm lite",
+          "benchmarkKey": "helm_lite",
+          "canonical_display_name": "Helm lite / Win Rate",
+          "evaluation_name": "helm_lite",
+          "score": 0.253,
+          "metric": "How many models this model outperforms on average (over columns).",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH",
+          "benchmarkKey": "helm_lite_math",
+          "canonical_display_name": "MATH / Equivalent (CoT)",
+          "evaluation_name": "MATH",
+          "score": 0.126,
+          "metric": "Equivalent (CoT) on MATH",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-34b-chat",
+      "model_route_id": "01-ai__yi-34b-chat",
+      "model_family_name": "Yi Chat 34B",
+      "developer": "01-ai",
+      "params_billions": 34.0,
+      "total_evaluations": 2,
+      "benchmark_count": 2,
+      "benchmark_family_count": 2,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-04-20T22:14:39.271662Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 2,
+          "raw_model_ids": [
+            "01-ai/Yi-34B-Chat",
+            "01-ai/yi-34b-chat"
+          ],
+          "last_updated": "2026-04-20T22:14:39.271662Z"
+        }
+      ],
+      "score_summary": {
+        "count": 380,
+        "min": 0.0,
+        "max": 1.0,
+        "average": 0.528668157894737
+      },
+      "reproducibility_summary": {
+        "results_total": 380,
+        "has_reproducibility_gap_count": 380,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 380,
+        "total_groups": 380,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 380,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 380,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "Helm air bench",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "Helm air bench",
+          "benchmarkKey": "helm_air_bench",
+          "canonical_display_name": "Air bench / AIRBench 2024 - #39-40.34: Characterization of identity - Predicted Personality / Refusal Rate",
+          "evaluation_name": "AIRBench 2024 - #39-40.34: Characterization of identity - Predicted Personality",
+          "score": 1.0,
+          "metric": "Refusal Rate on AIRBench 2024 - #39-40.34: Characterization of identity - Predicted Personality",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.5561,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.4699,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.4093,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.3978,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.3381,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.0627,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-large-preview",
+      "model_route_id": "01-ai__yi-large-preview",
+      "model_family_name": "Yi Large Preview",
+      "developer": "01-ai",
+      "params_billions": null,
+      "total_evaluations": 2,
+      "benchmark_count": 2,
+      "benchmark_family_count": 2,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-21T12:31:52.005480Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 2,
+          "raw_model_ids": [
+            "01-ai/yi-large-preview"
+          ],
+          "last_updated": "2026-03-21T12:31:52.005480Z"
+        }
+      ],
+      "score_summary": {
+        "count": 46,
+        "min": 0.176,
+        "max": 0.946,
+        "average": 0.741413043478261
+      },
+      "reproducibility_summary": {
+        "results_total": 46,
+        "has_reproducibility_gap_count": 46,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 46,
+        "total_groups": 46,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 46,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 46,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "GSM8K",
+        "Helm lite",
+        "LegalBench",
+        "MATH",
+        "MMLU",
+        "MedQA",
+        "NarrativeQA",
+        "NaturalQuestions (closed-book)",
+        "OpenbookQA",
+        "WMT 2014"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "OpenbookQA",
+          "benchmarkKey": "helm_lite_openbookqa",
+          "canonical_display_name": "OpenbookQA / Exact Match",
+          "evaluation_name": "OpenbookQA",
+          "score": 0.946,
+          "metric": "EM on OpenbookQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU",
+          "benchmarkKey": "helm_mmlu",
+          "canonical_display_name": "Mmlu / High School World History / Exact Match",
+          "evaluation_name": "High School World History",
+          "score": 0.928,
+          "metric": "EM on High School World History",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU",
+          "benchmarkKey": "helm_lite_mmlu",
+          "canonical_display_name": "MMLU / Exact Match",
+          "evaluation_name": "MMLU",
+          "score": 0.712,
+          "metric": "EM on MMLU",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH",
+          "benchmarkKey": "helm_lite_math",
+          "canonical_display_name": "MATH / Equivalent (CoT)",
+          "evaluation_name": "MATH",
+          "score": 0.712,
+          "metric": "Equivalent (CoT) on MATH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GSM8K",
+          "benchmarkKey": "helm_lite_gsm8k",
+          "canonical_display_name": "GSM8K / Exact Match",
+          "evaluation_name": "GSM8K",
+          "score": 0.69,
+          "metric": "EM on GSM8K",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MedQA",
+          "benchmarkKey": "helm_lite_medqa",
+          "canonical_display_name": "MedQA / Exact Match",
+          "evaluation_name": "MedQA",
+          "score": 0.66,
+          "metric": "EM on MedQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "LegalBench",
+          "benchmarkKey": "helm_lite_legalbench",
+          "canonical_display_name": "LegalBench / Exact Match",
+          "evaluation_name": "LegalBench",
+          "score": 0.519,
+          "metric": "EM on LegalBench",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "Helm lite",
+          "benchmarkKey": "helm_lite",
+          "canonical_display_name": "Helm lite / Win Rate",
+          "evaluation_name": "helm_lite",
+          "score": 0.471,
+          "metric": "How many models this model outperforms on average (over columns).",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "NaturalQuestions (closed-book)",
+          "benchmarkKey": "helm_lite_naturalquestions_closed_book",
+          "canonical_display_name": "NaturalQuestions (closed-book) / F1",
+          "evaluation_name": "NaturalQuestions (closed-book)",
+          "score": 0.428,
+          "metric": "F1 on NaturalQuestions (closed-book)",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "NarrativeQA",
+          "benchmarkKey": "helm_lite_narrativeqa",
+          "canonical_display_name": "NarrativeQA / F1",
+          "evaluation_name": "NarrativeQA",
+          "score": 0.373,
+          "metric": "F1 on NarrativeQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "WMT 2014",
+          "benchmarkKey": "helm_lite_wmt_2014",
+          "canonical_display_name": "WMT 2014 / BLEU-4",
+          "evaluation_name": "WMT 2014",
+          "score": 0.176,
+          "metric": "BLEU-4 on WMT 2014",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-1-5-34b",
+      "model_route_id": "01-ai__yi-1-5-34b",
+      "model_family_name": "Yi-1.5-34B",
+      "developer": "01-ai",
+      "params_billions": 34.389,
+      "total_evaluations": 1,
+      "benchmark_count": 1,
+      "benchmark_family_count": 1,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-19T16:08:18.240187Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 1,
+          "raw_model_ids": [
+            "01-ai/Yi-1.5-34B"
+          ],
+          "last_updated": "2026-03-19T16:08:18.240187Z"
+        }
+      ],
+      "score_summary": {
+        "count": 6,
+        "min": 0.1533,
+        "max": 0.5976,
+        "average": 0.3818333333333334
+      },
+      "reproducibility_summary": {
+        "results_total": 6,
+        "has_reproducibility_gap_count": 6,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 6,
+        "total_groups": 6,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 6,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 6,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.5976,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.4666,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.4236,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.3658,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.2841,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.1533,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-1-5-34b-32k",
+      "model_route_id": "01-ai__yi-1-5-34b-32k",
+      "model_family_name": "Yi-1.5-34B-32K",
+      "developer": "01-ai",
+      "params_billions": 34.389,
+      "total_evaluations": 1,
+      "benchmark_count": 1,
+      "benchmark_family_count": 1,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-19T16:08:18.240187Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 1,
+          "raw_model_ids": [
+            "01-ai/Yi-1.5-34B-32K"
+          ],
+          "last_updated": "2026-03-19T16:08:18.240187Z"
+        }
+      ],
+      "score_summary": {
+        "count": 6,
+        "min": 0.1541,
+        "max": 0.6016,
+        "average": 0.3902666666666666
+      },
+      "reproducibility_summary": {
+        "results_total": 6,
+        "has_reproducibility_gap_count": 6,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 6,
+        "total_groups": 6,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 6,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 6,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.6016,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.4709,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.4398,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.3633,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.3119,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.1541,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-1-5-34b-chat",
+      "model_route_id": "01-ai__yi-1-5-34b-chat",
+      "model_family_name": "Yi-1.5-34B-Chat",
+      "developer": "01-ai",
+      "params_billions": 34.389,
+      "total_evaluations": 1,
+      "benchmark_count": 1,
+      "benchmark_family_count": 1,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-19T16:08:18.240187Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 1,
+          "raw_model_ids": [
+            "01-ai/Yi-1.5-34B-Chat"
+          ],
+          "last_updated": "2026-03-19T16:08:18.240187Z"
+        }
+      ],
+      "score_summary": {
+        "count": 6,
+        "min": 0.2772,
+        "max": 0.6084,
+        "average": 0.4562333333333333
+      },
+      "reproducibility_summary": {
+        "results_total": 6,
+        "has_reproducibility_gap_count": 6,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 6,
+        "total_groups": 6,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 6,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 6,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.6084,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.6067,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.452,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.4282,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.3649,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.2772,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-1-5-34b-chat-16k",
+      "model_route_id": "01-ai__yi-1-5-34b-chat-16k",
+      "model_family_name": "Yi-1.5-34B-Chat-16K",
+      "developer": "01-ai",
+      "params_billions": 34.389,
+      "total_evaluations": 1,
+      "benchmark_count": 1,
+      "benchmark_family_count": 1,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-19T16:08:18.240187Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 1,
+          "raw_model_ids": [
+            "01-ai/Yi-1.5-34B-Chat-16K"
+          ],
+          "last_updated": "2026-03-19T16:08:18.240187Z"
+        }
+      ],
+      "score_summary": {
+        "count": 6,
+        "min": 0.2137,
+        "max": 0.61,
+        "average": 0.41875
+      },
+      "reproducibility_summary": {
+        "results_total": 6,
+        "has_reproducibility_gap_count": 6,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 6,
+        "total_groups": 6,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 6,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 6,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.61,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.4564,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.4545,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.4398,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.3381,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.2137,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-1-5-6b",
+      "model_route_id": "01-ai__yi-1-5-6b",
+      "model_family_name": "Yi-1.5-6B",
+      "developer": "01-ai",
+      "params_billions": 6.061,
+      "total_evaluations": 1,
+      "benchmark_count": 1,
+      "benchmark_family_count": 1,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-19T16:08:18.240187Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 1,
+          "raw_model_ids": [
+            "01-ai/Yi-1.5-6B"
+          ],
+          "last_updated": "2026-03-19T16:08:18.240187Z"
+        }
+      ],
+      "score_summary": {
+        "count": 6,
+        "min": 0.0665,
+        "max": 0.4493,
+        "average": 0.3071833333333333
+      },
+      "reproducibility_summary": {
+        "results_total": 6,
+        "has_reproducibility_gap_count": 6,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 6,
+        "total_groups": 6,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 6,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 6,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.4493,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.4374,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.3144,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.3138,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.2617,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.0665,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-1-5-6b-chat",
+      "model_route_id": "01-ai__yi-1-5-6b-chat",
+      "model_family_name": "Yi-1.5-6B-Chat",
+      "developer": "01-ai",
+      "params_billions": 6.061,
+      "total_evaluations": 1,
+      "benchmark_count": 1,
+      "benchmark_family_count": 1,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-19T16:08:18.240187Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 1,
+          "raw_model_ids": [
+            "01-ai/Yi-1.5-6B-Chat"
+          ],
+          "last_updated": "2026-03-19T16:08:18.240187Z"
+        }
+      ],
+      "score_summary": {
+        "count": 6,
+        "min": 0.1624,
+        "max": 0.5145,
+        "average": 0.36575
+      },
+      "reproducibility_summary": {
+        "results_total": 6,
+        "has_reproducibility_gap_count": 6,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 6,
+        "total_groups": 6,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 6,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 6,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.5145,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.4571,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.4392,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.3193,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.302,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.1624,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-1-5-9b",
+      "model_route_id": "01-ai__yi-1-5-9b",
+      "model_family_name": "Yi-1.5-9B",
+      "developer": "01-ai",
+      "params_billions": 8.829,
+      "total_evaluations": 1,
+      "benchmark_count": 1,
+      "benchmark_family_count": 1,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-19T16:08:18.240187Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 1,
+          "raw_model_ids": [
+            "01-ai/Yi-1.5-9B"
+          ],
+          "last_updated": "2026-03-19T16:08:18.240187Z"
+        }
+      ],
+      "score_summary": {
+        "count": 6,
+        "min": 0.114,
+        "max": 0.5143,
+        "average": 0.35425
+      },
+      "reproducibility_summary": {
+        "results_total": 6,
+        "has_reproducibility_gap_count": 6,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 6,
+        "total_groups": 6,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 6,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 6,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.5143,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.4328,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.3916,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.3792,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.2936,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.114,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-1-5-9b-32k",
+      "model_route_id": "01-ai__yi-1-5-9b-32k",
+      "model_family_name": "Yi-1.5-9B-32K",
+      "developer": "01-ai",
+      "params_billions": 8.829,
+      "total_evaluations": 1,
+      "benchmark_count": 1,
+      "benchmark_family_count": 1,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-19T16:08:18.240187Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 1,
+          "raw_model_ids": [
+            "01-ai/Yi-1.5-9B-32K"
+          ],
+          "last_updated": "2026-03-19T16:08:18.240187Z"
+        }
+      ],
+      "score_summary": {
+        "count": 6,
+        "min": 0.108,
+        "max": 0.4963,
+        "average": 0.3314666666666667
+      },
+      "reproducibility_summary": {
+        "results_total": 6,
+        "has_reproducibility_gap_count": 6,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 6,
+        "total_groups": 6,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 6,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 6,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.4963,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.4186,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.3765,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.3591,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.2303,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.108,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-1-5-9b-chat",
+      "model_route_id": "01-ai__yi-1-5-9b-chat",
+      "model_family_name": "Yi-1.5-9B-Chat",
+      "developer": "01-ai",
+      "params_billions": 8.829,
+      "total_evaluations": 1,
+      "benchmark_count": 1,
+      "benchmark_family_count": 1,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-19T16:08:18.240187Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 1,
+          "raw_model_ids": [
+            "01-ai/Yi-1.5-9B-Chat"
+          ],
+          "last_updated": "2026-03-19T16:08:18.240187Z"
+        }
+      ],
+      "score_summary": {
+        "count": 6,
+        "min": 0.2258,
+        "max": 0.6046,
+        "average": 0.42406666666666665
+      },
+      "reproducibility_summary": {
+        "results_total": 6,
+        "has_reproducibility_gap_count": 6,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 6,
+        "total_groups": 6,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 6,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 6,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.6046,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.5559,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.4259,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.3975,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.3347,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.2258,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-1-5-9b-chat-16k",
+      "model_route_id": "01-ai__yi-1-5-9b-chat-16k",
+      "model_family_name": "Yi-1.5-9B-Chat-16K",
+      "developer": "01-ai",
+      "params_billions": 8.829,
+      "total_evaluations": 1,
+      "benchmark_count": 1,
+      "benchmark_family_count": 1,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-19T16:08:18.240187Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 1,
+          "raw_model_ids": [
+            "01-ai/Yi-1.5-9B-Chat-16K"
+          ],
+          "last_updated": "2026-03-19T16:08:18.240187Z"
+        }
+      ],
+      "score_summary": {
+        "count": 6,
+        "min": 0.1782,
+        "max": 0.5153,
+        "average": 0.37215
+      },
+      "reproducibility_summary": {
+        "results_total": 6,
+        "has_reproducibility_gap_count": 6,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 6,
+        "total_groups": 6,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 6,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 6,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.5153,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.4214,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.4099,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.3994,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.3087,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.1782,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-34b-200k",
+      "model_route_id": "01-ai__yi-34b-200k",
+      "model_family_name": "Yi-34B-200K",
+      "developer": "01-ai",
+      "params_billions": 34.389,
+      "total_evaluations": 1,
+      "benchmark_count": 1,
+      "benchmark_family_count": 1,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-19T16:08:18.240187Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 1,
+          "raw_model_ids": [
+            "01-ai/Yi-34B-200K"
+          ],
+          "last_updated": "2026-03-19T16:08:18.240187Z"
+        }
+      ],
+      "score_summary": {
+        "count": 6,
+        "min": 0.0574,
+        "max": 0.5442,
+        "average": 0.32458333333333333
+      },
+      "reproducibility_summary": {
+        "results_total": 6,
+        "has_reproducibility_gap_count": 6,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 6,
+        "total_groups": 6,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 6,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 6,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.5442,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.4535,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.3817,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.3565,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.1542,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.0574,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-6b-200k",
+      "model_route_id": "01-ai__yi-6b-200k",
+      "model_family_name": "Yi-6B-200K",
+      "developer": "01-ai",
+      "params_billions": 6.061,
+      "total_evaluations": 1,
+      "benchmark_count": 1,
+      "benchmark_family_count": 1,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-19T16:08:18.240187Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 1,
+          "raw_model_ids": [
+            "01-ai/Yi-6B-200K"
+          ],
+          "last_updated": "2026-03-19T16:08:18.240187Z"
+        }
+      ],
+      "score_summary": {
+        "count": 6,
+        "min": 0.0181,
+        "max": 0.4587,
+        "average": 0.25938333333333335
+      },
+      "reproducibility_summary": {
+        "results_total": 6,
+        "has_reproducibility_gap_count": 6,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 6,
+        "total_groups": 6,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 6,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 6,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.4587,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.4289,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.2844,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.2819,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.0843,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.0181,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-6b-chat",
+      "model_route_id": "01-ai__yi-6b-chat",
+      "model_family_name": "Yi-6B-Chat",
+      "developer": "01-ai",
+      "params_billions": 6.061,
+      "total_evaluations": 1,
+      "benchmark_count": 1,
+      "benchmark_family_count": 1,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-19T16:08:18.240187Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 1,
+          "raw_model_ids": [
+            "01-ai/Yi-6B-Chat"
+          ],
+          "last_updated": "2026-03-19T16:08:18.240187Z"
+        }
+      ],
+      "score_summary": {
+        "count": 6,
+        "min": 0.0136,
+        "max": 0.4133,
+        "average": 0.2893
+      },
+      "reproducibility_summary": {
+        "results_total": 6,
+        "has_reproducibility_gap_count": 6,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 6,
+        "total_groups": 6,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 6,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 6,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.4133,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.3688,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.3395,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.3061,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.2945,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.0136,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-9b",
+      "model_route_id": "01-ai__yi-9b",
+      "model_family_name": "Yi-9B",
+      "developer": "01-ai",
+      "params_billions": 8.829,
+      "total_evaluations": 1,
+      "benchmark_count": 1,
+      "benchmark_family_count": 1,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-19T16:08:18.240187Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 1,
+          "raw_model_ids": [
+            "01-ai/Yi-9B"
+          ],
+          "last_updated": "2026-03-19T16:08:18.240187Z"
+        }
+      ],
+      "score_summary": {
+        "count": 6,
+        "min": 0.0559,
+        "max": 0.494,
+        "average": 0.3169333333333333
+      },
+      "reproducibility_summary": {
+        "results_total": 6,
+        "has_reproducibility_gap_count": 6,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 6,
+        "total_groups": 6,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 6,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 6,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.494,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.4054,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.3574,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.318,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.2709,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.0559,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-9b-200k",
+      "model_route_id": "01-ai__yi-9b-200k",
+      "model_family_name": "Yi-9B-200K",
+      "developer": "01-ai",
+      "params_billions": 8.829,
+      "total_evaluations": 1,
+      "benchmark_count": 1,
+      "benchmark_family_count": 1,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-19T16:08:18.240187Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 1,
+          "raw_model_ids": [
+            "01-ai/Yi-9B-200K"
+          ],
+          "last_updated": "2026-03-19T16:08:18.240187Z"
+        }
+      ],
+      "score_summary": {
+        "count": 6,
+        "min": 0.0665,
+        "max": 0.4793,
+        "average": 0.31425000000000003
+      },
+      "reproducibility_summary": {
+        "results_total": 6,
+        "has_reproducibility_gap_count": 6,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 6,
+        "total_groups": 6,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 6,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 6,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.4793,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.4294,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.3622,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.3154,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.2327,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.0665,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    },
+    {
+      "model_family_id": "01-ai/yi-coder-9b-chat",
+      "model_route_id": "01-ai__yi-coder-9b-chat",
+      "model_family_name": "Yi-Coder-9B-Chat",
+      "developer": "01-ai",
+      "params_billions": 8.829,
+      "total_evaluations": 1,
+      "benchmark_count": 1,
+      "benchmark_family_count": 1,
+      "categories_covered": [
+        "general",
+        "knowledge",
+        "reasoning"
+      ],
+      "last_updated": "2026-03-19T16:08:18.240187Z",
+      "variants": [
+        {
+          "variant_key": "default",
+          "variant_label": "Default",
+          "evaluation_count": 1,
+          "raw_model_ids": [
+            "01-ai/Yi-Coder-9B-Chat"
+          ],
+          "last_updated": "2026-03-19T16:08:18.240187Z"
+        }
+      ],
+      "score_summary": {
+        "count": 6,
+        "min": 0.04,
+        "max": 0.4817,
+        "average": 0.31538333333333335
+      },
+      "reproducibility_summary": {
+        "results_total": 6,
+        "has_reproducibility_gap_count": 6,
+        "populated_ratio_avg": 0.0
+      },
+      "provenance_summary": {
+        "total_results": 6,
+        "total_groups": 6,
+        "multi_source_groups": 0,
+        "first_party_only_groups": 0,
+        "source_type_distribution": {
+          "first_party": 0,
+          "third_party": 6,
+          "collaborative": 0,
+          "unspecified": 0
+        }
+      },
+      "comparability_summary": {
+        "total_groups": 6,
+        "groups_with_variant_check": 0,
+        "groups_with_cross_party_check": 0,
+        "variant_divergent_count": 0,
+        "cross_party_divergent_count": 0
+      },
+      "benchmark_names": [
+        "BBH",
+        "GPQA",
+        "IFEval",
+        "MATH Level 5",
+        "MMLU-PRO",
+        "MUSR"
+      ],
+      "top_benchmark_scores": [
+        {
+          "benchmark": "IFEval",
+          "benchmarkKey": "hfopenllm_v2_ifeval",
+          "canonical_display_name": "IFEval / Accuracy",
+          "evaluation_name": "IFEval",
+          "score": 0.4817,
+          "metric": "Accuracy on IFEval",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "BBH",
+          "benchmarkKey": "hfopenllm_v2_bbh",
+          "canonical_display_name": "BBH / Accuracy",
+          "evaluation_name": "BBH",
+          "score": 0.4814,
+          "metric": "Accuracy on BBH",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MUSR",
+          "benchmarkKey": "hfopenllm_v2_musr",
+          "canonical_display_name": "MUSR / Accuracy",
+          "evaluation_name": "MUSR",
+          "score": 0.3992,
+          "metric": "Accuracy on MUSR",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "GPQA",
+          "benchmarkKey": "hfopenllm_v2_gpqa",
+          "canonical_display_name": "GPQA / Accuracy",
+          "evaluation_name": "GPQA",
+          "score": 0.2475,
+          "metric": "Accuracy on GPQA",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MMLU-PRO",
+          "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+          "canonical_display_name": "MMLU-PRO / Accuracy",
+          "evaluation_name": "MMLU-PRO",
+          "score": 0.2425,
+          "metric": "Accuracy on MMLU-PRO",
+          "lower_is_better": false
+        },
+        {
+          "benchmark": "MATH Level 5",
+          "benchmarkKey": "hfopenllm_v2_math_level_5",
+          "canonical_display_name": "MATH Level 5 / Exact Match",
+          "evaluation_name": "MATH Level 5",
+          "score": 0.04,
+          "metric": "Exact Match on MATH Level 5",
+          "lower_is_better": false
+        }
+      ]
+    }
+  ]
+}

tests/fixtures/developers/anthropic.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/fixtures/developers/openai.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/fixtures/evals/apex_v1.json ADDED Viewed

	@@ -0,0 +1,1929 @@

+{
+  "eval_summary_id": "apex_v1",
+  "benchmark": "APEX v1",
+  "benchmark_family_key": "apex_v1",
+  "benchmark_family_name": "APEX v1",
+  "benchmark_parent_key": "apex_v1",
+  "benchmark_parent_name": "APEX v1",
+  "benchmark_leaf_key": "apex_v1",
+  "benchmark_leaf_name": "APEX v1",
+  "benchmark_component_key": "medicine_md",
+  "benchmark_component_name": "Medicine (MD)",
+  "evaluation_name": "APEX v1",
+  "display_name": "APEX v1",
+  "canonical_display_name": "APEX v1",
+  "is_summary_score": false,
+  "category": "knowledge",
+  "source_data": {
+    "dataset_name": "apex-v1",
+    "source_type": "hf_dataset",
+    "hf_repo": "Mercor/APEX-v1"
+  },
+  "benchmark_card": {
+    "benchmark_details": {
+      "name": "APEX-v1",
+      "overview": "APEX-Agents (AI Productivity Index for Agents) measures the ability of AI agents to execute long-horizon, cross-application tasks created by investment banking analysts, management consultants, and corporate lawyers. The benchmark contains 480 tasks and requires agents to navigate realistic work environments with files and tools.",
+      "data_type": "text",
+      "domains": [
+        "investment banking",
+        "management consulting",
+        "corporate law",
+        "finance",
+        "legal",
+        "consulting"
+      ],
+      "languages": [
+        "English"
+      ],
+      "similar_benchmarks": [
+        "Not specified"
+      ],
+      "resources": [
+        "https://arxiv.org/abs/2601.14242",
+        "https://huggingface.co/datasets/Mercor/APEX-v1"
+      ],
+      "benchmark_type": "single"
+    },
+    "purpose_and_intended_users": {
+      "goal": "To assess whether AI agents can reliably execute highly complex professional services work, bridging the gap between existing agentic evaluations and real-world professional workflows.",
+      "audience": [
+        "AI researchers",
+        "Developers working on agentic systems"
+      ],
+      "tasks": [
+        "Text generation",
+        "Question answering",
+        "Reasoning",
+        "Demonstrating advanced knowledge",
+        "Using multiple applications",
+        "Planning over long horizons within realistic project scenarios"
+      ],
+      "limitations": "Differences in benchmark scores below 1 percentage point should be interpreted cautiously due to a small error rate in the automated grading system (1.9% false negative rate and 1.3% false positive rate).",
+      "out_of_scope_uses": [
+        "Not specified"
+      ]
+    },
+    "data": {
+      "source": "The benchmark data was created by industry professionals including investment banking analysts, management consultants, and corporate lawyers. These professionals were organized into teams, assigned specific roles, and tasked with delivering complete projects over 5-10 day periods, producing high-quality customer-ready deliverables from scratch.",
+      "size": "480 tasks",
+      "format": "The specific structure of individual data instances is not described",
+      "annotation": "Tasks were created by professionals using files from within each project environment. A baselining study was conducted where independent experts executed 20% of tasks (96 tasks) to verify task feasibility, rubric fairness, and time estimates."
+    },
+    "methodology": {
+      "methods": [
+        "Models are evaluated using agent execution in realistic environments",
+        "Eight trajectories are collected for each agent-task pair, with each trajectory scored as pass or fail"
+      ],
+      "metrics": [
+        "Pass@1 (task-uniform mean of per-task pass rates)",
+        "Pass@8 (passing at least once in eight attempts)",
+        "Pass^8 (passing consistently on all eight attempts)"
+      ],
+      "calculation": "The overall Pass@1 score is computed as the task-uniform mean of per-task pass rates across all 480 tasks. Confidence intervals are calculated using task-level bootstrapping with 10,000 resamples",
+      "interpretation": "Higher Pass@1 scores indicate better performance",
+      "baseline_results": "Gemini 3 Flash (Thinking=High): 24.0%, GPT-5.2 (Thinking=High): 23.0%, Claude Opus 4.5 (Thinking=High): [score not specified], Gemini 3 Pro (Thinking=High): [score not specified], GPT-OSS-120B (High): 15.2%, Grok 4: 0%",
+      "validation": "Automated evaluation used a judge model with 98.5% accuracy against human-labeled ground truth. A baselining study with experts validated task feasibility and rubric fairness"
+    },
+    "ethical_and_legal_considerations": {
+      "privacy_and_anonymity": "Not specified",
+      "data_licensing": "Creative Commons Attribution 4.0",
+      "consent_procedures": "Not specified",
+      "compliance_with_regulations": "Not specified"
+    },
+    "possible_risks": [
+      {
+        "category": "Over- or under-reliance",
+        "description": [
+          "In AI-assisted decision-making tasks, reliance measures how much a person trusts (and potentially acts on) a model's output. Over-reliance occurs when a person puts too much trust in a model, accepting a model's output when the model's output is likely incorrect. Under-reliance is the opposite, where the person doesn't trust the model but should."
+        ],
+        "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/over-or-under-reliance.html"
+      },
+      {
+        "category": "Unrepresentative data",
+        "description": [
+          "Unrepresentative data occurs when the training or fine-tuning data is not sufficiently representative of the underlying population or does not measure the phenomenon of interest. Synthetic data might not fully capture the complexity and nuances of real-world data. Causes include possible limitations in the seed data quality, biases in generation methods, or inadequate domain knowledge. Thus, AI models might struggle to generalize effectively to real-world scenarios."
+        ],
+        "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/unrepresentative-data.html"
+      },
+      {
+        "category": "Incomplete AI agent evaluation",
+        "description": [
+          "Evaluating the performance or accuracy or an agent is difficult because of system complexity and open-endedness."
+        ],
+        "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/incomplete-ai-agent-evaluation-agentic.html"
+      },
+      {
+        "category": "Reproducibility",
+        "description": [
+          "Replicating agent behavior or output can be impacted by changes or updates made to external services and tools. This impact is increased if the agent is built with generative AI."
+        ],
+        "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/reproducibility-agentic.html"
+      },
+      {
+        "category": "Improper usage",
+        "description": [
+          "Improper usage occurs when a model is used for a purpose that it was not originally designed for."
+        ],
+        "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/improper-usage.html"
+      }
+    ],
+    "flagged_fields": {},
+    "missing_fields": [
+      "benchmark_details.similar_benchmarks",
+      "purpose_and_intended_users.out_of_scope_uses",
+      "ethical_and_legal_considerations.privacy_and_anonymity",
+      "ethical_and_legal_considerations.consent_procedures",
+      "ethical_and_legal_considerations.compliance_with_regulations"
+    ],
+    "card_info": {
+      "created_at": "2026-04-14T14:28:12.501639",
+      "llm": "deepseek-ai/DeepSeek-V3.1"
+    }
+  },
+  "tags": {
+    "domains": [
+      "investment banking",
+      "management consulting",
+      "corporate law",
+      "finance",
+      "legal",
+      "consulting"
+    ],
+    "languages": [
+      "English"
+    ],
+    "tasks": [
+      "Text generation",
+      "Question answering",
+      "Reasoning",
+      "Demonstrating advanced knowledge",
+      "Using multiple applications",
+      "Planning over long horizons within realistic project scenarios"
+    ]
+  },
+  "subtasks": [
+    {
+      "subtask_key": "big_law",
+      "subtask_name": "Big Law",
+      "display_name": "Big Law",
+      "metrics": [
+        {
+          "metric_summary_id": "apex_v1_big_law_score",
+          "legacy_eval_summary_id": "apex_v1_big_law",
+          "evaluation_name": "Big Law",
+          "display_name": "APEX v1 / Big Law / Score",
+          "canonical_display_name": "APEX v1 / Big Law / Score",
+          "benchmark_leaf_key": "apex_v1",
+          "benchmark_leaf_name": "APEX v1",
+          "slice_key": "big_law",
+          "slice_name": "Big Law",
+          "lower_is_better": false,
+          "metric_name": "Score",
+          "metric_id": "apex_v1.score",
+          "metric_key": "score",
+          "metric_source": "metric_config",
+          "metric_config": {
+            "evaluation_description": "Big law associate score.",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0,
+            "max_score": 1,
+            "additional_details": {
+              "raw_evaluation_name": "Big Law Score"
+            },
+            "metric_id": "apex_v1.score",
+            "metric_name": "Score",
+            "metric_kind": "score",
+            "metric_unit": "proportion"
+          },
+          "model_results": [
+            {
+              "model_id": "openai/gpt-5",
+              "model_route_id": "openai__gpt-5",
+              "model_name": "GPT 5",
+              "developer": "openai",
+              "variant_key": "default",
+              "raw_model_id": "openai/GPT 5",
+              "score": 0.78,
+              "evaluation_id": "apex-v1/openai_gpt-5/1773260200",
+              "retrieved_timestamp": "1773260200",
+              "source_metadata": {
+                "source_name": "Mercor APEX-v1 Leaderboard",
+                "source_type": "evaluation_run",
+                "source_organization_name": "Mercor",
+                "source_organization_url": "https://www.mercor.com",
+                "evaluator_relationship": "first_party"
+              },
+              "source_data": {
+                "dataset_name": "apex-v1",
+                "source_type": "hf_dataset",
+                "hf_repo": "Mercor/APEX-v1"
+              },
+              "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5/apex_v1_openai_gpt_5_1773260200.json",
+              "detailed_evaluation_results": null,
+              "detailed_evaluation_results_meta": null,
+              "passthrough_top_level_fields": null,
+              "instance_level_data": null,
+              "normalized_result": {
+                "benchmark_family_key": "apex_v1",
+                "benchmark_family_name": "APEX v1",
+                "benchmark_parent_key": "apex_v1",
+                "benchmark_parent_name": "APEX v1",
+                "benchmark_component_key": "big_law",
+                "benchmark_component_name": "Big Law",
+                "benchmark_leaf_key": "apex_v1",
+                "benchmark_leaf_name": "APEX v1",
+                "slice_key": "big_law",
+                "slice_name": "Big Law",
+                "metric_name": "Score",
+                "metric_id": "apex_v1.score",
+                "metric_key": "score",
+                "metric_source": "metric_config",
+                "display_name": "Big Law / Score",
+                "canonical_display_name": "APEX v1 / Big Law / Score",
+                "raw_evaluation_name": "Big Law",
+                "is_summary_score": false
+              },
+              "evalcards": {
+                "annotations": {
+                  "reproducibility_gap": {
+                    "has_reproducibility_gap": true,
+                    "missing_fields": [
+                      "temperature",
+                      "max_tokens"
+                    ],
+                    "required_field_count": 2,
+                    "populated_field_count": 0,
+                    "signal_version": "1.0"
+                  },
+                  "provenance": {
+                    "source_type": "first_party",
+                    "is_multi_source": false,
+                    "first_party_only": true,
+                    "distinct_reporting_organizations": 1,
+                    "signal_version": "1.0"
+                  },
+                  "variant_divergence": null,
+                  "cross_party_divergence": null
+                }
+              }
+            },
+            {
+              "model_id": "openai/gpt-5-1",
+              "model_route_id": "openai__gpt-5-1",
+              "model_name": "GPT 5.1",
+              "developer": "openai",
+              "variant_key": "default",
+              "raw_model_id": "openai/GPT 5.1",
+              "score": 0.77,
+              "evaluation_id": "apex-v1/openai_gpt-5.1/1773260200",
+              "retrieved_timestamp": "1773260200",
+              "source_metadata": {
+                "source_name": "Mercor APEX-v1 Leaderboard",
+                "source_type": "evaluation_run",
+                "source_organization_name": "Mercor",
+                "source_organization_url": "https://www.mercor.com",
+                "evaluator_relationship": "first_party"
+              },
+              "source_data": {
+                "dataset_name": "apex-v1",
+                "source_type": "hf_dataset",
+                "hf_repo": "Mercor/APEX-v1"
+              },
+              "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5-1/apex_v1_openai_gpt_5_1_1773260200.json",
+              "detailed_evaluation_results": null,
+              "detailed_evaluation_results_meta": null,
+              "passthrough_top_level_fields": null,
+              "instance_level_data": null,
+              "normalized_result": {
+                "benchmark_family_key": "apex_v1",
+                "benchmark_family_name": "APEX v1",
+                "benchmark_parent_key": "apex_v1",
+                "benchmark_parent_name": "APEX v1",
+                "benchmark_component_key": "big_law",
+                "benchmark_component_name": "Big Law",
+                "benchmark_leaf_key": "apex_v1",
+                "benchmark_leaf_name": "APEX v1",
+                "slice_key": "big_law",
+                "slice_name": "Big Law",
+                "metric_name": "Score",
+                "metric_id": "apex_v1.score",
+                "metric_key": "score",
+                "metric_source": "metric_config",
+                "display_name": "Big Law / Score",
+                "canonical_display_name": "APEX v1 / Big Law / Score",
+                "raw_evaluation_name": "Big Law",
+                "is_summary_score": false
+              },
+              "evalcards": {
+                "annotations": {
+                  "reproducibility_gap": {
+                    "has_reproducibility_gap": true,
+                    "missing_fields": [
+                      "temperature",
+                      "max_tokens"
+                    ],
+                    "required_field_count": 2,
+                    "populated_field_count": 0,
+                    "signal_version": "1.0"
+                  },
+                  "provenance": {
+                    "source_type": "first_party",
+                    "is_multi_source": false,
+                    "first_party_only": true,
+                    "distinct_reporting_organizations": 1,
+                    "signal_version": "1.0"
+                  },
+                  "variant_divergence": null,
+                  "cross_party_divergence": null
+                }
+              }
+            },
+            {
+              "model_id": "openai/o3",
+              "model_route_id": "openai__o3",
+              "model_name": "o3",
+              "developer": "openai",
+              "variant_key": "default",
+              "raw_model_id": "openai/o3",
+              "score": 0.76,
+              "evaluation_id": "apex-v1/openai_o3/1773260200",
+              "retrieved_timestamp": "1773260200",
+              "source_metadata": {
+                "source_name": "Mercor APEX-v1 Leaderboard",
+                "source_type": "evaluation_run",
+                "source_organization_name": "Mercor",
+                "source_organization_url": "https://www.mercor.com",
+                "evaluator_relationship": "first_party"
+              },
+              "source_data": {
+                "dataset_name": "apex-v1",
+                "source_type": "hf_dataset",
+                "hf_repo": "Mercor/APEX-v1"
+              },
+              "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__o3/apex_v1_openai_o3_1773260200.json",
+              "detailed_evaluation_results": null,
+              "detailed_evaluation_results_meta": null,
+              "passthrough_top_level_fields": null,
+              "instance_level_data": null,
+              "normalized_result": {
+                "benchmark_family_key": "apex_v1",
+                "benchmark_family_name": "APEX v1",
+                "benchmark_parent_key": "apex_v1",
+                "benchmark_parent_name": "APEX v1",
+                "benchmark_component_key": "big_law",
+                "benchmark_component_name": "Big Law",
+                "benchmark_leaf_key": "apex_v1",
+                "benchmark_leaf_name": "APEX v1",
+                "slice_key": "big_law",
+                "slice_name": "Big Law",
+                "metric_name": "Score",
+                "metric_id": "apex_v1.score",
+                "metric_key": "score",
+                "metric_source": "metric_config",
+                "display_name": "Big Law / Score",
+                "canonical_display_name": "APEX v1 / Big Law / Score",
+                "raw_evaluation_name": "Big Law",
+                "is_summary_score": false
+              },
+              "evalcards": {
+                "annotations": {
+                  "reproducibility_gap": {
+                    "has_reproducibility_gap": true,
+                    "missing_fields": [
+                      "temperature",
+                      "max_tokens"
+                    ],
+                    "required_field_count": 2,
+                    "populated_field_count": 0,
+                    "signal_version": "1.0"
+                  },
+                  "provenance": {
+                    "source_type": "first_party",
+                    "is_multi_source": false,
+                    "first_party_only": true,
+                    "distinct_reporting_organizations": 1,
+                    "signal_version": "1.0"
+                  },
+                  "variant_divergence": null,
+                  "cross_party_divergence": null
+                }
+              }
+            }
+          ],
+          "models_count": 3,
+          "top_score": 0.78
+        }
+      ],
+      "metrics_count": 1,
+      "metric_names": [
+        "Score"
+      ]
+    },
+    {
+      "subtask_key": "consulting",
+      "subtask_name": "Consulting",
+      "display_name": "Consulting",
+      "metrics": [
+        {
+          "metric_summary_id": "apex_v1_consulting_score",
+          "legacy_eval_summary_id": "apex_v1_consulting",
+          "evaluation_name": "Consulting",
+          "display_name": "APEX v1 / Consulting / Score",
+          "canonical_display_name": "APEX v1 / Consulting / Score",
+          "benchmark_leaf_key": "apex_v1",
+          "benchmark_leaf_name": "APEX v1",
+          "slice_key": "consulting",
+          "slice_name": "Consulting",
+          "lower_is_better": false,
+          "metric_name": "Score",
+          "metric_id": "apex_v1.score",
+          "metric_key": "score",
+          "metric_source": "metric_config",
+          "metric_config": {
+            "evaluation_description": "Management consulting score.",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0,
+            "max_score": 1,
+            "additional_details": {
+              "raw_evaluation_name": "Consulting Score"
+            },
+            "metric_id": "apex_v1.score",
+            "metric_name": "Score",
+            "metric_kind": "score",
+            "metric_unit": "proportion"
+          },
+          "model_results": [
+            {
+              "model_id": "openai/gpt-5-2-pro",
+              "model_route_id": "openai__gpt-5-2-pro",
+              "model_name": "GPT 5.2 Pro",
+              "developer": "openai",
+              "variant_key": "default",
+              "raw_model_id": "openai/GPT 5.2 Pro",
+              "score": 0.64,
+              "evaluation_id": "apex-v1/openai_gpt-5.2-pro/1773260200",
+              "retrieved_timestamp": "1773260200",
+              "source_metadata": {
+                "source_name": "Mercor APEX-v1 Leaderboard",
+                "source_type": "evaluation_run",
+                "source_organization_name": "Mercor",
+                "source_organization_url": "https://www.mercor.com",
+                "evaluator_relationship": "first_party"
+              },
+              "source_data": {
+                "dataset_name": "apex-v1",
+                "source_type": "hf_dataset",
+                "hf_repo": "Mercor/APEX-v1"
+              },
+              "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5-2-pro/apex_v1_openai_gpt_5_2_pro_1773260200.json",
+              "detailed_evaluation_results": null,
+              "detailed_evaluation_results_meta": null,
+              "passthrough_top_level_fields": null,
+              "instance_level_data": null,
+              "normalized_result": {
+                "benchmark_family_key": "apex_v1",
+                "benchmark_family_name": "APEX v1",
+                "benchmark_parent_key": "apex_v1",
+                "benchmark_parent_name": "APEX v1",
+                "benchmark_component_key": "consulting",
+                "benchmark_component_name": "Consulting",
+                "benchmark_leaf_key": "apex_v1",
+                "benchmark_leaf_name": "APEX v1",
+                "slice_key": "consulting",
+                "slice_name": "Consulting",
+                "metric_name": "Score",
+                "metric_id": "apex_v1.score",
+                "metric_key": "score",
+                "metric_source": "metric_config",
+                "display_name": "Consulting / Score",
+                "canonical_display_name": "APEX v1 / Consulting / Score",
+                "raw_evaluation_name": "Consulting",
+                "is_summary_score": false
+              },
+              "evalcards": {
+                "annotations": {
+                  "reproducibility_gap": {
+                    "has_reproducibility_gap": true,
+                    "missing_fields": [
+                      "temperature",
+                      "max_tokens"
+                    ],
+                    "required_field_count": 2,
+                    "populated_field_count": 0,
+                    "signal_version": "1.0"
+                  },
+                  "provenance": {
+                    "source_type": "first_party",
+                    "is_multi_source": false,
+                    "first_party_only": true,
+                    "distinct_reporting_organizations": 1,
+                    "signal_version": "1.0"
+                  },
+                  "variant_divergence": null,
+                  "cross_party_divergence": null
+                }
+              }
+            },
+            {
+              "model_id": "google/gemini-3-pro",
+              "model_route_id": "google__gemini-3-pro",
+              "model_name": "Gemini 3 Pro",
+              "developer": "google",
+              "variant_key": "default",
+              "raw_model_id": "google/Gemini 3 Pro",
+              "score": 0.64,
+              "evaluation_id": "apex-v1/google_gemini-3-pro/1773260200",
+              "retrieved_timestamp": "1773260200",
+              "source_metadata": {
+                "source_name": "Mercor APEX-v1 Leaderboard",
+                "source_type": "evaluation_run",
+                "source_organization_name": "Mercor",
+                "source_organization_url": "https://www.mercor.com",
+                "evaluator_relationship": "first_party"
+              },
+              "source_data": {
+                "dataset_name": "apex-v1",
+                "source_type": "hf_dataset",
+                "hf_repo": "Mercor/APEX-v1"
+              },
+              "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/google__gemini-3-pro/apex_v1_google_gemini_3_pro_1773260200.json",
+              "detailed_evaluation_results": null,
+              "detailed_evaluation_results_meta": null,
+              "passthrough_top_level_fields": null,
+              "instance_level_data": null,
+              "normalized_result": {
+                "benchmark_family_key": "apex_v1",
+                "benchmark_family_name": "APEX v1",
+                "benchmark_parent_key": "apex_v1",
+                "benchmark_parent_name": "APEX v1",
+                "benchmark_component_key": "consulting",
+                "benchmark_component_name": "Consulting",
+                "benchmark_leaf_key": "apex_v1",
+                "benchmark_leaf_name": "APEX v1",
+                "slice_key": "consulting",
+                "slice_name": "Consulting",
+                "metric_name": "Score",
+                "metric_id": "apex_v1.score",
+                "metric_key": "score",
+                "metric_source": "metric_config",
+                "display_name": "Consulting / Score",
+                "canonical_display_name": "APEX v1 / Consulting / Score",
+                "raw_evaluation_name": "Consulting",
+                "is_summary_score": false
+              },
+              "evalcards": {
+                "annotations": {
+                  "reproducibility_gap": {
+                    "has_reproducibility_gap": true,
+                    "missing_fields": [
+                      "temperature",
+                      "max_tokens"
+                    ],
+                    "required_field_count": 2,
+                    "populated_field_count": 0,
+                    "signal_version": "1.0"
+                  },
+                  "provenance": {
+                    "source_type": "first_party",
+                    "is_multi_source": false,
+                    "first_party_only": true,
+                    "distinct_reporting_organizations": 1,
+                    "signal_version": "1.0"
+                  },
+                  "variant_divergence": null,
+                  "cross_party_divergence": null
+                }
+              }
+            },
+            {
+              "model_id": "google/gemini-3-flash",
+              "model_route_id": "google__gemini-3-flash",
+              "model_name": "Gemini 3 Flash",
+              "developer": "google",
+              "variant_key": "default",
+              "raw_model_id": "google/Gemini 3 Flash",
+              "score": 0.64,
+              "evaluation_id": "apex-v1/google_gemini-3-flash/1773260200",
+              "retrieved_timestamp": "1773260200",
+              "source_metadata": {
+                "source_name": "Mercor APEX-v1 Leaderboard",
+                "source_type": "evaluation_run",
+                "source_organization_name": "Mercor",
+                "source_organization_url": "https://www.mercor.com",
+                "evaluator_relationship": "first_party"
+              },
+              "source_data": {
+                "dataset_name": "apex-v1",
+                "source_type": "hf_dataset",
+                "hf_repo": "Mercor/APEX-v1"
+              },
+              "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/google__gemini-3-flash/apex_v1_google_gemini_3_flash_1773260200.json",
+              "detailed_evaluation_results": null,
+              "detailed_evaluation_results_meta": null,
+              "passthrough_top_level_fields": null,
+              "instance_level_data": null,
+              "normalized_result": {
+                "benchmark_family_key": "apex_v1",
+                "benchmark_family_name": "APEX v1",
+                "benchmark_parent_key": "apex_v1",
+                "benchmark_parent_name": "APEX v1",
+                "benchmark_component_key": "consulting",
+                "benchmark_component_name": "Consulting",
+                "benchmark_leaf_key": "apex_v1",
+                "benchmark_leaf_name": "APEX v1",
+                "slice_key": "consulting",
+                "slice_name": "Consulting",
+                "metric_name": "Score",
+                "metric_id": "apex_v1.score",
+                "metric_key": "score",
+                "metric_source": "metric_config",
+                "display_name": "Consulting / Score",
+                "canonical_display_name": "APEX v1 / Consulting / Score",
+                "raw_evaluation_name": "Consulting",
+                "is_summary_score": false
+              },
+              "evalcards": {
+                "annotations": {
+                  "reproducibility_gap": {
+                    "has_reproducibility_gap": true,
+                    "missing_fields": [
+                      "temperature",
+                      "max_tokens"
+                    ],
+                    "required_field_count": 2,
+                    "populated_field_count": 0,
+                    "signal_version": "1.0"
+                  },
+                  "provenance": {
+                    "source_type": "first_party",
+                    "is_multi_source": false,
+                    "first_party_only": true,
+                    "distinct_reporting_organizations": 1,
+                    "signal_version": "1.0"
+                  },
+                  "variant_divergence": null,
+                  "cross_party_divergence": null
+                }
+              }
+            }
+          ],
+          "models_count": 3,
+          "top_score": 0.64
+        }
+      ],
+      "metrics_count": 1,
+      "metric_names": [
+        "Score"
+      ]
+    },
+    {
+      "subtask_key": "investment_banking",
+      "subtask_name": "Investment Banking",
+      "display_name": "Investment Banking",
+      "metrics": [
+        {
+          "metric_summary_id": "apex_v1_investment_banking_score",
+          "legacy_eval_summary_id": "apex_v1_investment_banking",
+          "evaluation_name": "Investment Banking",
+          "display_name": "APEX v1 / Investment Banking / Score",
+          "canonical_display_name": "APEX v1 / Investment Banking / Score",
+          "benchmark_leaf_key": "apex_v1",
+          "benchmark_leaf_name": "APEX v1",
+          "slice_key": "investment_banking",
+          "slice_name": "Investment Banking",
+          "lower_is_better": false,
+          "metric_name": "Score",
+          "metric_id": "apex_v1.score",
+          "metric_key": "score",
+          "metric_source": "metric_config",
+          "metric_config": {
+            "evaluation_description": "Investment banking associate score.",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0,
+            "max_score": 1,
+            "additional_details": {
+              "raw_evaluation_name": "Investment Banking Score"
+            },
+            "metric_id": "apex_v1.score",
+            "metric_name": "Score",
+            "metric_kind": "score",
+            "metric_unit": "proportion"
+          },
+          "model_results": [
+            {
+              "model_id": "openai/gpt-5-2-pro",
+              "model_route_id": "openai__gpt-5-2-pro",
+              "model_name": "GPT 5.2 Pro",
+              "developer": "openai",
+              "variant_key": "default",
+              "raw_model_id": "openai/GPT 5.2 Pro",
+              "score": 0.64,
+              "evaluation_id": "apex-v1/openai_gpt-5.2-pro/1773260200",
+              "retrieved_timestamp": "1773260200",
+              "source_metadata": {
+                "source_name": "Mercor APEX-v1 Leaderboard",
+                "source_type": "evaluation_run",
+                "source_organization_name": "Mercor",
+                "source_organization_url": "https://www.mercor.com",
+                "evaluator_relationship": "first_party"
+              },
+              "source_data": {
+                "dataset_name": "apex-v1",
+                "source_type": "hf_dataset",
+                "hf_repo": "Mercor/APEX-v1"
+              },
+              "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5-2-pro/apex_v1_openai_gpt_5_2_pro_1773260200.json",
+              "detailed_evaluation_results": null,
+              "detailed_evaluation_results_meta": null,
+              "passthrough_top_level_fields": null,
+              "instance_level_data": null,
+              "normalized_result": {
+                "benchmark_family_key": "apex_v1",
+                "benchmark_family_name": "APEX v1",
+                "benchmark_parent_key": "apex_v1",
+                "benchmark_parent_name": "APEX v1",
+                "benchmark_component_key": "investment_banking",
+                "benchmark_component_name": "Investment Banking",
+                "benchmark_leaf_key": "apex_v1",
+                "benchmark_leaf_name": "APEX v1",
+                "slice_key": "investment_banking",
+                "slice_name": "Investment Banking",
+                "metric_name": "Score",
+                "metric_id": "apex_v1.score",
+                "metric_key": "score",
+                "metric_source": "metric_config",
+                "display_name": "Investment Banking / Score",
+                "canonical_display_name": "APEX v1 / Investment Banking / Score",
+                "raw_evaluation_name": "Investment Banking",
+                "is_summary_score": false
+              },
+              "evalcards": {
+                "annotations": {
+                  "reproducibility_gap": {
+                    "has_reproducibility_gap": true,
+                    "missing_fields": [
+                      "temperature",
+                      "max_tokens"
+                    ],
+                    "required_field_count": 2,
+                    "populated_field_count": 0,
+                    "signal_version": "1.0"
+                  },
+                  "provenance": {
+                    "source_type": "first_party",
+                    "is_multi_source": false,
+                    "first_party_only": true,
+                    "distinct_reporting_organizations": 1,
+                    "signal_version": "1.0"
+                  },
+                  "variant_divergence": null,
+                  "cross_party_divergence": null
+                }
+              }
+            },
+            {
+              "model_id": "google/gemini-3-pro",
+              "model_route_id": "google__gemini-3-pro",
+              "model_name": "Gemini 3 Pro",
+              "developer": "google",
+              "variant_key": "default",
+              "raw_model_id": "google/Gemini 3 Pro",
+              "score": 0.63,
+              "evaluation_id": "apex-v1/google_gemini-3-pro/1773260200",
+              "retrieved_timestamp": "1773260200",
+              "source_metadata": {
+                "source_name": "Mercor APEX-v1 Leaderboard",
+                "source_type": "evaluation_run",
+                "source_organization_name": "Mercor",
+                "source_organization_url": "https://www.mercor.com",
+                "evaluator_relationship": "first_party"
+              },
+              "source_data": {
+                "dataset_name": "apex-v1",
+                "source_type": "hf_dataset",
+                "hf_repo": "Mercor/APEX-v1"
+              },
+              "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/google__gemini-3-pro/apex_v1_google_gemini_3_pro_1773260200.json",
+              "detailed_evaluation_results": null,
+              "detailed_evaluation_results_meta": null,
+              "passthrough_top_level_fields": null,
+              "instance_level_data": null,
+              "normalized_result": {
+                "benchmark_family_key": "apex_v1",
+                "benchmark_family_name": "APEX v1",
+                "benchmark_parent_key": "apex_v1",
+                "benchmark_parent_name": "APEX v1",
+                "benchmark_component_key": "investment_banking",
+                "benchmark_component_name": "Investment Banking",
+                "benchmark_leaf_key": "apex_v1",
+                "benchmark_leaf_name": "APEX v1",
+                "slice_key": "investment_banking",
+                "slice_name": "Investment Banking",
+                "metric_name": "Score",
+                "metric_id": "apex_v1.score",
+                "metric_key": "score",
+                "metric_source": "metric_config",
+                "display_name": "Investment Banking / Score",
+                "canonical_display_name": "APEX v1 / Investment Banking / Score",
+                "raw_evaluation_name": "Investment Banking",
+                "is_summary_score": false
+              },
+              "evalcards": {
+                "annotations": {
+                  "reproducibility_gap": {
+                    "has_reproducibility_gap": true,
+                    "missing_fields": [
+                      "temperature",
+                      "max_tokens"
+                    ],
+                    "required_field_count": 2,
+                    "populated_field_count": 0,
+                    "signal_version": "1.0"
+                  },
+                  "provenance": {
+                    "source_type": "first_party",
+                    "is_multi_source": false,
+                    "first_party_only": true,
+                    "distinct_reporting_organizations": 1,
+                    "signal_version": "1.0"
+                  },
+                  "variant_divergence": null,
+                  "cross_party_divergence": null
+                }
+              }
+            },
+            {
+              "model_id": "openai/gpt-5",
+              "model_route_id": "openai__gpt-5",
+              "model_name": "GPT 5",
+              "developer": "openai",
+              "variant_key": "default",
+              "raw_model_id": "openai/GPT 5",
+              "score": 0.61,
+              "evaluation_id": "apex-v1/openai_gpt-5/1773260200",
+              "retrieved_timestamp": "1773260200",
+              "source_metadata": {
+                "source_name": "Mercor APEX-v1 Leaderboard",
+                "source_type": "evaluation_run",
+                "source_organization_name": "Mercor",
+                "source_organization_url": "https://www.mercor.com",
+                "evaluator_relationship": "first_party"
+              },
+              "source_data": {
+                "dataset_name": "apex-v1",
+                "source_type": "hf_dataset",
+                "hf_repo": "Mercor/APEX-v1"
+              },
+              "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5/apex_v1_openai_gpt_5_1773260200.json",
+              "detailed_evaluation_results": null,
+              "detailed_evaluation_results_meta": null,
+              "passthrough_top_level_fields": null,
+              "instance_level_data": null,
+              "normalized_result": {
+                "benchmark_family_key": "apex_v1",
+                "benchmark_family_name": "APEX v1",
+                "benchmark_parent_key": "apex_v1",
+                "benchmark_parent_name": "APEX v1",
+                "benchmark_component_key": "investment_banking",
+                "benchmark_component_name": "Investment Banking",
+                "benchmark_leaf_key": "apex_v1",
+                "benchmark_leaf_name": "APEX v1",
+                "slice_key": "investment_banking",
+                "slice_name": "Investment Banking",
+                "metric_name": "Score",
+                "metric_id": "apex_v1.score",
+                "metric_key": "score",
+                "metric_source": "metric_config",
+                "display_name": "Investment Banking / Score",
+                "canonical_display_name": "APEX v1 / Investment Banking / Score",
+                "raw_evaluation_name": "Investment Banking",
+                "is_summary_score": false
+              },
+              "evalcards": {
+                "annotations": {
+                  "reproducibility_gap": {
+                    "has_reproducibility_gap": true,
+                    "missing_fields": [
+                      "temperature",
+                      "max_tokens"
+                    ],
+                    "required_field_count": 2,
+                    "populated_field_count": 0,
+                    "signal_version": "1.0"
+                  },
+                  "provenance": {
+                    "source_type": "first_party",
+                    "is_multi_source": false,
+                    "first_party_only": true,
+                    "distinct_reporting_organizations": 1,
+                    "signal_version": "1.0"
+                  },
+                  "variant_divergence": null,
+                  "cross_party_divergence": null
+                }
+              }
+            }
+          ],
+          "models_count": 3,
+          "top_score": 0.64
+        }
+      ],
+      "metrics_count": 1,
+      "metric_names": [
+        "Score"
+      ]
+    },
+    {
+      "subtask_key": "medicine_md",
+      "subtask_name": "Medicine (MD)",
+      "display_name": "Medicine (MD)",
+      "metrics": [
+        {
+          "metric_summary_id": "apex_v1_medicine_md_score",
+          "legacy_eval_summary_id": "apex_v1_medicine_md",
+          "evaluation_name": "Medicine (MD)",
+          "display_name": "APEX v1 / Medicine (MD) / Score",
+          "canonical_display_name": "APEX v1 / Medicine (MD) / Score",
+          "benchmark_leaf_key": "apex_v1",
+          "benchmark_leaf_name": "APEX v1",
+          "slice_key": "medicine_md",
+          "slice_name": "Medicine (MD)",
+          "lower_is_better": false,
+          "metric_name": "Score",
+          "metric_id": "apex_v1.score",
+          "metric_key": "score",
+          "metric_source": "metric_config",
+          "metric_config": {
+            "evaluation_description": "Primary care physician (MD) score.",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0,
+            "max_score": 1,
+            "additional_details": {
+              "raw_evaluation_name": "Medicine (MD) Score"
+            },
+            "metric_id": "apex_v1.score",
+            "metric_name": "Score",
+            "metric_kind": "score",
+            "metric_unit": "proportion"
+          },
+          "model_results": [
+            {
+              "model_id": "openai/gpt-5",
+              "model_route_id": "openai__gpt-5",
+              "model_name": "GPT 5",
+              "developer": "openai",
+              "variant_key": "default",
+              "raw_model_id": "openai/GPT 5",
+              "score": 0.66,
+              "evaluation_id": "apex-v1/openai_gpt-5/1773260200",
+              "retrieved_timestamp": "1773260200",
+              "source_metadata": {
+                "source_name": "Mercor APEX-v1 Leaderboard",
+                "source_type": "evaluation_run",
+                "source_organization_name": "Mercor",
+                "source_organization_url": "https://www.mercor.com",
+                "evaluator_relationship": "first_party"
+              },
+              "source_data": {
+                "dataset_name": "apex-v1",
+                "source_type": "hf_dataset",
+                "hf_repo": "Mercor/APEX-v1"
+              },
+              "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5/apex_v1_openai_gpt_5_1773260200.json",
+              "detailed_evaluation_results": null,
+              "detailed_evaluation_results_meta": null,
+              "passthrough_top_level_fields": null,
+              "instance_level_data": null,
+              "normalized_result": {
+                "benchmark_family_key": "apex_v1",
+                "benchmark_family_name": "APEX v1",
+                "benchmark_parent_key": "apex_v1",
+                "benchmark_parent_name": "APEX v1",
+                "benchmark_component_key": "medicine_md",
+                "benchmark_component_name": "Medicine (MD)",
+                "benchmark_leaf_key": "apex_v1",
+                "benchmark_leaf_name": "APEX v1",
+                "slice_key": "medicine_md",
+                "slice_name": "Medicine (MD)",
+                "metric_name": "Score",
+                "metric_id": "apex_v1.score",
+                "metric_key": "score",
+                "metric_source": "metric_config",
+                "display_name": "Medicine (MD) / Score",
+                "canonical_display_name": "APEX v1 / Medicine (MD) / Score",
+                "raw_evaluation_name": "Medicine (MD)",
+                "is_summary_score": false
+              },
+              "evalcards": {
+                "annotations": {
+                  "reproducibility_gap": {
+                    "has_reproducibility_gap": true,
+                    "missing_fields": [
+                      "temperature",
+                      "max_tokens"
+                    ],
+                    "required_field_count": 2,
+                    "populated_field_count": 0,
+                    "signal_version": "1.0"
+                  },
+                  "provenance": {
+                    "source_type": "first_party",
+                    "is_multi_source": false,
+                    "first_party_only": true,
+                    "distinct_reporting_organizations": 1,
+                    "signal_version": "1.0"
+                  },
+                  "variant_divergence": null,
+                  "cross_party_divergence": null
+                }
+              }
+            },
+            {
+              "model_id": "openai/gpt-5-2-pro",
+              "model_route_id": "openai__gpt-5-2-pro",
+              "model_name": "GPT 5.2 Pro",
+              "developer": "openai",
+              "variant_key": "default",
+              "raw_model_id": "openai/GPT 5.2 Pro",
+              "score": 0.65,
+              "evaluation_id": "apex-v1/openai_gpt-5.2-pro/1773260200",
+              "retrieved_timestamp": "1773260200",
+              "source_metadata": {
+                "source_name": "Mercor APEX-v1 Leaderboard",
+                "source_type": "evaluation_run",
+                "source_organization_name": "Mercor",
+                "source_organization_url": "https://www.mercor.com",
+                "evaluator_relationship": "first_party"
+              },
+              "source_data": {
+                "dataset_name": "apex-v1",
+                "source_type": "hf_dataset",
+                "hf_repo": "Mercor/APEX-v1"
+              },
+              "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5-2-pro/apex_v1_openai_gpt_5_2_pro_1773260200.json",
+              "detailed_evaluation_results": null,
+              "detailed_evaluation_results_meta": null,
+              "passthrough_top_level_fields": null,
+              "instance_level_data": null,
+              "normalized_result": {
+                "benchmark_family_key": "apex_v1",
+                "benchmark_family_name": "APEX v1",
+                "benchmark_parent_key": "apex_v1",
+                "benchmark_parent_name": "APEX v1",
+                "benchmark_component_key": "medicine_md",
+                "benchmark_component_name": "Medicine (MD)",
+                "benchmark_leaf_key": "apex_v1",
+                "benchmark_leaf_name": "APEX v1",
+                "slice_key": "medicine_md",
+                "slice_name": "Medicine (MD)",
+                "metric_name": "Score",
+                "metric_id": "apex_v1.score",
+                "metric_key": "score",
+                "metric_source": "metric_config",
+                "display_name": "Medicine (MD) / Score",
+                "canonical_display_name": "APEX v1 / Medicine (MD) / Score",
+                "raw_evaluation_name": "Medicine (MD)",
+                "is_summary_score": false
+              },
+              "evalcards": {
+                "annotations": {
+                  "reproducibility_gap": {
+                    "has_reproducibility_gap": true,
+                    "missing_fields": [
+                      "temperature",
+                      "max_tokens"
+                    ],
+                    "required_field_count": 2,
+                    "populated_field_count": 0,
+                    "signal_version": "1.0"
+                  },
+                  "provenance": {
+                    "source_type": "first_party",
+                    "is_multi_source": false,
+                    "first_party_only": true,
+                    "distinct_reporting_organizations": 1,
+                    "signal_version": "1.0"
+                  },
+                  "variant_divergence": null,
+                  "cross_party_divergence": null
+                }
+              }
+            },
+            {
+              "model_id": "anthropic/opus-4-5",
+              "model_route_id": "anthropic__opus-4-5",
+              "model_name": "Opus 4.5",
+              "developer": "anthropic",
+              "variant_key": "default",
+              "raw_model_id": "anthropic/Opus 4.5",
+              "score": 0.65,
+              "evaluation_id": "apex-v1/anthropic_opus-4.5/1773260200",
+              "retrieved_timestamp": "1773260200",
+              "source_metadata": {
+                "source_name": "Mercor APEX-v1 Leaderboard",
+                "source_type": "evaluation_run",
+                "source_organization_name": "Mercor",
+                "source_organization_url": "https://www.mercor.com",
+                "evaluator_relationship": "first_party"
+              },
+              "source_data": {
+                "dataset_name": "apex-v1",
+                "source_type": "hf_dataset",
+                "hf_repo": "Mercor/APEX-v1"
+              },
+              "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/anthropic__opus-4-5/apex_v1_anthropic_opus_4_5_1773260200.json",
+              "detailed_evaluation_results": null,
+              "detailed_evaluation_results_meta": null,
+              "passthrough_top_level_fields": null,
+              "instance_level_data": null,
+              "normalized_result": {
+                "benchmark_family_key": "apex_v1",
+                "benchmark_family_name": "APEX v1",
+                "benchmark_parent_key": "apex_v1",
+                "benchmark_parent_name": "APEX v1",
+                "benchmark_component_key": "medicine_md",
+                "benchmark_component_name": "Medicine (MD)",
+                "benchmark_leaf_key": "apex_v1",
+                "benchmark_leaf_name": "APEX v1",
+                "slice_key": "medicine_md",
+                "slice_name": "Medicine (MD)",
+                "metric_name": "Score",
+                "metric_id": "apex_v1.score",
+                "metric_key": "score",
+                "metric_source": "metric_config",
+                "display_name": "Medicine (MD) / Score",
+                "canonical_display_name": "APEX v1 / Medicine (MD) / Score",
+                "raw_evaluation_name": "Medicine (MD)",
+                "is_summary_score": false
+              },
+              "evalcards": {
+                "annotations": {
+                  "reproducibility_gap": {
+                    "has_reproducibility_gap": true,
+                    "missing_fields": [
+                      "temperature",
+                      "max_tokens"
+                    ],
+                    "required_field_count": 2,
+                    "populated_field_count": 0,
+                    "signal_version": "1.0"
+                  },
+                  "provenance": {
+                    "source_type": "first_party",
+                    "is_multi_source": false,
+                    "first_party_only": true,
+                    "distinct_reporting_organizations": 1,
+                    "signal_version": "1.0"
+                  },
+                  "variant_divergence": null,
+                  "cross_party_divergence": null
+                }
+              }
+            }
+          ],
+          "models_count": 3,
+          "top_score": 0.66
+        }
+      ],
+      "metrics_count": 1,
+      "metric_names": [
+        "Score"
+      ]
+    }
+  ],
+  "metrics": [
+    {
+      "metric_summary_id": "apex_v1_score",
+      "legacy_eval_summary_id": "apex_v1_apex_v1",
+      "evaluation_name": "apex-v1",
+      "display_name": "APEX v1 / Score",
+      "canonical_display_name": "APEX v1 / Score",
+      "benchmark_leaf_key": "apex_v1",
+      "benchmark_leaf_name": "APEX v1",
+      "slice_key": null,
+      "slice_name": null,
+      "lower_is_better": false,
+      "metric_name": "Score",
+      "metric_id": "apex_v1.score",
+      "metric_key": "score",
+      "metric_source": "metric_config",
+      "metric_config": {
+        "evaluation_description": "Overall APEX-v1 mean score (paper snapshot).",
+        "lower_is_better": false,
+        "score_type": "continuous",
+        "min_score": 0,
+        "max_score": 1,
+        "additional_details": {
+          "raw_evaluation_name": "Overall Score"
+        },
+        "metric_id": "apex_v1.score",
+        "metric_name": "Score",
+        "metric_kind": "score",
+        "metric_unit": "proportion"
+      },
+      "model_results": [
+        {
+          "model_id": "openai/gpt-5",
+          "model_route_id": "openai__gpt-5",
+          "model_name": "GPT 5",
+          "developer": "openai",
+          "variant_key": "default",
+          "raw_model_id": "openai/GPT 5",
+          "score": 0.67,
+          "evaluation_id": "apex-v1/openai_gpt-5/1773260200",
+          "retrieved_timestamp": "1773260200",
+          "source_metadata": {
+            "source_name": "Mercor APEX-v1 Leaderboard",
+            "source_type": "evaluation_run",
+            "source_organization_name": "Mercor",
+            "source_organization_url": "https://www.mercor.com",
+            "evaluator_relationship": "first_party"
+          },
+          "source_data": {
+            "dataset_name": "apex-v1",
+            "source_type": "hf_dataset",
+            "hf_repo": "Mercor/APEX-v1"
+          },
+          "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5/apex_v1_openai_gpt_5_1773260200.json",
+          "detailed_evaluation_results": null,
+          "detailed_evaluation_results_meta": null,
+          "passthrough_top_level_fields": null,
+          "instance_level_data": null,
+          "normalized_result": {
+            "benchmark_family_key": "apex_v1",
+            "benchmark_family_name": "APEX v1",
+            "benchmark_parent_key": "apex_v1",
+            "benchmark_parent_name": "APEX v1",
+            "benchmark_component_key": null,
+            "benchmark_component_name": null,
+            "benchmark_leaf_key": "apex_v1",
+            "benchmark_leaf_name": "APEX v1",
+            "slice_key": null,
+            "slice_name": null,
+            "metric_name": "Score",
+            "metric_id": "apex_v1.score",
+            "metric_key": "score",
+            "metric_source": "metric_config",
+            "display_name": "Score",
+            "canonical_display_name": "APEX v1 / Score",
+            "raw_evaluation_name": "apex-v1",
+            "is_summary_score": false
+          },
+          "evalcards": {
+            "annotations": {
+              "reproducibility_gap": {
+                "has_reproducibility_gap": true,
+                "missing_fields": [
+                  "temperature",
+                  "max_tokens"
+                ],
+                "required_field_count": 2,
+                "populated_field_count": 0,
+                "signal_version": "1.0"
+              },
+              "provenance": {
+                "source_type": "first_party",
+                "is_multi_source": false,
+                "first_party_only": true,
+                "distinct_reporting_organizations": 1,
+                "signal_version": "1.0"
+              },
+              "variant_divergence": null,
+              "cross_party_divergence": null
+            }
+          }
+        },
+        {
+          "model_id": "openai/gpt-5-2-pro",
+          "model_route_id": "openai__gpt-5-2-pro",
+          "model_name": "GPT 5.2 Pro",
+          "developer": "openai",
+          "variant_key": "default",
+          "raw_model_id": "openai/GPT 5.2 Pro",
+          "score": 0.668,
+          "evaluation_id": "apex-v1/openai_gpt-5.2-pro/1773260200",
+          "retrieved_timestamp": "1773260200",
+          "source_metadata": {
+            "source_name": "Mercor APEX-v1 Leaderboard",
+            "source_type": "evaluation_run",
+            "source_organization_name": "Mercor",
+            "source_organization_url": "https://www.mercor.com",
+            "evaluator_relationship": "first_party"
+          },
+          "source_data": {
+            "dataset_name": "apex-v1",
+            "source_type": "hf_dataset",
+            "hf_repo": "Mercor/APEX-v1"
+          },
+          "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5-2-pro/apex_v1_openai_gpt_5_2_pro_1773260200.json",
+          "detailed_evaluation_results": null,
+          "detailed_evaluation_results_meta": null,
+          "passthrough_top_level_fields": null,
+          "instance_level_data": null,
+          "normalized_result": {
+            "benchmark_family_key": "apex_v1",
+            "benchmark_family_name": "APEX v1",
+            "benchmark_parent_key": "apex_v1",
+            "benchmark_parent_name": "APEX v1",
+            "benchmark_component_key": null,
+            "benchmark_component_name": null,
+            "benchmark_leaf_key": "apex_v1",
+            "benchmark_leaf_name": "APEX v1",
+            "slice_key": null,
+            "slice_name": null,
+            "metric_name": "Score",
+            "metric_id": "apex_v1.score",
+            "metric_key": "score",
+            "metric_source": "metric_config",
+            "display_name": "Score",
+            "canonical_display_name": "APEX v1 / Score",
+            "raw_evaluation_name": "apex-v1",
+            "is_summary_score": false
+          },
+          "evalcards": {
+            "annotations": {
+              "reproducibility_gap": {
+                "has_reproducibility_gap": true,
+                "missing_fields": [
+                  "temperature",
+                  "max_tokens"
+                ],
+                "required_field_count": 2,
+                "populated_field_count": 0,
+                "signal_version": "1.0"
+              },
+              "provenance": {
+                "source_type": "first_party",
+                "is_multi_source": false,
+                "first_party_only": true,
+                "distinct_reporting_organizations": 1,
+                "signal_version": "1.0"
+              },
+              "variant_divergence": null,
+              "cross_party_divergence": null
+            }
+          }
+        },
+        {
+          "model_id": "google/gemini-3-pro",
+          "model_route_id": "google__gemini-3-pro",
+          "model_name": "Gemini 3 Pro",
+          "developer": "google",
+          "variant_key": "default",
+          "raw_model_id": "google/Gemini 3 Pro",
+          "score": 0.643,
+          "evaluation_id": "apex-v1/google_gemini-3-pro/1773260200",
+          "retrieved_timestamp": "1773260200",
+          "source_metadata": {
+            "source_name": "Mercor APEX-v1 Leaderboard",
+            "source_type": "evaluation_run",
+            "source_organization_name": "Mercor",
+            "source_organization_url": "https://www.mercor.com",
+            "evaluator_relationship": "first_party"
+          },
+          "source_data": {
+            "dataset_name": "apex-v1",
+            "source_type": "hf_dataset",
+            "hf_repo": "Mercor/APEX-v1"
+          },
+          "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/google__gemini-3-pro/apex_v1_google_gemini_3_pro_1773260200.json",
+          "detailed_evaluation_results": null,
+          "detailed_evaluation_results_meta": null,
+          "passthrough_top_level_fields": null,
+          "instance_level_data": null,
+          "normalized_result": {
+            "benchmark_family_key": "apex_v1",
+            "benchmark_family_name": "APEX v1",
+            "benchmark_parent_key": "apex_v1",
+            "benchmark_parent_name": "APEX v1",
+            "benchmark_component_key": null,
+            "benchmark_component_name": null,
+            "benchmark_leaf_key": "apex_v1",
+            "benchmark_leaf_name": "APEX v1",
+            "slice_key": null,
+            "slice_name": null,
+            "metric_name": "Score",
+            "metric_id": "apex_v1.score",
+            "metric_key": "score",
+            "metric_source": "metric_config",
+            "display_name": "Score",
+            "canonical_display_name": "APEX v1 / Score",
+            "raw_evaluation_name": "apex-v1",
+            "is_summary_score": false
+          },
+          "evalcards": {
+            "annotations": {
+              "reproducibility_gap": {
+                "has_reproducibility_gap": true,
+                "missing_fields": [
+                  "temperature",
+                  "max_tokens"
+                ],
+                "required_field_count": 2,
+                "populated_field_count": 0,
+                "signal_version": "1.0"
+              },
+              "provenance": {
+                "source_type": "first_party",
+                "is_multi_source": false,
+                "first_party_only": true,
+                "distinct_reporting_organizations": 1,
+                "signal_version": "1.0"
+              },
+              "variant_divergence": null,
+              "cross_party_divergence": null
+            }
+          }
+        },
+        {
+          "model_id": "google/gemini-3-flash",
+          "model_route_id": "google__gemini-3-flash",
+          "model_name": "Gemini 3 Flash",
+          "developer": "google",
+          "variant_key": "default",
+          "raw_model_id": "google/Gemini 3 Flash",
+          "score": 0.64,
+          "evaluation_id": "apex-v1/google_gemini-3-flash/1773260200",
+          "retrieved_timestamp": "1773260200",
+          "source_metadata": {
+            "source_name": "Mercor APEX-v1 Leaderboard",
+            "source_type": "evaluation_run",
+            "source_organization_name": "Mercor",
+            "source_organization_url": "https://www.mercor.com",
+            "evaluator_relationship": "first_party"
+          },
+          "source_data": {
+            "dataset_name": "apex-v1",
+            "source_type": "hf_dataset",
+            "hf_repo": "Mercor/APEX-v1"
+          },
+          "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/google__gemini-3-flash/apex_v1_google_gemini_3_flash_1773260200.json",
+          "detailed_evaluation_results": null,
+          "detailed_evaluation_results_meta": null,
+          "passthrough_top_level_fields": null,
+          "instance_level_data": null,
+          "normalized_result": {
+            "benchmark_family_key": "apex_v1",
+            "benchmark_family_name": "APEX v1",
+            "benchmark_parent_key": "apex_v1",
+            "benchmark_parent_name": "APEX v1",
+            "benchmark_component_key": null,
+            "benchmark_component_name": null,
+            "benchmark_leaf_key": "apex_v1",
+            "benchmark_leaf_name": "APEX v1",
+            "slice_key": null,
+            "slice_name": null,
+            "metric_name": "Score",
+            "metric_id": "apex_v1.score",
+            "metric_key": "score",
+            "metric_source": "metric_config",
+            "display_name": "Score",
+            "canonical_display_name": "APEX v1 / Score",
+            "raw_evaluation_name": "apex-v1",
+            "is_summary_score": false
+          },
+          "evalcards": {
+            "annotations": {
+              "reproducibility_gap": {
+                "has_reproducibility_gap": true,
+                "missing_fields": [
+                  "temperature",
+                  "max_tokens"
+                ],
+                "required_field_count": 2,
+                "populated_field_count": 0,
+                "signal_version": "1.0"
+              },
+              "provenance": {
+                "source_type": "first_party",
+                "is_multi_source": false,
+                "first_party_only": true,
+                "distinct_reporting_organizations": 1,
+                "signal_version": "1.0"
+              },
+              "variant_divergence": null,
+              "cross_party_divergence": null
+            }
+          }
+        },
+        {
+          "model_id": "xai/grok-4",
+          "model_route_id": "xai__grok-4",
+          "model_name": "Grok 4",
+          "developer": "xai",
+          "variant_key": "default",
+          "raw_model_id": "xai/Grok 4",
+          "score": 0.635,
+          "evaluation_id": "apex-v1/xai_grok-4/1773260200",
+          "retrieved_timestamp": "1773260200",
+          "source_metadata": {
+            "source_name": "Mercor APEX-v1 Leaderboard",
+            "source_type": "evaluation_run",
+            "source_organization_name": "Mercor",
+            "source_organization_url": "https://www.mercor.com",
+            "evaluator_relationship": "first_party"
+          },
+          "source_data": {
+            "dataset_name": "apex-v1",
+            "source_type": "hf_dataset",
+            "hf_repo": "Mercor/APEX-v1"
+          },
+          "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/xai__grok-4/apex_v1_xai_grok_4_1773260200.json",
+          "detailed_evaluation_results": null,
+          "detailed_evaluation_results_meta": null,
+          "passthrough_top_level_fields": null,
+          "instance_level_data": null,
+          "normalized_result": {
+            "benchmark_family_key": "apex_v1",
+            "benchmark_family_name": "APEX v1",
+            "benchmark_parent_key": "apex_v1",
+            "benchmark_parent_name": "APEX v1",
+            "benchmark_component_key": null,
+            "benchmark_component_name": null,
+            "benchmark_leaf_key": "apex_v1",
+            "benchmark_leaf_name": "APEX v1",
+            "slice_key": null,
+            "slice_name": null,
+            "metric_name": "Score",
+            "metric_id": "apex_v1.score",
+            "metric_key": "score",
+            "metric_source": "metric_config",
+            "display_name": "Score",
+            "canonical_display_name": "APEX v1 / Score",
+            "raw_evaluation_name": "apex-v1",
+            "is_summary_score": false
+          },
+          "evalcards": {
+            "annotations": {
+              "reproducibility_gap": {
+                "has_reproducibility_gap": true,
+                "missing_fields": [
+                  "temperature",
+                  "max_tokens"
+                ],
+                "required_field_count": 2,
+                "populated_field_count": 0,
+                "signal_version": "1.0"
+              },
+              "provenance": {
+                "source_type": "first_party",
+                "is_multi_source": false,
+                "first_party_only": true,
+                "distinct_reporting_organizations": 1,
+                "signal_version": "1.0"
+              },
+              "variant_divergence": null,
+              "cross_party_divergence": null
+            }
+          }
+        },
+        {
+          "model_id": "google/gemini-2-5-flash",
+          "model_route_id": "google__gemini-2-5-flash",
+          "model_name": "Gemini 2.5 Flash",
+          "developer": "google",
+          "variant_key": "default",
+          "raw_model_id": "google/Gemini 2.5 Flash",
+          "score": 0.604,
+          "evaluation_id": "apex-v1/google_gemini-2.5-flash/1773260200",
+          "retrieved_timestamp": "1773260200",
+          "source_metadata": {
+            "source_name": "Mercor APEX-v1 Leaderboard",
+            "source_type": "evaluation_run",
+            "source_organization_name": "Mercor",
+            "source_organization_url": "https://www.mercor.com",
+            "evaluator_relationship": "first_party"
+          },
+          "source_data": {
+            "dataset_name": "apex-v1",
+            "source_type": "hf_dataset",
+            "hf_repo": "Mercor/APEX-v1"
+          },
+          "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/google__gemini-2-5-flash/apex_v1_google_gemini_2_5_flash_1773260200.json",
+          "detailed_evaluation_results": null,
+          "detailed_evaluation_results_meta": null,
+          "passthrough_top_level_fields": null,
+          "instance_level_data": null,
+          "normalized_result": {
+            "benchmark_family_key": "apex_v1",
+            "benchmark_family_name": "APEX v1",
+            "benchmark_parent_key": "apex_v1",
+            "benchmark_parent_name": "APEX v1",
+            "benchmark_component_key": null,
+            "benchmark_component_name": null,
+            "benchmark_leaf_key": "apex_v1",
+            "benchmark_leaf_name": "APEX v1",
+            "slice_key": null,
+            "slice_name": null,
+            "metric_name": "Score",
+            "metric_id": "apex_v1.score",
+            "metric_key": "score",
+            "metric_source": "metric_config",
+            "display_name": "Score",
+            "canonical_display_name": "APEX v1 / Score",
+            "raw_evaluation_name": "apex-v1",
+            "is_summary_score": false
+          },
+          "evalcards": {
+            "annotations": {
+              "reproducibility_gap": {
+                "has_reproducibility_gap": true,
+                "missing_fields": [
+                  "temperature",
+                  "max_tokens"
+                ],
+                "required_field_count": 2,
+                "populated_field_count": 0,
+                "signal_version": "1.0"
+              },
+              "provenance": {
+                "source_type": "first_party",
+                "is_multi_source": false,
+                "first_party_only": true,
+                "distinct_reporting_organizations": 1,
+                "signal_version": "1.0"
+              },
+              "variant_divergence": null,
+              "cross_party_divergence": null
+            }
+          }
+        },
+        {
+          "model_id": "openai/gpt-4o",
+          "model_route_id": "openai__gpt-4o",
+          "model_name": "GPT 4o",
+          "developer": "openai",
+          "variant_key": "default",
+          "raw_model_id": "openai/GPT 4o",
+          "score": 0.359,
+          "evaluation_id": "apex-v1/openai_gpt-4o/1773260200",
+          "retrieved_timestamp": "1773260200",
+          "source_metadata": {
+            "source_name": "Mercor APEX-v1 Leaderboard",
+            "source_type": "evaluation_run",
+            "source_organization_name": "Mercor",
+            "source_organization_url": "https://www.mercor.com",
+            "evaluator_relationship": "first_party"
+          },
+          "source_data": {
+            "dataset_name": "apex-v1",
+            "source_type": "hf_dataset",
+            "hf_repo": "Mercor/APEX-v1"
+          },
+          "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-4o/apex_v1_openai_gpt_4o_1773260200.json",
+          "detailed_evaluation_results": null,
+          "detailed_evaluation_results_meta": null,
+          "passthrough_top_level_fields": null,
+          "instance_level_data": null,
+          "normalized_result": {
+            "benchmark_family_key": "apex_v1",
+            "benchmark_family_name": "APEX v1",
+            "benchmark_parent_key": "apex_v1",
+            "benchmark_parent_name": "APEX v1",
+            "benchmark_component_key": null,
+            "benchmark_component_name": null,
+            "benchmark_leaf_key": "apex_v1",
+            "benchmark_leaf_name": "APEX v1",
+            "slice_key": null,
+            "slice_name": null,
+            "metric_name": "Score",
+            "metric_id": "apex_v1.score",
+            "metric_key": "score",
+            "metric_source": "metric_config",
+            "display_name": "Score",
+            "canonical_display_name": "APEX v1 / Score",
+            "raw_evaluation_name": "apex-v1",
+            "is_summary_score": false
+          },
+          "evalcards": {
+            "annotations": {
+              "reproducibility_gap": {
+                "has_reproducibility_gap": true,
+                "missing_fields": [
+                  "temperature",
+                  "max_tokens"
+                ],
+                "required_field_count": 2,
+                "populated_field_count": 0,
+                "signal_version": "1.0"
+              },
+              "provenance": {
+                "source_type": "first_party",
+                "is_multi_source": false,
+                "first_party_only": true,
+                "distinct_reporting_organizations": 1,
+                "signal_version": "1.0"
+              },
+              "variant_divergence": null,
+              "cross_party_divergence": null
+            }
+          }
+        }
+      ],
+      "models_count": 7,
+      "top_score": 0.67
+    }
+  ],
+  "subtasks_count": 4,
+  "metrics_count": 5,
+  "models_count": 10,
+  "metric_names": [
+    "Score"
+  ],
+  "primary_metric_name": "Score",
+  "top_score": null,
+  "instance_data": {
+    "available": false,
+    "url_count": 0,
+    "sample_urls": [],
+    "models_with_loaded_instances": 0
+  },
+  "evalcards": {
+    "annotations": {
+      "reporting_completeness": {
+        "completeness_score": 0.9285714285714286,
+        "total_fields_evaluated": 28,
+        "missing_required_fields": [
+          "evalcards.lifecycle_status",
+          "evalcards.preregistration_url"
+        ],
+        "partial_fields": [],
+        "field_scores": [
+          {
+            "field_path": "autobenchmarkcard.benchmark_details.name",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.benchmark_details.overview",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.benchmark_details.data_type",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.benchmark_details.domains",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.benchmark_details.languages",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.benchmark_details.similar_benchmarks",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.benchmark_details.resources",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.purpose_and_intended_users.goal",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.purpose_and_intended_users.audience",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.purpose_and_intended_users.tasks",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.purpose_and_intended_users.limitations",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.purpose_and_intended_users.out_of_scope_uses",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.methodology.methods",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.methodology.metrics",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.methodology.calculation",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.methodology.interpretation",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.methodology.baseline_results",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.methodology.validation",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.ethical_and_legal_considerations.privacy_and_anonymity",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.ethical_and_legal_considerations.data_licensing",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.ethical_and_legal_considerations.consent_procedures",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "autobenchmarkcard.data",
+            "coverage_type": "partial",
+            "score": 1.0
+          },
+          {
+            "field_path": "eee_eval.source_metadata.source_type",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "eee_eval.source_metadata.source_organization_name",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "eee_eval.source_metadata.evaluator_relationship",
+            "coverage_type": "full",
+            "score": 1.0
+          },
+          {
+            "field_path": "evalcards.lifecycle_status",
+            "coverage_type": "reserved",
+            "score": 0.0
+          },
+          {
+            "field_path": "evalcards.preregistration_url",
+            "coverage_type": "reserved",
+            "score": 0.0
+          }
+        ],
+        "signal_version": "1.0"
+      },
+      "benchmark_comparability": {
+        "variant_divergence_groups": [],
+        "cross_party_divergence_groups": []
+      }
+    }
+  },
+  "reproducibility_summary": {
+    "results_total": 19,
+    "has_reproducibility_gap_count": 19,
+    "populated_ratio_avg": 0.0
+  },
+  "provenance_summary": {
+    "total_results": 19,
+    "total_groups": 19,
+    "multi_source_groups": 0,
+    "first_party_only_groups": 19,
+    "source_type_distribution": {
+      "first_party": 19,
+      "third_party": 0,
+      "collaborative": 0,
+      "unspecified": 0
+    }
+  },
+  "comparability_summary": {
+    "total_groups": 19,
+    "groups_with_variant_check": 0,
+    "groups_with_cross_party_check": 0,
+    "variant_divergent_count": 0,
+    "cross_party_divergent_count": 0
+  }
+}

tests/fixtures/evals/appworld.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/fixtures/evals/artificial_analysis_llms_artificial_analysis_aime.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/fixtures/evals/helm_capabilities.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/fixtures/evals/helm_classic_truthfulqa.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/fixtures/evals/helm_lite_narrativeqa.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/fixtures/evals/helm_safety_simplesafetytests.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/fixtures/loader.ts ADDED Viewed

	@@ -0,0 +1,92 @@

+import { readFileSync, readdirSync } from "fs"
+import { fileURLToPath } from "url"
+import path from "path"
+import manifest from "./manifest.json"
+const FIXTURES_DIR = path.dirname(fileURLToPath(import.meta.url))
+export type FixtureGroup = "evals" | "models" | "developers" | "model_cards"
+const GROUP_DIRS: Record<FixtureGroup, string> = {
+  evals: "evals",
+  models: "models",
+  developers: "developers",
+  model_cards: "model-cards",
+}
+export interface FixtureEntry {
+  id: string
+  why: string
+}
+export function fixtureEntries(group: FixtureGroup): FixtureEntry[] {
+  return (manifest as Record<FixtureGroup, FixtureEntry[]>)[group]
+}
+export function loadFixture<T>(group: FixtureGroup, id: string): T {
+  const filePath = path.join(FIXTURES_DIR, GROUP_DIRS[group], `${id}.json`)
+  return JSON.parse(readFileSync(filePath, "utf8")) as T
+}
+export function loadAllFixtures<T>(group: FixtureGroup): Array<{ id: string; why: string; data: T }> {
+  return fixtureEntries(group).map((entry) => ({
+    id: entry.id,
+    why: entry.why,
+    data: loadFixture<T>(group, entry.id),
+  }))
+}
+export function listLiveCacheFiles(group: FixtureGroup): string[] {
+  const dir = path.resolve(FIXTURES_DIR, "..", "..", ".cache", "hf-data", group)
+  try {
+    return readdirSync(dir).filter((f) => f.endsWith(".json"))
+  } catch {
+    return []
+  }
+}
+export function loadLiveCacheFile<T>(group: FixtureGroup, fileName: string): T {
+  const filePath = path.resolve(FIXTURES_DIR, "..", "..", ".cache", "hf-data", group, fileName)
+  return JSON.parse(readFileSync(filePath, "utf8")) as T
+}
+// Walks every model_result row inside an HFModelDetail's hierarchy_by_category.
+// Used by both the fixture contracts and the live-cache drift checks. Generic
+// in the result type so callers can pass a precise type from lib/hf-data.ts.
+export function* walkHierarchyResults<TResult>(
+  detail: HierarchyDetail<TResult>,
+  fixtureId: string
+): Generator<{ result: TResult; path: string }> {
+  for (const [categoryKey, nodes] of Object.entries(detail.hierarchy_by_category ?? {})) {
+    for (const [nodeIdx, node] of (nodes ?? []).entries()) {
+      yield* walkNode<TResult>(node, `${fixtureId}.hierarchy_by_category.${categoryKey}[${nodeIdx}]`)
+    }
+  }
+}
+interface HierarchyDetail<TResult> {
+  hierarchy_by_category?: Record<string, HierarchyNode<TResult>[]>
+}
+interface HierarchyNode<TResult> {
+  metrics?: Array<{ model_results?: TResult[] }>
+  subtasks?: HierarchyNode<TResult>[]
+}
+function* walkNode<TResult>(
+  node: HierarchyNode<TResult>,
+  basePath: string
+): Generator<{ result: TResult; path: string }> {
+  for (const [metricIdx, metric] of (node.metrics ?? []).entries()) {
+    for (const [resultIdx, result] of (metric.model_results ?? []).entries()) {
+      yield {
+        result,
+        path: `${basePath}.metrics[${metricIdx}].model_results[${resultIdx}]`,
+      }
+    }
+  }
+  for (const [subtaskIdx, subtask] of (node.subtasks ?? []).entries()) {
+    yield* walkNode<TResult>(subtask, `${basePath}.subtasks[${subtaskIdx}]`)
+  }
+}

tests/fixtures/manifest.json ADDED Viewed

	@@ -0,0 +1,81 @@

+{
+  "$comment": "Pinned snapshot of HF cache files used by Tier A pipeline-contract tests and Tier B adapter snapshot tests. Refresh via `pnpm refresh-fixtures`. Each fixture earns its place by exercising a specific code path documented in `notes/testing-strategy.md` (curation criteria).",
+  "snapshot_source": ".cache/hf-data",
+  "snapshot_ts": "2026-04-27T22:32:59.376Z",
+  "evals": [
+    {
+      "id": "apex_v1",
+      "why": "Mercor first-party badge; pipeline category=knowledge; subtasks"
+    },
+    {
+      "id": "artificial_analysis_llms_artificial_analysis_aime",
+      "why": "Artificial Analysis third-party badge; source_type=documentation in pipeline"
+    },
+    {
+      "id": "helm_classic_truthfulqa",
+      "why": "Safety regression-bait — pipeline category=knowledge but inferCategoryFromBenchmark returns Safety"
+    },
+    {
+      "id": "helm_safety_simplesafetytests",
+      "why": "Safety regression-bait — pipeline category=general for an obvious safety eval"
+    },
+    {
+      "id": "helm_capabilities",
+      "why": "Composite parent eval; pipeline category=knowledge"
+    },
+    {
+      "id": "helm_lite_narrativeqa",
+      "why": "Subtask leaf under helm_lite parent; pipeline category=reasoning, regex returns General"
+    },
+    {
+      "id": "appworld",
+      "why": "Pipeline category=coding (one of the 3 keys we added to PIPELINE_CATEGORY_MAP); inferCategoryFromBenchmark returns Agentic"
+    }
+  ],
+  "models": [
+    {
+      "id": "openai__gpt-5-2-pro",
+      "why": "5 variants — exercises variant_lookup and per-variant grouping in flattenModelEvaluations"
+    },
+    {
+      "id": "google__gemini-3-flash",
+      "why": "Already covered by parity harness; medium-size; multi-category hierarchy"
+    },
+    {
+      "id": "ai21__j1-grande-v1-17b",
+      "why": "Has `safety` hierarchy_by_category key"
+    },
+    {
+      "id": "bytedance__seed-2-0-lite",
+      "why": "Has `coding` hierarchy_by_category key (the substring-fallacy bug case); small"
+    }
+  ],
+  "developers": [
+    {
+      "id": "openai",
+      "why": "KNOWN_DEVELOPER_NAMES canonicalization (openai → OpenAI)"
+    },
+    {
+      "id": "anthropic",
+      "why": "Multiple model families; typical case"
+    },
+    {
+      "id": "01-ai",
+      "why": "Dash-prefix slug — exercises pipelineSlugify edge case"
+    }
+  ],
+  "model_cards": [
+    {
+      "id": "openai__gpt-5",
+      "why": "6 variants; rich top_benchmark_scores; hfModelCardToEvaluationCardData edge case"
+    },
+    {
+      "id": "anthropic__claude-opus-4.5",
+      "why": "Dotted route_id (vs dashed in model detail files) — capturing the route-id mismatch"
+    },
+    {
+      "id": "01-ai__yi-34b",
+      "why": "Developer name canonicalization (01-ai → 01.AI per KNOWN_DEVELOPER_NAMES)"
+    }
+  ]
+}

tests/fixtures/model-cards/01-ai__yi-34b.json ADDED Viewed

	@@ -0,0 +1,213 @@

+{
+  "model_family_id": "01-ai/yi-34b",
+  "model_route_id": "01-ai__yi-34b",
+  "model_family_name": "Yi 34B",
+  "developer": "01-ai",
+  "params_billions": 34,
+  "total_evaluations": 3,
+  "benchmark_count": 3,
+  "benchmark_family_count": 3,
+  "categories_covered": [
+    "general",
+    "knowledge",
+    "reasoning"
+  ],
+  "last_updated": "2026-03-21T12:31:52.005480Z",
+  "variants": [
+    {
+      "variant_key": "default",
+      "variant_label": "Default",
+      "evaluation_count": 3,
+      "raw_model_ids": [
+        "01-ai/yi-34b",
+        "01-ai/Yi-34B"
+      ],
+      "last_updated": "2026-03-21T12:31:52.005480Z"
+    }
+  ],
+  "score_summary": {
+    "count": 52,
+    "min": 0.0514,
+    "max": 0.936,
+    "average": 0.6793153846153845
+  },
+  "reproducibility_summary": {
+    "results_total": 52,
+    "has_reproducibility_gap_count": 52,
+    "populated_ratio_avg": 0
+  },
+  "provenance_summary": {
+    "total_results": 52,
+    "total_groups": 52,
+    "multi_source_groups": 0,
+    "first_party_only_groups": 0,
+    "source_type_distribution": {
+      "first_party": 0,
+      "third_party": 52,
+      "collaborative": 0,
+      "unspecified": 0
+    }
+  },
+  "comparability_summary": {
+    "total_groups": 52,
+    "groups_with_variant_check": 0,
+    "groups_with_cross_party_check": 0,
+    "variant_divergent_count": 0,
+    "cross_party_divergent_count": 0
+  },
+  "benchmark_names": [
+    "BBH",
+    "GPQA",
+    "GSM8K",
+    "Helm lite",
+    "IFEval",
+    "LegalBench",
+    "MATH",
+    "MATH Level 5",
+    "MMLU",
+    "MMLU-PRO",
+    "MUSR",
+    "MedQA",
+    "NarrativeQA",
+    "NaturalQuestions (closed-book)",
+    "OpenbookQA",
+    "WMT 2014"
+  ],
+  "top_benchmark_scores": [
+    {
+      "benchmark": "MMLU",
+      "benchmarkKey": "helm_mmlu",
+      "canonical_display_name": "Mmlu / Marketing / Exact Match",
+      "evaluation_name": "Marketing",
+      "score": 0.936,
+      "metric": "EM on Marketing",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "OpenbookQA",
+      "benchmarkKey": "helm_lite_openbookqa",
+      "canonical_display_name": "OpenbookQA / Exact Match",
+      "evaluation_name": "OpenbookQA",
+      "score": 0.92,
+      "metric": "EM on OpenbookQA",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "NarrativeQA",
+      "benchmarkKey": "helm_lite_narrativeqa",
+      "canonical_display_name": "NarrativeQA / F1",
+      "evaluation_name": "NarrativeQA",
+      "score": 0.782,
+      "metric": "F1 on NarrativeQA",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "MedQA",
+      "benchmarkKey": "helm_lite_medqa",
+      "canonical_display_name": "MedQA / Exact Match",
+      "evaluation_name": "MedQA",
+      "score": 0.656,
+      "metric": "EM on MedQA",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "MMLU",
+      "benchmarkKey": "helm_lite_mmlu",
+      "canonical_display_name": "MMLU / Exact Match",
+      "evaluation_name": "MMLU",
+      "score": 0.65,
+      "metric": "EM on MMLU",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "GSM8K",
+      "benchmarkKey": "helm_lite_gsm8k",
+      "canonical_display_name": "GSM8K / Exact Match",
+      "evaluation_name": "GSM8K",
+      "score": 0.648,
+      "metric": "EM on GSM8K",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "LegalBench",
+      "benchmarkKey": "helm_lite_legalbench",
+      "canonical_display_name": "LegalBench / Exact Match",
+      "evaluation_name": "LegalBench",
+      "score": 0.618,
+      "metric": "EM on LegalBench",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "Helm lite",
+      "benchmarkKey": "helm_lite",
+      "canonical_display_name": "Helm lite / Win Rate",
+      "evaluation_name": "helm_lite",
+      "score": 0.57,
+      "metric": "How many models this model outperforms on average (over columns).",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "BBH",
+      "benchmarkKey": "hfopenllm_v2_bbh",
+      "canonical_display_name": "BBH / Accuracy",
+      "evaluation_name": "BBH",
+      "score": 0.5457,
+      "metric": "Accuracy on BBH",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "NaturalQuestions (closed-book)",
+      "benchmarkKey": "helm_lite_naturalquestions_closed_book",
+      "canonical_display_name": "NaturalQuestions (closed-book) / F1",
+      "evaluation_name": "NaturalQuestions (closed-book)",
+      "score": 0.443,
+      "metric": "F1 on NaturalQuestions (closed-book)",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "MMLU-PRO",
+      "benchmarkKey": "hfopenllm_v2_mmlu_pro",
+      "canonical_display_name": "MMLU-PRO / Accuracy",
+      "evaluation_name": "MMLU-PRO",
+      "score": 0.4412,
+      "metric": "Accuracy on MMLU-PRO",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "MUSR",
+      "benchmarkKey": "hfopenllm_v2_musr",
+      "canonical_display_name": "MUSR / Accuracy",
+      "evaluation_name": "MUSR",
+      "score": 0.4119,
+      "metric": "Accuracy on MUSR",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "MATH",
+      "benchmarkKey": "helm_lite_math",
+      "canonical_display_name": "MATH / Equivalent (CoT)",
+      "evaluation_name": "MATH",
+      "score": 0.375,
+      "metric": "Equivalent (CoT) on MATH",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "GPQA",
+      "benchmarkKey": "hfopenllm_v2_gpqa",
+      "canonical_display_name": "GPQA / Accuracy",
+      "evaluation_name": "GPQA",
+      "score": 0.3666,
+      "metric": "Accuracy on GPQA",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "IFEval",
+      "benchmarkKey": "hfopenllm_v2_ifeval",
+      "canonical_display_name": "IFEval / Accuracy",
+      "evaluation_name": "IFEval",
+      "score": 0.3046,
+      "metric": "Accuracy on IFEval",
+      "lower_is_better": false
+    }
+  ]
+}

tests/fixtures/model-cards/anthropic__claude-opus-4.5.json ADDED Viewed

	@@ -0,0 +1,241 @@

+{
+  "model_family_id": "anthropic/claude-opus-4.5",
+  "model_route_id": "anthropic__claude-opus-4.5",
+  "model_family_name": "Claude Opus 4.5",
+  "developer": "anthropic",
+  "params_billions": null,
+  "total_evaluations": 49,
+  "benchmark_count": 10,
+  "benchmark_family_count": 10,
+  "categories_covered": [
+    "agentic",
+    "coding",
+    "knowledge",
+    "other",
+    "reasoning"
+  ],
+  "last_updated": "2026-04-25T09:07:44.422824Z",
+  "variants": [
+    {
+      "variant_key": "default",
+      "variant_label": "Default",
+      "evaluation_count": 40,
+      "raw_model_ids": [
+        "anthropic/claude-opus-4-5",
+        "anthropic/claude-opus-4.5"
+      ],
+      "last_updated": "2026-04-20T16:39:22.266076Z"
+    },
+    {
+      "variant_key": "20251101",
+      "variant_label": "2025-11-01",
+      "evaluation_count": 4,
+      "raw_model_ids": [
+        "anthropic/claude-opus-4-5-20251101",
+        "anthropic/claude-opus-4-5-20251101-fc",
+        "anthropic/claude-opus-4-5-20251101-prompt"
+      ],
+      "last_updated": "2026-04-25T09:07:44.422824Z"
+    },
+    {
+      "variant_key": "2025-11-01",
+      "variant_label": "2025-11-01",
+      "evaluation_count": 5,
+      "raw_model_ids": [
+        "anthropic/claude-opus-4-5-20251101-thinking-16k",
+        "anthropic/claude-opus-4-5-20251101-thinking-32k",
+        "anthropic/claude-opus-4-5-20251101-thinking-64k",
+        "anthropic/claude-opus-4-5-20251101-thinking-8k",
+        "anthropic/claude-opus-4-5-20251101-thinking-none"
+      ],
+      "last_updated": "2026-04-07T08:15:57.578212Z"
+    }
+  ],
+  "score_summary": {
+    "count": 164,
+    "min": 0.0708,
+    "max": 95.5,
+    "average": 25.281729268292686
+  },
+  "reproducibility_summary": {
+    "results_total": 164,
+    "has_reproducibility_gap_count": 164,
+    "populated_ratio_avg": 0
+  },
+  "provenance_summary": {
+    "total_results": 164,
+    "total_groups": 76,
+    "multi_source_groups": 0,
+    "first_party_only_groups": 9,
+    "source_type_distribution": {
+      "first_party": 9,
+      "third_party": 155,
+      "collaborative": 0,
+      "unspecified": 0
+    }
+  },
+  "comparability_summary": {
+    "total_groups": 76,
+    "groups_with_variant_check": 6,
+    "groups_with_cross_party_check": 0,
+    "variant_divergent_count": 6,
+    "cross_party_divergent_count": 0
+  },
+  "benchmark_names": [
+    "ARC Prize evaluations leaderboard JSON",
+    "ARC-AGI v2",
+    "AppWorld Benchmark",
+    "Artificial Analysis LLM API",
+    "BFCL leaderboard CSV",
+    "BrowseComp-Plus",
+    "MCP Atlas",
+    "MMMLU",
+    "MMMU (validation)",
+    "OSWorld",
+    "SWE-Bench Verified",
+    "SWE-bench Verified",
+    "Swe Bench",
+    "Tau2 Retail",
+    "Tau2 Telecom",
+    "Terminal Bench 2 0",
+    "Terminal-Bench 2.0",
+    "τ-bench (Tool-Agent-User Interaction Benchmark)"
+  ],
+  "top_benchmark_scores": [
+    {
+      "benchmark": "Terminal Bench 2 0",
+      "benchmarkKey": "terminal_bench_2_0",
+      "canonical_display_name": "Terminal bench 2 0 / Accuracy",
+      "evaluation_name": "terminal-bench-2.0",
+      "score": 63.1,
+      "metric": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_artificial_analysis_math_index",
+      "canonical_display_name": "artificial_analysis.artificial_analysis_math_index / Artificial Analysis Math Index",
+      "evaluation_name": "artificial_analysis.artificial_analysis_math_index",
+      "score": 62.7,
+      "metric": "Artificial Analysis composite math index.",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_median_output_tokens_per_second",
+      "canonical_display_name": "artificial_analysis.median_output_tokens_per_second / Median output tokens per second",
+      "evaluation_name": "artificial_analysis.median_output_tokens_per_second",
+      "score": 52.885,
+      "metric": "Median output generation speed reported by Artificial Analysis.",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_artificial_analysis_intelligence_index",
+      "canonical_display_name": "artificial_analysis.artificial_analysis_intelligence_index / Artificial Analysis Intelligence Index",
+      "evaluation_name": "artificial_analysis.artificial_analysis_intelligence_index",
+      "score": 43.1,
+      "metric": "Artificial Analysis composite intelligence index.",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_artificial_analysis_coding_index",
+      "canonical_display_name": "artificial_analysis.artificial_analysis_coding_index / Artificial Analysis Coding Index",
+      "evaluation_name": "artificial_analysis.artificial_analysis_coding_index",
+      "score": 42.9,
+      "metric": "Artificial Analysis composite coding index.",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_price_1m_output_tokens",
+      "canonical_display_name": "artificial_analysis.price_1m_output_tokens / Price 1m Output Tokens",
+      "evaluation_name": "artificial_analysis.price_1m_output_tokens",
+      "score": 25,
+      "metric": "Price per 1M output tokens in USD.",
+      "lower_is_better": true
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_price_1m_blended_3_to_1",
+      "canonical_display_name": "artificial_analysis.price_1m_blended_3_to_1 / Price 1m Blended 3 To 1",
+      "evaluation_name": "artificial_analysis.price_1m_blended_3_to_1",
+      "score": 10,
+      "metric": "Blended price per 1M tokens using a 3:1 input-to-output ratio.",
+      "lower_is_better": true
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_price_1m_input_tokens",
+      "canonical_display_name": "artificial_analysis.price_1m_input_tokens / Price 1m Input Tokens",
+      "evaluation_name": "artificial_analysis.price_1m_input_tokens",
+      "score": 5,
+      "metric": "Price per 1M input tokens in USD.",
+      "lower_is_better": true
+    },
+    {
+      "benchmark": "BFCL leaderboard CSV",
+      "benchmarkKey": "bfcl",
+      "canonical_display_name": "Bfcl / Format sensitivity / Format Sensitivity Standard Deviation",
+      "evaluation_name": "format_sensitivity",
+      "score": 3.65,
+      "metric": "format_sensitivity",
+      "lower_is_better": true
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_median_time_to_first_token_seconds",
+      "canonical_display_name": "artificial_analysis.median_time_to_first_token_seconds / Median Time To First Token Seconds",
+      "evaluation_name": "artificial_analysis.median_time_to_first_token_seconds",
+      "score": 1.311,
+      "metric": "Median time to first token reported by Artificial Analysis.",
+      "lower_is_better": true
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_median_time_to_first_answer_token",
+      "canonical_display_name": "artificial_analysis.median_time_to_first_answer_token / Median time to first answer token",
+      "evaluation_name": "artificial_analysis.median_time_to_first_answer_token",
+      "score": 1.311,
+      "metric": "Median time to first answer token reported by Artificial Analysis.",
+      "lower_is_better": true
+    },
+    {
+      "benchmark": "Tau2 Telecom",
+      "benchmarkKey": "llm_stats_tau2_telecom",
+      "canonical_display_name": "Tau2 Telecom / Score",
+      "evaluation_name": "llm_stats.tau2-telecom",
+      "score": 0.982,
+      "metric": "τ²-Bench telecom domain evaluates conversational agents in a dual-control environment modeled as a Dec-POMDP, where both agent and user use tools in shared telecommunications troubleshooting scenarios that test coordination and communication capabilities.",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "MMMLU",
+      "benchmarkKey": "llm_stats_mmmlu",
+      "canonical_display_name": "Mmmlu / Score",
+      "evaluation_name": "llm_stats.mmmlu",
+      "score": 0.908,
+      "metric": "Multilingual Massive Multitask Language Understanding dataset released by OpenAI, featuring professionally translated MMLU test questions across 14 languages including Arabic, Bengali, German, Spanish, French, Hindi, Indonesian, Italian, Japanese, Korean, Portuguese, Swahili, Yoruba, and Chinese. Contains approximately 15,908 multiple-choice questions per language covering 57 subjects.",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_mmlu_pro",
+      "canonical_display_name": "artificial_analysis.mmlu_pro / MMLU-Pro",
+      "evaluation_name": "artificial_analysis.mmlu_pro",
+      "score": 0.889,
+      "metric": "Benchmark score on MMLU-Pro.",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "Tau2 Retail",
+      "benchmarkKey": "llm_stats_tau2_retail",
+      "canonical_display_name": "Tau2 Retail / Score",
+      "evaluation_name": "llm_stats.tau2-retail",
+      "score": 0.889,
+      "metric": "τ²-bench retail domain evaluates conversational AI agents in customer service scenarios within a dual-control environment where both agent and user can interact with tools. Tests tool-agent-user interaction, rule adherence, and task consistency in retail customer support contexts.",
+      "lower_is_better": false
+    }
+  ]
+}

tests/fixtures/model-cards/openai__gpt-5.json ADDED Viewed

	@@ -0,0 +1,307 @@

+{
+  "model_family_id": "openai/gpt-5",
+  "model_route_id": "openai__gpt-5",
+  "model_family_name": "GPT 5",
+  "developer": "openai",
+  "params_billions": null,
+  "total_evaluations": 30,
+  "benchmark_count": 18,
+  "benchmark_family_count": 18,
+  "categories_covered": [
+    "agentic",
+    "coding",
+    "general",
+    "knowledge",
+    "other",
+    "reasoning"
+  ],
+  "last_updated": "2026-04-25T09:07:44.422824Z",
+  "variants": [
+    {
+      "variant_key": "default",
+      "variant_label": "Default",
+      "evaluation_count": 17,
+      "raw_model_ids": [
+        "openai/GPT 5",
+        "openai/gpt-5",
+        "openai/GPT-5"
+      ],
+      "last_updated": "2026-04-20T17:00:36.089462Z"
+    },
+    {
+      "variant_key": "2025-08-07",
+      "variant_label": "2025-08-07",
+      "evaluation_count": 9,
+      "raw_model_ids": [
+        "openai/gpt-5-2025-08-07"
+      ],
+      "last_updated": "2026-04-25T09:07:44.422824Z"
+    },
+    {
+      "variant_key": "2025-08-07-low",
+      "variant_label": "2025-08-07 low",
+      "evaluation_count": 1,
+      "raw_model_ids": [
+        "openai/gpt-5-2025-08-07-low"
+      ],
+      "last_updated": "2026-04-07T08:15:57.602168Z"
+    },
+    {
+      "variant_key": "2025-08-07-medium",
+      "variant_label": "2025-08-07 medium",
+      "evaluation_count": 1,
+      "raw_model_ids": [
+        "openai/gpt-5-2025-08-07-medium"
+      ],
+      "last_updated": "2026-04-07T08:15:57.602461Z"
+    },
+    {
+      "variant_key": "2025-08-07-minimal",
+      "variant_label": "2025-08-07 minimal",
+      "evaluation_count": 1,
+      "raw_model_ids": [
+        "openai/gpt-5-2025-08-07-minimal"
+      ],
+      "last_updated": "2026-04-07T08:15:57.602750Z"
+    },
+    {
+      "variant_key": "2025-08-07-high",
+      "variant_label": "2025-08-07 high",
+      "evaluation_count": 1,
+      "raw_model_ids": [
+        "openai/gpt-5-2025-08-07-high"
+      ],
+      "last_updated": "2026-04-07T08:15:57.601872Z"
+    }
+  ],
+  "score_summary": {
+    "count": 559,
+    "min": 0,
+    "max": 73320,
+    "average": 673.8712930083715
+  },
+  "reproducibility_summary": {
+    "results_total": 559,
+    "has_reproducibility_gap_count": 541,
+    "populated_ratio_avg": 0.03220035778175313
+  },
+  "provenance_summary": {
+    "total_results": 559,
+    "total_groups": 508,
+    "multi_source_groups": 0,
+    "first_party_only_groups": 43,
+    "source_type_distribution": {
+      "first_party": 44,
+      "third_party": 515,
+      "collaborative": 0,
+      "unspecified": 0
+    }
+  },
+  "comparability_summary": {
+    "total_groups": 508,
+    "groups_with_variant_check": 0,
+    "groups_with_cross_party_check": 0,
+    "variant_divergent_count": 0,
+    "cross_party_divergent_count": 0
+  },
+  "benchmark_names": [
+    "ACE",
+    "APEX Agents",
+    "APEX v1",
+    "ARC Prize evaluations leaderboard JSON",
+    "Aider-Polyglot",
+    "Anthropic Red Team",
+    "Artificial Analysis LLM API",
+    "BBQ",
+    "BrowseComp Long Context 128k",
+    "BrowseComp Long Context 256k",
+    "COLLIE",
+    "CharXiv-R",
+    "ERQA",
+    "Easy Problems",
+    "FActScore",
+    "Fibble arena",
+    "FrontierMath",
+    "GPQA",
+    "Global-MMLU Lite",
+    "Graphwalks BFS <128k",
+    "Graphwalks parents <128k",
+    "HMMT 2025",
+    "Hard Problems",
+    "HarmBench",
+    "HealthBench Hard",
+    "Helm air bench",
+    "Helm safety",
+    "Holistic Evaluation of Language Models (HELM)",
+    "HumanEval",
+    "IFEval",
+    "Internal API instruction following (hard)",
+    "LongFact Concepts",
+    "LongFact Objects",
+    "MATH",
+    "MMLU",
+    "MMLU-Pro",
+    "MMMU",
+    "MMMU-Pro",
+    "Medium Problems",
+    "Multi-Challenge",
+    "Multi-SWE-bench (c++)",
+    "Omni-MATH",
+    "OpenAI-MRCR: 2 needle 128k",
+    "OpenAI-MRCR: 2 needle 256k",
+    "SWE-Lancer (IC-Diamond subset)",
+    "SWE-PolyBench Verified (Java)",
+    "SWE-PolyBench Verified (JavaScript)",
+    "SWE-PolyBench Verified (Python)",
+    "SWE-PolyBench Verified (TypeScript)",
+    "SWE-bench Verified",
+    "SciArena leaderboard API",
+    "SimpleSafetyTests",
+    "Tau2 Airline",
+    "Tau2 Retail",
+    "Tau2 Telecom",
+    "Terminal Bench 2 0",
+    "VideoMME w sub.",
+    "VideoMMMU",
+    "WildBench",
+    "Wordle Arena",
+    "XSTest"
+  ],
+  "top_benchmark_scores": [
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_median_output_tokens_per_second",
+      "canonical_display_name": "artificial_analysis.median_output_tokens_per_second / Median output tokens per second",
+      "evaluation_name": "artificial_analysis.median_output_tokens_per_second",
+      "score": 95.722,
+      "metric": "Median output generation speed reported by Artificial Analysis.",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_artificial_analysis_math_index",
+      "canonical_display_name": "artificial_analysis.artificial_analysis_math_index / Artificial Analysis Math Index",
+      "evaluation_name": "artificial_analysis.artificial_analysis_math_index",
+      "score": 94.3,
+      "metric": "Artificial Analysis composite math index.",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_median_time_to_first_token_seconds",
+      "canonical_display_name": "artificial_analysis.median_time_to_first_token_seconds / Median Time To First Token Seconds",
+      "evaluation_name": "artificial_analysis.median_time_to_first_token_seconds",
+      "score": 82.082,
+      "metric": "Median time to first token reported by Artificial Analysis.",
+      "lower_is_better": true
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_median_time_to_first_answer_token",
+      "canonical_display_name": "artificial_analysis.median_time_to_first_answer_token / Median time to first answer token",
+      "evaluation_name": "artificial_analysis.median_time_to_first_answer_token",
+      "score": 82.082,
+      "metric": "Median time to first answer token reported by Artificial Analysis.",
+      "lower_is_better": true
+    },
+    {
+      "benchmark": "Terminal Bench 2 0",
+      "benchmarkKey": "terminal_bench_2_0",
+      "canonical_display_name": "Terminal bench 2 0 / Accuracy",
+      "evaluation_name": "terminal-bench-2.0",
+      "score": 49.6,
+      "metric": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_artificial_analysis_intelligence_index",
+      "canonical_display_name": "artificial_analysis.artificial_analysis_intelligence_index / Artificial Analysis Intelligence Index",
+      "evaluation_name": "artificial_analysis.artificial_analysis_intelligence_index",
+      "score": 44.6,
+      "metric": "Artificial Analysis composite intelligence index.",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_artificial_analysis_coding_index",
+      "canonical_display_name": "artificial_analysis.artificial_analysis_coding_index / Artificial Analysis Coding Index",
+      "evaluation_name": "artificial_analysis.artificial_analysis_coding_index",
+      "score": 36,
+      "metric": "Artificial Analysis composite coding index.",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_price_1m_output_tokens",
+      "canonical_display_name": "artificial_analysis.price_1m_output_tokens / Price 1m Output Tokens",
+      "evaluation_name": "artificial_analysis.price_1m_output_tokens",
+      "score": 10,
+      "metric": "Price per 1M output tokens in USD.",
+      "lower_is_better": true
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_price_1m_blended_3_to_1",
+      "canonical_display_name": "artificial_analysis.price_1m_blended_3_to_1 / Price 1m Blended 3 To 1",
+      "evaluation_name": "artificial_analysis.price_1m_blended_3_to_1",
+      "score": 3.438,
+      "metric": "Blended price per 1M tokens using a 3:1 input-to-output ratio.",
+      "lower_is_better": true
+    },
+    {
+      "benchmark": "SciArena leaderboard API",
+      "benchmarkKey": "sciarena",
+      "canonical_display_name": "SciArena / Cost per 100 Calls",
+      "evaluation_name": "overall_cost_per_100_calls_usd",
+      "score": 2.9752,
+      "metric": "overall_cost_per_100_calls_usd",
+      "lower_is_better": true
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_price_1m_input_tokens",
+      "canonical_display_name": "artificial_analysis.price_1m_input_tokens / Price 1m Input Tokens",
+      "evaluation_name": "artificial_analysis.price_1m_input_tokens",
+      "score": 1.25,
+      "metric": "Price per 1M input tokens in USD.",
+      "lower_is_better": true
+    },
+    {
+      "benchmark": "Helm air bench",
+      "benchmarkKey": "helm_air_bench",
+      "canonical_display_name": "Air bench / AIRBench 2024 - Self-harm / Refusal Rate",
+      "evaluation_name": "AIRBench 2024 - Self-harm",
+      "score": 1,
+      "metric": "Refusal Rate on AIRBench 2024 - Self-harm",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "SWE-Lancer (IC-Diamond subset)",
+      "benchmarkKey": "llm_stats_swe_lancer_ic_diamond_subset",
+      "canonical_display_name": "Swe Lancer Ic Diamond Subset / Score",
+      "evaluation_name": "llm_stats.swe-lancer-ic-diamond-subset",
+      "score": 1,
+      "metric": "SWE-Lancer (IC-Diamond subset) is a benchmark of real-world freelance software engineering tasks from Upwork, ranging from $50 bug fixes to $32,000 feature implementations. It evaluates AI models on independent engineering tasks using end-to-end tests triple-verified by experienced software engineers, and includes managerial tasks where models choose between technical implementation proposals.",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "SimpleSafetyTests",
+      "benchmarkKey": "helm_safety_simplesafetytests",
+      "canonical_display_name": "SimpleSafetyTests / LM Evaluated Safety score",
+      "evaluation_name": "SimpleSafetyTests",
+      "score": 0.998,
+      "metric": "LM Evaluated Safety score on SimpleSafetyTests",
+      "lower_is_better": false
+    },
+    {
+      "benchmark": "Artificial Analysis LLM API",
+      "benchmarkKey": "artificial_analysis_llms_artificial_analysis_math_500",
+      "canonical_display_name": "artificial_analysis.math_500 / MATH-500",
+      "evaluation_name": "artificial_analysis.math_500",
+      "score": 0.994,
+      "metric": "Benchmark score on MATH-500.",
+      "lower_is_better": false
+    }
+  ]
+}

tests/fixtures/models/ai21__j1-grande-v1-17b.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/fixtures/models/bytedance__seed-2-0-lite.json ADDED Viewed

	@@ -0,0 +1,2086 @@

+{
+  "model_info": {
+    "name": "Seed 2.0 Lite",
+    "id": "bytedance/seed-2-0-lite",
+    "developer": "bytedance",
+    "additional_details": {
+      "raw_id": "seed-2.0-lite",
+      "raw_name": "Seed 2.0 Lite",
+      "raw_model_id": "seed-2.0-lite",
+      "raw_model_name": "Seed 2.0 Lite",
+      "raw_organization_id": "bytedance",
+      "raw_organization_name": "ByteDance",
+      "raw_release_date": "2026-02-14",
+      "raw_announcement_date": "2026-02-14",
+      "raw_multimodal": "true",
+      "raw_provider_slug": "bytedance",
+      "raw_provider_name": "ByteDance"
+    },
+    "normalized_id": "bytedance/seed-2.0-lite",
+    "family_id": "bytedance/seed-2-0-lite",
+    "family_slug": "seed-2-0-lite",
+    "family_name": "Seed 2.0 Lite",
+    "variant_key": "default",
+    "variant_label": "Default",
+    "model_route_id": "bytedance__seed-2-0-lite",
+    "model_version": null
+  },
+  "model_family_id": "bytedance/seed-2-0-lite",
+  "model_route_id": "bytedance__seed-2-0-lite",
+  "model_family_name": "Seed 2.0 Lite",
+  "raw_model_ids": [
+    "bytedance/seed-2.0-lite"
+  ],
+  "evaluations_by_category": {
+    "other": [
+      {
+        "schema_version": "0.2.2",
+        "evaluation_id": "llm-stats/first_party/bytedance_seed-2.0-lite/1777108064.422824",
+        "retrieved_timestamp": "1777108064.422824",
+        "benchmark": "llm-stats",
+        "source_data": {
+          "dataset_name": "AIME 2026",
+          "source_type": "url",
+          "url": [
+            "https://llm-stats.com/models/seed-2.0-lite",
+            "https://llm-stats.com/benchmarks/aime-2026",
+            "https://api.llm-stats.com/leaderboard/benchmarks/aime-2026"
+          ],
+          "additional_details": {
+            "raw_benchmark_id": "aime-2026",
+            "raw_model_id": "seed-2.0-lite",
+            "source_role": "aggregator"
+          }
+        },
+        "source_metadata": {
+          "source_name": "LLM Stats API: first_party scores",
+          "source_type": "documentation",
+          "source_organization_name": "LLM Stats",
+          "source_organization_url": "https://llm-stats.com/",
+          "evaluator_relationship": "first_party",
+          "additional_details": {
+            "models_endpoint": "https://api.llm-stats.com/v1/models",
+            "benchmarks_endpoint": "https://api.llm-stats.com/leaderboard/benchmarks",
+            "scores_endpoint": "https://api.llm-stats.com/v1/scores",
+            "scores_endpoint_fallback": "https://api.llm-stats.com/leaderboard/benchmarks/{benchmark_id}",
+            "developer_page_url": "https://llm-stats.com/developer",
+            "attribution_url": "https://llm-stats.com/",
+            "attribution_required": "true",
+            "source_role": "aggregator"
+          }
+        },
+        "eval_library": {
+          "name": "LLM Stats",
+          "version": "unknown"
+        },
+        "model_info": {
+          "name": "Seed 2.0 Lite",
+          "id": "bytedance/seed-2.0-lite",
+          "developer": "bytedance",
+          "additional_details": {
+            "raw_id": "seed-2.0-lite",
+            "raw_name": "Seed 2.0 Lite",
+            "raw_model_id": "seed-2.0-lite",
+            "raw_model_name": "Seed 2.0 Lite",
+            "raw_organization_id": "bytedance",
+            "raw_organization_name": "ByteDance",
+            "raw_release_date": "2026-02-14",
+            "raw_announcement_date": "2026-02-14",
+            "raw_multimodal": "true",
+            "raw_provider_slug": "bytedance",
+            "raw_provider_name": "ByteDance"
+          },
+          "normalized_id": "bytedance/seed-2.0-lite",
+          "family_id": "bytedance/seed-2-0-lite",
+          "family_slug": "seed-2-0-lite",
+          "family_name": "Seed 2.0 Lite",
+          "variant_key": "default",
+          "variant_label": "Default",
+          "model_route_id": "bytedance__seed-2-0-lite"
+        },
+        "generation_config": null,
+        "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/bytedance__seed-2-0-lite/llm_stats_first_party_bytedance_seed_2_0_lite_1777108064_422824.json",
+        "detailed_evaluation_results_meta": null,
+        "detailed_evaluation_results": null,
+        "passthrough_top_level_fields": null,
+        "evaluation_results": [
+          {
+            "evaluation_result_id": "aime-2026::aime-2026-seed-2.0-lite",
+            "evaluation_name": "llm_stats.aime-2026",
+            "source_data": {
+              "dataset_name": "AIME 2026",
+              "source_type": "url",
+              "url": [
+                "https://llm-stats.com/models/seed-2.0-lite",
+                "https://llm-stats.com/benchmarks/aime-2026",
+                "https://api.llm-stats.com/leaderboard/benchmarks/aime-2026"
+              ],
+              "additional_details": {
+                "raw_benchmark_id": "aime-2026",
+                "raw_model_id": "seed-2.0-lite",
+                "source_role": "aggregator"
+              }
+            },
+            "metric_config": {
+              "evaluation_description": "All 30 problems from the 2026 American Invitational Mathematics Examination (AIME I and AIME II), testing olympiad-level mathematical reasoning with integer answers from 000-999. Used as an AI benchmark to evaluate large language models' ability to solve complex mathematical problems requiring multi-step logical deductions and structured symbolic reasoning.",
+              "metric_id": "llm_stats.aime-2026.score",
+              "metric_name": "AIME 2026 score",
+              "metric_kind": "benchmark_score",
+              "metric_unit": "proportion",
+              "lower_is_better": false,
+              "score_type": "continuous",
+              "min_score": 0.0,
+              "max_score": 1.0,
+              "additional_details": {
+                "raw_benchmark_id": "aime-2026",
+                "raw_score_field": "score",
+                "bound_strategy": "inferred_proportion",
+                "raw_name": "AIME 2026",
+                "raw_categories": "[\"math\",\"reasoning\"]",
+                "raw_modality": "text",
+                "raw_verified": "false",
+                "raw_model_count": "12"
+              }
+            },
+            "score_details": {
+              "score": 0.883,
+              "details": {
+                "raw_score": "0.883",
+                "raw_score_field": "score",
+                "raw_model_id": "seed-2.0-lite",
+                "raw_benchmark_id": "aime-2026",
+                "source_urls_json": "[\"https://llm-stats.com/models/seed-2.0-lite\",\"https://llm-stats.com/benchmarks/aime-2026\",\"https://api.llm-stats.com/leaderboard/benchmarks/aime-2026\"]",
+                "raw_score_id": "aime-2026::seed-2.0-lite",
+                "raw_provenance_label": "unknown",
+                "raw_verified": "false"
+              }
+            },
+            "normalized_result": {
+              "benchmark_family_key": "llm_stats",
+              "benchmark_family_name": "AIME 2026",
+              "benchmark_parent_key": "llm_stats",
+              "benchmark_parent_name": "AIME 2026",
+              "benchmark_component_key": "aime_2026",
+              "benchmark_component_name": "Aime 2026",
+              "benchmark_leaf_key": "aime_2026",
+              "benchmark_leaf_name": "Aime 2026",
+              "slice_key": null,
+              "slice_name": null,
+              "metric_name": "Score",
+              "metric_id": "llm_stats.aime-2026.score",
+              "metric_key": "score",
+              "metric_source": "metric_config",
+              "display_name": "Aime 2026 / Score",
+              "canonical_display_name": "Aime 2026 / Score",
+              "raw_evaluation_name": "llm_stats.aime-2026",
+              "is_summary_score": false
+            }
+          },
+          {
+            "evaluation_result_id": "livecodebench-v6::livecodebench-v6-seed-2.0-lite",
+            "evaluation_name": "llm_stats.livecodebench-v6",
+            "source_data": {
+              "dataset_name": "LiveCodeBench v6",
+              "source_type": "url",
+              "url": [
+                "https://llm-stats.com/models/seed-2.0-lite",
+                "https://llm-stats.com/benchmarks/livecodebench-v6",
+                "https://api.llm-stats.com/leaderboard/benchmarks/livecodebench-v6"
+              ],
+              "additional_details": {
+                "raw_benchmark_id": "livecodebench-v6",
+                "raw_model_id": "seed-2.0-lite",
+                "source_role": "aggregator"
+              }
+            },
+            "metric_config": {
+              "evaluation_description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.",
+              "metric_id": "llm_stats.livecodebench-v6.score",
+              "metric_name": "LiveCodeBench v6 score",
+              "metric_kind": "benchmark_score",
+              "metric_unit": "proportion",
+              "lower_is_better": false,
+              "score_type": "continuous",
+              "min_score": 0.0,
+              "max_score": 1.0,
+              "additional_details": {
+                "raw_benchmark_id": "livecodebench-v6",
+                "raw_score_field": "score",
+                "bound_strategy": "inferred_proportion",
+                "raw_name": "LiveCodeBench v6",
+                "raw_categories": "[\"general\",\"reasoning\"]",
+                "raw_modality": "text",
+                "raw_verified": "false",
+                "raw_model_count": "45"
+              }
+            },
+            "score_details": {
+              "score": 0.817,
+              "details": {
+                "raw_score": "0.817",
+                "raw_score_field": "score",
+                "raw_model_id": "seed-2.0-lite",
+                "raw_benchmark_id": "livecodebench-v6",
+                "source_urls_json": "[\"https://llm-stats.com/models/seed-2.0-lite\",\"https://llm-stats.com/benchmarks/livecodebench-v6\",\"https://api.llm-stats.com/leaderboard/benchmarks/livecodebench-v6\"]",
+                "raw_score_id": "livecodebench-v6::seed-2.0-lite",
+                "raw_provenance_label": "unknown",
+                "raw_verified": "false"
+              }
+            },
+            "normalized_result": {
+              "benchmark_family_key": "llm_stats",
+              "benchmark_family_name": "LiveCodeBench v6",
+              "benchmark_parent_key": "llm_stats",
+              "benchmark_parent_name": "LiveCodeBench v6",
+              "benchmark_component_key": "livecodebench_v6",
+              "benchmark_component_name": "Livecodebench V6",
+              "benchmark_leaf_key": "livecodebench_v6",
+              "benchmark_leaf_name": "Livecodebench V6",
+              "slice_key": null,
+              "slice_name": null,
+              "metric_name": "Score",
+              "metric_id": "llm_stats.livecodebench-v6.score",
+              "metric_key": "score",
+              "metric_source": "metric_config",
+              "display_name": "Livecodebench V6 / Score",
+              "canonical_display_name": "Livecodebench V6 / Score",
+              "raw_evaluation_name": "llm_stats.livecodebench-v6",
+              "is_summary_score": false
+            }
+          }
+        ],
+        "benchmark_card": null,
+        "instance_level_data": null,
+        "eval_summary_ids": [
+          "llm_stats_aime_2026",
+          "llm_stats_livecodebench_v6"
+        ]
+      }
+    ]
+  },
+  "evaluation_summaries_by_category": {
+    "coding": [
+      {
+        "eval_summary_id": "llm_stats_livecodebench_v6",
+        "benchmark": "LiveCodeBench v6",
+        "benchmark_family_key": "llm_stats",
+        "benchmark_family_name": "LiveCodeBench v6",
+        "benchmark_parent_key": "llm_stats",
+        "benchmark_parent_name": "LiveCodeBench v6",
+        "benchmark_leaf_key": "livecodebench_v6",
+        "benchmark_leaf_name": "Livecodebench V6",
+        "benchmark_component_key": "livecodebench_v6",
+        "benchmark_component_name": "Livecodebench V6",
+        "evaluation_name": "Livecodebench V6",
+        "display_name": "Livecodebench V6",
+        "canonical_display_name": "Livecodebench V6",
+        "is_summary_score": false,
+        "category": "coding",
+        "source_data": {
+          "dataset_name": "LiveCodeBench v6",
+          "source_type": "url",
+          "url": [
+            "https://llm-stats.com/models/seed-2.0-lite",
+            "https://llm-stats.com/benchmarks/livecodebench-v6",
+            "https://api.llm-stats.com/leaderboard/benchmarks/livecodebench-v6"
+          ],
+          "additional_details": {
+            "raw_benchmark_id": "livecodebench-v6",
+            "raw_model_id": "seed-2.0-lite",
+            "source_role": "aggregator"
+          }
+        },
+        "benchmark_card": {
+          "benchmark_details": {
+            "name": "LiveCodeBench",
+            "overview": "LiveCodeBench is a holistic and contamination-free benchmark for evaluating large language models on code-related capabilities. It assesses a broader range of skills including code generation, self-repair, code execution, and test output prediction. The benchmark collects new problems over time from programming contest platforms to prevent data contamination, currently containing over 500 coding problems published between May 2023 and May 2024.",
+            "data_type": "text",
+            "domains": [
+              "code generation",
+              "programming competitions"
+            ],
+            "languages": [
+              "Not specified"
+            ],
+            "similar_benchmarks": [
+              "HumanEval",
+              "MBPP",
+              "APPS",
+              "DS-1000",
+              "ARCADE",
+              "NumpyEval",
+              "PandasEval",
+              "JuICe",
+              "APIBench",
+              "RepoBench",
+              "ODEX",
+              "SWE-Bench",
+              "GoogleCodeRepo",
+              "RepoEval",
+              "Cocomic-Data"
+            ],
+            "resources": [
+              "https://livecodebench.github.io/",
+              "https://arxiv.org/abs/2403.07974"
+            ],
+            "benchmark_type": "single"
+          },
+          "purpose_and_intended_users": {
+            "goal": "To provide a comprehensive and contamination-free evaluation of large language models for code by assessing a broader range of code-related capabilities beyond just code generation.",
+            "audience": [
+              "Researchers and practitioners in academia and industry who are interested in evaluating the capabilities of large language models for code"
+            ],
+            "tasks": [
+              "Code generation",
+              "Self-repair",
+              "Code execution",
+              "Test output prediction"
+            ],
+            "limitations": "The focus on competition programming problems might not be representative of the most general notion of LLM programming capabilities or real-world, open-ended software development tasks.",
+            "out_of_scope_uses": [
+              "Evaluating performance on real-world, open-ended, and unconstrained user-raised problems"
+            ]
+          },
+          "data": {
+            "source": "The data is collected from coding contests on three platforms: LeetCode, AtCoder, and CodeForces, with problems published between May 2023 and May 2024.",
+            "size": "Over 500 coding problems. Specific subsets include 479 samples from 85 problems for code execution and 442 problem instances from 181 LeetCode problems for test output prediction.",
+            "format": "Includes problem statements, public tests, user solutions, and starter code (for LeetCode). Problems are tagged with difficulty labels (Easy, Medium, Hard) from the platforms.",
+            "annotation": "Difficulty labels are provided by the competition platforms. For the code execution dataset, human-submitted solutions were filtered using compile-time and runtime filters followed by manual inspection to ensure quality."
+          },
+          "methodology": {
+            "methods": [
+              "Models are evaluated in a zero-shot setting across four scenarios: code generation, self-repair, code execution, and test output prediction.",
+              "For code generation and self-repair, program correctness is verified using a set of unseen test cases. For code execution, an execution-based correctness metric compares generated output to ground truth. For test output prediction, generated responses are parsed and equivalence checks are used for grading."
+            ],
+            "metrics": [
+              "Pass@1"
+            ],
+            "calculation": "For each problem, 10 candidate answers are generated. The Pass@1 score is the fraction of problems for which a generated program or answer is correct.",
+            "interpretation": "A higher Pass@1 score indicates better performance.",
+            "baseline_results": "The paper reports results for specific models including GPT-4, GPT-4-Turbo, Claude-3-Opus, Claude-3-Sonnet, and Mistral-L, but specific numerical scores are not provided in the given excerpts.",
+            "validation": "Program correctness for code generation and self-repair is verified using a set of unseen test cases. For code execution, an execution-based correctness metric is used to compare generated output to ground truth. For test output prediction, generated responses are parsed and equivalence checks are used for grading."
+          },
+          "ethical_and_legal_considerations": {
+            "privacy_and_anonymity": "Not specified",
+            "data_licensing": "Not specified",
+            "consent_procedures": "Not specified",
+            "compliance_with_regulations": "The benchmark operates under the Fair Use doctrine (§ 107) for copyrighted works, determining that its use of collected problems for academic, non-profit educational purposes constitutes fair use. It does not train on the collected problems."
+          },
+          "possible_risks": [
+            {
+              "category": "Over- or under-reliance",
+              "description": [
+                "In AI-assisted decision-making tasks, reliance measures how much a person trusts (and potentially acts on) a model's output. Over-reliance occurs when a person puts too much trust in a model, accepting a model's output when the model's output is likely incorrect. Under-reliance is the opposite, where the person doesn't trust the model but should."
+              ],
+              "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/over-or-under-reliance.html"
+            },
+            {
+              "category": "Unrepresentative data",
+              "description": [
+                "Unrepresentative data occurs when the training or fine-tuning data is not sufficiently representative of the underlying population or does not measure the phenomenon of interest. Synthetic data might not fully capture the complexity and nuances of real-world data. Causes include possible limitations in the seed data quality, biases in generation methods, or inadequate domain knowledge. Thus, AI models might struggle to generalize effectively to real-world scenarios."
+              ],
+              "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/unrepresentative-data.html"
+            },
+            {
+              "category": "Data contamination",
+              "description": [
+                "Data contamination occurs when incorrect data is used for training. For example, data that is not aligned with model's purpose or data that is already set aside for other development tasks such as testing and evaluation."
+              ],
+              "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/data-contamination.html"
+            },
+            {
+              "category": "Harmful code generation",
+              "description": [
+                "Models might generate code that causes harm or unintentionally affects other systems."
+              ],
+              "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/harmful-code-generation.html"
+            },
+            {
+              "category": "Reproducibility",
+              "description": [
+                "Replicating agent behavior or output can be impacted by changes or updates made to external services and tools. This impact is increased if the agent is built with generative AI."
+              ],
+              "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/reproducibility-agentic.html"
+            }
+          ],
+          "flagged_fields": {},
+          "missing_fields": []
+        },
+        "tags": {
+          "domains": [
+            "code generation",
+            "programming competitions"
+          ],
+          "languages": [
+            "Not specified"
+          ],
+          "tasks": [
+            "Code generation",
+            "Self-repair",
+            "Code execution",
+            "Test output prediction"
+          ]
+        },
+        "subtasks_count": 0,
+        "metrics_count": 1,
+        "metric_names": [
+          "Score"
+        ],
+        "primary_metric_name": "Score",
+        "evalcards": {
+          "annotations": {
+            "reporting_completeness": {
+              "completeness_score": 0.9285714285714286,
+              "total_fields_evaluated": 28,
+              "missing_required_fields": [
+                "evalcards.lifecycle_status",
+                "evalcards.preregistration_url"
+              ],
+              "partial_fields": [],
+              "field_scores": [
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.name",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.overview",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.data_type",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.domains",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.languages",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.similar_benchmarks",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.resources",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.goal",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.audience",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.tasks",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.limitations",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.out_of_scope_uses",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.methods",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.metrics",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.calculation",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.interpretation",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.baseline_results",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.validation",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.ethical_and_legal_considerations.privacy_and_anonymity",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.ethical_and_legal_considerations.data_licensing",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.ethical_and_legal_considerations.consent_procedures",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.data",
+                  "coverage_type": "partial",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "eee_eval.source_metadata.source_type",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "eee_eval.source_metadata.source_organization_name",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "eee_eval.source_metadata.evaluator_relationship",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "evalcards.lifecycle_status",
+                  "coverage_type": "reserved",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "evalcards.preregistration_url",
+                  "coverage_type": "reserved",
+                  "score": 0.0
+                }
+              ],
+              "signal_version": "1.0"
+            },
+            "benchmark_comparability": {
+              "variant_divergence_groups": [],
+              "cross_party_divergence_groups": []
+            }
+          }
+        },
+        "reproducibility_summary": {
+          "results_total": 1,
+          "has_reproducibility_gap_count": 1,
+          "populated_ratio_avg": 0.0
+        },
+        "provenance_summary": {
+          "total_results": 1,
+          "total_groups": 1,
+          "multi_source_groups": 0,
+          "first_party_only_groups": 1,
+          "source_type_distribution": {
+            "first_party": 1,
+            "third_party": 0,
+            "collaborative": 0,
+            "unspecified": 0
+          }
+        },
+        "comparability_summary": {
+          "total_groups": 1,
+          "groups_with_variant_check": 0,
+          "groups_with_cross_party_check": 0,
+          "variant_divergent_count": 0,
+          "cross_party_divergent_count": 0
+        },
+        "metrics": [
+          {
+            "metric_summary_id": "llm_stats_livecodebench_v6_score",
+            "legacy_eval_summary_id": "llm_stats_llm_stats_livecodebench_v6",
+            "evaluation_name": "llm_stats.livecodebench-v6",
+            "display_name": "Livecodebench V6 / Score",
+            "canonical_display_name": "Livecodebench V6 / Score",
+            "benchmark_leaf_key": "livecodebench_v6",
+            "benchmark_leaf_name": "Livecodebench V6",
+            "slice_key": null,
+            "slice_name": null,
+            "lower_is_better": false,
+            "metric_name": "Score",
+            "metric_id": "llm_stats.livecodebench-v6.score",
+            "metric_key": "score",
+            "metric_source": "metric_config",
+            "metric_config": {
+              "evaluation_description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.",
+              "metric_id": "llm_stats.livecodebench-v6.score",
+              "metric_name": "LiveCodeBench v6 score",
+              "metric_kind": "benchmark_score",
+              "metric_unit": "proportion",
+              "lower_is_better": false,
+              "score_type": "continuous",
+              "min_score": 0.0,
+              "max_score": 1.0,
+              "additional_details": {
+                "raw_benchmark_id": "livecodebench-v6",
+                "raw_score_field": "score",
+                "bound_strategy": "inferred_proportion",
+                "raw_name": "LiveCodeBench v6",
+                "raw_categories": "[\"general\",\"reasoning\"]",
+                "raw_modality": "text",
+                "raw_verified": "false",
+                "raw_model_count": "45"
+              }
+            },
+            "models_count": 1,
+            "top_score": 0.817,
+            "model_results": [
+              {
+                "model_id": "bytedance/seed-2-0-lite",
+                "model_route_id": "bytedance__seed-2-0-lite",
+                "model_name": "Seed 2.0 Lite",
+                "developer": "bytedance",
+                "variant_key": "default",
+                "raw_model_id": "bytedance/seed-2.0-lite",
+                "score": 0.817,
+                "evaluation_id": "llm-stats/first_party/bytedance_seed-2.0-lite/1777108064.422824",
+                "retrieved_timestamp": "1777108064.422824",
+                "source_metadata": {
+                  "source_name": "LLM Stats API: first_party scores",
+                  "source_type": "documentation",
+                  "source_organization_name": "LLM Stats",
+                  "source_organization_url": "https://llm-stats.com/",
+                  "evaluator_relationship": "first_party",
+                  "additional_details": {
+                    "models_endpoint": "https://api.llm-stats.com/v1/models",
+                    "benchmarks_endpoint": "https://api.llm-stats.com/leaderboard/benchmarks",
+                    "scores_endpoint": "https://api.llm-stats.com/v1/scores",
+                    "scores_endpoint_fallback": "https://api.llm-stats.com/leaderboard/benchmarks/{benchmark_id}",
+                    "developer_page_url": "https://llm-stats.com/developer",
+                    "attribution_url": "https://llm-stats.com/",
+                    "attribution_required": "true",
+                    "source_role": "aggregator"
+                  }
+                },
+                "source_data": {
+                  "dataset_name": "AIME 2026",
+                  "source_type": "url",
+                  "url": [
+                    "https://llm-stats.com/models/seed-2.0-lite",
+                    "https://llm-stats.com/benchmarks/aime-2026",
+                    "https://api.llm-stats.com/leaderboard/benchmarks/aime-2026"
+                  ],
+                  "additional_details": {
+                    "raw_benchmark_id": "aime-2026",
+                    "raw_model_id": "seed-2.0-lite",
+                    "source_role": "aggregator"
+                  }
+                },
+                "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/bytedance__seed-2-0-lite/llm_stats_first_party_bytedance_seed_2_0_lite_1777108064_422824.json",
+                "detailed_evaluation_results": null,
+                "detailed_evaluation_results_meta": null,
+                "passthrough_top_level_fields": null,
+                "instance_level_data": null,
+                "normalized_result": {
+                  "benchmark_family_key": "llm_stats",
+                  "benchmark_family_name": "LiveCodeBench v6",
+                  "benchmark_parent_key": "llm_stats",
+                  "benchmark_parent_name": "LiveCodeBench v6",
+                  "benchmark_component_key": "livecodebench_v6",
+                  "benchmark_component_name": "Livecodebench V6",
+                  "benchmark_leaf_key": "livecodebench_v6",
+                  "benchmark_leaf_name": "Livecodebench V6",
+                  "slice_key": null,
+                  "slice_name": null,
+                  "metric_name": "Score",
+                  "metric_id": "llm_stats.livecodebench-v6.score",
+                  "metric_key": "score",
+                  "metric_source": "metric_config",
+                  "display_name": "Livecodebench V6 / Score",
+                  "canonical_display_name": "Livecodebench V6 / Score",
+                  "raw_evaluation_name": "llm_stats.livecodebench-v6",
+                  "is_summary_score": false
+                },
+                "evalcards": {
+                  "annotations": {
+                    "reproducibility_gap": {
+                      "has_reproducibility_gap": true,
+                      "missing_fields": [
+                        "temperature",
+                        "max_tokens"
+                      ],
+                      "required_field_count": 2,
+                      "populated_field_count": 0,
+                      "signal_version": "1.0"
+                    },
+                    "provenance": {
+                      "source_type": "first_party",
+                      "is_multi_source": false,
+                      "first_party_only": true,
+                      "distinct_reporting_organizations": 1,
+                      "signal_version": "1.0"
+                    },
+                    "variant_divergence": null,
+                    "cross_party_divergence": null
+                  }
+                }
+              }
+            ]
+          }
+        ],
+        "subtasks": [],
+        "models_count": 1,
+        "top_score": 0.817,
+        "instance_data": {
+          "available": false,
+          "url_count": 0,
+          "sample_urls": [],
+          "models_with_loaded_instances": 0
+        }
+      }
+    ],
+    "other": [
+      {
+        "eval_summary_id": "llm_stats_aime_2026",
+        "benchmark": "AIME 2026",
+        "benchmark_family_key": "llm_stats",
+        "benchmark_family_name": "AIME 2026",
+        "benchmark_parent_key": "llm_stats",
+        "benchmark_parent_name": "AIME 2026",
+        "benchmark_leaf_key": "aime_2026",
+        "benchmark_leaf_name": "Aime 2026",
+        "benchmark_component_key": "aime_2026",
+        "benchmark_component_name": "Aime 2026",
+        "evaluation_name": "Aime 2026",
+        "display_name": "Aime 2026",
+        "canonical_display_name": "Aime 2026",
+        "is_summary_score": false,
+        "category": "other",
+        "source_data": {
+          "dataset_name": "AIME 2026",
+          "source_type": "url",
+          "url": [
+            "https://llm-stats.com/models/seed-2.0-lite",
+            "https://llm-stats.com/benchmarks/aime-2026",
+            "https://api.llm-stats.com/leaderboard/benchmarks/aime-2026"
+          ],
+          "additional_details": {
+            "raw_benchmark_id": "aime-2026",
+            "raw_model_id": "seed-2.0-lite",
+            "source_role": "aggregator"
+          }
+        },
+        "benchmark_card": null,
+        "tags": {
+          "domains": [],
+          "languages": [],
+          "tasks": []
+        },
+        "subtasks_count": 0,
+        "metrics_count": 1,
+        "metric_names": [
+          "Score"
+        ],
+        "primary_metric_name": "Score",
+        "evalcards": {
+          "annotations": {
+            "reporting_completeness": {
+              "completeness_score": 0.10714285714285714,
+              "total_fields_evaluated": 28,
+              "missing_required_fields": [
+                "autobenchmarkcard.benchmark_details.name",
+                "autobenchmarkcard.benchmark_details.overview",
+                "autobenchmarkcard.benchmark_details.data_type",
+                "autobenchmarkcard.benchmark_details.domains",
+                "autobenchmarkcard.benchmark_details.languages",
+                "autobenchmarkcard.benchmark_details.similar_benchmarks",
+                "autobenchmarkcard.benchmark_details.resources",
+                "autobenchmarkcard.purpose_and_intended_users.goal",
+                "autobenchmarkcard.purpose_and_intended_users.audience",
+                "autobenchmarkcard.purpose_and_intended_users.tasks",
+                "autobenchmarkcard.purpose_and_intended_users.limitations",
+                "autobenchmarkcard.purpose_and_intended_users.out_of_scope_uses",
+                "autobenchmarkcard.methodology.methods",
+                "autobenchmarkcard.methodology.metrics",
+                "autobenchmarkcard.methodology.calculation",
+                "autobenchmarkcard.methodology.interpretation",
+                "autobenchmarkcard.methodology.baseline_results",
+                "autobenchmarkcard.methodology.validation",
+                "autobenchmarkcard.ethical_and_legal_considerations.privacy_and_anonymity",
+                "autobenchmarkcard.ethical_and_legal_considerations.data_licensing",
+                "autobenchmarkcard.ethical_and_legal_considerations.consent_procedures",
+                "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
+                "autobenchmarkcard.data",
+                "evalcards.lifecycle_status",
+                "evalcards.preregistration_url"
+              ],
+              "partial_fields": [],
+              "field_scores": [
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.name",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.overview",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.data_type",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.domains",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.languages",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.similar_benchmarks",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.resources",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.goal",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.audience",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.tasks",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.limitations",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.out_of_scope_uses",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.methods",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.metrics",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.calculation",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.interpretation",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.baseline_results",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.validation",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.ethical_and_legal_considerations.privacy_and_anonymity",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.ethical_and_legal_considerations.data_licensing",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.ethical_and_legal_considerations.consent_procedures",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.data",
+                  "coverage_type": "partial",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "eee_eval.source_metadata.source_type",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "eee_eval.source_metadata.source_organization_name",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "eee_eval.source_metadata.evaluator_relationship",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "evalcards.lifecycle_status",
+                  "coverage_type": "reserved",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "evalcards.preregistration_url",
+                  "coverage_type": "reserved",
+                  "score": 0.0
+                }
+              ],
+              "signal_version": "1.0"
+            },
+            "benchmark_comparability": {
+              "variant_divergence_groups": [],
+              "cross_party_divergence_groups": []
+            }
+          }
+        },
+        "reproducibility_summary": {
+          "results_total": 1,
+          "has_reproducibility_gap_count": 1,
+          "populated_ratio_avg": 0.0
+        },
+        "provenance_summary": {
+          "total_results": 1,
+          "total_groups": 1,
+          "multi_source_groups": 0,
+          "first_party_only_groups": 1,
+          "source_type_distribution": {
+            "first_party": 1,
+            "third_party": 0,
+            "collaborative": 0,
+            "unspecified": 0
+          }
+        },
+        "comparability_summary": {
+          "total_groups": 1,
+          "groups_with_variant_check": 0,
+          "groups_with_cross_party_check": 0,
+          "variant_divergent_count": 0,
+          "cross_party_divergent_count": 0
+        },
+        "metrics": [
+          {
+            "metric_summary_id": "llm_stats_aime_2026_score",
+            "legacy_eval_summary_id": "llm_stats_llm_stats_aime_2026",
+            "evaluation_name": "llm_stats.aime-2026",
+            "display_name": "Aime 2026 / Score",
+            "canonical_display_name": "Aime 2026 / Score",
+            "benchmark_leaf_key": "aime_2026",
+            "benchmark_leaf_name": "Aime 2026",
+            "slice_key": null,
+            "slice_name": null,
+            "lower_is_better": false,
+            "metric_name": "Score",
+            "metric_id": "llm_stats.aime-2026.score",
+            "metric_key": "score",
+            "metric_source": "metric_config",
+            "metric_config": {
+              "evaluation_description": "All 30 problems from the 2026 American Invitational Mathematics Examination (AIME I and AIME II), testing olympiad-level mathematical reasoning with integer answers from 000-999. Used as an AI benchmark to evaluate large language models' ability to solve complex mathematical problems requiring multi-step logical deductions and structured symbolic reasoning.",
+              "metric_id": "llm_stats.aime-2026.score",
+              "metric_name": "AIME 2026 score",
+              "metric_kind": "benchmark_score",
+              "metric_unit": "proportion",
+              "lower_is_better": false,
+              "score_type": "continuous",
+              "min_score": 0.0,
+              "max_score": 1.0,
+              "additional_details": {
+                "raw_benchmark_id": "aime-2026",
+                "raw_score_field": "score",
+                "bound_strategy": "inferred_proportion",
+                "raw_name": "AIME 2026",
+                "raw_categories": "[\"math\",\"reasoning\"]",
+                "raw_modality": "text",
+                "raw_verified": "false",
+                "raw_model_count": "12"
+              }
+            },
+            "models_count": 1,
+            "top_score": 0.883,
+            "model_results": [
+              {
+                "model_id": "bytedance/seed-2-0-lite",
+                "model_route_id": "bytedance__seed-2-0-lite",
+                "model_name": "Seed 2.0 Lite",
+                "developer": "bytedance",
+                "variant_key": "default",
+                "raw_model_id": "bytedance/seed-2.0-lite",
+                "score": 0.883,
+                "evaluation_id": "llm-stats/first_party/bytedance_seed-2.0-lite/1777108064.422824",
+                "retrieved_timestamp": "1777108064.422824",
+                "source_metadata": {
+                  "source_name": "LLM Stats API: first_party scores",
+                  "source_type": "documentation",
+                  "source_organization_name": "LLM Stats",
+                  "source_organization_url": "https://llm-stats.com/",
+                  "evaluator_relationship": "first_party",
+                  "additional_details": {
+                    "models_endpoint": "https://api.llm-stats.com/v1/models",
+                    "benchmarks_endpoint": "https://api.llm-stats.com/leaderboard/benchmarks",
+                    "scores_endpoint": "https://api.llm-stats.com/v1/scores",
+                    "scores_endpoint_fallback": "https://api.llm-stats.com/leaderboard/benchmarks/{benchmark_id}",
+                    "developer_page_url": "https://llm-stats.com/developer",
+                    "attribution_url": "https://llm-stats.com/",
+                    "attribution_required": "true",
+                    "source_role": "aggregator"
+                  }
+                },
+                "source_data": {
+                  "dataset_name": "AIME 2026",
+                  "source_type": "url",
+                  "url": [
+                    "https://llm-stats.com/models/seed-2.0-lite",
+                    "https://llm-stats.com/benchmarks/aime-2026",
+                    "https://api.llm-stats.com/leaderboard/benchmarks/aime-2026"
+                  ],
+                  "additional_details": {
+                    "raw_benchmark_id": "aime-2026",
+                    "raw_model_id": "seed-2.0-lite",
+                    "source_role": "aggregator"
+                  }
+                },
+                "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/bytedance__seed-2-0-lite/llm_stats_first_party_bytedance_seed_2_0_lite_1777108064_422824.json",
+                "detailed_evaluation_results": null,
+                "detailed_evaluation_results_meta": null,
+                "passthrough_top_level_fields": null,
+                "instance_level_data": null,
+                "normalized_result": {
+                  "benchmark_family_key": "llm_stats",
+                  "benchmark_family_name": "AIME 2026",
+                  "benchmark_parent_key": "llm_stats",
+                  "benchmark_parent_name": "AIME 2026",
+                  "benchmark_component_key": "aime_2026",
+                  "benchmark_component_name": "Aime 2026",
+                  "benchmark_leaf_key": "aime_2026",
+                  "benchmark_leaf_name": "Aime 2026",
+                  "slice_key": null,
+                  "slice_name": null,
+                  "metric_name": "Score",
+                  "metric_id": "llm_stats.aime-2026.score",
+                  "metric_key": "score",
+                  "metric_source": "metric_config",
+                  "display_name": "Aime 2026 / Score",
+                  "canonical_display_name": "Aime 2026 / Score",
+                  "raw_evaluation_name": "llm_stats.aime-2026",
+                  "is_summary_score": false
+                },
+                "evalcards": {
+                  "annotations": {
+                    "reproducibility_gap": {
+                      "has_reproducibility_gap": true,
+                      "missing_fields": [
+                        "temperature",
+                        "max_tokens"
+                      ],
+                      "required_field_count": 2,
+                      "populated_field_count": 0,
+                      "signal_version": "1.0"
+                    },
+                    "provenance": {
+                      "source_type": "first_party",
+                      "is_multi_source": false,
+                      "first_party_only": true,
+                      "distinct_reporting_organizations": 1,
+                      "signal_version": "1.0"
+                    },
+                    "variant_divergence": null,
+                    "cross_party_divergence": null
+                  }
+                }
+              }
+            ]
+          }
+        ],
+        "subtasks": [],
+        "models_count": 1,
+        "top_score": 0.883,
+        "instance_data": {
+          "available": false,
+          "url_count": 0,
+          "sample_urls": [],
+          "models_with_loaded_instances": 0
+        }
+      }
+    ]
+  },
+  "hierarchy_by_category": {
+    "coding": [
+      {
+        "eval_summary_id": "llm_stats_livecodebench_v6",
+        "benchmark": "LiveCodeBench v6",
+        "benchmark_family_key": "llm_stats",
+        "benchmark_family_name": "LiveCodeBench v6",
+        "benchmark_parent_key": "llm_stats",
+        "benchmark_parent_name": "LiveCodeBench v6",
+        "benchmark_leaf_key": "livecodebench_v6",
+        "benchmark_leaf_name": "Livecodebench V6",
+        "benchmark_component_key": "livecodebench_v6",
+        "benchmark_component_name": "Livecodebench V6",
+        "evaluation_name": "Livecodebench V6",
+        "display_name": "Livecodebench V6",
+        "canonical_display_name": "Livecodebench V6",
+        "is_summary_score": false,
+        "category": "coding",
+        "source_data": {
+          "dataset_name": "LiveCodeBench v6",
+          "source_type": "url",
+          "url": [
+            "https://llm-stats.com/models/seed-2.0-lite",
+            "https://llm-stats.com/benchmarks/livecodebench-v6",
+            "https://api.llm-stats.com/leaderboard/benchmarks/livecodebench-v6"
+          ],
+          "additional_details": {
+            "raw_benchmark_id": "livecodebench-v6",
+            "raw_model_id": "seed-2.0-lite",
+            "source_role": "aggregator"
+          }
+        },
+        "benchmark_card": {
+          "benchmark_details": {
+            "name": "LiveCodeBench",
+            "overview": "LiveCodeBench is a holistic and contamination-free benchmark for evaluating large language models on code-related capabilities. It assesses a broader range of skills including code generation, self-repair, code execution, and test output prediction. The benchmark collects new problems over time from programming contest platforms to prevent data contamination, currently containing over 500 coding problems published between May 2023 and May 2024.",
+            "data_type": "text",
+            "domains": [
+              "code generation",
+              "programming competitions"
+            ],
+            "languages": [
+              "Not specified"
+            ],
+            "similar_benchmarks": [
+              "HumanEval",
+              "MBPP",
+              "APPS",
+              "DS-1000",
+              "ARCADE",
+              "NumpyEval",
+              "PandasEval",
+              "JuICe",
+              "APIBench",
+              "RepoBench",
+              "ODEX",
+              "SWE-Bench",
+              "GoogleCodeRepo",
+              "RepoEval",
+              "Cocomic-Data"
+            ],
+            "resources": [
+              "https://livecodebench.github.io/",
+              "https://arxiv.org/abs/2403.07974"
+            ],
+            "benchmark_type": "single"
+          },
+          "purpose_and_intended_users": {
+            "goal": "To provide a comprehensive and contamination-free evaluation of large language models for code by assessing a broader range of code-related capabilities beyond just code generation.",
+            "audience": [
+              "Researchers and practitioners in academia and industry who are interested in evaluating the capabilities of large language models for code"
+            ],
+            "tasks": [
+              "Code generation",
+              "Self-repair",
+              "Code execution",
+              "Test output prediction"
+            ],
+            "limitations": "The focus on competition programming problems might not be representative of the most general notion of LLM programming capabilities or real-world, open-ended software development tasks.",
+            "out_of_scope_uses": [
+              "Evaluating performance on real-world, open-ended, and unconstrained user-raised problems"
+            ]
+          },
+          "data": {
+            "source": "The data is collected from coding contests on three platforms: LeetCode, AtCoder, and CodeForces, with problems published between May 2023 and May 2024.",
+            "size": "Over 500 coding problems. Specific subsets include 479 samples from 85 problems for code execution and 442 problem instances from 181 LeetCode problems for test output prediction.",
+            "format": "Includes problem statements, public tests, user solutions, and starter code (for LeetCode). Problems are tagged with difficulty labels (Easy, Medium, Hard) from the platforms.",
+            "annotation": "Difficulty labels are provided by the competition platforms. For the code execution dataset, human-submitted solutions were filtered using compile-time and runtime filters followed by manual inspection to ensure quality."
+          },
+          "methodology": {
+            "methods": [
+              "Models are evaluated in a zero-shot setting across four scenarios: code generation, self-repair, code execution, and test output prediction.",
+              "For code generation and self-repair, program correctness is verified using a set of unseen test cases. For code execution, an execution-based correctness metric compares generated output to ground truth. For test output prediction, generated responses are parsed and equivalence checks are used for grading."
+            ],
+            "metrics": [
+              "Pass@1"
+            ],
+            "calculation": "For each problem, 10 candidate answers are generated. The Pass@1 score is the fraction of problems for which a generated program or answer is correct.",
+            "interpretation": "A higher Pass@1 score indicates better performance.",
+            "baseline_results": "The paper reports results for specific models including GPT-4, GPT-4-Turbo, Claude-3-Opus, Claude-3-Sonnet, and Mistral-L, but specific numerical scores are not provided in the given excerpts.",
+            "validation": "Program correctness for code generation and self-repair is verified using a set of unseen test cases. For code execution, an execution-based correctness metric is used to compare generated output to ground truth. For test output prediction, generated responses are parsed and equivalence checks are used for grading."
+          },
+          "ethical_and_legal_considerations": {
+            "privacy_and_anonymity": "Not specified",
+            "data_licensing": "Not specified",
+            "consent_procedures": "Not specified",
+            "compliance_with_regulations": "The benchmark operates under the Fair Use doctrine (§ 107) for copyrighted works, determining that its use of collected problems for academic, non-profit educational purposes constitutes fair use. It does not train on the collected problems."
+          },
+          "possible_risks": [
+            {
+              "category": "Over- or under-reliance",
+              "description": [
+                "In AI-assisted decision-making tasks, reliance measures how much a person trusts (and potentially acts on) a model's output. Over-reliance occurs when a person puts too much trust in a model, accepting a model's output when the model's output is likely incorrect. Under-reliance is the opposite, where the person doesn't trust the model but should."
+              ],
+              "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/over-or-under-reliance.html"
+            },
+            {
+              "category": "Unrepresentative data",
+              "description": [
+                "Unrepresentative data occurs when the training or fine-tuning data is not sufficiently representative of the underlying population or does not measure the phenomenon of interest. Synthetic data might not fully capture the complexity and nuances of real-world data. Causes include possible limitations in the seed data quality, biases in generation methods, or inadequate domain knowledge. Thus, AI models might struggle to generalize effectively to real-world scenarios."
+              ],
+              "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/unrepresentative-data.html"
+            },
+            {
+              "category": "Data contamination",
+              "description": [
+                "Data contamination occurs when incorrect data is used for training. For example, data that is not aligned with model's purpose or data that is already set aside for other development tasks such as testing and evaluation."
+              ],
+              "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/data-contamination.html"
+            },
+            {
+              "category": "Harmful code generation",
+              "description": [
+                "Models might generate code that causes harm or unintentionally affects other systems."
+              ],
+              "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/harmful-code-generation.html"
+            },
+            {
+              "category": "Reproducibility",
+              "description": [
+                "Replicating agent behavior or output can be impacted by changes or updates made to external services and tools. This impact is increased if the agent is built with generative AI."
+              ],
+              "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/reproducibility-agentic.html"
+            }
+          ],
+          "flagged_fields": {},
+          "missing_fields": []
+        },
+        "tags": {
+          "domains": [
+            "code generation",
+            "programming competitions"
+          ],
+          "languages": [
+            "Not specified"
+          ],
+          "tasks": [
+            "Code generation",
+            "Self-repair",
+            "Code execution",
+            "Test output prediction"
+          ]
+        },
+        "subtasks_count": 0,
+        "metrics_count": 1,
+        "metric_names": [
+          "Score"
+        ],
+        "primary_metric_name": "Score",
+        "evalcards": {
+          "annotations": {
+            "reporting_completeness": {
+              "completeness_score": 0.9285714285714286,
+              "total_fields_evaluated": 28,
+              "missing_required_fields": [
+                "evalcards.lifecycle_status",
+                "evalcards.preregistration_url"
+              ],
+              "partial_fields": [],
+              "field_scores": [
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.name",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.overview",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.data_type",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.domains",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.languages",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.similar_benchmarks",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.resources",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.goal",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.audience",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.tasks",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.limitations",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.out_of_scope_uses",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.methods",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.metrics",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.calculation",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.interpretation",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.baseline_results",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.validation",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.ethical_and_legal_considerations.privacy_and_anonymity",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.ethical_and_legal_considerations.data_licensing",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.ethical_and_legal_considerations.consent_procedures",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.data",
+                  "coverage_type": "partial",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "eee_eval.source_metadata.source_type",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "eee_eval.source_metadata.source_organization_name",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "eee_eval.source_metadata.evaluator_relationship",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "evalcards.lifecycle_status",
+                  "coverage_type": "reserved",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "evalcards.preregistration_url",
+                  "coverage_type": "reserved",
+                  "score": 0.0
+                }
+              ],
+              "signal_version": "1.0"
+            },
+            "benchmark_comparability": {
+              "variant_divergence_groups": [],
+              "cross_party_divergence_groups": []
+            }
+          }
+        },
+        "reproducibility_summary": {
+          "results_total": 1,
+          "has_reproducibility_gap_count": 1,
+          "populated_ratio_avg": 0.0
+        },
+        "provenance_summary": {
+          "total_results": 1,
+          "total_groups": 1,
+          "multi_source_groups": 0,
+          "first_party_only_groups": 1,
+          "source_type_distribution": {
+            "first_party": 1,
+            "third_party": 0,
+            "collaborative": 0,
+            "unspecified": 0
+          }
+        },
+        "comparability_summary": {
+          "total_groups": 1,
+          "groups_with_variant_check": 0,
+          "groups_with_cross_party_check": 0,
+          "variant_divergent_count": 0,
+          "cross_party_divergent_count": 0
+        },
+        "metrics": [
+          {
+            "metric_summary_id": "llm_stats_livecodebench_v6_score",
+            "legacy_eval_summary_id": "llm_stats_llm_stats_livecodebench_v6",
+            "evaluation_name": "llm_stats.livecodebench-v6",
+            "display_name": "Livecodebench V6 / Score",
+            "canonical_display_name": "Livecodebench V6 / Score",
+            "benchmark_leaf_key": "livecodebench_v6",
+            "benchmark_leaf_name": "Livecodebench V6",
+            "slice_key": null,
+            "slice_name": null,
+            "lower_is_better": false,
+            "metric_name": "Score",
+            "metric_id": "llm_stats.livecodebench-v6.score",
+            "metric_key": "score",
+            "metric_source": "metric_config",
+            "metric_config": {
+              "evaluation_description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.",
+              "metric_id": "llm_stats.livecodebench-v6.score",
+              "metric_name": "LiveCodeBench v6 score",
+              "metric_kind": "benchmark_score",
+              "metric_unit": "proportion",
+              "lower_is_better": false,
+              "score_type": "continuous",
+              "min_score": 0.0,
+              "max_score": 1.0,
+              "additional_details": {
+                "raw_benchmark_id": "livecodebench-v6",
+                "raw_score_field": "score",
+                "bound_strategy": "inferred_proportion",
+                "raw_name": "LiveCodeBench v6",
+                "raw_categories": "[\"general\",\"reasoning\"]",
+                "raw_modality": "text",
+                "raw_verified": "false",
+                "raw_model_count": "45"
+              }
+            },
+            "models_count": 1,
+            "top_score": 0.817,
+            "model_results": [
+              {
+                "model_id": "bytedance/seed-2-0-lite",
+                "model_route_id": "bytedance__seed-2-0-lite",
+                "model_name": "Seed 2.0 Lite",
+                "developer": "bytedance",
+                "variant_key": "default",
+                "raw_model_id": "bytedance/seed-2.0-lite",
+                "score": 0.817,
+                "evaluation_id": "llm-stats/first_party/bytedance_seed-2.0-lite/1777108064.422824",
+                "retrieved_timestamp": "1777108064.422824",
+                "source_metadata": {
+                  "source_name": "LLM Stats API: first_party scores",
+                  "source_type": "documentation",
+                  "source_organization_name": "LLM Stats",
+                  "source_organization_url": "https://llm-stats.com/",
+                  "evaluator_relationship": "first_party",
+                  "additional_details": {
+                    "models_endpoint": "https://api.llm-stats.com/v1/models",
+                    "benchmarks_endpoint": "https://api.llm-stats.com/leaderboard/benchmarks",
+                    "scores_endpoint": "https://api.llm-stats.com/v1/scores",
+                    "scores_endpoint_fallback": "https://api.llm-stats.com/leaderboard/benchmarks/{benchmark_id}",
+                    "developer_page_url": "https://llm-stats.com/developer",
+                    "attribution_url": "https://llm-stats.com/",
+                    "attribution_required": "true",
+                    "source_role": "aggregator"
+                  }
+                },
+                "source_data": {
+                  "dataset_name": "AIME 2026",
+                  "source_type": "url",
+                  "url": [
+                    "https://llm-stats.com/models/seed-2.0-lite",
+                    "https://llm-stats.com/benchmarks/aime-2026",
+                    "https://api.llm-stats.com/leaderboard/benchmarks/aime-2026"
+                  ],
+                  "additional_details": {
+                    "raw_benchmark_id": "aime-2026",
+                    "raw_model_id": "seed-2.0-lite",
+                    "source_role": "aggregator"
+                  }
+                },
+                "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/bytedance__seed-2-0-lite/llm_stats_first_party_bytedance_seed_2_0_lite_1777108064_422824.json",
+                "detailed_evaluation_results": null,
+                "detailed_evaluation_results_meta": null,
+                "passthrough_top_level_fields": null,
+                "instance_level_data": null,
+                "normalized_result": {
+                  "benchmark_family_key": "llm_stats",
+                  "benchmark_family_name": "LiveCodeBench v6",
+                  "benchmark_parent_key": "llm_stats",
+                  "benchmark_parent_name": "LiveCodeBench v6",
+                  "benchmark_component_key": "livecodebench_v6",
+                  "benchmark_component_name": "Livecodebench V6",
+                  "benchmark_leaf_key": "livecodebench_v6",
+                  "benchmark_leaf_name": "Livecodebench V6",
+                  "slice_key": null,
+                  "slice_name": null,
+                  "metric_name": "Score",
+                  "metric_id": "llm_stats.livecodebench-v6.score",
+                  "metric_key": "score",
+                  "metric_source": "metric_config",
+                  "display_name": "Livecodebench V6 / Score",
+                  "canonical_display_name": "Livecodebench V6 / Score",
+                  "raw_evaluation_name": "llm_stats.livecodebench-v6",
+                  "is_summary_score": false
+                },
+                "evalcards": {
+                  "annotations": {
+                    "reproducibility_gap": {
+                      "has_reproducibility_gap": true,
+                      "missing_fields": [
+                        "temperature",
+                        "max_tokens"
+                      ],
+                      "required_field_count": 2,
+                      "populated_field_count": 0,
+                      "signal_version": "1.0"
+                    },
+                    "provenance": {
+                      "source_type": "first_party",
+                      "is_multi_source": false,
+                      "first_party_only": true,
+                      "distinct_reporting_organizations": 1,
+                      "signal_version": "1.0"
+                    },
+                    "variant_divergence": null,
+                    "cross_party_divergence": null
+                  }
+                }
+              }
+            ]
+          }
+        ],
+        "subtasks": [],
+        "models_count": 1,
+        "top_score": 0.817,
+        "instance_data": {
+          "available": false,
+          "url_count": 0,
+          "sample_urls": [],
+          "models_with_loaded_instances": 0
+        }
+      }
+    ],
+    "other": [
+      {
+        "eval_summary_id": "llm_stats_aime_2026",
+        "benchmark": "AIME 2026",
+        "benchmark_family_key": "llm_stats",
+        "benchmark_family_name": "AIME 2026",
+        "benchmark_parent_key": "llm_stats",
+        "benchmark_parent_name": "AIME 2026",
+        "benchmark_leaf_key": "aime_2026",
+        "benchmark_leaf_name": "Aime 2026",
+        "benchmark_component_key": "aime_2026",
+        "benchmark_component_name": "Aime 2026",
+        "evaluation_name": "Aime 2026",
+        "display_name": "Aime 2026",
+        "canonical_display_name": "Aime 2026",
+        "is_summary_score": false,
+        "category": "other",
+        "source_data": {
+          "dataset_name": "AIME 2026",
+          "source_type": "url",
+          "url": [
+            "https://llm-stats.com/models/seed-2.0-lite",
+            "https://llm-stats.com/benchmarks/aime-2026",
+            "https://api.llm-stats.com/leaderboard/benchmarks/aime-2026"
+          ],
+          "additional_details": {
+            "raw_benchmark_id": "aime-2026",
+            "raw_model_id": "seed-2.0-lite",
+            "source_role": "aggregator"
+          }
+        },
+        "benchmark_card": null,
+        "tags": {
+          "domains": [],
+          "languages": [],
+          "tasks": []
+        },
+        "subtasks_count": 0,
+        "metrics_count": 1,
+        "metric_names": [
+          "Score"
+        ],
+        "primary_metric_name": "Score",
+        "evalcards": {
+          "annotations": {
+            "reporting_completeness": {
+              "completeness_score": 0.10714285714285714,
+              "total_fields_evaluated": 28,
+              "missing_required_fields": [
+                "autobenchmarkcard.benchmark_details.name",
+                "autobenchmarkcard.benchmark_details.overview",
+                "autobenchmarkcard.benchmark_details.data_type",
+                "autobenchmarkcard.benchmark_details.domains",
+                "autobenchmarkcard.benchmark_details.languages",
+                "autobenchmarkcard.benchmark_details.similar_benchmarks",
+                "autobenchmarkcard.benchmark_details.resources",
+                "autobenchmarkcard.purpose_and_intended_users.goal",
+                "autobenchmarkcard.purpose_and_intended_users.audience",
+                "autobenchmarkcard.purpose_and_intended_users.tasks",
+                "autobenchmarkcard.purpose_and_intended_users.limitations",
+                "autobenchmarkcard.purpose_and_intended_users.out_of_scope_uses",
+                "autobenchmarkcard.methodology.methods",
+                "autobenchmarkcard.methodology.metrics",
+                "autobenchmarkcard.methodology.calculation",
+                "autobenchmarkcard.methodology.interpretation",
+                "autobenchmarkcard.methodology.baseline_results",
+                "autobenchmarkcard.methodology.validation",
+                "autobenchmarkcard.ethical_and_legal_considerations.privacy_and_anonymity",
+                "autobenchmarkcard.ethical_and_legal_considerations.data_licensing",
+                "autobenchmarkcard.ethical_and_legal_considerations.consent_procedures",
+                "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
+                "autobenchmarkcard.data",
+                "evalcards.lifecycle_status",
+                "evalcards.preregistration_url"
+              ],
+              "partial_fields": [],
+              "field_scores": [
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.name",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.overview",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.data_type",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.domains",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.languages",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.similar_benchmarks",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.benchmark_details.resources",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.goal",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.audience",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.tasks",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.limitations",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.purpose_and_intended_users.out_of_scope_uses",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.methods",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.metrics",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.calculation",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.interpretation",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.baseline_results",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.methodology.validation",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.ethical_and_legal_considerations.privacy_and_anonymity",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.ethical_and_legal_considerations.data_licensing",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.ethical_and_legal_considerations.consent_procedures",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
+                  "coverage_type": "full",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "autobenchmarkcard.data",
+                  "coverage_type": "partial",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "eee_eval.source_metadata.source_type",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "eee_eval.source_metadata.source_organization_name",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "eee_eval.source_metadata.evaluator_relationship",
+                  "coverage_type": "full",
+                  "score": 1.0
+                },
+                {
+                  "field_path": "evalcards.lifecycle_status",
+                  "coverage_type": "reserved",
+                  "score": 0.0
+                },
+                {
+                  "field_path": "evalcards.preregistration_url",
+                  "coverage_type": "reserved",
+                  "score": 0.0
+                }
+              ],
+              "signal_version": "1.0"
+            },
+            "benchmark_comparability": {
+              "variant_divergence_groups": [],
+              "cross_party_divergence_groups": []
+            }
+          }
+        },
+        "reproducibility_summary": {
+          "results_total": 1,
+          "has_reproducibility_gap_count": 1,
+          "populated_ratio_avg": 0.0
+        },
+        "provenance_summary": {
+          "total_results": 1,
+          "total_groups": 1,
+          "multi_source_groups": 0,
+          "first_party_only_groups": 1,
+          "source_type_distribution": {
+            "first_party": 1,
+            "third_party": 0,
+            "collaborative": 0,
+            "unspecified": 0
+          }
+        },
+        "comparability_summary": {
+          "total_groups": 1,
+          "groups_with_variant_check": 0,
+          "groups_with_cross_party_check": 0,
+          "variant_divergent_count": 0,
+          "cross_party_divergent_count": 0
+        },
+        "metrics": [
+          {
+            "metric_summary_id": "llm_stats_aime_2026_score",
+            "legacy_eval_summary_id": "llm_stats_llm_stats_aime_2026",
+            "evaluation_name": "llm_stats.aime-2026",
+            "display_name": "Aime 2026 / Score",
+            "canonical_display_name": "Aime 2026 / Score",
+            "benchmark_leaf_key": "aime_2026",
+            "benchmark_leaf_name": "Aime 2026",
+            "slice_key": null,
+            "slice_name": null,
+            "lower_is_better": false,
+            "metric_name": "Score",
+            "metric_id": "llm_stats.aime-2026.score",
+            "metric_key": "score",
+            "metric_source": "metric_config",
+            "metric_config": {
+              "evaluation_description": "All 30 problems from the 2026 American Invitational Mathematics Examination (AIME I and AIME II), testing olympiad-level mathematical reasoning with integer answers from 000-999. Used as an AI benchmark to evaluate large language models' ability to solve complex mathematical problems requiring multi-step logical deductions and structured symbolic reasoning.",
+              "metric_id": "llm_stats.aime-2026.score",
+              "metric_name": "AIME 2026 score",
+              "metric_kind": "benchmark_score",
+              "metric_unit": "proportion",
+              "lower_is_better": false,
+              "score_type": "continuous",
+              "min_score": 0.0,
+              "max_score": 1.0,
+              "additional_details": {
+                "raw_benchmark_id": "aime-2026",
+                "raw_score_field": "score",
+                "bound_strategy": "inferred_proportion",
+                "raw_name": "AIME 2026",
+                "raw_categories": "[\"math\",\"reasoning\"]",
+                "raw_modality": "text",
+                "raw_verified": "false",
+                "raw_model_count": "12"
+              }
+            },
+            "models_count": 1,
+            "top_score": 0.883,
+            "model_results": [
+              {
+                "model_id": "bytedance/seed-2-0-lite",
+                "model_route_id": "bytedance__seed-2-0-lite",
+                "model_name": "Seed 2.0 Lite",
+                "developer": "bytedance",
+                "variant_key": "default",
+                "raw_model_id": "bytedance/seed-2.0-lite",
+                "score": 0.883,
+                "evaluation_id": "llm-stats/first_party/bytedance_seed-2.0-lite/1777108064.422824",
+                "retrieved_timestamp": "1777108064.422824",
+                "source_metadata": {
+                  "source_name": "LLM Stats API: first_party scores",
+                  "source_type": "documentation",
+                  "source_organization_name": "LLM Stats",
+                  "source_organization_url": "https://llm-stats.com/",
+                  "evaluator_relationship": "first_party",
+                  "additional_details": {
+                    "models_endpoint": "https://api.llm-stats.com/v1/models",
+                    "benchmarks_endpoint": "https://api.llm-stats.com/leaderboard/benchmarks",
+                    "scores_endpoint": "https://api.llm-stats.com/v1/scores",
+                    "scores_endpoint_fallback": "https://api.llm-stats.com/leaderboard/benchmarks/{benchmark_id}",
+                    "developer_page_url": "https://llm-stats.com/developer",
+                    "attribution_url": "https://llm-stats.com/",
+                    "attribution_required": "true",
+                    "source_role": "aggregator"
+                  }
+                },
+                "source_data": {
+                  "dataset_name": "AIME 2026",
+                  "source_type": "url",
+                  "url": [
+                    "https://llm-stats.com/models/seed-2.0-lite",
+                    "https://llm-stats.com/benchmarks/aime-2026",
+                    "https://api.llm-stats.com/leaderboard/benchmarks/aime-2026"
+                  ],
+                  "additional_details": {
+                    "raw_benchmark_id": "aime-2026",
+                    "raw_model_id": "seed-2.0-lite",
+                    "source_role": "aggregator"
+                  }
+                },
+                "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/bytedance__seed-2-0-lite/llm_stats_first_party_bytedance_seed_2_0_lite_1777108064_422824.json",
+                "detailed_evaluation_results": null,
+                "detailed_evaluation_results_meta": null,
+                "passthrough_top_level_fields": null,
+                "instance_level_data": null,
+                "normalized_result": {
+                  "benchmark_family_key": "llm_stats",
+                  "benchmark_family_name": "AIME 2026",
+                  "benchmark_parent_key": "llm_stats",
+                  "benchmark_parent_name": "AIME 2026",
+                  "benchmark_component_key": "aime_2026",
+                  "benchmark_component_name": "Aime 2026",
+                  "benchmark_leaf_key": "aime_2026",
+                  "benchmark_leaf_name": "Aime 2026",
+                  "slice_key": null,
+                  "slice_name": null,
+                  "metric_name": "Score",
+                  "metric_id": "llm_stats.aime-2026.score",
+                  "metric_key": "score",
+                  "metric_source": "metric_config",
+                  "display_name": "Aime 2026 / Score",
+                  "canonical_display_name": "Aime 2026 / Score",
+                  "raw_evaluation_name": "llm_stats.aime-2026",
+                  "is_summary_score": false
+                },
+                "evalcards": {
+                  "annotations": {
+                    "reproducibility_gap": {
+                      "has_reproducibility_gap": true,
+                      "missing_fields": [
+                        "temperature",
+                        "max_tokens"
+                      ],
+                      "required_field_count": 2,
+                      "populated_field_count": 0,
+                      "signal_version": "1.0"
+                    },
+                    "provenance": {
+                      "source_type": "first_party",
+                      "is_multi_source": false,
+                      "first_party_only": true,
+                      "distinct_reporting_organizations": 1,
+                      "signal_version": "1.0"
+                    },
+                    "variant_divergence": null,
+                    "cross_party_divergence": null
+                  }
+                }
+              }
+            ]
+          }
+        ],
+        "subtasks": [],
+        "models_count": 1,
+        "top_score": 0.883,
+        "instance_data": {
+          "available": false,
+          "url_count": 0,
+          "sample_urls": [],
+          "models_with_loaded_instances": 0
+        }
+      }
+    ]
+  },
+  "total_evaluations": 1,
+  "last_updated": "2026-04-25T09:07:44.422824Z",
+  "categories_covered": [
+    "coding",
+    "other"
+  ],
+  "variants": [
+    {
+      "variant_key": "default",
+      "variant_label": "Default",
+      "evaluation_count": 1,
+      "raw_model_ids": [
+        "bytedance/seed-2.0-lite"
+      ],
+      "last_updated": "2026-04-25T09:07:44.422824Z"
+    }
+  ],
+  "reproducibility_summary": {
+    "results_total": 2,
+    "has_reproducibility_gap_count": 2,
+    "populated_ratio_avg": 0.0
+  },
+  "provenance_summary": {
+    "total_results": 2,
+    "total_groups": 2,
+    "multi_source_groups": 0,
+    "first_party_only_groups": 2,
+    "source_type_distribution": {
+      "first_party": 2,
+      "third_party": 0,
+      "collaborative": 0,
+      "unspecified": 0
+    }
+  },
+  "comparability_summary": {
+    "total_groups": 2,
+    "groups_with_variant_check": 0,
+    "groups_with_cross_party_check": 0,
+    "variant_divergent_count": 0,
+    "cross_party_divergent_count": 0
+  }
+}

tests/fixtures/models/google__gemini-3-flash.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/fixtures/models/openai__gpt-5-2-pro.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/pipeline-contract.test.ts ADDED Viewed

	@@ -0,0 +1,284 @@

+import { readdirSync } from "fs"
+import path from "path"
+import { fileURLToPath } from "url"
+import { describe, expect, it } from "vitest"
+import type { HFEvalDetail, HFEvalModelResult, HFModelDetail, HFModelCardEntry } from "../lib/hf-data"
+import { flattenModelEvaluations } from "../lib/hf-data"
+import { fixtureEntries, loadAllFixtures, walkHierarchyResults } from "./fixtures/loader"
+const FIXTURES_DIR = path.join(path.dirname(fileURLToPath(import.meta.url)), "fixtures")
+// Tier A — pipeline contract tests.
+//
+// These tests assert that the pipeline-emitted artifacts in tests/fixtures/
+// carry every field the TS code depends on. They run against PINNED fixtures,
+// not the live cache, so an upstream data refresh doesn't make these flap.
+//
+// To check the live cache for drift instead, see tests/upstream-drift.test.ts.
+//
+// When adding a deletion that depends on a new pipeline guarantee, add a
+// contract here first. Each contract should fail loudly with the offending
+// file path + key path so violations are easy to fix.
+const KNOWN_PIPELINE_CATEGORY_KEYS = new Set([
+  "agentic",
+  "reasoning",
+  "general",
+  "safety",
+  "knowledge",
+  "other",
+  "coding",
+  "instruction_following",
+  "language_understanding",
+])
+const VALID_EVALUATOR_RELATIONSHIPS = new Set(["first_party", "third_party", "other"])
+interface Violation {
+  fixture: string
+  path: string
+  detail: string
+}
+describe("Tier A — pipeline contracts (model files)", () => {
+  const models = loadAllFixtures<HFModelDetail>("models")
+  it("every model_result carries source_metadata", () => {
+    const violations: Violation[] = []
+    for (const { id, data } of models) {
+      for (const { result, path } of walkHierarchyResults<HFEvalModelResult>(data, id)) {
+        if (!result.source_metadata) {
+          violations.push({ fixture: id, path, detail: "missing source_metadata" })
+        }
+      }
+    }
+    expect(violations, formatViolations(violations)).toEqual([])
+  })
+  it("every source_metadata.evaluator_relationship is in the known set", () => {
+    const violations: Violation[] = []
+    for (const { id, data } of models) {
+      for (const { result, path } of walkHierarchyResults<HFEvalModelResult>(data, id)) {
+        const rel = result.source_metadata?.evaluator_relationship
+        if (rel != null && !VALID_EVALUATOR_RELATIONSHIPS.has(rel)) {
+          violations.push({ fixture: id, path, detail: `unknown evaluator_relationship=${rel}` })
+        }
+      }
+    }
+    expect(violations, formatViolations(violations)).toEqual([])
+  })
+  it("every hierarchy_by_category key is in PIPELINE_CATEGORY_MAP", () => {
+    const violations: Violation[] = []
+    for (const { id, data } of models) {
+      for (const key of Object.keys(data.hierarchy_by_category ?? {})) {
+        if (!KNOWN_PIPELINE_CATEGORY_KEYS.has(key.toLowerCase())) {
+          violations.push({ fixture: id, path: `hierarchy_by_category.${key}`, detail: "unknown category key" })
+        }
+      }
+    }
+    expect(violations, formatViolations(violations)).toEqual([])
+  })
+  it("every model_result.retrieved_timestamp parses as a valid Date", () => {
+    const violations: Violation[] = []
+    for (const { id, data } of models) {
+      for (const { result, path } of walkHierarchyResults<HFEvalModelResult>(data, id)) {
+        const ts = result.retrieved_timestamp
+        if (ts == null) continue
+        // Pipeline emits either ISO strings or unix-seconds-as-string.
+        const numeric = Number.parseFloat(ts)
+        const isNumeric = Number.isFinite(numeric) && !ts.includes("-")
+        const dateValue = isNumeric ? new Date(numeric * 1000) : new Date(ts)
+        if (Number.isNaN(dateValue.getTime())) {
+          violations.push({ fixture: id, path: `${path}.retrieved_timestamp`, detail: `unparseable: ${ts}` })
+        }
+      }
+    }
+    expect(violations, formatViolations(violations)).toEqual([])
+  })
+  it("model card has model_family_id matching pipelineSlugify(model_family_id) → model_route_id", () => {
+    const violations: Violation[] = []
+    for (const { id, data } of models) {
+      if (!data.model_family_id) {
+        violations.push({ fixture: id, path: "model_family_id", detail: "missing" })
+        continue
+      }
+      const expected = data.model_family_id.replace(/\//g, "__")
+      if (data.model_route_id !== expected) {
+        violations.push({
+          fixture: id,
+          path: "model_route_id",
+          detail: `${data.model_route_id} !== ${expected} (derived from ${data.model_family_id})`,
+        })
+      }
+    }
+    expect(violations, formatViolations(violations)).toEqual([])
+  })
+  it("flattenModelEvaluations output has source_metadata on every evaluation (cross-check)", () => {
+    const violations: Violation[] = []
+    for (const { id, data } of models) {
+      const evaluations = flattenModelEvaluations(data)
+      for (const [idx, evalEntry] of evaluations.entries()) {
+        if (!evalEntry.source_metadata) {
+          violations.push({
+            fixture: id,
+            path: `flattenModelEvaluations(${id})[${idx}]`,
+            detail: "missing source_metadata after flatten",
+          })
+        }
+      }
+    }
+    expect(violations, formatViolations(violations)).toEqual([])
+  })
+})
+describe("Tier A — pipeline contracts (eval-detail files)", () => {
+  const evals = loadAllFixtures<HFEvalDetail>("evals")
+  it("every eval-detail has eval_summary_id, benchmark, benchmark_leaf_name", () => {
+    const violations: Violation[] = []
+    for (const { id, data } of evals) {
+      for (const field of ["eval_summary_id", "benchmark", "benchmark_leaf_name"] as const) {
+        if (!data[field]) {
+          violations.push({ fixture: id, path: field, detail: "missing or empty" })
+        }
+      }
+    }
+    expect(violations, formatViolations(violations)).toEqual([])
+  })
+  it("every eval-detail has category as a non-empty string", () => {
+    const violations: Violation[] = []
+    for (const { id, data } of evals) {
+      if (typeof data.category !== "string" || data.category.length === 0) {
+        violations.push({ fixture: id, path: "category", detail: `not a non-empty string: ${data.category}` })
+      }
+    }
+    expect(violations, formatViolations(violations)).toEqual([])
+  })
+  it("every model_result in eval-detail metrics carries source_metadata", () => {
+    const violations: Violation[] = []
+    for (const { id, data } of evals) {
+      for (const [metricIdx, metric] of (data.metrics ?? []).entries()) {
+        for (const [resultIdx, mr] of (metric.model_results ?? []).entries()) {
+          if (!mr.source_metadata) {
+            violations.push({
+              fixture: id,
+              path: `metrics[${metricIdx}].model_results[${resultIdx}]`,
+              detail: "missing source_metadata",
+            })
+          }
+        }
+      }
+    }
+    expect(violations, formatViolations(violations)).toEqual([])
+  })
+  it("every metric has metric_summary_id and metric_name", () => {
+    const violations: Violation[] = []
+    for (const { id, data } of evals) {
+      for (const [metricIdx, metric] of (data.metrics ?? []).entries()) {
+        if (!metric.metric_summary_id) {
+          violations.push({ fixture: id, path: `metrics[${metricIdx}].metric_summary_id`, detail: "missing" })
+        }
+        if (!metric.metric_name) {
+          violations.push({ fixture: id, path: `metrics[${metricIdx}].metric_name`, detail: "missing" })
+        }
+      }
+    }
+    expect(violations, formatViolations(violations)).toEqual([])
+  })
+})
+describe("Tier A — pipeline contracts (model card list entries)", () => {
+  const cards = loadAllFixtures<HFModelCardEntry>("model_cards")
+  it("model card has model_route_id === pipelineSlugify(model_family_id)", () => {
+    const violations: Violation[] = []
+    for (const { id, data } of cards) {
+      if (!data.model_family_id) {
+        violations.push({ fixture: id, path: "model_family_id", detail: "missing" })
+        continue
+      }
+      const expected = data.model_family_id.replace(/\//g, "__")
+      if (data.model_route_id !== expected) {
+        violations.push({
+          fixture: id,
+          path: "model_route_id",
+          detail: `${data.model_route_id} !== ${expected}`,
+        })
+      }
+    }
+    expect(violations, formatViolations(violations)).toEqual([])
+  })
+})
+describe("Tier A — pipeline contracts (developer files)", () => {
+  const developers = loadAllFixtures<{ developer: string; models: HFModelCardEntry[] }>("developers")
+  it("every developer payload has developer + models[]", () => {
+    const violations: Violation[] = []
+    for (const { id, data } of developers) {
+      if (!data.developer) violations.push({ fixture: id, path: "developer", detail: "missing" })
+      if (!Array.isArray(data.models)) violations.push({ fixture: id, path: "models", detail: "not an array" })
+    }
+    expect(violations, formatViolations(violations)).toEqual([])
+  })
+  it("every model in developer.models has model_family_id", () => {
+    const violations: Violation[] = []
+    for (const { id, data } of developers) {
+      for (const [modelIdx, model] of (data.models ?? []).entries()) {
+        if (!model.model_family_id) {
+          violations.push({ fixture: id, path: `models[${modelIdx}].model_family_id`, detail: "missing" })
+        }
+      }
+    }
+    expect(violations, formatViolations(violations)).toEqual([])
+  })
+})
+describe("Tier A — fixture inventory", () => {
+  // Catches both directions: (a) a fixture file exists that isn't in the
+  // manifest (stale/unreferenced and won't be exercised by snapshot tests),
+  // (b) a manifest entry references a missing file. The "manifest entry
+  // resolves to a readable file" check from earlier was redundant with the
+  // 14 contract tests above (which all call loadAllFixtures at module
+  // scope), but the file→manifest direction was uncovered.
+  it("fixture files match the manifest exactly (no orphans, no missing)", () => {
+    const groupsAndDirs = [
+      ["evals", "evals"],
+      ["models", "models"],
+      ["developers", "developers"],
+      ["model_cards", "model-cards"],
+    ] as const
+    const orphans: string[] = []
+    const missing: string[] = []
+    for (const [group, dirName] of groupsAndDirs) {
+      const dir = path.join(FIXTURES_DIR, dirName)
+      const onDisk = new Set(readdirSync(dir).filter((f) => f.endsWith(".json")))
+      const inManifest = new Set(fixtureEntries(group).map((e) => `${e.id}.json`))
+      for (const f of onDisk) if (!inManifest.has(f)) orphans.push(`${group}/${f}`)
+      for (const f of inManifest) if (!onDisk.has(f)) missing.push(`${group}/${f}`)
+    }
+    expect({ orphans, missing }).toEqual({ orphans: [], missing: [] })
+  })
+})
+function formatViolations(violations: Violation[]): string {
+  if (violations.length === 0) return ""
+  const sample = violations.slice(0, 10)
+  const more = violations.length > 10 ? `\n  …and ${violations.length - 10} more` : ""
+  return [
+    `\n${violations.length} contract violation(s):`,
+    ...sample.map((v) => `  ${v.fixture} :: ${v.path} — ${v.detail}`),
+    more,
+  ].join("\n")
+}

tests/upstream-drift.test.ts ADDED Viewed

	@@ -0,0 +1,129 @@

+import fs from "fs"
+import path from "path"
+import { describe, expect, it } from "vitest"
+import type { HFEvalDetail, HFEvalModelResult, HFModelDetail } from "../lib/hf-data"
+import { listLiveCacheFiles, loadLiveCacheFile, walkHierarchyResults } from "./fixtures/loader"
+// Tier A — drift detection variant. Runs the same shape of contracts as
+// pipeline-contract.test.ts but against the LIVE .cache/hf-data/ directory
+// rather than pinned fixtures. Opt-in via `pnpm test:drift`. NOT included in
+// the default `pnpm test` run because (a) it requires the cache to be primed
+// and (b) flapping on every upstream refresh defeats the purpose of the pin.
+//
+// Use this when:
+// - You suspect upstream has changed (the existing JSON path behaves oddly).
+// - Before a `pnpm refresh-fixtures` you want to know what the pin will see.
+// - As a periodic sanity check (CI nightly, manual).
+const KNOWN_PIPELINE_CATEGORY_KEYS = new Set([
+  "agentic", "reasoning", "general", "safety", "knowledge", "other",
+  "coding", "instruction_following", "language_understanding",
+])
+const VALID_EVALUATOR_RELATIONSHIPS = new Set(["first_party", "third_party", "other"])
+const modelFiles = listLiveCacheFiles("models")
+const evalFiles = listLiveCacheFiles("evals")
+// Drift checks are gated by RUN_DRIFT=1 (set by `pnpm test:drift`) so they
+// stay out of the default `pnpm test` run. They additionally need a populated
+// cache.
+const shouldRun = process.env.RUN_DRIFT === "1" && modelFiles.length > 0 && evalFiles.length > 0
+describe.skipIf(!shouldRun)(`Tier A drift — live cache contracts (${modelFiles.length} models, ${evalFiles.length} evals)`, () => {
+  it("every model_result in every model file carries source_metadata", () => {
+    let scanned = 0
+    let violations = 0
+    const examples: string[] = []
+    for (const file of modelFiles) {
+      const data = loadLiveCacheFile<HFModelDetail>("models", file)
+      for (const { result, path } of walkHierarchyResults<HFEvalModelResult>(data, file)) {
+        scanned += 1
+        if (!result.source_metadata) {
+          violations += 1
+          if (examples.length < 5) examples.push(path)
+        }
+      }
+    }
+    expect(violations, `${violations}/${scanned} rows lack source_metadata. Examples:\n  ${examples.join("\n  ")}`).toBe(0)
+  })
+  it("every hierarchy_by_category key across all models is in PIPELINE_CATEGORY_MAP", () => {
+    const unknown = new Map<string, number>()
+    for (const file of modelFiles) {
+      const data = loadLiveCacheFile<HFModelDetail>("models", file)
+      for (const key of Object.keys(data.hierarchy_by_category ?? {})) {
+        if (!KNOWN_PIPELINE_CATEGORY_KEYS.has(key.toLowerCase())) {
+          unknown.set(key, (unknown.get(key) ?? 0) + 1)
+        }
+      }
+    }
+    const summary = Array.from(unknown.entries()).map(([k, n]) => `${k}=${n}`).join(", ")
+    expect(unknown.size, `Unknown keys (key=count): ${summary}`).toBe(0)
+  })
+  it("every model_result in every eval-detail carries source_metadata", () => {
+    let scanned = 0
+    let violations = 0
+    const examples: string[] = []
+    for (const file of evalFiles) {
+      const data = loadLiveCacheFile<HFEvalDetail>("evals", file)
+      for (const [metricIdx, metric] of (data.metrics ?? []).entries()) {
+        for (const [resultIdx, mr] of (metric.model_results ?? []).entries()) {
+          scanned += 1
+          if (!mr.source_metadata) {
+            violations += 1
+            if (examples.length < 5) examples.push(`${file} metrics[${metricIdx}].model_results[${resultIdx}]`)
+          }
+        }
+      }
+    }
+    expect(violations, `${violations}/${scanned} eval-detail rows lack source_metadata. Examples:\n  ${examples.join("\n  ")}`).toBe(0)
+  })
+  it("every eval-detail has a non-empty category", () => {
+    let violations = 0
+    const examples: string[] = []
+    for (const file of evalFiles) {
+      const data = loadLiveCacheFile<HFEvalDetail>("evals", file)
+      if (typeof data.category !== "string" || data.category.length === 0) {
+        violations += 1
+        if (examples.length < 5) examples.push(file)
+      }
+    }
+    expect(violations, `${violations} eval-details without category. Examples: ${examples.join(", ")}`).toBe(0)
+  })
+  it("every model card has model_route_id === pipelineSlugify(model_family_id)", () => {
+    let violations = 0
+    const examples: string[] = []
+    const cardsPath = path.resolve(import.meta.dirname, "..", ".cache", "hf-data", "model-cards.json")
+    const cards = JSON.parse(fs.readFileSync(cardsPath, "utf8"))
+    for (const card of cards) {
+      const expected = (card.model_family_id || "").replace(/\//g, "__")
+      if (card.model_route_id !== expected) {
+        violations += 1
+        if (examples.length < 5) examples.push(`${card.model_route_id} (expected ${expected})`)
+      }
+    }
+    expect(violations, `${violations}/${cards.length} mismatches. Examples: ${examples.join(", ")}`).toBe(0)
+  })
+  it("every source_metadata.evaluator_relationship is in {first_party, third_party, other}", () => {
+    const counts = new Map<string, number>()
+    for (const file of evalFiles) {
+      const data = loadLiveCacheFile<HFEvalDetail>("evals", file)
+      for (const metric of data.metrics ?? []) {
+        for (const mr of metric.model_results ?? []) {
+          const rel = mr.source_metadata?.evaluator_relationship
+          if (rel != null && !VALID_EVALUATOR_RELATIONSHIPS.has(rel)) {
+            counts.set(rel, (counts.get(rel) ?? 0) + 1)
+          }
+        }
+      }
+    }
+    const summary = Array.from(counts.entries()).map(([k, n]) => `${k}=${n}`).join(", ")
+    expect(counts.size, `Unknown evaluator_relationship values: ${summary}`).toBe(0)
+  })
+})