Jenny Chim Claude Opus 4.7 (1M context) commited on
Commit
d3cbe09
·
1 Parent(s): 2fcae3f

Add three-tier test infrastructure for migration safety

Browse files

Tier A — pipeline contract tests (tests/pipeline-contract.test.ts, ~14
tests against pinned tests/fixtures/) assert that every field the TS code
depends on is present in upstream artifacts. tests/upstream-drift.test.ts
runs the same shape against the live .cache/hf-data/ (5830 models, 587
evals); opt-in via `pnpm test:drift` (RUN_DRIFT=1 env gate) so it doesn't
flap on every upstream refresh.

Tier B — adapter snapshot tests (tests/adapters.test.ts) snapshot the
output of hfModelCardToEvaluationCardData, hfEvalDetailToSummary,
flattenModelEvaluations, and hfDeveloperDetailToSummary against ~17
hand-curated fixtures. Large outputs use a digest (count + distinct sets +
sha256 of full output) so the snapshot stays reviewable.

Tier C — full-cache differential audit (scripts/audit-adapters.mjs) runs
every adapter against either pinned fixtures or the full live cache,
produces a deterministic JSON digest, and supports `--diff` mode for
before/after comparison of a code change. Catches distribution shifts that
fixture-based tests can't surface.

Fixture management: tests/fixtures/manifest.json catalogs each fixture
with a `why` annotation explaining the code path it exercises (multi-
variant model, first-party Mercor, third-party Artificial Analysis,
Safety regression-bait, coding hierarchy key, etc). `pnpm refresh-fixtures`
re-pins from .cache/hf-data/. Each fixture is small enough to review in a
PR diff. The fixture-vs-manifest consistency test catches both orphan
files and missing files.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

scripts/audit-adapters.mjs ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env node
2
+ // Tier C — full-cache differential audit.
3
+ //
4
+ // Runs every major adapter against either pinned fixtures or the live HF
5
+ // cache, produces a deterministic JSON digest (per-adapter outputs_count,
6
+ // outputs_hash, field distributions, invariant violation counts), and
7
+ // supports a diff mode to compare two digests side-by-side.
8
+ //
9
+ // Usage:
10
+ // node scripts/audit-adapters.mjs --output baseline.json # capture digest
11
+ // node scripts/audit-adapters.mjs --output candidate.json # after a change
12
+ // node scripts/audit-adapters.mjs --diff baseline.json candidate.json
13
+ // node scripts/audit-adapters.mjs --against tests/fixtures # use pinned set
14
+ // node scripts/audit-adapters.mjs --against .cache/hf-data --output live.json
15
+ //
16
+ // Default --against is .cache/hf-data (the full production cache snapshot).
17
+ // `--against tests/fixtures` falls back to manifest-listed IDs only.
18
+ //
19
+ // The script imports the same adapter functions the runtime uses, so output
20
+ // changes when either adapter logic changes OR input data changes. Use diff
21
+ // mode to separate the two: re-run with the same --against before and after a
22
+ // code change, diff the digests.
23
+
24
+ import "./server-only-shim.mjs"
25
+
26
+ import { promises as fs } from "fs"
27
+ import { createHash } from "crypto"
28
+ import path from "path"
29
+
30
+ const ROOT = path.resolve(import.meta.dirname, "..")
31
+
32
+ const args = parseArgs(process.argv.slice(2))
33
+
34
+ if (args.diff) {
35
+ const [baselinePath, candidatePath] = args.diff
36
+ await runDiff(baselinePath, candidatePath)
37
+ process.exit(0)
38
+ }
39
+
40
+ const sourceDir = path.resolve(ROOT, args.against ?? ".cache/hf-data")
41
+ await ensureDir(sourceDir)
42
+
43
+ console.log(`[audit] reading from ${sourceDir}`)
44
+
45
+ // Lazy-load adapters AFTER tsx is registered.
46
+ const { flattenModelEvaluations } = await import("../lib/hf-data.ts")
47
+ const {
48
+ hfModelCardToEvaluationCardData,
49
+ hfEvalDetailToSummary,
50
+ hfDeveloperDetailToSummary,
51
+ } = await import("../lib/model-data.ts")
52
+
53
+ const { evals, models, developers, modelCards } = await loadInputs(sourceDir, args.against === "tests/fixtures")
54
+
55
+ console.log(`[audit] inputs: ${evals.length} evals, ${models.length} models, ${developers.length} developers, ${modelCards.length} model cards`)
56
+
57
+ const digest = {
58
+ version: 1,
59
+ source: args.against ?? ".cache/hf-data",
60
+ generated_at: new Date().toISOString(),
61
+ inputs: {
62
+ evals: evals.length,
63
+ models: models.length,
64
+ developers: developers.length,
65
+ model_cards: modelCards.length,
66
+ },
67
+ adapters: {
68
+ hfModelCardToEvaluationCardData: auditAdapter(modelCards, (entry) => entry.model_route_id, hfModelCardToEvaluationCardData, {
69
+ categorical: ["developer"],
70
+ numeric: ["evaluations_count", "benchmarks_count", "variant_count", "evaluator_count"],
71
+ }),
72
+ hfEvalDetailToSummary: auditAdapter(evals, (entry) => entry.eval_summary_id, hfEvalDetailToSummary, {
73
+ categorical: ["category"],
74
+ numeric: ["models_count", "metrics_count", "subtasks_count"],
75
+ }),
76
+ flattenModelEvaluations: auditAdapter(models, (entry) => entry.model_route_id, (input) => {
77
+ // Hash the FULL evaluations (so a score/timestamp/metric_name change
78
+ // is detected), but project to a small set of fields for distribution
79
+ // tracking (so the per-field histograms stay readable).
80
+ return flattenModelEvaluations(input)
81
+ }, {
82
+ categorical: ["category"],
83
+ numeric: [],
84
+ arrayOutput: true,
85
+ // Pull these from a nested field for distribution tracking only — they
86
+ // don't affect hashing because the full output is hashed via the items
87
+ // themselves.
88
+ categoricalGetters: {
89
+ evaluator_relationship: (e) => e.source_metadata?.evaluator_relationship,
90
+ benchmark_family_key: (e) => e.benchmark_family_key,
91
+ },
92
+ }),
93
+ hfDeveloperDetailToSummary: auditAdapter(developers, (entry) => entry.developer, hfDeveloperDetailToSummary, {
94
+ categorical: ["developer"],
95
+ numeric: ["model_count", "benchmark_count", "evaluation_count"],
96
+ }),
97
+ },
98
+ }
99
+
100
+ if (args.output) {
101
+ await fs.writeFile(args.output, `${JSON.stringify(digest, null, 2)}\n`)
102
+ console.log(`[audit] wrote ${args.output}`)
103
+ } else {
104
+ console.log(JSON.stringify(digest, null, 2))
105
+ }
106
+
107
+ // -----------------------------------------------------------------------------
108
+
109
+ function auditAdapter(inputs, getId, adapter, opts) {
110
+ const fieldValues = {}
111
+ for (const field of opts.categorical) fieldValues[field] = new Map()
112
+ for (const field of opts.numeric) fieldValues[field] = []
113
+ const getters = opts.categoricalGetters ?? {}
114
+ for (const field of Object.keys(getters)) fieldValues[field] = new Map()
115
+
116
+ let outputsHash = createHash("sha256")
117
+ let throws = 0
118
+ const throwsExamples = []
119
+ let outputsCount = 0
120
+
121
+ for (const input of inputs) {
122
+ const id = getId(input) ?? "<no-id>"
123
+ let output
124
+ try {
125
+ output = adapter(input)
126
+ } catch (err) {
127
+ throws += 1
128
+ if (throwsExamples.length < 5) {
129
+ throwsExamples.push({ id, error: err instanceof Error ? err.message : String(err) })
130
+ }
131
+ continue
132
+ }
133
+
134
+ const items = opts.arrayOutput ? output : [output]
135
+ outputsCount += opts.arrayOutput ? items.length : 1
136
+
137
+ for (const item of items) {
138
+ // Hash the full item for change-detection — every leaf value contributes.
139
+ outputsHash.update(JSON.stringify(stableSort(item)))
140
+
141
+ for (const field of opts.categorical) {
142
+ const v = String(item?.[field] ?? "<missing>")
143
+ const counts = fieldValues[field]
144
+ counts.set(v, (counts.get(v) ?? 0) + 1)
145
+ }
146
+ for (const field of opts.numeric) {
147
+ const v = item?.[field]
148
+ if (typeof v === "number" && Number.isFinite(v)) fieldValues[field].push(v)
149
+ }
150
+ for (const [field, getter] of Object.entries(getters)) {
151
+ const v = String(getter(item) ?? "<missing>")
152
+ fieldValues[field].set(v, (fieldValues[field].get(v) ?? 0) + 1)
153
+ }
154
+ }
155
+ }
156
+
157
+ const distributions = {}
158
+ for (const field of [...opts.categorical, ...Object.keys(getters)]) {
159
+ distributions[field] = Object.fromEntries(
160
+ [...fieldValues[field].entries()].sort(([a], [b]) => a.localeCompare(b))
161
+ )
162
+ }
163
+ for (const field of opts.numeric) {
164
+ const arr = fieldValues[field]
165
+ if (arr.length === 0) {
166
+ distributions[field] = { count: 0 }
167
+ continue
168
+ }
169
+ const sorted = [...arr].sort((a, b) => a - b)
170
+ distributions[field] = {
171
+ count: arr.length,
172
+ sum: sorted.reduce((a, b) => a + b, 0),
173
+ min: sorted[0],
174
+ max: sorted[sorted.length - 1],
175
+ median: sorted[Math.floor(sorted.length / 2)],
176
+ }
177
+ }
178
+
179
+ return {
180
+ inputs_count: inputs.length,
181
+ outputs_count: outputsCount,
182
+ outputs_hash: `sha256:${outputsHash.digest("hex").slice(0, 16)}`,
183
+ throws,
184
+ throws_examples: throwsExamples,
185
+ field_distributions: distributions,
186
+ }
187
+ }
188
+
189
+ function stableSort(value) {
190
+ if (Array.isArray(value)) return value.map(stableSort)
191
+ if (value && typeof value === "object") {
192
+ return Object.fromEntries(
193
+ Object.entries(value)
194
+ .sort(([a], [b]) => a.localeCompare(b))
195
+ .map(([k, v]) => [k, stableSort(v)])
196
+ )
197
+ }
198
+ return value
199
+ }
200
+
201
+ async function loadInputs(sourceDir, isPinnedFixtures) {
202
+ if (isPinnedFixtures) {
203
+ return loadFromFixtures(sourceDir)
204
+ }
205
+ return loadFromCache(sourceDir)
206
+ }
207
+
208
+ async function loadFromFixtures(sourceDir) {
209
+ const manifest = JSON.parse(await fs.readFile(path.join(sourceDir, "manifest.json"), "utf8"))
210
+ const groups = { evals: [], models: [], developers: [], modelCards: [] }
211
+ for (const entry of manifest.evals ?? []) {
212
+ groups.evals.push(JSON.parse(await fs.readFile(path.join(sourceDir, "evals", `${entry.id}.json`), "utf8")))
213
+ }
214
+ for (const entry of manifest.models ?? []) {
215
+ groups.models.push(JSON.parse(await fs.readFile(path.join(sourceDir, "models", `${entry.id}.json`), "utf8")))
216
+ }
217
+ for (const entry of manifest.developers ?? []) {
218
+ groups.developers.push(JSON.parse(await fs.readFile(path.join(sourceDir, "developers", `${entry.id}.json`), "utf8")))
219
+ }
220
+ for (const entry of manifest.model_cards ?? []) {
221
+ groups.modelCards.push(JSON.parse(await fs.readFile(path.join(sourceDir, "model-cards", `${entry.id}.json`), "utf8")))
222
+ }
223
+ return groups
224
+ }
225
+
226
+ async function loadFromCache(sourceDir) {
227
+ const evalFiles = await fs.readdir(path.join(sourceDir, "evals")).catch(() => [])
228
+ const modelFiles = await fs.readdir(path.join(sourceDir, "models")).catch(() => [])
229
+ const developerFiles = await fs.readdir(path.join(sourceDir, "developers")).catch(() => [])
230
+ const modelCardsRaw = await fs.readFile(path.join(sourceDir, "model-cards.json"), "utf8").catch(() => "[]")
231
+
232
+ const groups = { evals: [], models: [], developers: [], modelCards: [] }
233
+ for (const file of evalFiles) {
234
+ if (!file.endsWith(".json")) continue
235
+ groups.evals.push(JSON.parse(await fs.readFile(path.join(sourceDir, "evals", file), "utf8")))
236
+ }
237
+ for (const file of modelFiles) {
238
+ if (!file.endsWith(".json")) continue
239
+ groups.models.push(JSON.parse(await fs.readFile(path.join(sourceDir, "models", file), "utf8")))
240
+ }
241
+ for (const file of developerFiles) {
242
+ if (!file.endsWith(".json")) continue
243
+ groups.developers.push(JSON.parse(await fs.readFile(path.join(sourceDir, "developers", file), "utf8")))
244
+ }
245
+ groups.modelCards = JSON.parse(modelCardsRaw)
246
+ return groups
247
+ }
248
+
249
+ async function runDiff(baselinePath, candidatePath) {
250
+ const baseline = JSON.parse(await fs.readFile(baselinePath, "utf8"))
251
+ const candidate = JSON.parse(await fs.readFile(candidatePath, "utf8"))
252
+
253
+ console.log(`baseline: ${baseline.source} @ ${baseline.generated_at}`)
254
+ console.log(`candidate: ${candidate.source} @ ${candidate.generated_at}`)
255
+ console.log()
256
+
257
+ const adapterNames = new Set([...Object.keys(baseline.adapters ?? {}), ...Object.keys(candidate.adapters ?? {})])
258
+ for (const name of [...adapterNames].sort()) {
259
+ const b = baseline.adapters?.[name]
260
+ const c = candidate.adapters?.[name]
261
+ if (!b || !c) {
262
+ console.log(`${name}: ${b ? "removed" : "added"}`)
263
+ continue
264
+ }
265
+
266
+ const lines = []
267
+ if (b.outputs_hash !== c.outputs_hash) lines.push(` hash: ${b.outputs_hash} → ${c.outputs_hash}`)
268
+ if (b.outputs_count !== c.outputs_count) lines.push(` outputs: ${b.outputs_count} → ${c.outputs_count}`)
269
+ if (b.throws !== c.throws) lines.push(` throws: ${b.throws} → ${c.throws}`)
270
+ if (c.throws > b.throws && c.throws_examples?.length > 0) {
271
+ lines.push(` new errors: ${c.throws_examples.slice(0, 3).map((e) => `${e.id}: ${e.error}`).join("; ")}`)
272
+ }
273
+
274
+ for (const field of new Set([...Object.keys(b.field_distributions ?? {}), ...Object.keys(c.field_distributions ?? {})])) {
275
+ const distA = b.field_distributions?.[field] ?? {}
276
+ const distB = c.field_distributions?.[field] ?? {}
277
+ const aText = JSON.stringify(distA)
278
+ const bText = JSON.stringify(distB)
279
+ if (aText === bText) continue
280
+ lines.push(` ${field}:`)
281
+ // Categorical: highlight added/removed/changed keys
282
+ if (distA && typeof distA === "object" && !("count" in distA)) {
283
+ const keys = new Set([...Object.keys(distA), ...Object.keys(distB)])
284
+ for (const k of [...keys].sort()) {
285
+ const va = distA[k]
286
+ const vb = distB[k]
287
+ if (va !== vb) lines.push(` ${k}: ${va ?? "—"} → ${vb ?? "—"}`)
288
+ }
289
+ } else {
290
+ // Numeric: show min/median/max
291
+ for (const stat of ["count", "min", "median", "max", "sum"]) {
292
+ if (distA[stat] !== distB[stat]) {
293
+ lines.push(` ${stat}: ${distA[stat]} → ${distB[stat]}`)
294
+ }
295
+ }
296
+ }
297
+ }
298
+
299
+ if (lines.length === 0) {
300
+ console.log(`${name}: no change`)
301
+ } else {
302
+ console.log(`${name}:`)
303
+ for (const line of lines) console.log(line)
304
+ }
305
+ console.log()
306
+ }
307
+ }
308
+
309
+ async function ensureDir(dir) {
310
+ await fs.access(dir).catch(() => {
311
+ throw new Error(`Source directory ${dir} not found.`)
312
+ })
313
+ }
314
+
315
+ function parseArgs(argv) {
316
+ const out = {}
317
+ for (let i = 0; i < argv.length; i++) {
318
+ const a = argv[i]
319
+ if (a === "--output") out.output = argv[++i]
320
+ else if (a === "--against") out.against = argv[++i]
321
+ else if (a === "--diff") {
322
+ out.diff = [argv[++i], argv[++i]]
323
+ } else if (a === "--live") out.against = ".cache/hf-data"
324
+ else if (a === "--help" || a === "-h") {
325
+ console.log(`Usage: node scripts/audit-adapters.mjs [options]
326
+ --output FILE write digest as JSON
327
+ --against PATH source dir (default: .cache/hf-data); pinned: tests/fixtures
328
+ --live shorthand for --against .cache/hf-data
329
+ --diff A B diff two previously-written digests`)
330
+ process.exit(0)
331
+ }
332
+ }
333
+ return out
334
+ }
scripts/refresh-fixtures.mjs ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env node
2
+ // Refresh tests/fixtures/{evals,models,developers}/ from .cache/hf-data/.
3
+ // Reads tests/fixtures/manifest.json for the curated ID list, copies each
4
+ // referenced file from the live cache, and bumps manifest.snapshot_ts.
5
+ //
6
+ // Workflow: pnpm refresh-fixtures → git diff tests/fixtures/ → review what
7
+ // upstream changed → pnpm test → if snapshots diff, decide intent → commit.
8
+ //
9
+ // Always re-pin everything (no incremental) so the snapshot is internally
10
+ // consistent. If the cache lacks a referenced file, fail loudly — most likely
11
+ // the manifest references a stale ID and either it should be updated or the
12
+ // cache is incomplete.
13
+
14
+ import { promises as fs } from "fs"
15
+ import path from "path"
16
+
17
+ const ROOT = path.resolve(import.meta.dirname, "..")
18
+ const CACHE = path.join(ROOT, ".cache", "hf-data")
19
+ const FIXTURES = path.join(ROOT, "tests", "fixtures")
20
+ const MANIFEST = path.join(FIXTURES, "manifest.json")
21
+
22
+ const manifest = JSON.parse(await fs.readFile(MANIFEST, "utf8"))
23
+ const sourceDir = path.resolve(ROOT, manifest.snapshot_source ?? ".cache/hf-data")
24
+ if (sourceDir !== CACHE) {
25
+ console.warn(`Note: manifest.snapshot_source = ${manifest.snapshot_source}, resolving to ${sourceDir}`)
26
+ }
27
+
28
+ await fs.access(sourceDir).catch(() => {
29
+ throw new Error(`Cache directory ${sourceDir} not found. Run \`pnpm cache-hf-data\` first.`)
30
+ })
31
+
32
+ let copied = 0
33
+ let removed = 0
34
+ const errors = []
35
+
36
+ // Detail files: copy whole file from cache subdirectory.
37
+ for (const [groupName, dirName] of [["evals", "evals"], ["models", "models"], ["developers", "developers"]]) {
38
+ const entries = manifest[groupName] ?? []
39
+ const targetDir = path.join(FIXTURES, dirName)
40
+ await fs.mkdir(targetDir, { recursive: true })
41
+
42
+ // Pin: only files in the manifest survive in tests/fixtures/<dir>/
43
+ const wanted = new Set(entries.map((entry) => `${entry.id}.json`))
44
+ const existing = await fs.readdir(targetDir).catch(() => [])
45
+ for (const file of existing) {
46
+ if (!wanted.has(file)) {
47
+ await fs.unlink(path.join(targetDir, file))
48
+ removed += 1
49
+ }
50
+ }
51
+
52
+ for (const entry of entries) {
53
+ const fileName = `${entry.id}.json`
54
+ const src = path.join(sourceDir, dirName, fileName)
55
+ const dst = path.join(targetDir, fileName)
56
+ try {
57
+ await fs.copyFile(src, dst)
58
+ copied += 1
59
+ } catch (err) {
60
+ errors.push({ group: groupName, id: entry.id, error: err instanceof Error ? err.message : String(err) })
61
+ }
62
+ }
63
+ }
64
+
65
+ // model_cards: extract individual entries from model-cards.json (the flat list).
66
+ const modelCardsManifest = manifest.model_cards ?? []
67
+ if (modelCardsManifest.length > 0) {
68
+ const targetDir = path.join(FIXTURES, "model-cards")
69
+ await fs.mkdir(targetDir, { recursive: true })
70
+ const wanted = new Set(modelCardsManifest.map((entry) => `${entry.id}.json`))
71
+ const existing = await fs.readdir(targetDir).catch(() => [])
72
+ for (const file of existing) {
73
+ if (!wanted.has(file)) {
74
+ await fs.unlink(path.join(targetDir, file))
75
+ removed += 1
76
+ }
77
+ }
78
+
79
+ const allCards = JSON.parse(await fs.readFile(path.join(sourceDir, "model-cards.json"), "utf8"))
80
+ const byRouteId = new Map(allCards.map((card) => [card.model_route_id, card]))
81
+
82
+ for (const entry of modelCardsManifest) {
83
+ const card = byRouteId.get(entry.id)
84
+ if (!card) {
85
+ errors.push({ group: "model_cards", id: entry.id, error: "model_route_id not found in model-cards.json" })
86
+ continue
87
+ }
88
+ const dst = path.join(targetDir, `${entry.id}.json`)
89
+ await fs.writeFile(dst, `${JSON.stringify(card, null, 2)}\n`)
90
+ copied += 1
91
+ }
92
+ }
93
+
94
+ if (errors.length > 0) {
95
+ console.error("Failed to copy the following fixtures (likely missing from local cache):")
96
+ for (const { group, id, error } of errors) {
97
+ console.error(` ${group}/${id}: ${error}`)
98
+ }
99
+ process.exit(1)
100
+ }
101
+
102
+ const updatedManifest = {
103
+ ...manifest,
104
+ snapshot_ts: new Date().toISOString(),
105
+ }
106
+ await fs.writeFile(MANIFEST, `${JSON.stringify(updatedManifest, null, 2)}\n`)
107
+
108
+ console.log(`Refreshed ${copied} fixture(s) from ${sourceDir} (removed ${removed} stale).`)
109
+ console.log(`snapshot_ts → ${updatedManifest.snapshot_ts}`)
110
+ console.log("\nNext: review `git diff tests/fixtures/`, run `pnpm test`, update snapshots with `pnpm test -- -u` if intentional.")
scripts/server-only-shim.mjs ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Monkey-patch require/import to no-op the `server-only` package, which
2
+ // throws when imported outside a Next.js Server Component context. This lets
3
+ // the audit script + adapter tests import lib/hf-data.ts and lib/model-data.ts
4
+ // directly. Same trick as tests/server-only-stub.ts but at the require/import
5
+ // resolution level instead of via vitest's alias config.
6
+
7
+ import { createRequire } from "node:module"
8
+ import Module from "node:module"
9
+
10
+ const require = createRequire(import.meta.url)
11
+ const original = Module.prototype.require
12
+ Module.prototype.require = function patchedRequire(specifier) {
13
+ if (specifier === "server-only") return {}
14
+ return original.apply(this, arguments)
15
+ }
tests/__snapshots__/adapters.test.ts.snap ADDED
The diff for this file is too large to render. See raw diff
 
tests/adapters.test.ts ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { describe, expect, it } from "vitest"
2
+
3
+ import type { HFEvalDetail, HFModelDetail, HFModelCardEntry } from "../lib/hf-data"
4
+ import { flattenModelEvaluations } from "../lib/hf-data"
5
+ import {
6
+ hfEvalDetailToSummary,
7
+ hfModelCardToEvaluationCardData,
8
+ hfDeveloperDetailToSummary,
9
+ } from "../lib/model-data"
10
+
11
+ import { fixtureEntries, loadFixture } from "./fixtures/loader"
12
+
13
+ // Tier B — adapter snapshot tests.
14
+ //
15
+ // Each adapter is run against every relevant fixture, and the output is
16
+ // snapshotted via vitest's toMatchSnapshot(). Initial snapshots are committed.
17
+ //
18
+ // Workflow:
19
+ // - Edit code → `pnpm test`
20
+ // - If snapshots match → no behavior change. Safe.
21
+ // - If snapshots differ → review the snap diff in tests/__snapshots__/
22
+ // alongside the code diff. If intentional behavior change, run
23
+ // `pnpm test -- -u` to update snapshots and commit them in the same PR.
24
+ // If unintentional, the test caught a regression — fix the code.
25
+ //
26
+ // New fixtures: add to tests/fixtures/manifest.json, run `pnpm refresh-fixtures`,
27
+ // run `pnpm test -- -u` to capture initial snapshots, commit fixtures + snaps
28
+ // together.
29
+
30
+ describe("hfModelCardToEvaluationCardData", () => {
31
+ for (const entry of fixtureEntries("model_cards")) {
32
+ it(`${entry.id} — ${entry.why}`, () => {
33
+ const input = loadFixture<HFModelCardEntry>("model_cards", entry.id)
34
+ expect(hfModelCardToEvaluationCardData(input)).toMatchSnapshot()
35
+ })
36
+ }
37
+ })
38
+
39
+ describe("hfEvalDetailToSummary", () => {
40
+ for (const entry of fixtureEntries("evals")) {
41
+ it(`${entry.id} — ${entry.why}`, () => {
42
+ const input = loadFixture<HFEvalDetail>("evals", entry.id)
43
+ // attachBenchmarkCardToSummary is async + I/O bound — snapshot the
44
+ // synchronous core. attachBenchmarkCardToSummary is covered separately
45
+ // by the parity harness.
46
+ expect(hfEvalDetailToSummary(input)).toMatchSnapshot()
47
+ })
48
+ }
49
+ })
50
+
51
+ describe("flattenModelEvaluations", () => {
52
+ for (const entry of fixtureEntries("models")) {
53
+ it(`${entry.id} — ${entry.why}`, () => {
54
+ const input = loadFixture<HFModelDetail>("models", entry.id)
55
+ const evaluations = flattenModelEvaluations(input)
56
+ // Snapshot a digest rather than the full output (which can be 10k+ lines
57
+ // for large models). The digest captures: count, distinct categories,
58
+ // distinct evaluator_relationships, count of distinct benchmark_family_keys,
59
+ // count of unique evaluation_ids, and a hash of the full output. Any change
60
+ // to the full output changes the hash; the structured fields make the diff
61
+ // readable when something changes.
62
+ expect(digestEvaluations(evaluations)).toMatchSnapshot()
63
+ })
64
+ }
65
+ })
66
+
67
+ describe("hfDeveloperDetailToSummary", () => {
68
+ for (const entry of fixtureEntries("developers")) {
69
+ it(`${entry.id} — ${entry.why}`, () => {
70
+ const input = loadFixture<{ developer: string; models: HFModelCardEntry[] }>("developers", entry.id)
71
+ // Developer fixtures can be large (anthropic.json is 389KB, many model
72
+ // cards). Snapshot a digest: scalar fields plus a count + hash of the
73
+ // model_cards array. Bigger detail goes through hfModelCardToEvaluationCardData
74
+ // tests above.
75
+ expect(digestDeveloperSummary(hfDeveloperDetailToSummary(input))).toMatchSnapshot()
76
+ })
77
+ }
78
+ })
79
+
80
+ import { createHash } from "crypto"
81
+ import type { BenchmarkEvaluation } from "../lib/benchmark-schema"
82
+
83
+ function stableHash(value: unknown): string {
84
+ return createHash("sha256").update(JSON.stringify(value)).digest("hex").slice(0, 12)
85
+ }
86
+
87
+ function digestEvaluations(evaluations: BenchmarkEvaluation[]) {
88
+ const categories = new Set<string>()
89
+ const families = new Set<string>()
90
+ const evaluators = new Set<string>()
91
+ const evaluationIds = new Set<string>()
92
+ let missingSourceMetadata = 0
93
+ for (const e of evaluations) {
94
+ if (e.category) categories.add(e.category)
95
+ if (e.benchmark_family_key) families.add(e.benchmark_family_key)
96
+ if (e.source_metadata?.evaluator_relationship) evaluators.add(e.source_metadata.evaluator_relationship)
97
+ if (e.evaluation_id) evaluationIds.add(e.evaluation_id)
98
+ if (!e.source_metadata) missingSourceMetadata += 1
99
+ }
100
+ return {
101
+ count: evaluations.length,
102
+ distinct_evaluation_ids: evaluationIds.size,
103
+ distinct_categories: [...categories].sort(),
104
+ distinct_benchmark_family_keys: families.size,
105
+ distinct_evaluator_relationships: [...evaluators].sort(),
106
+ missing_source_metadata: missingSourceMetadata,
107
+ full_output_hash: stableHash(evaluations),
108
+ }
109
+ }
110
+
111
+ function digestDeveloperSummary(summary: ReturnType<typeof hfDeveloperDetailToSummary>) {
112
+ return {
113
+ developer: summary.developer,
114
+ route_id: summary.route_id,
115
+ model_count: summary.model_count,
116
+ benchmark_count: summary.benchmark_count,
117
+ evaluation_count: summary.evaluation_count,
118
+ popular_evals: summary.popular_evals,
119
+ models_hash: stableHash(summary.models),
120
+ models_count: summary.models.length,
121
+ }
122
+ }
tests/fixtures/developers/01-ai.json ADDED
@@ -0,0 +1,2669 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "developer": "01-ai",
3
+ "models": [
4
+ {
5
+ "model_family_id": "01-ai/yi-34b",
6
+ "model_route_id": "01-ai__yi-34b",
7
+ "model_family_name": "Yi 34B",
8
+ "developer": "01-ai",
9
+ "params_billions": 34.0,
10
+ "total_evaluations": 3,
11
+ "benchmark_count": 3,
12
+ "benchmark_family_count": 3,
13
+ "categories_covered": [
14
+ "general",
15
+ "knowledge",
16
+ "reasoning"
17
+ ],
18
+ "last_updated": "2026-03-21T12:31:52.005480Z",
19
+ "variants": [
20
+ {
21
+ "variant_key": "default",
22
+ "variant_label": "Default",
23
+ "evaluation_count": 3,
24
+ "raw_model_ids": [
25
+ "01-ai/Yi-34B",
26
+ "01-ai/yi-34b"
27
+ ],
28
+ "last_updated": "2026-03-21T12:31:52.005480Z"
29
+ }
30
+ ],
31
+ "score_summary": {
32
+ "count": 52,
33
+ "min": 0.0514,
34
+ "max": 0.936,
35
+ "average": 0.6793153846153845
36
+ },
37
+ "reproducibility_summary": {
38
+ "results_total": 52,
39
+ "has_reproducibility_gap_count": 52,
40
+ "populated_ratio_avg": 0.0
41
+ },
42
+ "provenance_summary": {
43
+ "total_results": 52,
44
+ "total_groups": 52,
45
+ "multi_source_groups": 0,
46
+ "first_party_only_groups": 0,
47
+ "source_type_distribution": {
48
+ "first_party": 0,
49
+ "third_party": 52,
50
+ "collaborative": 0,
51
+ "unspecified": 0
52
+ }
53
+ },
54
+ "comparability_summary": {
55
+ "total_groups": 52,
56
+ "groups_with_variant_check": 0,
57
+ "groups_with_cross_party_check": 0,
58
+ "variant_divergent_count": 0,
59
+ "cross_party_divergent_count": 0
60
+ },
61
+ "benchmark_names": [
62
+ "BBH",
63
+ "GPQA",
64
+ "GSM8K",
65
+ "Helm lite",
66
+ "IFEval",
67
+ "LegalBench",
68
+ "MATH",
69
+ "MATH Level 5",
70
+ "MMLU",
71
+ "MMLU-PRO",
72
+ "MUSR",
73
+ "MedQA",
74
+ "NarrativeQA",
75
+ "NaturalQuestions (closed-book)",
76
+ "OpenbookQA",
77
+ "WMT 2014"
78
+ ],
79
+ "top_benchmark_scores": [
80
+ {
81
+ "benchmark": "MMLU",
82
+ "benchmarkKey": "helm_mmlu",
83
+ "canonical_display_name": "Mmlu / Marketing / Exact Match",
84
+ "evaluation_name": "Marketing",
85
+ "score": 0.936,
86
+ "metric": "EM on Marketing",
87
+ "lower_is_better": false
88
+ },
89
+ {
90
+ "benchmark": "OpenbookQA",
91
+ "benchmarkKey": "helm_lite_openbookqa",
92
+ "canonical_display_name": "OpenbookQA / Exact Match",
93
+ "evaluation_name": "OpenbookQA",
94
+ "score": 0.92,
95
+ "metric": "EM on OpenbookQA",
96
+ "lower_is_better": false
97
+ },
98
+ {
99
+ "benchmark": "NarrativeQA",
100
+ "benchmarkKey": "helm_lite_narrativeqa",
101
+ "canonical_display_name": "NarrativeQA / F1",
102
+ "evaluation_name": "NarrativeQA",
103
+ "score": 0.782,
104
+ "metric": "F1 on NarrativeQA",
105
+ "lower_is_better": false
106
+ },
107
+ {
108
+ "benchmark": "MedQA",
109
+ "benchmarkKey": "helm_lite_medqa",
110
+ "canonical_display_name": "MedQA / Exact Match",
111
+ "evaluation_name": "MedQA",
112
+ "score": 0.656,
113
+ "metric": "EM on MedQA",
114
+ "lower_is_better": false
115
+ },
116
+ {
117
+ "benchmark": "MMLU",
118
+ "benchmarkKey": "helm_lite_mmlu",
119
+ "canonical_display_name": "MMLU / Exact Match",
120
+ "evaluation_name": "MMLU",
121
+ "score": 0.65,
122
+ "metric": "EM on MMLU",
123
+ "lower_is_better": false
124
+ },
125
+ {
126
+ "benchmark": "GSM8K",
127
+ "benchmarkKey": "helm_lite_gsm8k",
128
+ "canonical_display_name": "GSM8K / Exact Match",
129
+ "evaluation_name": "GSM8K",
130
+ "score": 0.648,
131
+ "metric": "EM on GSM8K",
132
+ "lower_is_better": false
133
+ },
134
+ {
135
+ "benchmark": "LegalBench",
136
+ "benchmarkKey": "helm_lite_legalbench",
137
+ "canonical_display_name": "LegalBench / Exact Match",
138
+ "evaluation_name": "LegalBench",
139
+ "score": 0.618,
140
+ "metric": "EM on LegalBench",
141
+ "lower_is_better": false
142
+ },
143
+ {
144
+ "benchmark": "Helm lite",
145
+ "benchmarkKey": "helm_lite",
146
+ "canonical_display_name": "Helm lite / Win Rate",
147
+ "evaluation_name": "helm_lite",
148
+ "score": 0.57,
149
+ "metric": "How many models this model outperforms on average (over columns).",
150
+ "lower_is_better": false
151
+ },
152
+ {
153
+ "benchmark": "BBH",
154
+ "benchmarkKey": "hfopenllm_v2_bbh",
155
+ "canonical_display_name": "BBH / Accuracy",
156
+ "evaluation_name": "BBH",
157
+ "score": 0.5457,
158
+ "metric": "Accuracy on BBH",
159
+ "lower_is_better": false
160
+ },
161
+ {
162
+ "benchmark": "NaturalQuestions (closed-book)",
163
+ "benchmarkKey": "helm_lite_naturalquestions_closed_book",
164
+ "canonical_display_name": "NaturalQuestions (closed-book) / F1",
165
+ "evaluation_name": "NaturalQuestions (closed-book)",
166
+ "score": 0.443,
167
+ "metric": "F1 on NaturalQuestions (closed-book)",
168
+ "lower_is_better": false
169
+ },
170
+ {
171
+ "benchmark": "MMLU-PRO",
172
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
173
+ "canonical_display_name": "MMLU-PRO / Accuracy",
174
+ "evaluation_name": "MMLU-PRO",
175
+ "score": 0.4412,
176
+ "metric": "Accuracy on MMLU-PRO",
177
+ "lower_is_better": false
178
+ },
179
+ {
180
+ "benchmark": "MUSR",
181
+ "benchmarkKey": "hfopenllm_v2_musr",
182
+ "canonical_display_name": "MUSR / Accuracy",
183
+ "evaluation_name": "MUSR",
184
+ "score": 0.4119,
185
+ "metric": "Accuracy on MUSR",
186
+ "lower_is_better": false
187
+ },
188
+ {
189
+ "benchmark": "MATH",
190
+ "benchmarkKey": "helm_lite_math",
191
+ "canonical_display_name": "MATH / Equivalent (CoT)",
192
+ "evaluation_name": "MATH",
193
+ "score": 0.375,
194
+ "metric": "Equivalent (CoT) on MATH",
195
+ "lower_is_better": false
196
+ },
197
+ {
198
+ "benchmark": "GPQA",
199
+ "benchmarkKey": "hfopenllm_v2_gpqa",
200
+ "canonical_display_name": "GPQA / Accuracy",
201
+ "evaluation_name": "GPQA",
202
+ "score": 0.3666,
203
+ "metric": "Accuracy on GPQA",
204
+ "lower_is_better": false
205
+ },
206
+ {
207
+ "benchmark": "IFEval",
208
+ "benchmarkKey": "hfopenllm_v2_ifeval",
209
+ "canonical_display_name": "IFEval / Accuracy",
210
+ "evaluation_name": "IFEval",
211
+ "score": 0.3046,
212
+ "metric": "Accuracy on IFEval",
213
+ "lower_is_better": false
214
+ }
215
+ ]
216
+ },
217
+ {
218
+ "model_family_id": "01-ai/yi-6b",
219
+ "model_route_id": "01-ai__yi-6b",
220
+ "model_family_name": "Yi 6B",
221
+ "developer": "01-ai",
222
+ "params_billions": 6.0,
223
+ "total_evaluations": 3,
224
+ "benchmark_count": 3,
225
+ "benchmark_family_count": 3,
226
+ "categories_covered": [
227
+ "general",
228
+ "knowledge",
229
+ "reasoning"
230
+ ],
231
+ "last_updated": "2026-03-21T12:31:52.005480Z",
232
+ "variants": [
233
+ {
234
+ "variant_key": "default",
235
+ "variant_label": "Default",
236
+ "evaluation_count": 3,
237
+ "raw_model_ids": [
238
+ "01-ai/Yi-6B",
239
+ "01-ai/yi-6b"
240
+ ],
241
+ "last_updated": "2026-03-21T12:31:52.005480Z"
242
+ }
243
+ ],
244
+ "score_summary": {
245
+ "count": 52,
246
+ "min": 0.0159,
247
+ "max": 0.893,
248
+ "average": 0.5652923076923078
249
+ },
250
+ "reproducibility_summary": {
251
+ "results_total": 52,
252
+ "has_reproducibility_gap_count": 52,
253
+ "populated_ratio_avg": 0.0
254
+ },
255
+ "provenance_summary": {
256
+ "total_results": 52,
257
+ "total_groups": 52,
258
+ "multi_source_groups": 0,
259
+ "first_party_only_groups": 0,
260
+ "source_type_distribution": {
261
+ "first_party": 0,
262
+ "third_party": 52,
263
+ "collaborative": 0,
264
+ "unspecified": 0
265
+ }
266
+ },
267
+ "comparability_summary": {
268
+ "total_groups": 52,
269
+ "groups_with_variant_check": 0,
270
+ "groups_with_cross_party_check": 0,
271
+ "variant_divergent_count": 0,
272
+ "cross_party_divergent_count": 0
273
+ },
274
+ "benchmark_names": [
275
+ "BBH",
276
+ "GPQA",
277
+ "GSM8K",
278
+ "Helm lite",
279
+ "IFEval",
280
+ "LegalBench",
281
+ "MATH",
282
+ "MATH Level 5",
283
+ "MMLU",
284
+ "MMLU-PRO",
285
+ "MUSR",
286
+ "MedQA",
287
+ "NarrativeQA",
288
+ "NaturalQuestions (closed-book)",
289
+ "OpenbookQA",
290
+ "WMT 2014"
291
+ ],
292
+ "top_benchmark_scores": [
293
+ {
294
+ "benchmark": "MMLU",
295
+ "benchmarkKey": "helm_mmlu",
296
+ "canonical_display_name": "Mmlu / Marketing / Exact Match",
297
+ "evaluation_name": "Marketing",
298
+ "score": 0.893,
299
+ "metric": "EM on Marketing",
300
+ "lower_is_better": false
301
+ },
302
+ {
303
+ "benchmark": "OpenbookQA",
304
+ "benchmarkKey": "helm_lite_openbookqa",
305
+ "canonical_display_name": "OpenbookQA / Exact Match",
306
+ "evaluation_name": "OpenbookQA",
307
+ "score": 0.8,
308
+ "metric": "EM on OpenbookQA",
309
+ "lower_is_better": false
310
+ },
311
+ {
312
+ "benchmark": "NarrativeQA",
313
+ "benchmarkKey": "helm_lite_narrativeqa",
314
+ "canonical_display_name": "NarrativeQA / F1",
315
+ "evaluation_name": "NarrativeQA",
316
+ "score": 0.702,
317
+ "metric": "F1 on NarrativeQA",
318
+ "lower_is_better": false
319
+ },
320
+ {
321
+ "benchmark": "MMLU",
322
+ "benchmarkKey": "helm_lite_mmlu",
323
+ "canonical_display_name": "MMLU / Exact Match",
324
+ "evaluation_name": "MMLU",
325
+ "score": 0.53,
326
+ "metric": "EM on MMLU",
327
+ "lower_is_better": false
328
+ },
329
+ {
330
+ "benchmark": "LegalBench",
331
+ "benchmarkKey": "helm_lite_legalbench",
332
+ "canonical_display_name": "LegalBench / Exact Match",
333
+ "evaluation_name": "LegalBench",
334
+ "score": 0.519,
335
+ "metric": "EM on LegalBench",
336
+ "lower_is_better": false
337
+ },
338
+ {
339
+ "benchmark": "MedQA",
340
+ "benchmarkKey": "helm_lite_medqa",
341
+ "canonical_display_name": "MedQA / Exact Match",
342
+ "evaluation_name": "MedQA",
343
+ "score": 0.497,
344
+ "metric": "EM on MedQA",
345
+ "lower_is_better": false
346
+ },
347
+ {
348
+ "benchmark": "BBH",
349
+ "benchmarkKey": "hfopenllm_v2_bbh",
350
+ "canonical_display_name": "BBH / Accuracy",
351
+ "evaluation_name": "BBH",
352
+ "score": 0.4309,
353
+ "metric": "Accuracy on BBH",
354
+ "lower_is_better": false
355
+ },
356
+ {
357
+ "benchmark": "MUSR",
358
+ "benchmarkKey": "hfopenllm_v2_musr",
359
+ "canonical_display_name": "MUSR / Accuracy",
360
+ "evaluation_name": "MUSR",
361
+ "score": 0.3937,
362
+ "metric": "Accuracy on MUSR",
363
+ "lower_is_better": false
364
+ },
365
+ {
366
+ "benchmark": "GSM8K",
367
+ "benchmarkKey": "helm_lite_gsm8k",
368
+ "canonical_display_name": "GSM8K / Exact Match",
369
+ "evaluation_name": "GSM8K",
370
+ "score": 0.375,
371
+ "metric": "EM on GSM8K",
372
+ "lower_is_better": false
373
+ },
374
+ {
375
+ "benchmark": "NaturalQuestions (closed-book)",
376
+ "benchmarkKey": "helm_lite_naturalquestions_closed_book",
377
+ "canonical_display_name": "NaturalQuestions (closed-book) / F1",
378
+ "evaluation_name": "NaturalQuestions (closed-book)",
379
+ "score": 0.31,
380
+ "metric": "F1 on NaturalQuestions (closed-book)",
381
+ "lower_is_better": false
382
+ },
383
+ {
384
+ "benchmark": "MMLU-PRO",
385
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
386
+ "canonical_display_name": "MMLU-PRO / Accuracy",
387
+ "evaluation_name": "MMLU-PRO",
388
+ "score": 0.2991,
389
+ "metric": "Accuracy on MMLU-PRO",
390
+ "lower_is_better": false
391
+ },
392
+ {
393
+ "benchmark": "IFEval",
394
+ "benchmarkKey": "hfopenllm_v2_ifeval",
395
+ "canonical_display_name": "IFEval / Accuracy",
396
+ "evaluation_name": "IFEval",
397
+ "score": 0.2893,
398
+ "metric": "Accuracy on IFEval",
399
+ "lower_is_better": false
400
+ },
401
+ {
402
+ "benchmark": "GPQA",
403
+ "benchmarkKey": "hfopenllm_v2_gpqa",
404
+ "canonical_display_name": "GPQA / Accuracy",
405
+ "evaluation_name": "GPQA",
406
+ "score": 0.2693,
407
+ "metric": "Accuracy on GPQA",
408
+ "lower_is_better": false
409
+ },
410
+ {
411
+ "benchmark": "Helm lite",
412
+ "benchmarkKey": "helm_lite",
413
+ "canonical_display_name": "Helm lite / Win Rate",
414
+ "evaluation_name": "helm_lite",
415
+ "score": 0.253,
416
+ "metric": "How many models this model outperforms on average (over columns).",
417
+ "lower_is_better": false
418
+ },
419
+ {
420
+ "benchmark": "MATH",
421
+ "benchmarkKey": "helm_lite_math",
422
+ "canonical_display_name": "MATH / Equivalent (CoT)",
423
+ "evaluation_name": "MATH",
424
+ "score": 0.126,
425
+ "metric": "Equivalent (CoT) on MATH",
426
+ "lower_is_better": false
427
+ }
428
+ ]
429
+ },
430
+ {
431
+ "model_family_id": "01-ai/yi-34b-chat",
432
+ "model_route_id": "01-ai__yi-34b-chat",
433
+ "model_family_name": "Yi Chat 34B",
434
+ "developer": "01-ai",
435
+ "params_billions": 34.0,
436
+ "total_evaluations": 2,
437
+ "benchmark_count": 2,
438
+ "benchmark_family_count": 2,
439
+ "categories_covered": [
440
+ "general",
441
+ "knowledge",
442
+ "reasoning"
443
+ ],
444
+ "last_updated": "2026-04-20T22:14:39.271662Z",
445
+ "variants": [
446
+ {
447
+ "variant_key": "default",
448
+ "variant_label": "Default",
449
+ "evaluation_count": 2,
450
+ "raw_model_ids": [
451
+ "01-ai/Yi-34B-Chat",
452
+ "01-ai/yi-34b-chat"
453
+ ],
454
+ "last_updated": "2026-04-20T22:14:39.271662Z"
455
+ }
456
+ ],
457
+ "score_summary": {
458
+ "count": 380,
459
+ "min": 0.0,
460
+ "max": 1.0,
461
+ "average": 0.528668157894737
462
+ },
463
+ "reproducibility_summary": {
464
+ "results_total": 380,
465
+ "has_reproducibility_gap_count": 380,
466
+ "populated_ratio_avg": 0.0
467
+ },
468
+ "provenance_summary": {
469
+ "total_results": 380,
470
+ "total_groups": 380,
471
+ "multi_source_groups": 0,
472
+ "first_party_only_groups": 0,
473
+ "source_type_distribution": {
474
+ "first_party": 0,
475
+ "third_party": 380,
476
+ "collaborative": 0,
477
+ "unspecified": 0
478
+ }
479
+ },
480
+ "comparability_summary": {
481
+ "total_groups": 380,
482
+ "groups_with_variant_check": 0,
483
+ "groups_with_cross_party_check": 0,
484
+ "variant_divergent_count": 0,
485
+ "cross_party_divergent_count": 0
486
+ },
487
+ "benchmark_names": [
488
+ "BBH",
489
+ "GPQA",
490
+ "Helm air bench",
491
+ "IFEval",
492
+ "MATH Level 5",
493
+ "MMLU-PRO",
494
+ "MUSR"
495
+ ],
496
+ "top_benchmark_scores": [
497
+ {
498
+ "benchmark": "Helm air bench",
499
+ "benchmarkKey": "helm_air_bench",
500
+ "canonical_display_name": "Air bench / AIRBench 2024 - #39-40.34: Characterization of identity - Predicted Personality / Refusal Rate",
501
+ "evaluation_name": "AIRBench 2024 - #39-40.34: Characterization of identity - Predicted Personality",
502
+ "score": 1.0,
503
+ "metric": "Refusal Rate on AIRBench 2024 - #39-40.34: Characterization of identity - Predicted Personality",
504
+ "lower_is_better": false
505
+ },
506
+ {
507
+ "benchmark": "BBH",
508
+ "benchmarkKey": "hfopenllm_v2_bbh",
509
+ "canonical_display_name": "BBH / Accuracy",
510
+ "evaluation_name": "BBH",
511
+ "score": 0.5561,
512
+ "metric": "Accuracy on BBH",
513
+ "lower_is_better": false
514
+ },
515
+ {
516
+ "benchmark": "IFEval",
517
+ "benchmarkKey": "hfopenllm_v2_ifeval",
518
+ "canonical_display_name": "IFEval / Accuracy",
519
+ "evaluation_name": "IFEval",
520
+ "score": 0.4699,
521
+ "metric": "Accuracy on IFEval",
522
+ "lower_is_better": false
523
+ },
524
+ {
525
+ "benchmark": "MMLU-PRO",
526
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
527
+ "canonical_display_name": "MMLU-PRO / Accuracy",
528
+ "evaluation_name": "MMLU-PRO",
529
+ "score": 0.4093,
530
+ "metric": "Accuracy on MMLU-PRO",
531
+ "lower_is_better": false
532
+ },
533
+ {
534
+ "benchmark": "MUSR",
535
+ "benchmarkKey": "hfopenllm_v2_musr",
536
+ "canonical_display_name": "MUSR / Accuracy",
537
+ "evaluation_name": "MUSR",
538
+ "score": 0.3978,
539
+ "metric": "Accuracy on MUSR",
540
+ "lower_is_better": false
541
+ },
542
+ {
543
+ "benchmark": "GPQA",
544
+ "benchmarkKey": "hfopenllm_v2_gpqa",
545
+ "canonical_display_name": "GPQA / Accuracy",
546
+ "evaluation_name": "GPQA",
547
+ "score": 0.3381,
548
+ "metric": "Accuracy on GPQA",
549
+ "lower_is_better": false
550
+ },
551
+ {
552
+ "benchmark": "MATH Level 5",
553
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
554
+ "canonical_display_name": "MATH Level 5 / Exact Match",
555
+ "evaluation_name": "MATH Level 5",
556
+ "score": 0.0627,
557
+ "metric": "Exact Match on MATH Level 5",
558
+ "lower_is_better": false
559
+ }
560
+ ]
561
+ },
562
+ {
563
+ "model_family_id": "01-ai/yi-large-preview",
564
+ "model_route_id": "01-ai__yi-large-preview",
565
+ "model_family_name": "Yi Large Preview",
566
+ "developer": "01-ai",
567
+ "params_billions": null,
568
+ "total_evaluations": 2,
569
+ "benchmark_count": 2,
570
+ "benchmark_family_count": 2,
571
+ "categories_covered": [
572
+ "general",
573
+ "knowledge",
574
+ "reasoning"
575
+ ],
576
+ "last_updated": "2026-03-21T12:31:52.005480Z",
577
+ "variants": [
578
+ {
579
+ "variant_key": "default",
580
+ "variant_label": "Default",
581
+ "evaluation_count": 2,
582
+ "raw_model_ids": [
583
+ "01-ai/yi-large-preview"
584
+ ],
585
+ "last_updated": "2026-03-21T12:31:52.005480Z"
586
+ }
587
+ ],
588
+ "score_summary": {
589
+ "count": 46,
590
+ "min": 0.176,
591
+ "max": 0.946,
592
+ "average": 0.741413043478261
593
+ },
594
+ "reproducibility_summary": {
595
+ "results_total": 46,
596
+ "has_reproducibility_gap_count": 46,
597
+ "populated_ratio_avg": 0.0
598
+ },
599
+ "provenance_summary": {
600
+ "total_results": 46,
601
+ "total_groups": 46,
602
+ "multi_source_groups": 0,
603
+ "first_party_only_groups": 0,
604
+ "source_type_distribution": {
605
+ "first_party": 0,
606
+ "third_party": 46,
607
+ "collaborative": 0,
608
+ "unspecified": 0
609
+ }
610
+ },
611
+ "comparability_summary": {
612
+ "total_groups": 46,
613
+ "groups_with_variant_check": 0,
614
+ "groups_with_cross_party_check": 0,
615
+ "variant_divergent_count": 0,
616
+ "cross_party_divergent_count": 0
617
+ },
618
+ "benchmark_names": [
619
+ "GSM8K",
620
+ "Helm lite",
621
+ "LegalBench",
622
+ "MATH",
623
+ "MMLU",
624
+ "MedQA",
625
+ "NarrativeQA",
626
+ "NaturalQuestions (closed-book)",
627
+ "OpenbookQA",
628
+ "WMT 2014"
629
+ ],
630
+ "top_benchmark_scores": [
631
+ {
632
+ "benchmark": "OpenbookQA",
633
+ "benchmarkKey": "helm_lite_openbookqa",
634
+ "canonical_display_name": "OpenbookQA / Exact Match",
635
+ "evaluation_name": "OpenbookQA",
636
+ "score": 0.946,
637
+ "metric": "EM on OpenbookQA",
638
+ "lower_is_better": false
639
+ },
640
+ {
641
+ "benchmark": "MMLU",
642
+ "benchmarkKey": "helm_mmlu",
643
+ "canonical_display_name": "Mmlu / High School World History / Exact Match",
644
+ "evaluation_name": "High School World History",
645
+ "score": 0.928,
646
+ "metric": "EM on High School World History",
647
+ "lower_is_better": false
648
+ },
649
+ {
650
+ "benchmark": "MMLU",
651
+ "benchmarkKey": "helm_lite_mmlu",
652
+ "canonical_display_name": "MMLU / Exact Match",
653
+ "evaluation_name": "MMLU",
654
+ "score": 0.712,
655
+ "metric": "EM on MMLU",
656
+ "lower_is_better": false
657
+ },
658
+ {
659
+ "benchmark": "MATH",
660
+ "benchmarkKey": "helm_lite_math",
661
+ "canonical_display_name": "MATH / Equivalent (CoT)",
662
+ "evaluation_name": "MATH",
663
+ "score": 0.712,
664
+ "metric": "Equivalent (CoT) on MATH",
665
+ "lower_is_better": false
666
+ },
667
+ {
668
+ "benchmark": "GSM8K",
669
+ "benchmarkKey": "helm_lite_gsm8k",
670
+ "canonical_display_name": "GSM8K / Exact Match",
671
+ "evaluation_name": "GSM8K",
672
+ "score": 0.69,
673
+ "metric": "EM on GSM8K",
674
+ "lower_is_better": false
675
+ },
676
+ {
677
+ "benchmark": "MedQA",
678
+ "benchmarkKey": "helm_lite_medqa",
679
+ "canonical_display_name": "MedQA / Exact Match",
680
+ "evaluation_name": "MedQA",
681
+ "score": 0.66,
682
+ "metric": "EM on MedQA",
683
+ "lower_is_better": false
684
+ },
685
+ {
686
+ "benchmark": "LegalBench",
687
+ "benchmarkKey": "helm_lite_legalbench",
688
+ "canonical_display_name": "LegalBench / Exact Match",
689
+ "evaluation_name": "LegalBench",
690
+ "score": 0.519,
691
+ "metric": "EM on LegalBench",
692
+ "lower_is_better": false
693
+ },
694
+ {
695
+ "benchmark": "Helm lite",
696
+ "benchmarkKey": "helm_lite",
697
+ "canonical_display_name": "Helm lite / Win Rate",
698
+ "evaluation_name": "helm_lite",
699
+ "score": 0.471,
700
+ "metric": "How many models this model outperforms on average (over columns).",
701
+ "lower_is_better": false
702
+ },
703
+ {
704
+ "benchmark": "NaturalQuestions (closed-book)",
705
+ "benchmarkKey": "helm_lite_naturalquestions_closed_book",
706
+ "canonical_display_name": "NaturalQuestions (closed-book) / F1",
707
+ "evaluation_name": "NaturalQuestions (closed-book)",
708
+ "score": 0.428,
709
+ "metric": "F1 on NaturalQuestions (closed-book)",
710
+ "lower_is_better": false
711
+ },
712
+ {
713
+ "benchmark": "NarrativeQA",
714
+ "benchmarkKey": "helm_lite_narrativeqa",
715
+ "canonical_display_name": "NarrativeQA / F1",
716
+ "evaluation_name": "NarrativeQA",
717
+ "score": 0.373,
718
+ "metric": "F1 on NarrativeQA",
719
+ "lower_is_better": false
720
+ },
721
+ {
722
+ "benchmark": "WMT 2014",
723
+ "benchmarkKey": "helm_lite_wmt_2014",
724
+ "canonical_display_name": "WMT 2014 / BLEU-4",
725
+ "evaluation_name": "WMT 2014",
726
+ "score": 0.176,
727
+ "metric": "BLEU-4 on WMT 2014",
728
+ "lower_is_better": false
729
+ }
730
+ ]
731
+ },
732
+ {
733
+ "model_family_id": "01-ai/yi-1-5-34b",
734
+ "model_route_id": "01-ai__yi-1-5-34b",
735
+ "model_family_name": "Yi-1.5-34B",
736
+ "developer": "01-ai",
737
+ "params_billions": 34.389,
738
+ "total_evaluations": 1,
739
+ "benchmark_count": 1,
740
+ "benchmark_family_count": 1,
741
+ "categories_covered": [
742
+ "general",
743
+ "knowledge",
744
+ "reasoning"
745
+ ],
746
+ "last_updated": "2026-03-19T16:08:18.240187Z",
747
+ "variants": [
748
+ {
749
+ "variant_key": "default",
750
+ "variant_label": "Default",
751
+ "evaluation_count": 1,
752
+ "raw_model_ids": [
753
+ "01-ai/Yi-1.5-34B"
754
+ ],
755
+ "last_updated": "2026-03-19T16:08:18.240187Z"
756
+ }
757
+ ],
758
+ "score_summary": {
759
+ "count": 6,
760
+ "min": 0.1533,
761
+ "max": 0.5976,
762
+ "average": 0.3818333333333334
763
+ },
764
+ "reproducibility_summary": {
765
+ "results_total": 6,
766
+ "has_reproducibility_gap_count": 6,
767
+ "populated_ratio_avg": 0.0
768
+ },
769
+ "provenance_summary": {
770
+ "total_results": 6,
771
+ "total_groups": 6,
772
+ "multi_source_groups": 0,
773
+ "first_party_only_groups": 0,
774
+ "source_type_distribution": {
775
+ "first_party": 0,
776
+ "third_party": 6,
777
+ "collaborative": 0,
778
+ "unspecified": 0
779
+ }
780
+ },
781
+ "comparability_summary": {
782
+ "total_groups": 6,
783
+ "groups_with_variant_check": 0,
784
+ "groups_with_cross_party_check": 0,
785
+ "variant_divergent_count": 0,
786
+ "cross_party_divergent_count": 0
787
+ },
788
+ "benchmark_names": [
789
+ "BBH",
790
+ "GPQA",
791
+ "IFEval",
792
+ "MATH Level 5",
793
+ "MMLU-PRO",
794
+ "MUSR"
795
+ ],
796
+ "top_benchmark_scores": [
797
+ {
798
+ "benchmark": "BBH",
799
+ "benchmarkKey": "hfopenllm_v2_bbh",
800
+ "canonical_display_name": "BBH / Accuracy",
801
+ "evaluation_name": "BBH",
802
+ "score": 0.5976,
803
+ "metric": "Accuracy on BBH",
804
+ "lower_is_better": false
805
+ },
806
+ {
807
+ "benchmark": "MMLU-PRO",
808
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
809
+ "canonical_display_name": "MMLU-PRO / Accuracy",
810
+ "evaluation_name": "MMLU-PRO",
811
+ "score": 0.4666,
812
+ "metric": "Accuracy on MMLU-PRO",
813
+ "lower_is_better": false
814
+ },
815
+ {
816
+ "benchmark": "MUSR",
817
+ "benchmarkKey": "hfopenllm_v2_musr",
818
+ "canonical_display_name": "MUSR / Accuracy",
819
+ "evaluation_name": "MUSR",
820
+ "score": 0.4236,
821
+ "metric": "Accuracy on MUSR",
822
+ "lower_is_better": false
823
+ },
824
+ {
825
+ "benchmark": "GPQA",
826
+ "benchmarkKey": "hfopenllm_v2_gpqa",
827
+ "canonical_display_name": "GPQA / Accuracy",
828
+ "evaluation_name": "GPQA",
829
+ "score": 0.3658,
830
+ "metric": "Accuracy on GPQA",
831
+ "lower_is_better": false
832
+ },
833
+ {
834
+ "benchmark": "IFEval",
835
+ "benchmarkKey": "hfopenllm_v2_ifeval",
836
+ "canonical_display_name": "IFEval / Accuracy",
837
+ "evaluation_name": "IFEval",
838
+ "score": 0.2841,
839
+ "metric": "Accuracy on IFEval",
840
+ "lower_is_better": false
841
+ },
842
+ {
843
+ "benchmark": "MATH Level 5",
844
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
845
+ "canonical_display_name": "MATH Level 5 / Exact Match",
846
+ "evaluation_name": "MATH Level 5",
847
+ "score": 0.1533,
848
+ "metric": "Exact Match on MATH Level 5",
849
+ "lower_is_better": false
850
+ }
851
+ ]
852
+ },
853
+ {
854
+ "model_family_id": "01-ai/yi-1-5-34b-32k",
855
+ "model_route_id": "01-ai__yi-1-5-34b-32k",
856
+ "model_family_name": "Yi-1.5-34B-32K",
857
+ "developer": "01-ai",
858
+ "params_billions": 34.389,
859
+ "total_evaluations": 1,
860
+ "benchmark_count": 1,
861
+ "benchmark_family_count": 1,
862
+ "categories_covered": [
863
+ "general",
864
+ "knowledge",
865
+ "reasoning"
866
+ ],
867
+ "last_updated": "2026-03-19T16:08:18.240187Z",
868
+ "variants": [
869
+ {
870
+ "variant_key": "default",
871
+ "variant_label": "Default",
872
+ "evaluation_count": 1,
873
+ "raw_model_ids": [
874
+ "01-ai/Yi-1.5-34B-32K"
875
+ ],
876
+ "last_updated": "2026-03-19T16:08:18.240187Z"
877
+ }
878
+ ],
879
+ "score_summary": {
880
+ "count": 6,
881
+ "min": 0.1541,
882
+ "max": 0.6016,
883
+ "average": 0.3902666666666666
884
+ },
885
+ "reproducibility_summary": {
886
+ "results_total": 6,
887
+ "has_reproducibility_gap_count": 6,
888
+ "populated_ratio_avg": 0.0
889
+ },
890
+ "provenance_summary": {
891
+ "total_results": 6,
892
+ "total_groups": 6,
893
+ "multi_source_groups": 0,
894
+ "first_party_only_groups": 0,
895
+ "source_type_distribution": {
896
+ "first_party": 0,
897
+ "third_party": 6,
898
+ "collaborative": 0,
899
+ "unspecified": 0
900
+ }
901
+ },
902
+ "comparability_summary": {
903
+ "total_groups": 6,
904
+ "groups_with_variant_check": 0,
905
+ "groups_with_cross_party_check": 0,
906
+ "variant_divergent_count": 0,
907
+ "cross_party_divergent_count": 0
908
+ },
909
+ "benchmark_names": [
910
+ "BBH",
911
+ "GPQA",
912
+ "IFEval",
913
+ "MATH Level 5",
914
+ "MMLU-PRO",
915
+ "MUSR"
916
+ ],
917
+ "top_benchmark_scores": [
918
+ {
919
+ "benchmark": "BBH",
920
+ "benchmarkKey": "hfopenllm_v2_bbh",
921
+ "canonical_display_name": "BBH / Accuracy",
922
+ "evaluation_name": "BBH",
923
+ "score": 0.6016,
924
+ "metric": "Accuracy on BBH",
925
+ "lower_is_better": false
926
+ },
927
+ {
928
+ "benchmark": "MMLU-PRO",
929
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
930
+ "canonical_display_name": "MMLU-PRO / Accuracy",
931
+ "evaluation_name": "MMLU-PRO",
932
+ "score": 0.4709,
933
+ "metric": "Accuracy on MMLU-PRO",
934
+ "lower_is_better": false
935
+ },
936
+ {
937
+ "benchmark": "MUSR",
938
+ "benchmarkKey": "hfopenllm_v2_musr",
939
+ "canonical_display_name": "MUSR / Accuracy",
940
+ "evaluation_name": "MUSR",
941
+ "score": 0.4398,
942
+ "metric": "Accuracy on MUSR",
943
+ "lower_is_better": false
944
+ },
945
+ {
946
+ "benchmark": "GPQA",
947
+ "benchmarkKey": "hfopenllm_v2_gpqa",
948
+ "canonical_display_name": "GPQA / Accuracy",
949
+ "evaluation_name": "GPQA",
950
+ "score": 0.3633,
951
+ "metric": "Accuracy on GPQA",
952
+ "lower_is_better": false
953
+ },
954
+ {
955
+ "benchmark": "IFEval",
956
+ "benchmarkKey": "hfopenllm_v2_ifeval",
957
+ "canonical_display_name": "IFEval / Accuracy",
958
+ "evaluation_name": "IFEval",
959
+ "score": 0.3119,
960
+ "metric": "Accuracy on IFEval",
961
+ "lower_is_better": false
962
+ },
963
+ {
964
+ "benchmark": "MATH Level 5",
965
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
966
+ "canonical_display_name": "MATH Level 5 / Exact Match",
967
+ "evaluation_name": "MATH Level 5",
968
+ "score": 0.1541,
969
+ "metric": "Exact Match on MATH Level 5",
970
+ "lower_is_better": false
971
+ }
972
+ ]
973
+ },
974
+ {
975
+ "model_family_id": "01-ai/yi-1-5-34b-chat",
976
+ "model_route_id": "01-ai__yi-1-5-34b-chat",
977
+ "model_family_name": "Yi-1.5-34B-Chat",
978
+ "developer": "01-ai",
979
+ "params_billions": 34.389,
980
+ "total_evaluations": 1,
981
+ "benchmark_count": 1,
982
+ "benchmark_family_count": 1,
983
+ "categories_covered": [
984
+ "general",
985
+ "knowledge",
986
+ "reasoning"
987
+ ],
988
+ "last_updated": "2026-03-19T16:08:18.240187Z",
989
+ "variants": [
990
+ {
991
+ "variant_key": "default",
992
+ "variant_label": "Default",
993
+ "evaluation_count": 1,
994
+ "raw_model_ids": [
995
+ "01-ai/Yi-1.5-34B-Chat"
996
+ ],
997
+ "last_updated": "2026-03-19T16:08:18.240187Z"
998
+ }
999
+ ],
1000
+ "score_summary": {
1001
+ "count": 6,
1002
+ "min": 0.2772,
1003
+ "max": 0.6084,
1004
+ "average": 0.4562333333333333
1005
+ },
1006
+ "reproducibility_summary": {
1007
+ "results_total": 6,
1008
+ "has_reproducibility_gap_count": 6,
1009
+ "populated_ratio_avg": 0.0
1010
+ },
1011
+ "provenance_summary": {
1012
+ "total_results": 6,
1013
+ "total_groups": 6,
1014
+ "multi_source_groups": 0,
1015
+ "first_party_only_groups": 0,
1016
+ "source_type_distribution": {
1017
+ "first_party": 0,
1018
+ "third_party": 6,
1019
+ "collaborative": 0,
1020
+ "unspecified": 0
1021
+ }
1022
+ },
1023
+ "comparability_summary": {
1024
+ "total_groups": 6,
1025
+ "groups_with_variant_check": 0,
1026
+ "groups_with_cross_party_check": 0,
1027
+ "variant_divergent_count": 0,
1028
+ "cross_party_divergent_count": 0
1029
+ },
1030
+ "benchmark_names": [
1031
+ "BBH",
1032
+ "GPQA",
1033
+ "IFEval",
1034
+ "MATH Level 5",
1035
+ "MMLU-PRO",
1036
+ "MUSR"
1037
+ ],
1038
+ "top_benchmark_scores": [
1039
+ {
1040
+ "benchmark": "BBH",
1041
+ "benchmarkKey": "hfopenllm_v2_bbh",
1042
+ "canonical_display_name": "BBH / Accuracy",
1043
+ "evaluation_name": "BBH",
1044
+ "score": 0.6084,
1045
+ "metric": "Accuracy on BBH",
1046
+ "lower_is_better": false
1047
+ },
1048
+ {
1049
+ "benchmark": "IFEval",
1050
+ "benchmarkKey": "hfopenllm_v2_ifeval",
1051
+ "canonical_display_name": "IFEval / Accuracy",
1052
+ "evaluation_name": "IFEval",
1053
+ "score": 0.6067,
1054
+ "metric": "Accuracy on IFEval",
1055
+ "lower_is_better": false
1056
+ },
1057
+ {
1058
+ "benchmark": "MMLU-PRO",
1059
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
1060
+ "canonical_display_name": "MMLU-PRO / Accuracy",
1061
+ "evaluation_name": "MMLU-PRO",
1062
+ "score": 0.452,
1063
+ "metric": "Accuracy on MMLU-PRO",
1064
+ "lower_is_better": false
1065
+ },
1066
+ {
1067
+ "benchmark": "MUSR",
1068
+ "benchmarkKey": "hfopenllm_v2_musr",
1069
+ "canonical_display_name": "MUSR / Accuracy",
1070
+ "evaluation_name": "MUSR",
1071
+ "score": 0.4282,
1072
+ "metric": "Accuracy on MUSR",
1073
+ "lower_is_better": false
1074
+ },
1075
+ {
1076
+ "benchmark": "GPQA",
1077
+ "benchmarkKey": "hfopenllm_v2_gpqa",
1078
+ "canonical_display_name": "GPQA / Accuracy",
1079
+ "evaluation_name": "GPQA",
1080
+ "score": 0.3649,
1081
+ "metric": "Accuracy on GPQA",
1082
+ "lower_is_better": false
1083
+ },
1084
+ {
1085
+ "benchmark": "MATH Level 5",
1086
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
1087
+ "canonical_display_name": "MATH Level 5 / Exact Match",
1088
+ "evaluation_name": "MATH Level 5",
1089
+ "score": 0.2772,
1090
+ "metric": "Exact Match on MATH Level 5",
1091
+ "lower_is_better": false
1092
+ }
1093
+ ]
1094
+ },
1095
+ {
1096
+ "model_family_id": "01-ai/yi-1-5-34b-chat-16k",
1097
+ "model_route_id": "01-ai__yi-1-5-34b-chat-16k",
1098
+ "model_family_name": "Yi-1.5-34B-Chat-16K",
1099
+ "developer": "01-ai",
1100
+ "params_billions": 34.389,
1101
+ "total_evaluations": 1,
1102
+ "benchmark_count": 1,
1103
+ "benchmark_family_count": 1,
1104
+ "categories_covered": [
1105
+ "general",
1106
+ "knowledge",
1107
+ "reasoning"
1108
+ ],
1109
+ "last_updated": "2026-03-19T16:08:18.240187Z",
1110
+ "variants": [
1111
+ {
1112
+ "variant_key": "default",
1113
+ "variant_label": "Default",
1114
+ "evaluation_count": 1,
1115
+ "raw_model_ids": [
1116
+ "01-ai/Yi-1.5-34B-Chat-16K"
1117
+ ],
1118
+ "last_updated": "2026-03-19T16:08:18.240187Z"
1119
+ }
1120
+ ],
1121
+ "score_summary": {
1122
+ "count": 6,
1123
+ "min": 0.2137,
1124
+ "max": 0.61,
1125
+ "average": 0.41875
1126
+ },
1127
+ "reproducibility_summary": {
1128
+ "results_total": 6,
1129
+ "has_reproducibility_gap_count": 6,
1130
+ "populated_ratio_avg": 0.0
1131
+ },
1132
+ "provenance_summary": {
1133
+ "total_results": 6,
1134
+ "total_groups": 6,
1135
+ "multi_source_groups": 0,
1136
+ "first_party_only_groups": 0,
1137
+ "source_type_distribution": {
1138
+ "first_party": 0,
1139
+ "third_party": 6,
1140
+ "collaborative": 0,
1141
+ "unspecified": 0
1142
+ }
1143
+ },
1144
+ "comparability_summary": {
1145
+ "total_groups": 6,
1146
+ "groups_with_variant_check": 0,
1147
+ "groups_with_cross_party_check": 0,
1148
+ "variant_divergent_count": 0,
1149
+ "cross_party_divergent_count": 0
1150
+ },
1151
+ "benchmark_names": [
1152
+ "BBH",
1153
+ "GPQA",
1154
+ "IFEval",
1155
+ "MATH Level 5",
1156
+ "MMLU-PRO",
1157
+ "MUSR"
1158
+ ],
1159
+ "top_benchmark_scores": [
1160
+ {
1161
+ "benchmark": "BBH",
1162
+ "benchmarkKey": "hfopenllm_v2_bbh",
1163
+ "canonical_display_name": "BBH / Accuracy",
1164
+ "evaluation_name": "BBH",
1165
+ "score": 0.61,
1166
+ "metric": "Accuracy on BBH",
1167
+ "lower_is_better": false
1168
+ },
1169
+ {
1170
+ "benchmark": "IFEval",
1171
+ "benchmarkKey": "hfopenllm_v2_ifeval",
1172
+ "canonical_display_name": "IFEval / Accuracy",
1173
+ "evaluation_name": "IFEval",
1174
+ "score": 0.4564,
1175
+ "metric": "Accuracy on IFEval",
1176
+ "lower_is_better": false
1177
+ },
1178
+ {
1179
+ "benchmark": "MMLU-PRO",
1180
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
1181
+ "canonical_display_name": "MMLU-PRO / Accuracy",
1182
+ "evaluation_name": "MMLU-PRO",
1183
+ "score": 0.4545,
1184
+ "metric": "Accuracy on MMLU-PRO",
1185
+ "lower_is_better": false
1186
+ },
1187
+ {
1188
+ "benchmark": "MUSR",
1189
+ "benchmarkKey": "hfopenllm_v2_musr",
1190
+ "canonical_display_name": "MUSR / Accuracy",
1191
+ "evaluation_name": "MUSR",
1192
+ "score": 0.4398,
1193
+ "metric": "Accuracy on MUSR",
1194
+ "lower_is_better": false
1195
+ },
1196
+ {
1197
+ "benchmark": "GPQA",
1198
+ "benchmarkKey": "hfopenllm_v2_gpqa",
1199
+ "canonical_display_name": "GPQA / Accuracy",
1200
+ "evaluation_name": "GPQA",
1201
+ "score": 0.3381,
1202
+ "metric": "Accuracy on GPQA",
1203
+ "lower_is_better": false
1204
+ },
1205
+ {
1206
+ "benchmark": "MATH Level 5",
1207
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
1208
+ "canonical_display_name": "MATH Level 5 / Exact Match",
1209
+ "evaluation_name": "MATH Level 5",
1210
+ "score": 0.2137,
1211
+ "metric": "Exact Match on MATH Level 5",
1212
+ "lower_is_better": false
1213
+ }
1214
+ ]
1215
+ },
1216
+ {
1217
+ "model_family_id": "01-ai/yi-1-5-6b",
1218
+ "model_route_id": "01-ai__yi-1-5-6b",
1219
+ "model_family_name": "Yi-1.5-6B",
1220
+ "developer": "01-ai",
1221
+ "params_billions": 6.061,
1222
+ "total_evaluations": 1,
1223
+ "benchmark_count": 1,
1224
+ "benchmark_family_count": 1,
1225
+ "categories_covered": [
1226
+ "general",
1227
+ "knowledge",
1228
+ "reasoning"
1229
+ ],
1230
+ "last_updated": "2026-03-19T16:08:18.240187Z",
1231
+ "variants": [
1232
+ {
1233
+ "variant_key": "default",
1234
+ "variant_label": "Default",
1235
+ "evaluation_count": 1,
1236
+ "raw_model_ids": [
1237
+ "01-ai/Yi-1.5-6B"
1238
+ ],
1239
+ "last_updated": "2026-03-19T16:08:18.240187Z"
1240
+ }
1241
+ ],
1242
+ "score_summary": {
1243
+ "count": 6,
1244
+ "min": 0.0665,
1245
+ "max": 0.4493,
1246
+ "average": 0.3071833333333333
1247
+ },
1248
+ "reproducibility_summary": {
1249
+ "results_total": 6,
1250
+ "has_reproducibility_gap_count": 6,
1251
+ "populated_ratio_avg": 0.0
1252
+ },
1253
+ "provenance_summary": {
1254
+ "total_results": 6,
1255
+ "total_groups": 6,
1256
+ "multi_source_groups": 0,
1257
+ "first_party_only_groups": 0,
1258
+ "source_type_distribution": {
1259
+ "first_party": 0,
1260
+ "third_party": 6,
1261
+ "collaborative": 0,
1262
+ "unspecified": 0
1263
+ }
1264
+ },
1265
+ "comparability_summary": {
1266
+ "total_groups": 6,
1267
+ "groups_with_variant_check": 0,
1268
+ "groups_with_cross_party_check": 0,
1269
+ "variant_divergent_count": 0,
1270
+ "cross_party_divergent_count": 0
1271
+ },
1272
+ "benchmark_names": [
1273
+ "BBH",
1274
+ "GPQA",
1275
+ "IFEval",
1276
+ "MATH Level 5",
1277
+ "MMLU-PRO",
1278
+ "MUSR"
1279
+ ],
1280
+ "top_benchmark_scores": [
1281
+ {
1282
+ "benchmark": "BBH",
1283
+ "benchmarkKey": "hfopenllm_v2_bbh",
1284
+ "canonical_display_name": "BBH / Accuracy",
1285
+ "evaluation_name": "BBH",
1286
+ "score": 0.4493,
1287
+ "metric": "Accuracy on BBH",
1288
+ "lower_is_better": false
1289
+ },
1290
+ {
1291
+ "benchmark": "MUSR",
1292
+ "benchmarkKey": "hfopenllm_v2_musr",
1293
+ "canonical_display_name": "MUSR / Accuracy",
1294
+ "evaluation_name": "MUSR",
1295
+ "score": 0.4374,
1296
+ "metric": "Accuracy on MUSR",
1297
+ "lower_is_better": false
1298
+ },
1299
+ {
1300
+ "benchmark": "MMLU-PRO",
1301
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
1302
+ "canonical_display_name": "MMLU-PRO / Accuracy",
1303
+ "evaluation_name": "MMLU-PRO",
1304
+ "score": 0.3144,
1305
+ "metric": "Accuracy on MMLU-PRO",
1306
+ "lower_is_better": false
1307
+ },
1308
+ {
1309
+ "benchmark": "GPQA",
1310
+ "benchmarkKey": "hfopenllm_v2_gpqa",
1311
+ "canonical_display_name": "GPQA / Accuracy",
1312
+ "evaluation_name": "GPQA",
1313
+ "score": 0.3138,
1314
+ "metric": "Accuracy on GPQA",
1315
+ "lower_is_better": false
1316
+ },
1317
+ {
1318
+ "benchmark": "IFEval",
1319
+ "benchmarkKey": "hfopenllm_v2_ifeval",
1320
+ "canonical_display_name": "IFEval / Accuracy",
1321
+ "evaluation_name": "IFEval",
1322
+ "score": 0.2617,
1323
+ "metric": "Accuracy on IFEval",
1324
+ "lower_is_better": false
1325
+ },
1326
+ {
1327
+ "benchmark": "MATH Level 5",
1328
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
1329
+ "canonical_display_name": "MATH Level 5 / Exact Match",
1330
+ "evaluation_name": "MATH Level 5",
1331
+ "score": 0.0665,
1332
+ "metric": "Exact Match on MATH Level 5",
1333
+ "lower_is_better": false
1334
+ }
1335
+ ]
1336
+ },
1337
+ {
1338
+ "model_family_id": "01-ai/yi-1-5-6b-chat",
1339
+ "model_route_id": "01-ai__yi-1-5-6b-chat",
1340
+ "model_family_name": "Yi-1.5-6B-Chat",
1341
+ "developer": "01-ai",
1342
+ "params_billions": 6.061,
1343
+ "total_evaluations": 1,
1344
+ "benchmark_count": 1,
1345
+ "benchmark_family_count": 1,
1346
+ "categories_covered": [
1347
+ "general",
1348
+ "knowledge",
1349
+ "reasoning"
1350
+ ],
1351
+ "last_updated": "2026-03-19T16:08:18.240187Z",
1352
+ "variants": [
1353
+ {
1354
+ "variant_key": "default",
1355
+ "variant_label": "Default",
1356
+ "evaluation_count": 1,
1357
+ "raw_model_ids": [
1358
+ "01-ai/Yi-1.5-6B-Chat"
1359
+ ],
1360
+ "last_updated": "2026-03-19T16:08:18.240187Z"
1361
+ }
1362
+ ],
1363
+ "score_summary": {
1364
+ "count": 6,
1365
+ "min": 0.1624,
1366
+ "max": 0.5145,
1367
+ "average": 0.36575
1368
+ },
1369
+ "reproducibility_summary": {
1370
+ "results_total": 6,
1371
+ "has_reproducibility_gap_count": 6,
1372
+ "populated_ratio_avg": 0.0
1373
+ },
1374
+ "provenance_summary": {
1375
+ "total_results": 6,
1376
+ "total_groups": 6,
1377
+ "multi_source_groups": 0,
1378
+ "first_party_only_groups": 0,
1379
+ "source_type_distribution": {
1380
+ "first_party": 0,
1381
+ "third_party": 6,
1382
+ "collaborative": 0,
1383
+ "unspecified": 0
1384
+ }
1385
+ },
1386
+ "comparability_summary": {
1387
+ "total_groups": 6,
1388
+ "groups_with_variant_check": 0,
1389
+ "groups_with_cross_party_check": 0,
1390
+ "variant_divergent_count": 0,
1391
+ "cross_party_divergent_count": 0
1392
+ },
1393
+ "benchmark_names": [
1394
+ "BBH",
1395
+ "GPQA",
1396
+ "IFEval",
1397
+ "MATH Level 5",
1398
+ "MMLU-PRO",
1399
+ "MUSR"
1400
+ ],
1401
+ "top_benchmark_scores": [
1402
+ {
1403
+ "benchmark": "IFEval",
1404
+ "benchmarkKey": "hfopenllm_v2_ifeval",
1405
+ "canonical_display_name": "IFEval / Accuracy",
1406
+ "evaluation_name": "IFEval",
1407
+ "score": 0.5145,
1408
+ "metric": "Accuracy on IFEval",
1409
+ "lower_is_better": false
1410
+ },
1411
+ {
1412
+ "benchmark": "BBH",
1413
+ "benchmarkKey": "hfopenllm_v2_bbh",
1414
+ "canonical_display_name": "BBH / Accuracy",
1415
+ "evaluation_name": "BBH",
1416
+ "score": 0.4571,
1417
+ "metric": "Accuracy on BBH",
1418
+ "lower_is_better": false
1419
+ },
1420
+ {
1421
+ "benchmark": "MUSR",
1422
+ "benchmarkKey": "hfopenllm_v2_musr",
1423
+ "canonical_display_name": "MUSR / Accuracy",
1424
+ "evaluation_name": "MUSR",
1425
+ "score": 0.4392,
1426
+ "metric": "Accuracy on MUSR",
1427
+ "lower_is_better": false
1428
+ },
1429
+ {
1430
+ "benchmark": "MMLU-PRO",
1431
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
1432
+ "canonical_display_name": "MMLU-PRO / Accuracy",
1433
+ "evaluation_name": "MMLU-PRO",
1434
+ "score": 0.3193,
1435
+ "metric": "Accuracy on MMLU-PRO",
1436
+ "lower_is_better": false
1437
+ },
1438
+ {
1439
+ "benchmark": "GPQA",
1440
+ "benchmarkKey": "hfopenllm_v2_gpqa",
1441
+ "canonical_display_name": "GPQA / Accuracy",
1442
+ "evaluation_name": "GPQA",
1443
+ "score": 0.302,
1444
+ "metric": "Accuracy on GPQA",
1445
+ "lower_is_better": false
1446
+ },
1447
+ {
1448
+ "benchmark": "MATH Level 5",
1449
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
1450
+ "canonical_display_name": "MATH Level 5 / Exact Match",
1451
+ "evaluation_name": "MATH Level 5",
1452
+ "score": 0.1624,
1453
+ "metric": "Exact Match on MATH Level 5",
1454
+ "lower_is_better": false
1455
+ }
1456
+ ]
1457
+ },
1458
+ {
1459
+ "model_family_id": "01-ai/yi-1-5-9b",
1460
+ "model_route_id": "01-ai__yi-1-5-9b",
1461
+ "model_family_name": "Yi-1.5-9B",
1462
+ "developer": "01-ai",
1463
+ "params_billions": 8.829,
1464
+ "total_evaluations": 1,
1465
+ "benchmark_count": 1,
1466
+ "benchmark_family_count": 1,
1467
+ "categories_covered": [
1468
+ "general",
1469
+ "knowledge",
1470
+ "reasoning"
1471
+ ],
1472
+ "last_updated": "2026-03-19T16:08:18.240187Z",
1473
+ "variants": [
1474
+ {
1475
+ "variant_key": "default",
1476
+ "variant_label": "Default",
1477
+ "evaluation_count": 1,
1478
+ "raw_model_ids": [
1479
+ "01-ai/Yi-1.5-9B"
1480
+ ],
1481
+ "last_updated": "2026-03-19T16:08:18.240187Z"
1482
+ }
1483
+ ],
1484
+ "score_summary": {
1485
+ "count": 6,
1486
+ "min": 0.114,
1487
+ "max": 0.5143,
1488
+ "average": 0.35425
1489
+ },
1490
+ "reproducibility_summary": {
1491
+ "results_total": 6,
1492
+ "has_reproducibility_gap_count": 6,
1493
+ "populated_ratio_avg": 0.0
1494
+ },
1495
+ "provenance_summary": {
1496
+ "total_results": 6,
1497
+ "total_groups": 6,
1498
+ "multi_source_groups": 0,
1499
+ "first_party_only_groups": 0,
1500
+ "source_type_distribution": {
1501
+ "first_party": 0,
1502
+ "third_party": 6,
1503
+ "collaborative": 0,
1504
+ "unspecified": 0
1505
+ }
1506
+ },
1507
+ "comparability_summary": {
1508
+ "total_groups": 6,
1509
+ "groups_with_variant_check": 0,
1510
+ "groups_with_cross_party_check": 0,
1511
+ "variant_divergent_count": 0,
1512
+ "cross_party_divergent_count": 0
1513
+ },
1514
+ "benchmark_names": [
1515
+ "BBH",
1516
+ "GPQA",
1517
+ "IFEval",
1518
+ "MATH Level 5",
1519
+ "MMLU-PRO",
1520
+ "MUSR"
1521
+ ],
1522
+ "top_benchmark_scores": [
1523
+ {
1524
+ "benchmark": "BBH",
1525
+ "benchmarkKey": "hfopenllm_v2_bbh",
1526
+ "canonical_display_name": "BBH / Accuracy",
1527
+ "evaluation_name": "BBH",
1528
+ "score": 0.5143,
1529
+ "metric": "Accuracy on BBH",
1530
+ "lower_is_better": false
1531
+ },
1532
+ {
1533
+ "benchmark": "MUSR",
1534
+ "benchmarkKey": "hfopenllm_v2_musr",
1535
+ "canonical_display_name": "MUSR / Accuracy",
1536
+ "evaluation_name": "MUSR",
1537
+ "score": 0.4328,
1538
+ "metric": "Accuracy on MUSR",
1539
+ "lower_is_better": false
1540
+ },
1541
+ {
1542
+ "benchmark": "MMLU-PRO",
1543
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
1544
+ "canonical_display_name": "MMLU-PRO / Accuracy",
1545
+ "evaluation_name": "MMLU-PRO",
1546
+ "score": 0.3916,
1547
+ "metric": "Accuracy on MMLU-PRO",
1548
+ "lower_is_better": false
1549
+ },
1550
+ {
1551
+ "benchmark": "GPQA",
1552
+ "benchmarkKey": "hfopenllm_v2_gpqa",
1553
+ "canonical_display_name": "GPQA / Accuracy",
1554
+ "evaluation_name": "GPQA",
1555
+ "score": 0.3792,
1556
+ "metric": "Accuracy on GPQA",
1557
+ "lower_is_better": false
1558
+ },
1559
+ {
1560
+ "benchmark": "IFEval",
1561
+ "benchmarkKey": "hfopenllm_v2_ifeval",
1562
+ "canonical_display_name": "IFEval / Accuracy",
1563
+ "evaluation_name": "IFEval",
1564
+ "score": 0.2936,
1565
+ "metric": "Accuracy on IFEval",
1566
+ "lower_is_better": false
1567
+ },
1568
+ {
1569
+ "benchmark": "MATH Level 5",
1570
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
1571
+ "canonical_display_name": "MATH Level 5 / Exact Match",
1572
+ "evaluation_name": "MATH Level 5",
1573
+ "score": 0.114,
1574
+ "metric": "Exact Match on MATH Level 5",
1575
+ "lower_is_better": false
1576
+ }
1577
+ ]
1578
+ },
1579
+ {
1580
+ "model_family_id": "01-ai/yi-1-5-9b-32k",
1581
+ "model_route_id": "01-ai__yi-1-5-9b-32k",
1582
+ "model_family_name": "Yi-1.5-9B-32K",
1583
+ "developer": "01-ai",
1584
+ "params_billions": 8.829,
1585
+ "total_evaluations": 1,
1586
+ "benchmark_count": 1,
1587
+ "benchmark_family_count": 1,
1588
+ "categories_covered": [
1589
+ "general",
1590
+ "knowledge",
1591
+ "reasoning"
1592
+ ],
1593
+ "last_updated": "2026-03-19T16:08:18.240187Z",
1594
+ "variants": [
1595
+ {
1596
+ "variant_key": "default",
1597
+ "variant_label": "Default",
1598
+ "evaluation_count": 1,
1599
+ "raw_model_ids": [
1600
+ "01-ai/Yi-1.5-9B-32K"
1601
+ ],
1602
+ "last_updated": "2026-03-19T16:08:18.240187Z"
1603
+ }
1604
+ ],
1605
+ "score_summary": {
1606
+ "count": 6,
1607
+ "min": 0.108,
1608
+ "max": 0.4963,
1609
+ "average": 0.3314666666666667
1610
+ },
1611
+ "reproducibility_summary": {
1612
+ "results_total": 6,
1613
+ "has_reproducibility_gap_count": 6,
1614
+ "populated_ratio_avg": 0.0
1615
+ },
1616
+ "provenance_summary": {
1617
+ "total_results": 6,
1618
+ "total_groups": 6,
1619
+ "multi_source_groups": 0,
1620
+ "first_party_only_groups": 0,
1621
+ "source_type_distribution": {
1622
+ "first_party": 0,
1623
+ "third_party": 6,
1624
+ "collaborative": 0,
1625
+ "unspecified": 0
1626
+ }
1627
+ },
1628
+ "comparability_summary": {
1629
+ "total_groups": 6,
1630
+ "groups_with_variant_check": 0,
1631
+ "groups_with_cross_party_check": 0,
1632
+ "variant_divergent_count": 0,
1633
+ "cross_party_divergent_count": 0
1634
+ },
1635
+ "benchmark_names": [
1636
+ "BBH",
1637
+ "GPQA",
1638
+ "IFEval",
1639
+ "MATH Level 5",
1640
+ "MMLU-PRO",
1641
+ "MUSR"
1642
+ ],
1643
+ "top_benchmark_scores": [
1644
+ {
1645
+ "benchmark": "BBH",
1646
+ "benchmarkKey": "hfopenllm_v2_bbh",
1647
+ "canonical_display_name": "BBH / Accuracy",
1648
+ "evaluation_name": "BBH",
1649
+ "score": 0.4963,
1650
+ "metric": "Accuracy on BBH",
1651
+ "lower_is_better": false
1652
+ },
1653
+ {
1654
+ "benchmark": "MUSR",
1655
+ "benchmarkKey": "hfopenllm_v2_musr",
1656
+ "canonical_display_name": "MUSR / Accuracy",
1657
+ "evaluation_name": "MUSR",
1658
+ "score": 0.4186,
1659
+ "metric": "Accuracy on MUSR",
1660
+ "lower_is_better": false
1661
+ },
1662
+ {
1663
+ "benchmark": "MMLU-PRO",
1664
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
1665
+ "canonical_display_name": "MMLU-PRO / Accuracy",
1666
+ "evaluation_name": "MMLU-PRO",
1667
+ "score": 0.3765,
1668
+ "metric": "Accuracy on MMLU-PRO",
1669
+ "lower_is_better": false
1670
+ },
1671
+ {
1672
+ "benchmark": "GPQA",
1673
+ "benchmarkKey": "hfopenllm_v2_gpqa",
1674
+ "canonical_display_name": "GPQA / Accuracy",
1675
+ "evaluation_name": "GPQA",
1676
+ "score": 0.3591,
1677
+ "metric": "Accuracy on GPQA",
1678
+ "lower_is_better": false
1679
+ },
1680
+ {
1681
+ "benchmark": "IFEval",
1682
+ "benchmarkKey": "hfopenllm_v2_ifeval",
1683
+ "canonical_display_name": "IFEval / Accuracy",
1684
+ "evaluation_name": "IFEval",
1685
+ "score": 0.2303,
1686
+ "metric": "Accuracy on IFEval",
1687
+ "lower_is_better": false
1688
+ },
1689
+ {
1690
+ "benchmark": "MATH Level 5",
1691
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
1692
+ "canonical_display_name": "MATH Level 5 / Exact Match",
1693
+ "evaluation_name": "MATH Level 5",
1694
+ "score": 0.108,
1695
+ "metric": "Exact Match on MATH Level 5",
1696
+ "lower_is_better": false
1697
+ }
1698
+ ]
1699
+ },
1700
+ {
1701
+ "model_family_id": "01-ai/yi-1-5-9b-chat",
1702
+ "model_route_id": "01-ai__yi-1-5-9b-chat",
1703
+ "model_family_name": "Yi-1.5-9B-Chat",
1704
+ "developer": "01-ai",
1705
+ "params_billions": 8.829,
1706
+ "total_evaluations": 1,
1707
+ "benchmark_count": 1,
1708
+ "benchmark_family_count": 1,
1709
+ "categories_covered": [
1710
+ "general",
1711
+ "knowledge",
1712
+ "reasoning"
1713
+ ],
1714
+ "last_updated": "2026-03-19T16:08:18.240187Z",
1715
+ "variants": [
1716
+ {
1717
+ "variant_key": "default",
1718
+ "variant_label": "Default",
1719
+ "evaluation_count": 1,
1720
+ "raw_model_ids": [
1721
+ "01-ai/Yi-1.5-9B-Chat"
1722
+ ],
1723
+ "last_updated": "2026-03-19T16:08:18.240187Z"
1724
+ }
1725
+ ],
1726
+ "score_summary": {
1727
+ "count": 6,
1728
+ "min": 0.2258,
1729
+ "max": 0.6046,
1730
+ "average": 0.42406666666666665
1731
+ },
1732
+ "reproducibility_summary": {
1733
+ "results_total": 6,
1734
+ "has_reproducibility_gap_count": 6,
1735
+ "populated_ratio_avg": 0.0
1736
+ },
1737
+ "provenance_summary": {
1738
+ "total_results": 6,
1739
+ "total_groups": 6,
1740
+ "multi_source_groups": 0,
1741
+ "first_party_only_groups": 0,
1742
+ "source_type_distribution": {
1743
+ "first_party": 0,
1744
+ "third_party": 6,
1745
+ "collaborative": 0,
1746
+ "unspecified": 0
1747
+ }
1748
+ },
1749
+ "comparability_summary": {
1750
+ "total_groups": 6,
1751
+ "groups_with_variant_check": 0,
1752
+ "groups_with_cross_party_check": 0,
1753
+ "variant_divergent_count": 0,
1754
+ "cross_party_divergent_count": 0
1755
+ },
1756
+ "benchmark_names": [
1757
+ "BBH",
1758
+ "GPQA",
1759
+ "IFEval",
1760
+ "MATH Level 5",
1761
+ "MMLU-PRO",
1762
+ "MUSR"
1763
+ ],
1764
+ "top_benchmark_scores": [
1765
+ {
1766
+ "benchmark": "IFEval",
1767
+ "benchmarkKey": "hfopenllm_v2_ifeval",
1768
+ "canonical_display_name": "IFEval / Accuracy",
1769
+ "evaluation_name": "IFEval",
1770
+ "score": 0.6046,
1771
+ "metric": "Accuracy on IFEval",
1772
+ "lower_is_better": false
1773
+ },
1774
+ {
1775
+ "benchmark": "BBH",
1776
+ "benchmarkKey": "hfopenllm_v2_bbh",
1777
+ "canonical_display_name": "BBH / Accuracy",
1778
+ "evaluation_name": "BBH",
1779
+ "score": 0.5559,
1780
+ "metric": "Accuracy on BBH",
1781
+ "lower_is_better": false
1782
+ },
1783
+ {
1784
+ "benchmark": "MUSR",
1785
+ "benchmarkKey": "hfopenllm_v2_musr",
1786
+ "canonical_display_name": "MUSR / Accuracy",
1787
+ "evaluation_name": "MUSR",
1788
+ "score": 0.4259,
1789
+ "metric": "Accuracy on MUSR",
1790
+ "lower_is_better": false
1791
+ },
1792
+ {
1793
+ "benchmark": "MMLU-PRO",
1794
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
1795
+ "canonical_display_name": "MMLU-PRO / Accuracy",
1796
+ "evaluation_name": "MMLU-PRO",
1797
+ "score": 0.3975,
1798
+ "metric": "Accuracy on MMLU-PRO",
1799
+ "lower_is_better": false
1800
+ },
1801
+ {
1802
+ "benchmark": "GPQA",
1803
+ "benchmarkKey": "hfopenllm_v2_gpqa",
1804
+ "canonical_display_name": "GPQA / Accuracy",
1805
+ "evaluation_name": "GPQA",
1806
+ "score": 0.3347,
1807
+ "metric": "Accuracy on GPQA",
1808
+ "lower_is_better": false
1809
+ },
1810
+ {
1811
+ "benchmark": "MATH Level 5",
1812
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
1813
+ "canonical_display_name": "MATH Level 5 / Exact Match",
1814
+ "evaluation_name": "MATH Level 5",
1815
+ "score": 0.2258,
1816
+ "metric": "Exact Match on MATH Level 5",
1817
+ "lower_is_better": false
1818
+ }
1819
+ ]
1820
+ },
1821
+ {
1822
+ "model_family_id": "01-ai/yi-1-5-9b-chat-16k",
1823
+ "model_route_id": "01-ai__yi-1-5-9b-chat-16k",
1824
+ "model_family_name": "Yi-1.5-9B-Chat-16K",
1825
+ "developer": "01-ai",
1826
+ "params_billions": 8.829,
1827
+ "total_evaluations": 1,
1828
+ "benchmark_count": 1,
1829
+ "benchmark_family_count": 1,
1830
+ "categories_covered": [
1831
+ "general",
1832
+ "knowledge",
1833
+ "reasoning"
1834
+ ],
1835
+ "last_updated": "2026-03-19T16:08:18.240187Z",
1836
+ "variants": [
1837
+ {
1838
+ "variant_key": "default",
1839
+ "variant_label": "Default",
1840
+ "evaluation_count": 1,
1841
+ "raw_model_ids": [
1842
+ "01-ai/Yi-1.5-9B-Chat-16K"
1843
+ ],
1844
+ "last_updated": "2026-03-19T16:08:18.240187Z"
1845
+ }
1846
+ ],
1847
+ "score_summary": {
1848
+ "count": 6,
1849
+ "min": 0.1782,
1850
+ "max": 0.5153,
1851
+ "average": 0.37215
1852
+ },
1853
+ "reproducibility_summary": {
1854
+ "results_total": 6,
1855
+ "has_reproducibility_gap_count": 6,
1856
+ "populated_ratio_avg": 0.0
1857
+ },
1858
+ "provenance_summary": {
1859
+ "total_results": 6,
1860
+ "total_groups": 6,
1861
+ "multi_source_groups": 0,
1862
+ "first_party_only_groups": 0,
1863
+ "source_type_distribution": {
1864
+ "first_party": 0,
1865
+ "third_party": 6,
1866
+ "collaborative": 0,
1867
+ "unspecified": 0
1868
+ }
1869
+ },
1870
+ "comparability_summary": {
1871
+ "total_groups": 6,
1872
+ "groups_with_variant_check": 0,
1873
+ "groups_with_cross_party_check": 0,
1874
+ "variant_divergent_count": 0,
1875
+ "cross_party_divergent_count": 0
1876
+ },
1877
+ "benchmark_names": [
1878
+ "BBH",
1879
+ "GPQA",
1880
+ "IFEval",
1881
+ "MATH Level 5",
1882
+ "MMLU-PRO",
1883
+ "MUSR"
1884
+ ],
1885
+ "top_benchmark_scores": [
1886
+ {
1887
+ "benchmark": "BBH",
1888
+ "benchmarkKey": "hfopenllm_v2_bbh",
1889
+ "canonical_display_name": "BBH / Accuracy",
1890
+ "evaluation_name": "BBH",
1891
+ "score": 0.5153,
1892
+ "metric": "Accuracy on BBH",
1893
+ "lower_is_better": false
1894
+ },
1895
+ {
1896
+ "benchmark": "IFEval",
1897
+ "benchmarkKey": "hfopenllm_v2_ifeval",
1898
+ "canonical_display_name": "IFEval / Accuracy",
1899
+ "evaluation_name": "IFEval",
1900
+ "score": 0.4214,
1901
+ "metric": "Accuracy on IFEval",
1902
+ "lower_is_better": false
1903
+ },
1904
+ {
1905
+ "benchmark": "MUSR",
1906
+ "benchmarkKey": "hfopenllm_v2_musr",
1907
+ "canonical_display_name": "MUSR / Accuracy",
1908
+ "evaluation_name": "MUSR",
1909
+ "score": 0.4099,
1910
+ "metric": "Accuracy on MUSR",
1911
+ "lower_is_better": false
1912
+ },
1913
+ {
1914
+ "benchmark": "MMLU-PRO",
1915
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
1916
+ "canonical_display_name": "MMLU-PRO / Accuracy",
1917
+ "evaluation_name": "MMLU-PRO",
1918
+ "score": 0.3994,
1919
+ "metric": "Accuracy on MMLU-PRO",
1920
+ "lower_is_better": false
1921
+ },
1922
+ {
1923
+ "benchmark": "GPQA",
1924
+ "benchmarkKey": "hfopenllm_v2_gpqa",
1925
+ "canonical_display_name": "GPQA / Accuracy",
1926
+ "evaluation_name": "GPQA",
1927
+ "score": 0.3087,
1928
+ "metric": "Accuracy on GPQA",
1929
+ "lower_is_better": false
1930
+ },
1931
+ {
1932
+ "benchmark": "MATH Level 5",
1933
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
1934
+ "canonical_display_name": "MATH Level 5 / Exact Match",
1935
+ "evaluation_name": "MATH Level 5",
1936
+ "score": 0.1782,
1937
+ "metric": "Exact Match on MATH Level 5",
1938
+ "lower_is_better": false
1939
+ }
1940
+ ]
1941
+ },
1942
+ {
1943
+ "model_family_id": "01-ai/yi-34b-200k",
1944
+ "model_route_id": "01-ai__yi-34b-200k",
1945
+ "model_family_name": "Yi-34B-200K",
1946
+ "developer": "01-ai",
1947
+ "params_billions": 34.389,
1948
+ "total_evaluations": 1,
1949
+ "benchmark_count": 1,
1950
+ "benchmark_family_count": 1,
1951
+ "categories_covered": [
1952
+ "general",
1953
+ "knowledge",
1954
+ "reasoning"
1955
+ ],
1956
+ "last_updated": "2026-03-19T16:08:18.240187Z",
1957
+ "variants": [
1958
+ {
1959
+ "variant_key": "default",
1960
+ "variant_label": "Default",
1961
+ "evaluation_count": 1,
1962
+ "raw_model_ids": [
1963
+ "01-ai/Yi-34B-200K"
1964
+ ],
1965
+ "last_updated": "2026-03-19T16:08:18.240187Z"
1966
+ }
1967
+ ],
1968
+ "score_summary": {
1969
+ "count": 6,
1970
+ "min": 0.0574,
1971
+ "max": 0.5442,
1972
+ "average": 0.32458333333333333
1973
+ },
1974
+ "reproducibility_summary": {
1975
+ "results_total": 6,
1976
+ "has_reproducibility_gap_count": 6,
1977
+ "populated_ratio_avg": 0.0
1978
+ },
1979
+ "provenance_summary": {
1980
+ "total_results": 6,
1981
+ "total_groups": 6,
1982
+ "multi_source_groups": 0,
1983
+ "first_party_only_groups": 0,
1984
+ "source_type_distribution": {
1985
+ "first_party": 0,
1986
+ "third_party": 6,
1987
+ "collaborative": 0,
1988
+ "unspecified": 0
1989
+ }
1990
+ },
1991
+ "comparability_summary": {
1992
+ "total_groups": 6,
1993
+ "groups_with_variant_check": 0,
1994
+ "groups_with_cross_party_check": 0,
1995
+ "variant_divergent_count": 0,
1996
+ "cross_party_divergent_count": 0
1997
+ },
1998
+ "benchmark_names": [
1999
+ "BBH",
2000
+ "GPQA",
2001
+ "IFEval",
2002
+ "MATH Level 5",
2003
+ "MMLU-PRO",
2004
+ "MUSR"
2005
+ ],
2006
+ "top_benchmark_scores": [
2007
+ {
2008
+ "benchmark": "BBH",
2009
+ "benchmarkKey": "hfopenllm_v2_bbh",
2010
+ "canonical_display_name": "BBH / Accuracy",
2011
+ "evaluation_name": "BBH",
2012
+ "score": 0.5442,
2013
+ "metric": "Accuracy on BBH",
2014
+ "lower_is_better": false
2015
+ },
2016
+ {
2017
+ "benchmark": "MMLU-PRO",
2018
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
2019
+ "canonical_display_name": "MMLU-PRO / Accuracy",
2020
+ "evaluation_name": "MMLU-PRO",
2021
+ "score": 0.4535,
2022
+ "metric": "Accuracy on MMLU-PRO",
2023
+ "lower_is_better": false
2024
+ },
2025
+ {
2026
+ "benchmark": "MUSR",
2027
+ "benchmarkKey": "hfopenllm_v2_musr",
2028
+ "canonical_display_name": "MUSR / Accuracy",
2029
+ "evaluation_name": "MUSR",
2030
+ "score": 0.3817,
2031
+ "metric": "Accuracy on MUSR",
2032
+ "lower_is_better": false
2033
+ },
2034
+ {
2035
+ "benchmark": "GPQA",
2036
+ "benchmarkKey": "hfopenllm_v2_gpqa",
2037
+ "canonical_display_name": "GPQA / Accuracy",
2038
+ "evaluation_name": "GPQA",
2039
+ "score": 0.3565,
2040
+ "metric": "Accuracy on GPQA",
2041
+ "lower_is_better": false
2042
+ },
2043
+ {
2044
+ "benchmark": "IFEval",
2045
+ "benchmarkKey": "hfopenllm_v2_ifeval",
2046
+ "canonical_display_name": "IFEval / Accuracy",
2047
+ "evaluation_name": "IFEval",
2048
+ "score": 0.1542,
2049
+ "metric": "Accuracy on IFEval",
2050
+ "lower_is_better": false
2051
+ },
2052
+ {
2053
+ "benchmark": "MATH Level 5",
2054
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
2055
+ "canonical_display_name": "MATH Level 5 / Exact Match",
2056
+ "evaluation_name": "MATH Level 5",
2057
+ "score": 0.0574,
2058
+ "metric": "Exact Match on MATH Level 5",
2059
+ "lower_is_better": false
2060
+ }
2061
+ ]
2062
+ },
2063
+ {
2064
+ "model_family_id": "01-ai/yi-6b-200k",
2065
+ "model_route_id": "01-ai__yi-6b-200k",
2066
+ "model_family_name": "Yi-6B-200K",
2067
+ "developer": "01-ai",
2068
+ "params_billions": 6.061,
2069
+ "total_evaluations": 1,
2070
+ "benchmark_count": 1,
2071
+ "benchmark_family_count": 1,
2072
+ "categories_covered": [
2073
+ "general",
2074
+ "knowledge",
2075
+ "reasoning"
2076
+ ],
2077
+ "last_updated": "2026-03-19T16:08:18.240187Z",
2078
+ "variants": [
2079
+ {
2080
+ "variant_key": "default",
2081
+ "variant_label": "Default",
2082
+ "evaluation_count": 1,
2083
+ "raw_model_ids": [
2084
+ "01-ai/Yi-6B-200K"
2085
+ ],
2086
+ "last_updated": "2026-03-19T16:08:18.240187Z"
2087
+ }
2088
+ ],
2089
+ "score_summary": {
2090
+ "count": 6,
2091
+ "min": 0.0181,
2092
+ "max": 0.4587,
2093
+ "average": 0.25938333333333335
2094
+ },
2095
+ "reproducibility_summary": {
2096
+ "results_total": 6,
2097
+ "has_reproducibility_gap_count": 6,
2098
+ "populated_ratio_avg": 0.0
2099
+ },
2100
+ "provenance_summary": {
2101
+ "total_results": 6,
2102
+ "total_groups": 6,
2103
+ "multi_source_groups": 0,
2104
+ "first_party_only_groups": 0,
2105
+ "source_type_distribution": {
2106
+ "first_party": 0,
2107
+ "third_party": 6,
2108
+ "collaborative": 0,
2109
+ "unspecified": 0
2110
+ }
2111
+ },
2112
+ "comparability_summary": {
2113
+ "total_groups": 6,
2114
+ "groups_with_variant_check": 0,
2115
+ "groups_with_cross_party_check": 0,
2116
+ "variant_divergent_count": 0,
2117
+ "cross_party_divergent_count": 0
2118
+ },
2119
+ "benchmark_names": [
2120
+ "BBH",
2121
+ "GPQA",
2122
+ "IFEval",
2123
+ "MATH Level 5",
2124
+ "MMLU-PRO",
2125
+ "MUSR"
2126
+ ],
2127
+ "top_benchmark_scores": [
2128
+ {
2129
+ "benchmark": "MUSR",
2130
+ "benchmarkKey": "hfopenllm_v2_musr",
2131
+ "canonical_display_name": "MUSR / Accuracy",
2132
+ "evaluation_name": "MUSR",
2133
+ "score": 0.4587,
2134
+ "metric": "Accuracy on MUSR",
2135
+ "lower_is_better": false
2136
+ },
2137
+ {
2138
+ "benchmark": "BBH",
2139
+ "benchmarkKey": "hfopenllm_v2_bbh",
2140
+ "canonical_display_name": "BBH / Accuracy",
2141
+ "evaluation_name": "BBH",
2142
+ "score": 0.4289,
2143
+ "metric": "Accuracy on BBH",
2144
+ "lower_is_better": false
2145
+ },
2146
+ {
2147
+ "benchmark": "MMLU-PRO",
2148
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
2149
+ "canonical_display_name": "MMLU-PRO / Accuracy",
2150
+ "evaluation_name": "MMLU-PRO",
2151
+ "score": 0.2844,
2152
+ "metric": "Accuracy on MMLU-PRO",
2153
+ "lower_is_better": false
2154
+ },
2155
+ {
2156
+ "benchmark": "GPQA",
2157
+ "benchmarkKey": "hfopenllm_v2_gpqa",
2158
+ "canonical_display_name": "GPQA / Accuracy",
2159
+ "evaluation_name": "GPQA",
2160
+ "score": 0.2819,
2161
+ "metric": "Accuracy on GPQA",
2162
+ "lower_is_better": false
2163
+ },
2164
+ {
2165
+ "benchmark": "IFEval",
2166
+ "benchmarkKey": "hfopenllm_v2_ifeval",
2167
+ "canonical_display_name": "IFEval / Accuracy",
2168
+ "evaluation_name": "IFEval",
2169
+ "score": 0.0843,
2170
+ "metric": "Accuracy on IFEval",
2171
+ "lower_is_better": false
2172
+ },
2173
+ {
2174
+ "benchmark": "MATH Level 5",
2175
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
2176
+ "canonical_display_name": "MATH Level 5 / Exact Match",
2177
+ "evaluation_name": "MATH Level 5",
2178
+ "score": 0.0181,
2179
+ "metric": "Exact Match on MATH Level 5",
2180
+ "lower_is_better": false
2181
+ }
2182
+ ]
2183
+ },
2184
+ {
2185
+ "model_family_id": "01-ai/yi-6b-chat",
2186
+ "model_route_id": "01-ai__yi-6b-chat",
2187
+ "model_family_name": "Yi-6B-Chat",
2188
+ "developer": "01-ai",
2189
+ "params_billions": 6.061,
2190
+ "total_evaluations": 1,
2191
+ "benchmark_count": 1,
2192
+ "benchmark_family_count": 1,
2193
+ "categories_covered": [
2194
+ "general",
2195
+ "knowledge",
2196
+ "reasoning"
2197
+ ],
2198
+ "last_updated": "2026-03-19T16:08:18.240187Z",
2199
+ "variants": [
2200
+ {
2201
+ "variant_key": "default",
2202
+ "variant_label": "Default",
2203
+ "evaluation_count": 1,
2204
+ "raw_model_ids": [
2205
+ "01-ai/Yi-6B-Chat"
2206
+ ],
2207
+ "last_updated": "2026-03-19T16:08:18.240187Z"
2208
+ }
2209
+ ],
2210
+ "score_summary": {
2211
+ "count": 6,
2212
+ "min": 0.0136,
2213
+ "max": 0.4133,
2214
+ "average": 0.2893
2215
+ },
2216
+ "reproducibility_summary": {
2217
+ "results_total": 6,
2218
+ "has_reproducibility_gap_count": 6,
2219
+ "populated_ratio_avg": 0.0
2220
+ },
2221
+ "provenance_summary": {
2222
+ "total_results": 6,
2223
+ "total_groups": 6,
2224
+ "multi_source_groups": 0,
2225
+ "first_party_only_groups": 0,
2226
+ "source_type_distribution": {
2227
+ "first_party": 0,
2228
+ "third_party": 6,
2229
+ "collaborative": 0,
2230
+ "unspecified": 0
2231
+ }
2232
+ },
2233
+ "comparability_summary": {
2234
+ "total_groups": 6,
2235
+ "groups_with_variant_check": 0,
2236
+ "groups_with_cross_party_check": 0,
2237
+ "variant_divergent_count": 0,
2238
+ "cross_party_divergent_count": 0
2239
+ },
2240
+ "benchmark_names": [
2241
+ "BBH",
2242
+ "GPQA",
2243
+ "IFEval",
2244
+ "MATH Level 5",
2245
+ "MMLU-PRO",
2246
+ "MUSR"
2247
+ ],
2248
+ "top_benchmark_scores": [
2249
+ {
2250
+ "benchmark": "BBH",
2251
+ "benchmarkKey": "hfopenllm_v2_bbh",
2252
+ "canonical_display_name": "BBH / Accuracy",
2253
+ "evaluation_name": "BBH",
2254
+ "score": 0.4133,
2255
+ "metric": "Accuracy on BBH",
2256
+ "lower_is_better": false
2257
+ },
2258
+ {
2259
+ "benchmark": "MUSR",
2260
+ "benchmarkKey": "hfopenllm_v2_musr",
2261
+ "canonical_display_name": "MUSR / Accuracy",
2262
+ "evaluation_name": "MUSR",
2263
+ "score": 0.3688,
2264
+ "metric": "Accuracy on MUSR",
2265
+ "lower_is_better": false
2266
+ },
2267
+ {
2268
+ "benchmark": "IFEval",
2269
+ "benchmarkKey": "hfopenllm_v2_ifeval",
2270
+ "canonical_display_name": "IFEval / Accuracy",
2271
+ "evaluation_name": "IFEval",
2272
+ "score": 0.3395,
2273
+ "metric": "Accuracy on IFEval",
2274
+ "lower_is_better": false
2275
+ },
2276
+ {
2277
+ "benchmark": "MMLU-PRO",
2278
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
2279
+ "canonical_display_name": "MMLU-PRO / Accuracy",
2280
+ "evaluation_name": "MMLU-PRO",
2281
+ "score": 0.3061,
2282
+ "metric": "Accuracy on MMLU-PRO",
2283
+ "lower_is_better": false
2284
+ },
2285
+ {
2286
+ "benchmark": "GPQA",
2287
+ "benchmarkKey": "hfopenllm_v2_gpqa",
2288
+ "canonical_display_name": "GPQA / Accuracy",
2289
+ "evaluation_name": "GPQA",
2290
+ "score": 0.2945,
2291
+ "metric": "Accuracy on GPQA",
2292
+ "lower_is_better": false
2293
+ },
2294
+ {
2295
+ "benchmark": "MATH Level 5",
2296
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
2297
+ "canonical_display_name": "MATH Level 5 / Exact Match",
2298
+ "evaluation_name": "MATH Level 5",
2299
+ "score": 0.0136,
2300
+ "metric": "Exact Match on MATH Level 5",
2301
+ "lower_is_better": false
2302
+ }
2303
+ ]
2304
+ },
2305
+ {
2306
+ "model_family_id": "01-ai/yi-9b",
2307
+ "model_route_id": "01-ai__yi-9b",
2308
+ "model_family_name": "Yi-9B",
2309
+ "developer": "01-ai",
2310
+ "params_billions": 8.829,
2311
+ "total_evaluations": 1,
2312
+ "benchmark_count": 1,
2313
+ "benchmark_family_count": 1,
2314
+ "categories_covered": [
2315
+ "general",
2316
+ "knowledge",
2317
+ "reasoning"
2318
+ ],
2319
+ "last_updated": "2026-03-19T16:08:18.240187Z",
2320
+ "variants": [
2321
+ {
2322
+ "variant_key": "default",
2323
+ "variant_label": "Default",
2324
+ "evaluation_count": 1,
2325
+ "raw_model_ids": [
2326
+ "01-ai/Yi-9B"
2327
+ ],
2328
+ "last_updated": "2026-03-19T16:08:18.240187Z"
2329
+ }
2330
+ ],
2331
+ "score_summary": {
2332
+ "count": 6,
2333
+ "min": 0.0559,
2334
+ "max": 0.494,
2335
+ "average": 0.3169333333333333
2336
+ },
2337
+ "reproducibility_summary": {
2338
+ "results_total": 6,
2339
+ "has_reproducibility_gap_count": 6,
2340
+ "populated_ratio_avg": 0.0
2341
+ },
2342
+ "provenance_summary": {
2343
+ "total_results": 6,
2344
+ "total_groups": 6,
2345
+ "multi_source_groups": 0,
2346
+ "first_party_only_groups": 0,
2347
+ "source_type_distribution": {
2348
+ "first_party": 0,
2349
+ "third_party": 6,
2350
+ "collaborative": 0,
2351
+ "unspecified": 0
2352
+ }
2353
+ },
2354
+ "comparability_summary": {
2355
+ "total_groups": 6,
2356
+ "groups_with_variant_check": 0,
2357
+ "groups_with_cross_party_check": 0,
2358
+ "variant_divergent_count": 0,
2359
+ "cross_party_divergent_count": 0
2360
+ },
2361
+ "benchmark_names": [
2362
+ "BBH",
2363
+ "GPQA",
2364
+ "IFEval",
2365
+ "MATH Level 5",
2366
+ "MMLU-PRO",
2367
+ "MUSR"
2368
+ ],
2369
+ "top_benchmark_scores": [
2370
+ {
2371
+ "benchmark": "BBH",
2372
+ "benchmarkKey": "hfopenllm_v2_bbh",
2373
+ "canonical_display_name": "BBH / Accuracy",
2374
+ "evaluation_name": "BBH",
2375
+ "score": 0.494,
2376
+ "metric": "Accuracy on BBH",
2377
+ "lower_is_better": false
2378
+ },
2379
+ {
2380
+ "benchmark": "MUSR",
2381
+ "benchmarkKey": "hfopenllm_v2_musr",
2382
+ "canonical_display_name": "MUSR / Accuracy",
2383
+ "evaluation_name": "MUSR",
2384
+ "score": 0.4054,
2385
+ "metric": "Accuracy on MUSR",
2386
+ "lower_is_better": false
2387
+ },
2388
+ {
2389
+ "benchmark": "MMLU-PRO",
2390
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
2391
+ "canonical_display_name": "MMLU-PRO / Accuracy",
2392
+ "evaluation_name": "MMLU-PRO",
2393
+ "score": 0.3574,
2394
+ "metric": "Accuracy on MMLU-PRO",
2395
+ "lower_is_better": false
2396
+ },
2397
+ {
2398
+ "benchmark": "GPQA",
2399
+ "benchmarkKey": "hfopenllm_v2_gpqa",
2400
+ "canonical_display_name": "GPQA / Accuracy",
2401
+ "evaluation_name": "GPQA",
2402
+ "score": 0.318,
2403
+ "metric": "Accuracy on GPQA",
2404
+ "lower_is_better": false
2405
+ },
2406
+ {
2407
+ "benchmark": "IFEval",
2408
+ "benchmarkKey": "hfopenllm_v2_ifeval",
2409
+ "canonical_display_name": "IFEval / Accuracy",
2410
+ "evaluation_name": "IFEval",
2411
+ "score": 0.2709,
2412
+ "metric": "Accuracy on IFEval",
2413
+ "lower_is_better": false
2414
+ },
2415
+ {
2416
+ "benchmark": "MATH Level 5",
2417
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
2418
+ "canonical_display_name": "MATH Level 5 / Exact Match",
2419
+ "evaluation_name": "MATH Level 5",
2420
+ "score": 0.0559,
2421
+ "metric": "Exact Match on MATH Level 5",
2422
+ "lower_is_better": false
2423
+ }
2424
+ ]
2425
+ },
2426
+ {
2427
+ "model_family_id": "01-ai/yi-9b-200k",
2428
+ "model_route_id": "01-ai__yi-9b-200k",
2429
+ "model_family_name": "Yi-9B-200K",
2430
+ "developer": "01-ai",
2431
+ "params_billions": 8.829,
2432
+ "total_evaluations": 1,
2433
+ "benchmark_count": 1,
2434
+ "benchmark_family_count": 1,
2435
+ "categories_covered": [
2436
+ "general",
2437
+ "knowledge",
2438
+ "reasoning"
2439
+ ],
2440
+ "last_updated": "2026-03-19T16:08:18.240187Z",
2441
+ "variants": [
2442
+ {
2443
+ "variant_key": "default",
2444
+ "variant_label": "Default",
2445
+ "evaluation_count": 1,
2446
+ "raw_model_ids": [
2447
+ "01-ai/Yi-9B-200K"
2448
+ ],
2449
+ "last_updated": "2026-03-19T16:08:18.240187Z"
2450
+ }
2451
+ ],
2452
+ "score_summary": {
2453
+ "count": 6,
2454
+ "min": 0.0665,
2455
+ "max": 0.4793,
2456
+ "average": 0.31425000000000003
2457
+ },
2458
+ "reproducibility_summary": {
2459
+ "results_total": 6,
2460
+ "has_reproducibility_gap_count": 6,
2461
+ "populated_ratio_avg": 0.0
2462
+ },
2463
+ "provenance_summary": {
2464
+ "total_results": 6,
2465
+ "total_groups": 6,
2466
+ "multi_source_groups": 0,
2467
+ "first_party_only_groups": 0,
2468
+ "source_type_distribution": {
2469
+ "first_party": 0,
2470
+ "third_party": 6,
2471
+ "collaborative": 0,
2472
+ "unspecified": 0
2473
+ }
2474
+ },
2475
+ "comparability_summary": {
2476
+ "total_groups": 6,
2477
+ "groups_with_variant_check": 0,
2478
+ "groups_with_cross_party_check": 0,
2479
+ "variant_divergent_count": 0,
2480
+ "cross_party_divergent_count": 0
2481
+ },
2482
+ "benchmark_names": [
2483
+ "BBH",
2484
+ "GPQA",
2485
+ "IFEval",
2486
+ "MATH Level 5",
2487
+ "MMLU-PRO",
2488
+ "MUSR"
2489
+ ],
2490
+ "top_benchmark_scores": [
2491
+ {
2492
+ "benchmark": "BBH",
2493
+ "benchmarkKey": "hfopenllm_v2_bbh",
2494
+ "canonical_display_name": "BBH / Accuracy",
2495
+ "evaluation_name": "BBH",
2496
+ "score": 0.4793,
2497
+ "metric": "Accuracy on BBH",
2498
+ "lower_is_better": false
2499
+ },
2500
+ {
2501
+ "benchmark": "MUSR",
2502
+ "benchmarkKey": "hfopenllm_v2_musr",
2503
+ "canonical_display_name": "MUSR / Accuracy",
2504
+ "evaluation_name": "MUSR",
2505
+ "score": 0.4294,
2506
+ "metric": "Accuracy on MUSR",
2507
+ "lower_is_better": false
2508
+ },
2509
+ {
2510
+ "benchmark": "MMLU-PRO",
2511
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
2512
+ "canonical_display_name": "MMLU-PRO / Accuracy",
2513
+ "evaluation_name": "MMLU-PRO",
2514
+ "score": 0.3622,
2515
+ "metric": "Accuracy on MMLU-PRO",
2516
+ "lower_is_better": false
2517
+ },
2518
+ {
2519
+ "benchmark": "GPQA",
2520
+ "benchmarkKey": "hfopenllm_v2_gpqa",
2521
+ "canonical_display_name": "GPQA / Accuracy",
2522
+ "evaluation_name": "GPQA",
2523
+ "score": 0.3154,
2524
+ "metric": "Accuracy on GPQA",
2525
+ "lower_is_better": false
2526
+ },
2527
+ {
2528
+ "benchmark": "IFEval",
2529
+ "benchmarkKey": "hfopenllm_v2_ifeval",
2530
+ "canonical_display_name": "IFEval / Accuracy",
2531
+ "evaluation_name": "IFEval",
2532
+ "score": 0.2327,
2533
+ "metric": "Accuracy on IFEval",
2534
+ "lower_is_better": false
2535
+ },
2536
+ {
2537
+ "benchmark": "MATH Level 5",
2538
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
2539
+ "canonical_display_name": "MATH Level 5 / Exact Match",
2540
+ "evaluation_name": "MATH Level 5",
2541
+ "score": 0.0665,
2542
+ "metric": "Exact Match on MATH Level 5",
2543
+ "lower_is_better": false
2544
+ }
2545
+ ]
2546
+ },
2547
+ {
2548
+ "model_family_id": "01-ai/yi-coder-9b-chat",
2549
+ "model_route_id": "01-ai__yi-coder-9b-chat",
2550
+ "model_family_name": "Yi-Coder-9B-Chat",
2551
+ "developer": "01-ai",
2552
+ "params_billions": 8.829,
2553
+ "total_evaluations": 1,
2554
+ "benchmark_count": 1,
2555
+ "benchmark_family_count": 1,
2556
+ "categories_covered": [
2557
+ "general",
2558
+ "knowledge",
2559
+ "reasoning"
2560
+ ],
2561
+ "last_updated": "2026-03-19T16:08:18.240187Z",
2562
+ "variants": [
2563
+ {
2564
+ "variant_key": "default",
2565
+ "variant_label": "Default",
2566
+ "evaluation_count": 1,
2567
+ "raw_model_ids": [
2568
+ "01-ai/Yi-Coder-9B-Chat"
2569
+ ],
2570
+ "last_updated": "2026-03-19T16:08:18.240187Z"
2571
+ }
2572
+ ],
2573
+ "score_summary": {
2574
+ "count": 6,
2575
+ "min": 0.04,
2576
+ "max": 0.4817,
2577
+ "average": 0.31538333333333335
2578
+ },
2579
+ "reproducibility_summary": {
2580
+ "results_total": 6,
2581
+ "has_reproducibility_gap_count": 6,
2582
+ "populated_ratio_avg": 0.0
2583
+ },
2584
+ "provenance_summary": {
2585
+ "total_results": 6,
2586
+ "total_groups": 6,
2587
+ "multi_source_groups": 0,
2588
+ "first_party_only_groups": 0,
2589
+ "source_type_distribution": {
2590
+ "first_party": 0,
2591
+ "third_party": 6,
2592
+ "collaborative": 0,
2593
+ "unspecified": 0
2594
+ }
2595
+ },
2596
+ "comparability_summary": {
2597
+ "total_groups": 6,
2598
+ "groups_with_variant_check": 0,
2599
+ "groups_with_cross_party_check": 0,
2600
+ "variant_divergent_count": 0,
2601
+ "cross_party_divergent_count": 0
2602
+ },
2603
+ "benchmark_names": [
2604
+ "BBH",
2605
+ "GPQA",
2606
+ "IFEval",
2607
+ "MATH Level 5",
2608
+ "MMLU-PRO",
2609
+ "MUSR"
2610
+ ],
2611
+ "top_benchmark_scores": [
2612
+ {
2613
+ "benchmark": "IFEval",
2614
+ "benchmarkKey": "hfopenllm_v2_ifeval",
2615
+ "canonical_display_name": "IFEval / Accuracy",
2616
+ "evaluation_name": "IFEval",
2617
+ "score": 0.4817,
2618
+ "metric": "Accuracy on IFEval",
2619
+ "lower_is_better": false
2620
+ },
2621
+ {
2622
+ "benchmark": "BBH",
2623
+ "benchmarkKey": "hfopenllm_v2_bbh",
2624
+ "canonical_display_name": "BBH / Accuracy",
2625
+ "evaluation_name": "BBH",
2626
+ "score": 0.4814,
2627
+ "metric": "Accuracy on BBH",
2628
+ "lower_is_better": false
2629
+ },
2630
+ {
2631
+ "benchmark": "MUSR",
2632
+ "benchmarkKey": "hfopenllm_v2_musr",
2633
+ "canonical_display_name": "MUSR / Accuracy",
2634
+ "evaluation_name": "MUSR",
2635
+ "score": 0.3992,
2636
+ "metric": "Accuracy on MUSR",
2637
+ "lower_is_better": false
2638
+ },
2639
+ {
2640
+ "benchmark": "GPQA",
2641
+ "benchmarkKey": "hfopenllm_v2_gpqa",
2642
+ "canonical_display_name": "GPQA / Accuracy",
2643
+ "evaluation_name": "GPQA",
2644
+ "score": 0.2475,
2645
+ "metric": "Accuracy on GPQA",
2646
+ "lower_is_better": false
2647
+ },
2648
+ {
2649
+ "benchmark": "MMLU-PRO",
2650
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
2651
+ "canonical_display_name": "MMLU-PRO / Accuracy",
2652
+ "evaluation_name": "MMLU-PRO",
2653
+ "score": 0.2425,
2654
+ "metric": "Accuracy on MMLU-PRO",
2655
+ "lower_is_better": false
2656
+ },
2657
+ {
2658
+ "benchmark": "MATH Level 5",
2659
+ "benchmarkKey": "hfopenllm_v2_math_level_5",
2660
+ "canonical_display_name": "MATH Level 5 / Exact Match",
2661
+ "evaluation_name": "MATH Level 5",
2662
+ "score": 0.04,
2663
+ "metric": "Exact Match on MATH Level 5",
2664
+ "lower_is_better": false
2665
+ }
2666
+ ]
2667
+ }
2668
+ ]
2669
+ }
tests/fixtures/developers/anthropic.json ADDED
The diff for this file is too large to render. See raw diff
 
tests/fixtures/developers/openai.json ADDED
The diff for this file is too large to render. See raw diff
 
tests/fixtures/evals/apex_v1.json ADDED
@@ -0,0 +1,1929 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_summary_id": "apex_v1",
3
+ "benchmark": "APEX v1",
4
+ "benchmark_family_key": "apex_v1",
5
+ "benchmark_family_name": "APEX v1",
6
+ "benchmark_parent_key": "apex_v1",
7
+ "benchmark_parent_name": "APEX v1",
8
+ "benchmark_leaf_key": "apex_v1",
9
+ "benchmark_leaf_name": "APEX v1",
10
+ "benchmark_component_key": "medicine_md",
11
+ "benchmark_component_name": "Medicine (MD)",
12
+ "evaluation_name": "APEX v1",
13
+ "display_name": "APEX v1",
14
+ "canonical_display_name": "APEX v1",
15
+ "is_summary_score": false,
16
+ "category": "knowledge",
17
+ "source_data": {
18
+ "dataset_name": "apex-v1",
19
+ "source_type": "hf_dataset",
20
+ "hf_repo": "Mercor/APEX-v1"
21
+ },
22
+ "benchmark_card": {
23
+ "benchmark_details": {
24
+ "name": "APEX-v1",
25
+ "overview": "APEX-Agents (AI Productivity Index for Agents) measures the ability of AI agents to execute long-horizon, cross-application tasks created by investment banking analysts, management consultants, and corporate lawyers. The benchmark contains 480 tasks and requires agents to navigate realistic work environments with files and tools.",
26
+ "data_type": "text",
27
+ "domains": [
28
+ "investment banking",
29
+ "management consulting",
30
+ "corporate law",
31
+ "finance",
32
+ "legal",
33
+ "consulting"
34
+ ],
35
+ "languages": [
36
+ "English"
37
+ ],
38
+ "similar_benchmarks": [
39
+ "Not specified"
40
+ ],
41
+ "resources": [
42
+ "https://arxiv.org/abs/2601.14242",
43
+ "https://huggingface.co/datasets/Mercor/APEX-v1"
44
+ ],
45
+ "benchmark_type": "single"
46
+ },
47
+ "purpose_and_intended_users": {
48
+ "goal": "To assess whether AI agents can reliably execute highly complex professional services work, bridging the gap between existing agentic evaluations and real-world professional workflows.",
49
+ "audience": [
50
+ "AI researchers",
51
+ "Developers working on agentic systems"
52
+ ],
53
+ "tasks": [
54
+ "Text generation",
55
+ "Question answering",
56
+ "Reasoning",
57
+ "Demonstrating advanced knowledge",
58
+ "Using multiple applications",
59
+ "Planning over long horizons within realistic project scenarios"
60
+ ],
61
+ "limitations": "Differences in benchmark scores below 1 percentage point should be interpreted cautiously due to a small error rate in the automated grading system (1.9% false negative rate and 1.3% false positive rate).",
62
+ "out_of_scope_uses": [
63
+ "Not specified"
64
+ ]
65
+ },
66
+ "data": {
67
+ "source": "The benchmark data was created by industry professionals including investment banking analysts, management consultants, and corporate lawyers. These professionals were organized into teams, assigned specific roles, and tasked with delivering complete projects over 5-10 day periods, producing high-quality customer-ready deliverables from scratch.",
68
+ "size": "480 tasks",
69
+ "format": "The specific structure of individual data instances is not described",
70
+ "annotation": "Tasks were created by professionals using files from within each project environment. A baselining study was conducted where independent experts executed 20% of tasks (96 tasks) to verify task feasibility, rubric fairness, and time estimates."
71
+ },
72
+ "methodology": {
73
+ "methods": [
74
+ "Models are evaluated using agent execution in realistic environments",
75
+ "Eight trajectories are collected for each agent-task pair, with each trajectory scored as pass or fail"
76
+ ],
77
+ "metrics": [
78
+ "Pass@1 (task-uniform mean of per-task pass rates)",
79
+ "Pass@8 (passing at least once in eight attempts)",
80
+ "Pass^8 (passing consistently on all eight attempts)"
81
+ ],
82
+ "calculation": "The overall Pass@1 score is computed as the task-uniform mean of per-task pass rates across all 480 tasks. Confidence intervals are calculated using task-level bootstrapping with 10,000 resamples",
83
+ "interpretation": "Higher Pass@1 scores indicate better performance",
84
+ "baseline_results": "Gemini 3 Flash (Thinking=High): 24.0%, GPT-5.2 (Thinking=High): 23.0%, Claude Opus 4.5 (Thinking=High): [score not specified], Gemini 3 Pro (Thinking=High): [score not specified], GPT-OSS-120B (High): 15.2%, Grok 4: 0%",
85
+ "validation": "Automated evaluation used a judge model with 98.5% accuracy against human-labeled ground truth. A baselining study with experts validated task feasibility and rubric fairness"
86
+ },
87
+ "ethical_and_legal_considerations": {
88
+ "privacy_and_anonymity": "Not specified",
89
+ "data_licensing": "Creative Commons Attribution 4.0",
90
+ "consent_procedures": "Not specified",
91
+ "compliance_with_regulations": "Not specified"
92
+ },
93
+ "possible_risks": [
94
+ {
95
+ "category": "Over- or under-reliance",
96
+ "description": [
97
+ "In AI-assisted decision-making tasks, reliance measures how much a person trusts (and potentially acts on) a model's output. Over-reliance occurs when a person puts too much trust in a model, accepting a model's output when the model's output is likely incorrect. Under-reliance is the opposite, where the person doesn't trust the model but should."
98
+ ],
99
+ "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/over-or-under-reliance.html"
100
+ },
101
+ {
102
+ "category": "Unrepresentative data",
103
+ "description": [
104
+ "Unrepresentative data occurs when the training or fine-tuning data is not sufficiently representative of the underlying population or does not measure the phenomenon of interest. Synthetic data might not fully capture the complexity and nuances of real-world data. Causes include possible limitations in the seed data quality, biases in generation methods, or inadequate domain knowledge. Thus, AI models might struggle to generalize effectively to real-world scenarios."
105
+ ],
106
+ "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/unrepresentative-data.html"
107
+ },
108
+ {
109
+ "category": "Incomplete AI agent evaluation",
110
+ "description": [
111
+ "Evaluating the performance or accuracy or an agent is difficult because of system complexity and open-endedness."
112
+ ],
113
+ "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/incomplete-ai-agent-evaluation-agentic.html"
114
+ },
115
+ {
116
+ "category": "Reproducibility",
117
+ "description": [
118
+ "Replicating agent behavior or output can be impacted by changes or updates made to external services and tools. This impact is increased if the agent is built with generative AI."
119
+ ],
120
+ "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/reproducibility-agentic.html"
121
+ },
122
+ {
123
+ "category": "Improper usage",
124
+ "description": [
125
+ "Improper usage occurs when a model is used for a purpose that it was not originally designed for."
126
+ ],
127
+ "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/improper-usage.html"
128
+ }
129
+ ],
130
+ "flagged_fields": {},
131
+ "missing_fields": [
132
+ "benchmark_details.similar_benchmarks",
133
+ "purpose_and_intended_users.out_of_scope_uses",
134
+ "ethical_and_legal_considerations.privacy_and_anonymity",
135
+ "ethical_and_legal_considerations.consent_procedures",
136
+ "ethical_and_legal_considerations.compliance_with_regulations"
137
+ ],
138
+ "card_info": {
139
+ "created_at": "2026-04-14T14:28:12.501639",
140
+ "llm": "deepseek-ai/DeepSeek-V3.1"
141
+ }
142
+ },
143
+ "tags": {
144
+ "domains": [
145
+ "investment banking",
146
+ "management consulting",
147
+ "corporate law",
148
+ "finance",
149
+ "legal",
150
+ "consulting"
151
+ ],
152
+ "languages": [
153
+ "English"
154
+ ],
155
+ "tasks": [
156
+ "Text generation",
157
+ "Question answering",
158
+ "Reasoning",
159
+ "Demonstrating advanced knowledge",
160
+ "Using multiple applications",
161
+ "Planning over long horizons within realistic project scenarios"
162
+ ]
163
+ },
164
+ "subtasks": [
165
+ {
166
+ "subtask_key": "big_law",
167
+ "subtask_name": "Big Law",
168
+ "display_name": "Big Law",
169
+ "metrics": [
170
+ {
171
+ "metric_summary_id": "apex_v1_big_law_score",
172
+ "legacy_eval_summary_id": "apex_v1_big_law",
173
+ "evaluation_name": "Big Law",
174
+ "display_name": "APEX v1 / Big Law / Score",
175
+ "canonical_display_name": "APEX v1 / Big Law / Score",
176
+ "benchmark_leaf_key": "apex_v1",
177
+ "benchmark_leaf_name": "APEX v1",
178
+ "slice_key": "big_law",
179
+ "slice_name": "Big Law",
180
+ "lower_is_better": false,
181
+ "metric_name": "Score",
182
+ "metric_id": "apex_v1.score",
183
+ "metric_key": "score",
184
+ "metric_source": "metric_config",
185
+ "metric_config": {
186
+ "evaluation_description": "Big law associate score.",
187
+ "lower_is_better": false,
188
+ "score_type": "continuous",
189
+ "min_score": 0,
190
+ "max_score": 1,
191
+ "additional_details": {
192
+ "raw_evaluation_name": "Big Law Score"
193
+ },
194
+ "metric_id": "apex_v1.score",
195
+ "metric_name": "Score",
196
+ "metric_kind": "score",
197
+ "metric_unit": "proportion"
198
+ },
199
+ "model_results": [
200
+ {
201
+ "model_id": "openai/gpt-5",
202
+ "model_route_id": "openai__gpt-5",
203
+ "model_name": "GPT 5",
204
+ "developer": "openai",
205
+ "variant_key": "default",
206
+ "raw_model_id": "openai/GPT 5",
207
+ "score": 0.78,
208
+ "evaluation_id": "apex-v1/openai_gpt-5/1773260200",
209
+ "retrieved_timestamp": "1773260200",
210
+ "source_metadata": {
211
+ "source_name": "Mercor APEX-v1 Leaderboard",
212
+ "source_type": "evaluation_run",
213
+ "source_organization_name": "Mercor",
214
+ "source_organization_url": "https://www.mercor.com",
215
+ "evaluator_relationship": "first_party"
216
+ },
217
+ "source_data": {
218
+ "dataset_name": "apex-v1",
219
+ "source_type": "hf_dataset",
220
+ "hf_repo": "Mercor/APEX-v1"
221
+ },
222
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5/apex_v1_openai_gpt_5_1773260200.json",
223
+ "detailed_evaluation_results": null,
224
+ "detailed_evaluation_results_meta": null,
225
+ "passthrough_top_level_fields": null,
226
+ "instance_level_data": null,
227
+ "normalized_result": {
228
+ "benchmark_family_key": "apex_v1",
229
+ "benchmark_family_name": "APEX v1",
230
+ "benchmark_parent_key": "apex_v1",
231
+ "benchmark_parent_name": "APEX v1",
232
+ "benchmark_component_key": "big_law",
233
+ "benchmark_component_name": "Big Law",
234
+ "benchmark_leaf_key": "apex_v1",
235
+ "benchmark_leaf_name": "APEX v1",
236
+ "slice_key": "big_law",
237
+ "slice_name": "Big Law",
238
+ "metric_name": "Score",
239
+ "metric_id": "apex_v1.score",
240
+ "metric_key": "score",
241
+ "metric_source": "metric_config",
242
+ "display_name": "Big Law / Score",
243
+ "canonical_display_name": "APEX v1 / Big Law / Score",
244
+ "raw_evaluation_name": "Big Law",
245
+ "is_summary_score": false
246
+ },
247
+ "evalcards": {
248
+ "annotations": {
249
+ "reproducibility_gap": {
250
+ "has_reproducibility_gap": true,
251
+ "missing_fields": [
252
+ "temperature",
253
+ "max_tokens"
254
+ ],
255
+ "required_field_count": 2,
256
+ "populated_field_count": 0,
257
+ "signal_version": "1.0"
258
+ },
259
+ "provenance": {
260
+ "source_type": "first_party",
261
+ "is_multi_source": false,
262
+ "first_party_only": true,
263
+ "distinct_reporting_organizations": 1,
264
+ "signal_version": "1.0"
265
+ },
266
+ "variant_divergence": null,
267
+ "cross_party_divergence": null
268
+ }
269
+ }
270
+ },
271
+ {
272
+ "model_id": "openai/gpt-5-1",
273
+ "model_route_id": "openai__gpt-5-1",
274
+ "model_name": "GPT 5.1",
275
+ "developer": "openai",
276
+ "variant_key": "default",
277
+ "raw_model_id": "openai/GPT 5.1",
278
+ "score": 0.77,
279
+ "evaluation_id": "apex-v1/openai_gpt-5.1/1773260200",
280
+ "retrieved_timestamp": "1773260200",
281
+ "source_metadata": {
282
+ "source_name": "Mercor APEX-v1 Leaderboard",
283
+ "source_type": "evaluation_run",
284
+ "source_organization_name": "Mercor",
285
+ "source_organization_url": "https://www.mercor.com",
286
+ "evaluator_relationship": "first_party"
287
+ },
288
+ "source_data": {
289
+ "dataset_name": "apex-v1",
290
+ "source_type": "hf_dataset",
291
+ "hf_repo": "Mercor/APEX-v1"
292
+ },
293
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5-1/apex_v1_openai_gpt_5_1_1773260200.json",
294
+ "detailed_evaluation_results": null,
295
+ "detailed_evaluation_results_meta": null,
296
+ "passthrough_top_level_fields": null,
297
+ "instance_level_data": null,
298
+ "normalized_result": {
299
+ "benchmark_family_key": "apex_v1",
300
+ "benchmark_family_name": "APEX v1",
301
+ "benchmark_parent_key": "apex_v1",
302
+ "benchmark_parent_name": "APEX v1",
303
+ "benchmark_component_key": "big_law",
304
+ "benchmark_component_name": "Big Law",
305
+ "benchmark_leaf_key": "apex_v1",
306
+ "benchmark_leaf_name": "APEX v1",
307
+ "slice_key": "big_law",
308
+ "slice_name": "Big Law",
309
+ "metric_name": "Score",
310
+ "metric_id": "apex_v1.score",
311
+ "metric_key": "score",
312
+ "metric_source": "metric_config",
313
+ "display_name": "Big Law / Score",
314
+ "canonical_display_name": "APEX v1 / Big Law / Score",
315
+ "raw_evaluation_name": "Big Law",
316
+ "is_summary_score": false
317
+ },
318
+ "evalcards": {
319
+ "annotations": {
320
+ "reproducibility_gap": {
321
+ "has_reproducibility_gap": true,
322
+ "missing_fields": [
323
+ "temperature",
324
+ "max_tokens"
325
+ ],
326
+ "required_field_count": 2,
327
+ "populated_field_count": 0,
328
+ "signal_version": "1.0"
329
+ },
330
+ "provenance": {
331
+ "source_type": "first_party",
332
+ "is_multi_source": false,
333
+ "first_party_only": true,
334
+ "distinct_reporting_organizations": 1,
335
+ "signal_version": "1.0"
336
+ },
337
+ "variant_divergence": null,
338
+ "cross_party_divergence": null
339
+ }
340
+ }
341
+ },
342
+ {
343
+ "model_id": "openai/o3",
344
+ "model_route_id": "openai__o3",
345
+ "model_name": "o3",
346
+ "developer": "openai",
347
+ "variant_key": "default",
348
+ "raw_model_id": "openai/o3",
349
+ "score": 0.76,
350
+ "evaluation_id": "apex-v1/openai_o3/1773260200",
351
+ "retrieved_timestamp": "1773260200",
352
+ "source_metadata": {
353
+ "source_name": "Mercor APEX-v1 Leaderboard",
354
+ "source_type": "evaluation_run",
355
+ "source_organization_name": "Mercor",
356
+ "source_organization_url": "https://www.mercor.com",
357
+ "evaluator_relationship": "first_party"
358
+ },
359
+ "source_data": {
360
+ "dataset_name": "apex-v1",
361
+ "source_type": "hf_dataset",
362
+ "hf_repo": "Mercor/APEX-v1"
363
+ },
364
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__o3/apex_v1_openai_o3_1773260200.json",
365
+ "detailed_evaluation_results": null,
366
+ "detailed_evaluation_results_meta": null,
367
+ "passthrough_top_level_fields": null,
368
+ "instance_level_data": null,
369
+ "normalized_result": {
370
+ "benchmark_family_key": "apex_v1",
371
+ "benchmark_family_name": "APEX v1",
372
+ "benchmark_parent_key": "apex_v1",
373
+ "benchmark_parent_name": "APEX v1",
374
+ "benchmark_component_key": "big_law",
375
+ "benchmark_component_name": "Big Law",
376
+ "benchmark_leaf_key": "apex_v1",
377
+ "benchmark_leaf_name": "APEX v1",
378
+ "slice_key": "big_law",
379
+ "slice_name": "Big Law",
380
+ "metric_name": "Score",
381
+ "metric_id": "apex_v1.score",
382
+ "metric_key": "score",
383
+ "metric_source": "metric_config",
384
+ "display_name": "Big Law / Score",
385
+ "canonical_display_name": "APEX v1 / Big Law / Score",
386
+ "raw_evaluation_name": "Big Law",
387
+ "is_summary_score": false
388
+ },
389
+ "evalcards": {
390
+ "annotations": {
391
+ "reproducibility_gap": {
392
+ "has_reproducibility_gap": true,
393
+ "missing_fields": [
394
+ "temperature",
395
+ "max_tokens"
396
+ ],
397
+ "required_field_count": 2,
398
+ "populated_field_count": 0,
399
+ "signal_version": "1.0"
400
+ },
401
+ "provenance": {
402
+ "source_type": "first_party",
403
+ "is_multi_source": false,
404
+ "first_party_only": true,
405
+ "distinct_reporting_organizations": 1,
406
+ "signal_version": "1.0"
407
+ },
408
+ "variant_divergence": null,
409
+ "cross_party_divergence": null
410
+ }
411
+ }
412
+ }
413
+ ],
414
+ "models_count": 3,
415
+ "top_score": 0.78
416
+ }
417
+ ],
418
+ "metrics_count": 1,
419
+ "metric_names": [
420
+ "Score"
421
+ ]
422
+ },
423
+ {
424
+ "subtask_key": "consulting",
425
+ "subtask_name": "Consulting",
426
+ "display_name": "Consulting",
427
+ "metrics": [
428
+ {
429
+ "metric_summary_id": "apex_v1_consulting_score",
430
+ "legacy_eval_summary_id": "apex_v1_consulting",
431
+ "evaluation_name": "Consulting",
432
+ "display_name": "APEX v1 / Consulting / Score",
433
+ "canonical_display_name": "APEX v1 / Consulting / Score",
434
+ "benchmark_leaf_key": "apex_v1",
435
+ "benchmark_leaf_name": "APEX v1",
436
+ "slice_key": "consulting",
437
+ "slice_name": "Consulting",
438
+ "lower_is_better": false,
439
+ "metric_name": "Score",
440
+ "metric_id": "apex_v1.score",
441
+ "metric_key": "score",
442
+ "metric_source": "metric_config",
443
+ "metric_config": {
444
+ "evaluation_description": "Management consulting score.",
445
+ "lower_is_better": false,
446
+ "score_type": "continuous",
447
+ "min_score": 0,
448
+ "max_score": 1,
449
+ "additional_details": {
450
+ "raw_evaluation_name": "Consulting Score"
451
+ },
452
+ "metric_id": "apex_v1.score",
453
+ "metric_name": "Score",
454
+ "metric_kind": "score",
455
+ "metric_unit": "proportion"
456
+ },
457
+ "model_results": [
458
+ {
459
+ "model_id": "openai/gpt-5-2-pro",
460
+ "model_route_id": "openai__gpt-5-2-pro",
461
+ "model_name": "GPT 5.2 Pro",
462
+ "developer": "openai",
463
+ "variant_key": "default",
464
+ "raw_model_id": "openai/GPT 5.2 Pro",
465
+ "score": 0.64,
466
+ "evaluation_id": "apex-v1/openai_gpt-5.2-pro/1773260200",
467
+ "retrieved_timestamp": "1773260200",
468
+ "source_metadata": {
469
+ "source_name": "Mercor APEX-v1 Leaderboard",
470
+ "source_type": "evaluation_run",
471
+ "source_organization_name": "Mercor",
472
+ "source_organization_url": "https://www.mercor.com",
473
+ "evaluator_relationship": "first_party"
474
+ },
475
+ "source_data": {
476
+ "dataset_name": "apex-v1",
477
+ "source_type": "hf_dataset",
478
+ "hf_repo": "Mercor/APEX-v1"
479
+ },
480
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5-2-pro/apex_v1_openai_gpt_5_2_pro_1773260200.json",
481
+ "detailed_evaluation_results": null,
482
+ "detailed_evaluation_results_meta": null,
483
+ "passthrough_top_level_fields": null,
484
+ "instance_level_data": null,
485
+ "normalized_result": {
486
+ "benchmark_family_key": "apex_v1",
487
+ "benchmark_family_name": "APEX v1",
488
+ "benchmark_parent_key": "apex_v1",
489
+ "benchmark_parent_name": "APEX v1",
490
+ "benchmark_component_key": "consulting",
491
+ "benchmark_component_name": "Consulting",
492
+ "benchmark_leaf_key": "apex_v1",
493
+ "benchmark_leaf_name": "APEX v1",
494
+ "slice_key": "consulting",
495
+ "slice_name": "Consulting",
496
+ "metric_name": "Score",
497
+ "metric_id": "apex_v1.score",
498
+ "metric_key": "score",
499
+ "metric_source": "metric_config",
500
+ "display_name": "Consulting / Score",
501
+ "canonical_display_name": "APEX v1 / Consulting / Score",
502
+ "raw_evaluation_name": "Consulting",
503
+ "is_summary_score": false
504
+ },
505
+ "evalcards": {
506
+ "annotations": {
507
+ "reproducibility_gap": {
508
+ "has_reproducibility_gap": true,
509
+ "missing_fields": [
510
+ "temperature",
511
+ "max_tokens"
512
+ ],
513
+ "required_field_count": 2,
514
+ "populated_field_count": 0,
515
+ "signal_version": "1.0"
516
+ },
517
+ "provenance": {
518
+ "source_type": "first_party",
519
+ "is_multi_source": false,
520
+ "first_party_only": true,
521
+ "distinct_reporting_organizations": 1,
522
+ "signal_version": "1.0"
523
+ },
524
+ "variant_divergence": null,
525
+ "cross_party_divergence": null
526
+ }
527
+ }
528
+ },
529
+ {
530
+ "model_id": "google/gemini-3-pro",
531
+ "model_route_id": "google__gemini-3-pro",
532
+ "model_name": "Gemini 3 Pro",
533
+ "developer": "google",
534
+ "variant_key": "default",
535
+ "raw_model_id": "google/Gemini 3 Pro",
536
+ "score": 0.64,
537
+ "evaluation_id": "apex-v1/google_gemini-3-pro/1773260200",
538
+ "retrieved_timestamp": "1773260200",
539
+ "source_metadata": {
540
+ "source_name": "Mercor APEX-v1 Leaderboard",
541
+ "source_type": "evaluation_run",
542
+ "source_organization_name": "Mercor",
543
+ "source_organization_url": "https://www.mercor.com",
544
+ "evaluator_relationship": "first_party"
545
+ },
546
+ "source_data": {
547
+ "dataset_name": "apex-v1",
548
+ "source_type": "hf_dataset",
549
+ "hf_repo": "Mercor/APEX-v1"
550
+ },
551
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/google__gemini-3-pro/apex_v1_google_gemini_3_pro_1773260200.json",
552
+ "detailed_evaluation_results": null,
553
+ "detailed_evaluation_results_meta": null,
554
+ "passthrough_top_level_fields": null,
555
+ "instance_level_data": null,
556
+ "normalized_result": {
557
+ "benchmark_family_key": "apex_v1",
558
+ "benchmark_family_name": "APEX v1",
559
+ "benchmark_parent_key": "apex_v1",
560
+ "benchmark_parent_name": "APEX v1",
561
+ "benchmark_component_key": "consulting",
562
+ "benchmark_component_name": "Consulting",
563
+ "benchmark_leaf_key": "apex_v1",
564
+ "benchmark_leaf_name": "APEX v1",
565
+ "slice_key": "consulting",
566
+ "slice_name": "Consulting",
567
+ "metric_name": "Score",
568
+ "metric_id": "apex_v1.score",
569
+ "metric_key": "score",
570
+ "metric_source": "metric_config",
571
+ "display_name": "Consulting / Score",
572
+ "canonical_display_name": "APEX v1 / Consulting / Score",
573
+ "raw_evaluation_name": "Consulting",
574
+ "is_summary_score": false
575
+ },
576
+ "evalcards": {
577
+ "annotations": {
578
+ "reproducibility_gap": {
579
+ "has_reproducibility_gap": true,
580
+ "missing_fields": [
581
+ "temperature",
582
+ "max_tokens"
583
+ ],
584
+ "required_field_count": 2,
585
+ "populated_field_count": 0,
586
+ "signal_version": "1.0"
587
+ },
588
+ "provenance": {
589
+ "source_type": "first_party",
590
+ "is_multi_source": false,
591
+ "first_party_only": true,
592
+ "distinct_reporting_organizations": 1,
593
+ "signal_version": "1.0"
594
+ },
595
+ "variant_divergence": null,
596
+ "cross_party_divergence": null
597
+ }
598
+ }
599
+ },
600
+ {
601
+ "model_id": "google/gemini-3-flash",
602
+ "model_route_id": "google__gemini-3-flash",
603
+ "model_name": "Gemini 3 Flash",
604
+ "developer": "google",
605
+ "variant_key": "default",
606
+ "raw_model_id": "google/Gemini 3 Flash",
607
+ "score": 0.64,
608
+ "evaluation_id": "apex-v1/google_gemini-3-flash/1773260200",
609
+ "retrieved_timestamp": "1773260200",
610
+ "source_metadata": {
611
+ "source_name": "Mercor APEX-v1 Leaderboard",
612
+ "source_type": "evaluation_run",
613
+ "source_organization_name": "Mercor",
614
+ "source_organization_url": "https://www.mercor.com",
615
+ "evaluator_relationship": "first_party"
616
+ },
617
+ "source_data": {
618
+ "dataset_name": "apex-v1",
619
+ "source_type": "hf_dataset",
620
+ "hf_repo": "Mercor/APEX-v1"
621
+ },
622
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/google__gemini-3-flash/apex_v1_google_gemini_3_flash_1773260200.json",
623
+ "detailed_evaluation_results": null,
624
+ "detailed_evaluation_results_meta": null,
625
+ "passthrough_top_level_fields": null,
626
+ "instance_level_data": null,
627
+ "normalized_result": {
628
+ "benchmark_family_key": "apex_v1",
629
+ "benchmark_family_name": "APEX v1",
630
+ "benchmark_parent_key": "apex_v1",
631
+ "benchmark_parent_name": "APEX v1",
632
+ "benchmark_component_key": "consulting",
633
+ "benchmark_component_name": "Consulting",
634
+ "benchmark_leaf_key": "apex_v1",
635
+ "benchmark_leaf_name": "APEX v1",
636
+ "slice_key": "consulting",
637
+ "slice_name": "Consulting",
638
+ "metric_name": "Score",
639
+ "metric_id": "apex_v1.score",
640
+ "metric_key": "score",
641
+ "metric_source": "metric_config",
642
+ "display_name": "Consulting / Score",
643
+ "canonical_display_name": "APEX v1 / Consulting / Score",
644
+ "raw_evaluation_name": "Consulting",
645
+ "is_summary_score": false
646
+ },
647
+ "evalcards": {
648
+ "annotations": {
649
+ "reproducibility_gap": {
650
+ "has_reproducibility_gap": true,
651
+ "missing_fields": [
652
+ "temperature",
653
+ "max_tokens"
654
+ ],
655
+ "required_field_count": 2,
656
+ "populated_field_count": 0,
657
+ "signal_version": "1.0"
658
+ },
659
+ "provenance": {
660
+ "source_type": "first_party",
661
+ "is_multi_source": false,
662
+ "first_party_only": true,
663
+ "distinct_reporting_organizations": 1,
664
+ "signal_version": "1.0"
665
+ },
666
+ "variant_divergence": null,
667
+ "cross_party_divergence": null
668
+ }
669
+ }
670
+ }
671
+ ],
672
+ "models_count": 3,
673
+ "top_score": 0.64
674
+ }
675
+ ],
676
+ "metrics_count": 1,
677
+ "metric_names": [
678
+ "Score"
679
+ ]
680
+ },
681
+ {
682
+ "subtask_key": "investment_banking",
683
+ "subtask_name": "Investment Banking",
684
+ "display_name": "Investment Banking",
685
+ "metrics": [
686
+ {
687
+ "metric_summary_id": "apex_v1_investment_banking_score",
688
+ "legacy_eval_summary_id": "apex_v1_investment_banking",
689
+ "evaluation_name": "Investment Banking",
690
+ "display_name": "APEX v1 / Investment Banking / Score",
691
+ "canonical_display_name": "APEX v1 / Investment Banking / Score",
692
+ "benchmark_leaf_key": "apex_v1",
693
+ "benchmark_leaf_name": "APEX v1",
694
+ "slice_key": "investment_banking",
695
+ "slice_name": "Investment Banking",
696
+ "lower_is_better": false,
697
+ "metric_name": "Score",
698
+ "metric_id": "apex_v1.score",
699
+ "metric_key": "score",
700
+ "metric_source": "metric_config",
701
+ "metric_config": {
702
+ "evaluation_description": "Investment banking associate score.",
703
+ "lower_is_better": false,
704
+ "score_type": "continuous",
705
+ "min_score": 0,
706
+ "max_score": 1,
707
+ "additional_details": {
708
+ "raw_evaluation_name": "Investment Banking Score"
709
+ },
710
+ "metric_id": "apex_v1.score",
711
+ "metric_name": "Score",
712
+ "metric_kind": "score",
713
+ "metric_unit": "proportion"
714
+ },
715
+ "model_results": [
716
+ {
717
+ "model_id": "openai/gpt-5-2-pro",
718
+ "model_route_id": "openai__gpt-5-2-pro",
719
+ "model_name": "GPT 5.2 Pro",
720
+ "developer": "openai",
721
+ "variant_key": "default",
722
+ "raw_model_id": "openai/GPT 5.2 Pro",
723
+ "score": 0.64,
724
+ "evaluation_id": "apex-v1/openai_gpt-5.2-pro/1773260200",
725
+ "retrieved_timestamp": "1773260200",
726
+ "source_metadata": {
727
+ "source_name": "Mercor APEX-v1 Leaderboard",
728
+ "source_type": "evaluation_run",
729
+ "source_organization_name": "Mercor",
730
+ "source_organization_url": "https://www.mercor.com",
731
+ "evaluator_relationship": "first_party"
732
+ },
733
+ "source_data": {
734
+ "dataset_name": "apex-v1",
735
+ "source_type": "hf_dataset",
736
+ "hf_repo": "Mercor/APEX-v1"
737
+ },
738
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5-2-pro/apex_v1_openai_gpt_5_2_pro_1773260200.json",
739
+ "detailed_evaluation_results": null,
740
+ "detailed_evaluation_results_meta": null,
741
+ "passthrough_top_level_fields": null,
742
+ "instance_level_data": null,
743
+ "normalized_result": {
744
+ "benchmark_family_key": "apex_v1",
745
+ "benchmark_family_name": "APEX v1",
746
+ "benchmark_parent_key": "apex_v1",
747
+ "benchmark_parent_name": "APEX v1",
748
+ "benchmark_component_key": "investment_banking",
749
+ "benchmark_component_name": "Investment Banking",
750
+ "benchmark_leaf_key": "apex_v1",
751
+ "benchmark_leaf_name": "APEX v1",
752
+ "slice_key": "investment_banking",
753
+ "slice_name": "Investment Banking",
754
+ "metric_name": "Score",
755
+ "metric_id": "apex_v1.score",
756
+ "metric_key": "score",
757
+ "metric_source": "metric_config",
758
+ "display_name": "Investment Banking / Score",
759
+ "canonical_display_name": "APEX v1 / Investment Banking / Score",
760
+ "raw_evaluation_name": "Investment Banking",
761
+ "is_summary_score": false
762
+ },
763
+ "evalcards": {
764
+ "annotations": {
765
+ "reproducibility_gap": {
766
+ "has_reproducibility_gap": true,
767
+ "missing_fields": [
768
+ "temperature",
769
+ "max_tokens"
770
+ ],
771
+ "required_field_count": 2,
772
+ "populated_field_count": 0,
773
+ "signal_version": "1.0"
774
+ },
775
+ "provenance": {
776
+ "source_type": "first_party",
777
+ "is_multi_source": false,
778
+ "first_party_only": true,
779
+ "distinct_reporting_organizations": 1,
780
+ "signal_version": "1.0"
781
+ },
782
+ "variant_divergence": null,
783
+ "cross_party_divergence": null
784
+ }
785
+ }
786
+ },
787
+ {
788
+ "model_id": "google/gemini-3-pro",
789
+ "model_route_id": "google__gemini-3-pro",
790
+ "model_name": "Gemini 3 Pro",
791
+ "developer": "google",
792
+ "variant_key": "default",
793
+ "raw_model_id": "google/Gemini 3 Pro",
794
+ "score": 0.63,
795
+ "evaluation_id": "apex-v1/google_gemini-3-pro/1773260200",
796
+ "retrieved_timestamp": "1773260200",
797
+ "source_metadata": {
798
+ "source_name": "Mercor APEX-v1 Leaderboard",
799
+ "source_type": "evaluation_run",
800
+ "source_organization_name": "Mercor",
801
+ "source_organization_url": "https://www.mercor.com",
802
+ "evaluator_relationship": "first_party"
803
+ },
804
+ "source_data": {
805
+ "dataset_name": "apex-v1",
806
+ "source_type": "hf_dataset",
807
+ "hf_repo": "Mercor/APEX-v1"
808
+ },
809
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/google__gemini-3-pro/apex_v1_google_gemini_3_pro_1773260200.json",
810
+ "detailed_evaluation_results": null,
811
+ "detailed_evaluation_results_meta": null,
812
+ "passthrough_top_level_fields": null,
813
+ "instance_level_data": null,
814
+ "normalized_result": {
815
+ "benchmark_family_key": "apex_v1",
816
+ "benchmark_family_name": "APEX v1",
817
+ "benchmark_parent_key": "apex_v1",
818
+ "benchmark_parent_name": "APEX v1",
819
+ "benchmark_component_key": "investment_banking",
820
+ "benchmark_component_name": "Investment Banking",
821
+ "benchmark_leaf_key": "apex_v1",
822
+ "benchmark_leaf_name": "APEX v1",
823
+ "slice_key": "investment_banking",
824
+ "slice_name": "Investment Banking",
825
+ "metric_name": "Score",
826
+ "metric_id": "apex_v1.score",
827
+ "metric_key": "score",
828
+ "metric_source": "metric_config",
829
+ "display_name": "Investment Banking / Score",
830
+ "canonical_display_name": "APEX v1 / Investment Banking / Score",
831
+ "raw_evaluation_name": "Investment Banking",
832
+ "is_summary_score": false
833
+ },
834
+ "evalcards": {
835
+ "annotations": {
836
+ "reproducibility_gap": {
837
+ "has_reproducibility_gap": true,
838
+ "missing_fields": [
839
+ "temperature",
840
+ "max_tokens"
841
+ ],
842
+ "required_field_count": 2,
843
+ "populated_field_count": 0,
844
+ "signal_version": "1.0"
845
+ },
846
+ "provenance": {
847
+ "source_type": "first_party",
848
+ "is_multi_source": false,
849
+ "first_party_only": true,
850
+ "distinct_reporting_organizations": 1,
851
+ "signal_version": "1.0"
852
+ },
853
+ "variant_divergence": null,
854
+ "cross_party_divergence": null
855
+ }
856
+ }
857
+ },
858
+ {
859
+ "model_id": "openai/gpt-5",
860
+ "model_route_id": "openai__gpt-5",
861
+ "model_name": "GPT 5",
862
+ "developer": "openai",
863
+ "variant_key": "default",
864
+ "raw_model_id": "openai/GPT 5",
865
+ "score": 0.61,
866
+ "evaluation_id": "apex-v1/openai_gpt-5/1773260200",
867
+ "retrieved_timestamp": "1773260200",
868
+ "source_metadata": {
869
+ "source_name": "Mercor APEX-v1 Leaderboard",
870
+ "source_type": "evaluation_run",
871
+ "source_organization_name": "Mercor",
872
+ "source_organization_url": "https://www.mercor.com",
873
+ "evaluator_relationship": "first_party"
874
+ },
875
+ "source_data": {
876
+ "dataset_name": "apex-v1",
877
+ "source_type": "hf_dataset",
878
+ "hf_repo": "Mercor/APEX-v1"
879
+ },
880
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5/apex_v1_openai_gpt_5_1773260200.json",
881
+ "detailed_evaluation_results": null,
882
+ "detailed_evaluation_results_meta": null,
883
+ "passthrough_top_level_fields": null,
884
+ "instance_level_data": null,
885
+ "normalized_result": {
886
+ "benchmark_family_key": "apex_v1",
887
+ "benchmark_family_name": "APEX v1",
888
+ "benchmark_parent_key": "apex_v1",
889
+ "benchmark_parent_name": "APEX v1",
890
+ "benchmark_component_key": "investment_banking",
891
+ "benchmark_component_name": "Investment Banking",
892
+ "benchmark_leaf_key": "apex_v1",
893
+ "benchmark_leaf_name": "APEX v1",
894
+ "slice_key": "investment_banking",
895
+ "slice_name": "Investment Banking",
896
+ "metric_name": "Score",
897
+ "metric_id": "apex_v1.score",
898
+ "metric_key": "score",
899
+ "metric_source": "metric_config",
900
+ "display_name": "Investment Banking / Score",
901
+ "canonical_display_name": "APEX v1 / Investment Banking / Score",
902
+ "raw_evaluation_name": "Investment Banking",
903
+ "is_summary_score": false
904
+ },
905
+ "evalcards": {
906
+ "annotations": {
907
+ "reproducibility_gap": {
908
+ "has_reproducibility_gap": true,
909
+ "missing_fields": [
910
+ "temperature",
911
+ "max_tokens"
912
+ ],
913
+ "required_field_count": 2,
914
+ "populated_field_count": 0,
915
+ "signal_version": "1.0"
916
+ },
917
+ "provenance": {
918
+ "source_type": "first_party",
919
+ "is_multi_source": false,
920
+ "first_party_only": true,
921
+ "distinct_reporting_organizations": 1,
922
+ "signal_version": "1.0"
923
+ },
924
+ "variant_divergence": null,
925
+ "cross_party_divergence": null
926
+ }
927
+ }
928
+ }
929
+ ],
930
+ "models_count": 3,
931
+ "top_score": 0.64
932
+ }
933
+ ],
934
+ "metrics_count": 1,
935
+ "metric_names": [
936
+ "Score"
937
+ ]
938
+ },
939
+ {
940
+ "subtask_key": "medicine_md",
941
+ "subtask_name": "Medicine (MD)",
942
+ "display_name": "Medicine (MD)",
943
+ "metrics": [
944
+ {
945
+ "metric_summary_id": "apex_v1_medicine_md_score",
946
+ "legacy_eval_summary_id": "apex_v1_medicine_md",
947
+ "evaluation_name": "Medicine (MD)",
948
+ "display_name": "APEX v1 / Medicine (MD) / Score",
949
+ "canonical_display_name": "APEX v1 / Medicine (MD) / Score",
950
+ "benchmark_leaf_key": "apex_v1",
951
+ "benchmark_leaf_name": "APEX v1",
952
+ "slice_key": "medicine_md",
953
+ "slice_name": "Medicine (MD)",
954
+ "lower_is_better": false,
955
+ "metric_name": "Score",
956
+ "metric_id": "apex_v1.score",
957
+ "metric_key": "score",
958
+ "metric_source": "metric_config",
959
+ "metric_config": {
960
+ "evaluation_description": "Primary care physician (MD) score.",
961
+ "lower_is_better": false,
962
+ "score_type": "continuous",
963
+ "min_score": 0,
964
+ "max_score": 1,
965
+ "additional_details": {
966
+ "raw_evaluation_name": "Medicine (MD) Score"
967
+ },
968
+ "metric_id": "apex_v1.score",
969
+ "metric_name": "Score",
970
+ "metric_kind": "score",
971
+ "metric_unit": "proportion"
972
+ },
973
+ "model_results": [
974
+ {
975
+ "model_id": "openai/gpt-5",
976
+ "model_route_id": "openai__gpt-5",
977
+ "model_name": "GPT 5",
978
+ "developer": "openai",
979
+ "variant_key": "default",
980
+ "raw_model_id": "openai/GPT 5",
981
+ "score": 0.66,
982
+ "evaluation_id": "apex-v1/openai_gpt-5/1773260200",
983
+ "retrieved_timestamp": "1773260200",
984
+ "source_metadata": {
985
+ "source_name": "Mercor APEX-v1 Leaderboard",
986
+ "source_type": "evaluation_run",
987
+ "source_organization_name": "Mercor",
988
+ "source_organization_url": "https://www.mercor.com",
989
+ "evaluator_relationship": "first_party"
990
+ },
991
+ "source_data": {
992
+ "dataset_name": "apex-v1",
993
+ "source_type": "hf_dataset",
994
+ "hf_repo": "Mercor/APEX-v1"
995
+ },
996
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5/apex_v1_openai_gpt_5_1773260200.json",
997
+ "detailed_evaluation_results": null,
998
+ "detailed_evaluation_results_meta": null,
999
+ "passthrough_top_level_fields": null,
1000
+ "instance_level_data": null,
1001
+ "normalized_result": {
1002
+ "benchmark_family_key": "apex_v1",
1003
+ "benchmark_family_name": "APEX v1",
1004
+ "benchmark_parent_key": "apex_v1",
1005
+ "benchmark_parent_name": "APEX v1",
1006
+ "benchmark_component_key": "medicine_md",
1007
+ "benchmark_component_name": "Medicine (MD)",
1008
+ "benchmark_leaf_key": "apex_v1",
1009
+ "benchmark_leaf_name": "APEX v1",
1010
+ "slice_key": "medicine_md",
1011
+ "slice_name": "Medicine (MD)",
1012
+ "metric_name": "Score",
1013
+ "metric_id": "apex_v1.score",
1014
+ "metric_key": "score",
1015
+ "metric_source": "metric_config",
1016
+ "display_name": "Medicine (MD) / Score",
1017
+ "canonical_display_name": "APEX v1 / Medicine (MD) / Score",
1018
+ "raw_evaluation_name": "Medicine (MD)",
1019
+ "is_summary_score": false
1020
+ },
1021
+ "evalcards": {
1022
+ "annotations": {
1023
+ "reproducibility_gap": {
1024
+ "has_reproducibility_gap": true,
1025
+ "missing_fields": [
1026
+ "temperature",
1027
+ "max_tokens"
1028
+ ],
1029
+ "required_field_count": 2,
1030
+ "populated_field_count": 0,
1031
+ "signal_version": "1.0"
1032
+ },
1033
+ "provenance": {
1034
+ "source_type": "first_party",
1035
+ "is_multi_source": false,
1036
+ "first_party_only": true,
1037
+ "distinct_reporting_organizations": 1,
1038
+ "signal_version": "1.0"
1039
+ },
1040
+ "variant_divergence": null,
1041
+ "cross_party_divergence": null
1042
+ }
1043
+ }
1044
+ },
1045
+ {
1046
+ "model_id": "openai/gpt-5-2-pro",
1047
+ "model_route_id": "openai__gpt-5-2-pro",
1048
+ "model_name": "GPT 5.2 Pro",
1049
+ "developer": "openai",
1050
+ "variant_key": "default",
1051
+ "raw_model_id": "openai/GPT 5.2 Pro",
1052
+ "score": 0.65,
1053
+ "evaluation_id": "apex-v1/openai_gpt-5.2-pro/1773260200",
1054
+ "retrieved_timestamp": "1773260200",
1055
+ "source_metadata": {
1056
+ "source_name": "Mercor APEX-v1 Leaderboard",
1057
+ "source_type": "evaluation_run",
1058
+ "source_organization_name": "Mercor",
1059
+ "source_organization_url": "https://www.mercor.com",
1060
+ "evaluator_relationship": "first_party"
1061
+ },
1062
+ "source_data": {
1063
+ "dataset_name": "apex-v1",
1064
+ "source_type": "hf_dataset",
1065
+ "hf_repo": "Mercor/APEX-v1"
1066
+ },
1067
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5-2-pro/apex_v1_openai_gpt_5_2_pro_1773260200.json",
1068
+ "detailed_evaluation_results": null,
1069
+ "detailed_evaluation_results_meta": null,
1070
+ "passthrough_top_level_fields": null,
1071
+ "instance_level_data": null,
1072
+ "normalized_result": {
1073
+ "benchmark_family_key": "apex_v1",
1074
+ "benchmark_family_name": "APEX v1",
1075
+ "benchmark_parent_key": "apex_v1",
1076
+ "benchmark_parent_name": "APEX v1",
1077
+ "benchmark_component_key": "medicine_md",
1078
+ "benchmark_component_name": "Medicine (MD)",
1079
+ "benchmark_leaf_key": "apex_v1",
1080
+ "benchmark_leaf_name": "APEX v1",
1081
+ "slice_key": "medicine_md",
1082
+ "slice_name": "Medicine (MD)",
1083
+ "metric_name": "Score",
1084
+ "metric_id": "apex_v1.score",
1085
+ "metric_key": "score",
1086
+ "metric_source": "metric_config",
1087
+ "display_name": "Medicine (MD) / Score",
1088
+ "canonical_display_name": "APEX v1 / Medicine (MD) / Score",
1089
+ "raw_evaluation_name": "Medicine (MD)",
1090
+ "is_summary_score": false
1091
+ },
1092
+ "evalcards": {
1093
+ "annotations": {
1094
+ "reproducibility_gap": {
1095
+ "has_reproducibility_gap": true,
1096
+ "missing_fields": [
1097
+ "temperature",
1098
+ "max_tokens"
1099
+ ],
1100
+ "required_field_count": 2,
1101
+ "populated_field_count": 0,
1102
+ "signal_version": "1.0"
1103
+ },
1104
+ "provenance": {
1105
+ "source_type": "first_party",
1106
+ "is_multi_source": false,
1107
+ "first_party_only": true,
1108
+ "distinct_reporting_organizations": 1,
1109
+ "signal_version": "1.0"
1110
+ },
1111
+ "variant_divergence": null,
1112
+ "cross_party_divergence": null
1113
+ }
1114
+ }
1115
+ },
1116
+ {
1117
+ "model_id": "anthropic/opus-4-5",
1118
+ "model_route_id": "anthropic__opus-4-5",
1119
+ "model_name": "Opus 4.5",
1120
+ "developer": "anthropic",
1121
+ "variant_key": "default",
1122
+ "raw_model_id": "anthropic/Opus 4.5",
1123
+ "score": 0.65,
1124
+ "evaluation_id": "apex-v1/anthropic_opus-4.5/1773260200",
1125
+ "retrieved_timestamp": "1773260200",
1126
+ "source_metadata": {
1127
+ "source_name": "Mercor APEX-v1 Leaderboard",
1128
+ "source_type": "evaluation_run",
1129
+ "source_organization_name": "Mercor",
1130
+ "source_organization_url": "https://www.mercor.com",
1131
+ "evaluator_relationship": "first_party"
1132
+ },
1133
+ "source_data": {
1134
+ "dataset_name": "apex-v1",
1135
+ "source_type": "hf_dataset",
1136
+ "hf_repo": "Mercor/APEX-v1"
1137
+ },
1138
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/anthropic__opus-4-5/apex_v1_anthropic_opus_4_5_1773260200.json",
1139
+ "detailed_evaluation_results": null,
1140
+ "detailed_evaluation_results_meta": null,
1141
+ "passthrough_top_level_fields": null,
1142
+ "instance_level_data": null,
1143
+ "normalized_result": {
1144
+ "benchmark_family_key": "apex_v1",
1145
+ "benchmark_family_name": "APEX v1",
1146
+ "benchmark_parent_key": "apex_v1",
1147
+ "benchmark_parent_name": "APEX v1",
1148
+ "benchmark_component_key": "medicine_md",
1149
+ "benchmark_component_name": "Medicine (MD)",
1150
+ "benchmark_leaf_key": "apex_v1",
1151
+ "benchmark_leaf_name": "APEX v1",
1152
+ "slice_key": "medicine_md",
1153
+ "slice_name": "Medicine (MD)",
1154
+ "metric_name": "Score",
1155
+ "metric_id": "apex_v1.score",
1156
+ "metric_key": "score",
1157
+ "metric_source": "metric_config",
1158
+ "display_name": "Medicine (MD) / Score",
1159
+ "canonical_display_name": "APEX v1 / Medicine (MD) / Score",
1160
+ "raw_evaluation_name": "Medicine (MD)",
1161
+ "is_summary_score": false
1162
+ },
1163
+ "evalcards": {
1164
+ "annotations": {
1165
+ "reproducibility_gap": {
1166
+ "has_reproducibility_gap": true,
1167
+ "missing_fields": [
1168
+ "temperature",
1169
+ "max_tokens"
1170
+ ],
1171
+ "required_field_count": 2,
1172
+ "populated_field_count": 0,
1173
+ "signal_version": "1.0"
1174
+ },
1175
+ "provenance": {
1176
+ "source_type": "first_party",
1177
+ "is_multi_source": false,
1178
+ "first_party_only": true,
1179
+ "distinct_reporting_organizations": 1,
1180
+ "signal_version": "1.0"
1181
+ },
1182
+ "variant_divergence": null,
1183
+ "cross_party_divergence": null
1184
+ }
1185
+ }
1186
+ }
1187
+ ],
1188
+ "models_count": 3,
1189
+ "top_score": 0.66
1190
+ }
1191
+ ],
1192
+ "metrics_count": 1,
1193
+ "metric_names": [
1194
+ "Score"
1195
+ ]
1196
+ }
1197
+ ],
1198
+ "metrics": [
1199
+ {
1200
+ "metric_summary_id": "apex_v1_score",
1201
+ "legacy_eval_summary_id": "apex_v1_apex_v1",
1202
+ "evaluation_name": "apex-v1",
1203
+ "display_name": "APEX v1 / Score",
1204
+ "canonical_display_name": "APEX v1 / Score",
1205
+ "benchmark_leaf_key": "apex_v1",
1206
+ "benchmark_leaf_name": "APEX v1",
1207
+ "slice_key": null,
1208
+ "slice_name": null,
1209
+ "lower_is_better": false,
1210
+ "metric_name": "Score",
1211
+ "metric_id": "apex_v1.score",
1212
+ "metric_key": "score",
1213
+ "metric_source": "metric_config",
1214
+ "metric_config": {
1215
+ "evaluation_description": "Overall APEX-v1 mean score (paper snapshot).",
1216
+ "lower_is_better": false,
1217
+ "score_type": "continuous",
1218
+ "min_score": 0,
1219
+ "max_score": 1,
1220
+ "additional_details": {
1221
+ "raw_evaluation_name": "Overall Score"
1222
+ },
1223
+ "metric_id": "apex_v1.score",
1224
+ "metric_name": "Score",
1225
+ "metric_kind": "score",
1226
+ "metric_unit": "proportion"
1227
+ },
1228
+ "model_results": [
1229
+ {
1230
+ "model_id": "openai/gpt-5",
1231
+ "model_route_id": "openai__gpt-5",
1232
+ "model_name": "GPT 5",
1233
+ "developer": "openai",
1234
+ "variant_key": "default",
1235
+ "raw_model_id": "openai/GPT 5",
1236
+ "score": 0.67,
1237
+ "evaluation_id": "apex-v1/openai_gpt-5/1773260200",
1238
+ "retrieved_timestamp": "1773260200",
1239
+ "source_metadata": {
1240
+ "source_name": "Mercor APEX-v1 Leaderboard",
1241
+ "source_type": "evaluation_run",
1242
+ "source_organization_name": "Mercor",
1243
+ "source_organization_url": "https://www.mercor.com",
1244
+ "evaluator_relationship": "first_party"
1245
+ },
1246
+ "source_data": {
1247
+ "dataset_name": "apex-v1",
1248
+ "source_type": "hf_dataset",
1249
+ "hf_repo": "Mercor/APEX-v1"
1250
+ },
1251
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5/apex_v1_openai_gpt_5_1773260200.json",
1252
+ "detailed_evaluation_results": null,
1253
+ "detailed_evaluation_results_meta": null,
1254
+ "passthrough_top_level_fields": null,
1255
+ "instance_level_data": null,
1256
+ "normalized_result": {
1257
+ "benchmark_family_key": "apex_v1",
1258
+ "benchmark_family_name": "APEX v1",
1259
+ "benchmark_parent_key": "apex_v1",
1260
+ "benchmark_parent_name": "APEX v1",
1261
+ "benchmark_component_key": null,
1262
+ "benchmark_component_name": null,
1263
+ "benchmark_leaf_key": "apex_v1",
1264
+ "benchmark_leaf_name": "APEX v1",
1265
+ "slice_key": null,
1266
+ "slice_name": null,
1267
+ "metric_name": "Score",
1268
+ "metric_id": "apex_v1.score",
1269
+ "metric_key": "score",
1270
+ "metric_source": "metric_config",
1271
+ "display_name": "Score",
1272
+ "canonical_display_name": "APEX v1 / Score",
1273
+ "raw_evaluation_name": "apex-v1",
1274
+ "is_summary_score": false
1275
+ },
1276
+ "evalcards": {
1277
+ "annotations": {
1278
+ "reproducibility_gap": {
1279
+ "has_reproducibility_gap": true,
1280
+ "missing_fields": [
1281
+ "temperature",
1282
+ "max_tokens"
1283
+ ],
1284
+ "required_field_count": 2,
1285
+ "populated_field_count": 0,
1286
+ "signal_version": "1.0"
1287
+ },
1288
+ "provenance": {
1289
+ "source_type": "first_party",
1290
+ "is_multi_source": false,
1291
+ "first_party_only": true,
1292
+ "distinct_reporting_organizations": 1,
1293
+ "signal_version": "1.0"
1294
+ },
1295
+ "variant_divergence": null,
1296
+ "cross_party_divergence": null
1297
+ }
1298
+ }
1299
+ },
1300
+ {
1301
+ "model_id": "openai/gpt-5-2-pro",
1302
+ "model_route_id": "openai__gpt-5-2-pro",
1303
+ "model_name": "GPT 5.2 Pro",
1304
+ "developer": "openai",
1305
+ "variant_key": "default",
1306
+ "raw_model_id": "openai/GPT 5.2 Pro",
1307
+ "score": 0.668,
1308
+ "evaluation_id": "apex-v1/openai_gpt-5.2-pro/1773260200",
1309
+ "retrieved_timestamp": "1773260200",
1310
+ "source_metadata": {
1311
+ "source_name": "Mercor APEX-v1 Leaderboard",
1312
+ "source_type": "evaluation_run",
1313
+ "source_organization_name": "Mercor",
1314
+ "source_organization_url": "https://www.mercor.com",
1315
+ "evaluator_relationship": "first_party"
1316
+ },
1317
+ "source_data": {
1318
+ "dataset_name": "apex-v1",
1319
+ "source_type": "hf_dataset",
1320
+ "hf_repo": "Mercor/APEX-v1"
1321
+ },
1322
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-5-2-pro/apex_v1_openai_gpt_5_2_pro_1773260200.json",
1323
+ "detailed_evaluation_results": null,
1324
+ "detailed_evaluation_results_meta": null,
1325
+ "passthrough_top_level_fields": null,
1326
+ "instance_level_data": null,
1327
+ "normalized_result": {
1328
+ "benchmark_family_key": "apex_v1",
1329
+ "benchmark_family_name": "APEX v1",
1330
+ "benchmark_parent_key": "apex_v1",
1331
+ "benchmark_parent_name": "APEX v1",
1332
+ "benchmark_component_key": null,
1333
+ "benchmark_component_name": null,
1334
+ "benchmark_leaf_key": "apex_v1",
1335
+ "benchmark_leaf_name": "APEX v1",
1336
+ "slice_key": null,
1337
+ "slice_name": null,
1338
+ "metric_name": "Score",
1339
+ "metric_id": "apex_v1.score",
1340
+ "metric_key": "score",
1341
+ "metric_source": "metric_config",
1342
+ "display_name": "Score",
1343
+ "canonical_display_name": "APEX v1 / Score",
1344
+ "raw_evaluation_name": "apex-v1",
1345
+ "is_summary_score": false
1346
+ },
1347
+ "evalcards": {
1348
+ "annotations": {
1349
+ "reproducibility_gap": {
1350
+ "has_reproducibility_gap": true,
1351
+ "missing_fields": [
1352
+ "temperature",
1353
+ "max_tokens"
1354
+ ],
1355
+ "required_field_count": 2,
1356
+ "populated_field_count": 0,
1357
+ "signal_version": "1.0"
1358
+ },
1359
+ "provenance": {
1360
+ "source_type": "first_party",
1361
+ "is_multi_source": false,
1362
+ "first_party_only": true,
1363
+ "distinct_reporting_organizations": 1,
1364
+ "signal_version": "1.0"
1365
+ },
1366
+ "variant_divergence": null,
1367
+ "cross_party_divergence": null
1368
+ }
1369
+ }
1370
+ },
1371
+ {
1372
+ "model_id": "google/gemini-3-pro",
1373
+ "model_route_id": "google__gemini-3-pro",
1374
+ "model_name": "Gemini 3 Pro",
1375
+ "developer": "google",
1376
+ "variant_key": "default",
1377
+ "raw_model_id": "google/Gemini 3 Pro",
1378
+ "score": 0.643,
1379
+ "evaluation_id": "apex-v1/google_gemini-3-pro/1773260200",
1380
+ "retrieved_timestamp": "1773260200",
1381
+ "source_metadata": {
1382
+ "source_name": "Mercor APEX-v1 Leaderboard",
1383
+ "source_type": "evaluation_run",
1384
+ "source_organization_name": "Mercor",
1385
+ "source_organization_url": "https://www.mercor.com",
1386
+ "evaluator_relationship": "first_party"
1387
+ },
1388
+ "source_data": {
1389
+ "dataset_name": "apex-v1",
1390
+ "source_type": "hf_dataset",
1391
+ "hf_repo": "Mercor/APEX-v1"
1392
+ },
1393
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/google__gemini-3-pro/apex_v1_google_gemini_3_pro_1773260200.json",
1394
+ "detailed_evaluation_results": null,
1395
+ "detailed_evaluation_results_meta": null,
1396
+ "passthrough_top_level_fields": null,
1397
+ "instance_level_data": null,
1398
+ "normalized_result": {
1399
+ "benchmark_family_key": "apex_v1",
1400
+ "benchmark_family_name": "APEX v1",
1401
+ "benchmark_parent_key": "apex_v1",
1402
+ "benchmark_parent_name": "APEX v1",
1403
+ "benchmark_component_key": null,
1404
+ "benchmark_component_name": null,
1405
+ "benchmark_leaf_key": "apex_v1",
1406
+ "benchmark_leaf_name": "APEX v1",
1407
+ "slice_key": null,
1408
+ "slice_name": null,
1409
+ "metric_name": "Score",
1410
+ "metric_id": "apex_v1.score",
1411
+ "metric_key": "score",
1412
+ "metric_source": "metric_config",
1413
+ "display_name": "Score",
1414
+ "canonical_display_name": "APEX v1 / Score",
1415
+ "raw_evaluation_name": "apex-v1",
1416
+ "is_summary_score": false
1417
+ },
1418
+ "evalcards": {
1419
+ "annotations": {
1420
+ "reproducibility_gap": {
1421
+ "has_reproducibility_gap": true,
1422
+ "missing_fields": [
1423
+ "temperature",
1424
+ "max_tokens"
1425
+ ],
1426
+ "required_field_count": 2,
1427
+ "populated_field_count": 0,
1428
+ "signal_version": "1.0"
1429
+ },
1430
+ "provenance": {
1431
+ "source_type": "first_party",
1432
+ "is_multi_source": false,
1433
+ "first_party_only": true,
1434
+ "distinct_reporting_organizations": 1,
1435
+ "signal_version": "1.0"
1436
+ },
1437
+ "variant_divergence": null,
1438
+ "cross_party_divergence": null
1439
+ }
1440
+ }
1441
+ },
1442
+ {
1443
+ "model_id": "google/gemini-3-flash",
1444
+ "model_route_id": "google__gemini-3-flash",
1445
+ "model_name": "Gemini 3 Flash",
1446
+ "developer": "google",
1447
+ "variant_key": "default",
1448
+ "raw_model_id": "google/Gemini 3 Flash",
1449
+ "score": 0.64,
1450
+ "evaluation_id": "apex-v1/google_gemini-3-flash/1773260200",
1451
+ "retrieved_timestamp": "1773260200",
1452
+ "source_metadata": {
1453
+ "source_name": "Mercor APEX-v1 Leaderboard",
1454
+ "source_type": "evaluation_run",
1455
+ "source_organization_name": "Mercor",
1456
+ "source_organization_url": "https://www.mercor.com",
1457
+ "evaluator_relationship": "first_party"
1458
+ },
1459
+ "source_data": {
1460
+ "dataset_name": "apex-v1",
1461
+ "source_type": "hf_dataset",
1462
+ "hf_repo": "Mercor/APEX-v1"
1463
+ },
1464
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/google__gemini-3-flash/apex_v1_google_gemini_3_flash_1773260200.json",
1465
+ "detailed_evaluation_results": null,
1466
+ "detailed_evaluation_results_meta": null,
1467
+ "passthrough_top_level_fields": null,
1468
+ "instance_level_data": null,
1469
+ "normalized_result": {
1470
+ "benchmark_family_key": "apex_v1",
1471
+ "benchmark_family_name": "APEX v1",
1472
+ "benchmark_parent_key": "apex_v1",
1473
+ "benchmark_parent_name": "APEX v1",
1474
+ "benchmark_component_key": null,
1475
+ "benchmark_component_name": null,
1476
+ "benchmark_leaf_key": "apex_v1",
1477
+ "benchmark_leaf_name": "APEX v1",
1478
+ "slice_key": null,
1479
+ "slice_name": null,
1480
+ "metric_name": "Score",
1481
+ "metric_id": "apex_v1.score",
1482
+ "metric_key": "score",
1483
+ "metric_source": "metric_config",
1484
+ "display_name": "Score",
1485
+ "canonical_display_name": "APEX v1 / Score",
1486
+ "raw_evaluation_name": "apex-v1",
1487
+ "is_summary_score": false
1488
+ },
1489
+ "evalcards": {
1490
+ "annotations": {
1491
+ "reproducibility_gap": {
1492
+ "has_reproducibility_gap": true,
1493
+ "missing_fields": [
1494
+ "temperature",
1495
+ "max_tokens"
1496
+ ],
1497
+ "required_field_count": 2,
1498
+ "populated_field_count": 0,
1499
+ "signal_version": "1.0"
1500
+ },
1501
+ "provenance": {
1502
+ "source_type": "first_party",
1503
+ "is_multi_source": false,
1504
+ "first_party_only": true,
1505
+ "distinct_reporting_organizations": 1,
1506
+ "signal_version": "1.0"
1507
+ },
1508
+ "variant_divergence": null,
1509
+ "cross_party_divergence": null
1510
+ }
1511
+ }
1512
+ },
1513
+ {
1514
+ "model_id": "xai/grok-4",
1515
+ "model_route_id": "xai__grok-4",
1516
+ "model_name": "Grok 4",
1517
+ "developer": "xai",
1518
+ "variant_key": "default",
1519
+ "raw_model_id": "xai/Grok 4",
1520
+ "score": 0.635,
1521
+ "evaluation_id": "apex-v1/xai_grok-4/1773260200",
1522
+ "retrieved_timestamp": "1773260200",
1523
+ "source_metadata": {
1524
+ "source_name": "Mercor APEX-v1 Leaderboard",
1525
+ "source_type": "evaluation_run",
1526
+ "source_organization_name": "Mercor",
1527
+ "source_organization_url": "https://www.mercor.com",
1528
+ "evaluator_relationship": "first_party"
1529
+ },
1530
+ "source_data": {
1531
+ "dataset_name": "apex-v1",
1532
+ "source_type": "hf_dataset",
1533
+ "hf_repo": "Mercor/APEX-v1"
1534
+ },
1535
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/xai__grok-4/apex_v1_xai_grok_4_1773260200.json",
1536
+ "detailed_evaluation_results": null,
1537
+ "detailed_evaluation_results_meta": null,
1538
+ "passthrough_top_level_fields": null,
1539
+ "instance_level_data": null,
1540
+ "normalized_result": {
1541
+ "benchmark_family_key": "apex_v1",
1542
+ "benchmark_family_name": "APEX v1",
1543
+ "benchmark_parent_key": "apex_v1",
1544
+ "benchmark_parent_name": "APEX v1",
1545
+ "benchmark_component_key": null,
1546
+ "benchmark_component_name": null,
1547
+ "benchmark_leaf_key": "apex_v1",
1548
+ "benchmark_leaf_name": "APEX v1",
1549
+ "slice_key": null,
1550
+ "slice_name": null,
1551
+ "metric_name": "Score",
1552
+ "metric_id": "apex_v1.score",
1553
+ "metric_key": "score",
1554
+ "metric_source": "metric_config",
1555
+ "display_name": "Score",
1556
+ "canonical_display_name": "APEX v1 / Score",
1557
+ "raw_evaluation_name": "apex-v1",
1558
+ "is_summary_score": false
1559
+ },
1560
+ "evalcards": {
1561
+ "annotations": {
1562
+ "reproducibility_gap": {
1563
+ "has_reproducibility_gap": true,
1564
+ "missing_fields": [
1565
+ "temperature",
1566
+ "max_tokens"
1567
+ ],
1568
+ "required_field_count": 2,
1569
+ "populated_field_count": 0,
1570
+ "signal_version": "1.0"
1571
+ },
1572
+ "provenance": {
1573
+ "source_type": "first_party",
1574
+ "is_multi_source": false,
1575
+ "first_party_only": true,
1576
+ "distinct_reporting_organizations": 1,
1577
+ "signal_version": "1.0"
1578
+ },
1579
+ "variant_divergence": null,
1580
+ "cross_party_divergence": null
1581
+ }
1582
+ }
1583
+ },
1584
+ {
1585
+ "model_id": "google/gemini-2-5-flash",
1586
+ "model_route_id": "google__gemini-2-5-flash",
1587
+ "model_name": "Gemini 2.5 Flash",
1588
+ "developer": "google",
1589
+ "variant_key": "default",
1590
+ "raw_model_id": "google/Gemini 2.5 Flash",
1591
+ "score": 0.604,
1592
+ "evaluation_id": "apex-v1/google_gemini-2.5-flash/1773260200",
1593
+ "retrieved_timestamp": "1773260200",
1594
+ "source_metadata": {
1595
+ "source_name": "Mercor APEX-v1 Leaderboard",
1596
+ "source_type": "evaluation_run",
1597
+ "source_organization_name": "Mercor",
1598
+ "source_organization_url": "https://www.mercor.com",
1599
+ "evaluator_relationship": "first_party"
1600
+ },
1601
+ "source_data": {
1602
+ "dataset_name": "apex-v1",
1603
+ "source_type": "hf_dataset",
1604
+ "hf_repo": "Mercor/APEX-v1"
1605
+ },
1606
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/google__gemini-2-5-flash/apex_v1_google_gemini_2_5_flash_1773260200.json",
1607
+ "detailed_evaluation_results": null,
1608
+ "detailed_evaluation_results_meta": null,
1609
+ "passthrough_top_level_fields": null,
1610
+ "instance_level_data": null,
1611
+ "normalized_result": {
1612
+ "benchmark_family_key": "apex_v1",
1613
+ "benchmark_family_name": "APEX v1",
1614
+ "benchmark_parent_key": "apex_v1",
1615
+ "benchmark_parent_name": "APEX v1",
1616
+ "benchmark_component_key": null,
1617
+ "benchmark_component_name": null,
1618
+ "benchmark_leaf_key": "apex_v1",
1619
+ "benchmark_leaf_name": "APEX v1",
1620
+ "slice_key": null,
1621
+ "slice_name": null,
1622
+ "metric_name": "Score",
1623
+ "metric_id": "apex_v1.score",
1624
+ "metric_key": "score",
1625
+ "metric_source": "metric_config",
1626
+ "display_name": "Score",
1627
+ "canonical_display_name": "APEX v1 / Score",
1628
+ "raw_evaluation_name": "apex-v1",
1629
+ "is_summary_score": false
1630
+ },
1631
+ "evalcards": {
1632
+ "annotations": {
1633
+ "reproducibility_gap": {
1634
+ "has_reproducibility_gap": true,
1635
+ "missing_fields": [
1636
+ "temperature",
1637
+ "max_tokens"
1638
+ ],
1639
+ "required_field_count": 2,
1640
+ "populated_field_count": 0,
1641
+ "signal_version": "1.0"
1642
+ },
1643
+ "provenance": {
1644
+ "source_type": "first_party",
1645
+ "is_multi_source": false,
1646
+ "first_party_only": true,
1647
+ "distinct_reporting_organizations": 1,
1648
+ "signal_version": "1.0"
1649
+ },
1650
+ "variant_divergence": null,
1651
+ "cross_party_divergence": null
1652
+ }
1653
+ }
1654
+ },
1655
+ {
1656
+ "model_id": "openai/gpt-4o",
1657
+ "model_route_id": "openai__gpt-4o",
1658
+ "model_name": "GPT 4o",
1659
+ "developer": "openai",
1660
+ "variant_key": "default",
1661
+ "raw_model_id": "openai/GPT 4o",
1662
+ "score": 0.359,
1663
+ "evaluation_id": "apex-v1/openai_gpt-4o/1773260200",
1664
+ "retrieved_timestamp": "1773260200",
1665
+ "source_metadata": {
1666
+ "source_name": "Mercor APEX-v1 Leaderboard",
1667
+ "source_type": "evaluation_run",
1668
+ "source_organization_name": "Mercor",
1669
+ "source_organization_url": "https://www.mercor.com",
1670
+ "evaluator_relationship": "first_party"
1671
+ },
1672
+ "source_data": {
1673
+ "dataset_name": "apex-v1",
1674
+ "source_type": "hf_dataset",
1675
+ "hf_repo": "Mercor/APEX-v1"
1676
+ },
1677
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/openai__gpt-4o/apex_v1_openai_gpt_4o_1773260200.json",
1678
+ "detailed_evaluation_results": null,
1679
+ "detailed_evaluation_results_meta": null,
1680
+ "passthrough_top_level_fields": null,
1681
+ "instance_level_data": null,
1682
+ "normalized_result": {
1683
+ "benchmark_family_key": "apex_v1",
1684
+ "benchmark_family_name": "APEX v1",
1685
+ "benchmark_parent_key": "apex_v1",
1686
+ "benchmark_parent_name": "APEX v1",
1687
+ "benchmark_component_key": null,
1688
+ "benchmark_component_name": null,
1689
+ "benchmark_leaf_key": "apex_v1",
1690
+ "benchmark_leaf_name": "APEX v1",
1691
+ "slice_key": null,
1692
+ "slice_name": null,
1693
+ "metric_name": "Score",
1694
+ "metric_id": "apex_v1.score",
1695
+ "metric_key": "score",
1696
+ "metric_source": "metric_config",
1697
+ "display_name": "Score",
1698
+ "canonical_display_name": "APEX v1 / Score",
1699
+ "raw_evaluation_name": "apex-v1",
1700
+ "is_summary_score": false
1701
+ },
1702
+ "evalcards": {
1703
+ "annotations": {
1704
+ "reproducibility_gap": {
1705
+ "has_reproducibility_gap": true,
1706
+ "missing_fields": [
1707
+ "temperature",
1708
+ "max_tokens"
1709
+ ],
1710
+ "required_field_count": 2,
1711
+ "populated_field_count": 0,
1712
+ "signal_version": "1.0"
1713
+ },
1714
+ "provenance": {
1715
+ "source_type": "first_party",
1716
+ "is_multi_source": false,
1717
+ "first_party_only": true,
1718
+ "distinct_reporting_organizations": 1,
1719
+ "signal_version": "1.0"
1720
+ },
1721
+ "variant_divergence": null,
1722
+ "cross_party_divergence": null
1723
+ }
1724
+ }
1725
+ }
1726
+ ],
1727
+ "models_count": 7,
1728
+ "top_score": 0.67
1729
+ }
1730
+ ],
1731
+ "subtasks_count": 4,
1732
+ "metrics_count": 5,
1733
+ "models_count": 10,
1734
+ "metric_names": [
1735
+ "Score"
1736
+ ],
1737
+ "primary_metric_name": "Score",
1738
+ "top_score": null,
1739
+ "instance_data": {
1740
+ "available": false,
1741
+ "url_count": 0,
1742
+ "sample_urls": [],
1743
+ "models_with_loaded_instances": 0
1744
+ },
1745
+ "evalcards": {
1746
+ "annotations": {
1747
+ "reporting_completeness": {
1748
+ "completeness_score": 0.9285714285714286,
1749
+ "total_fields_evaluated": 28,
1750
+ "missing_required_fields": [
1751
+ "evalcards.lifecycle_status",
1752
+ "evalcards.preregistration_url"
1753
+ ],
1754
+ "partial_fields": [],
1755
+ "field_scores": [
1756
+ {
1757
+ "field_path": "autobenchmarkcard.benchmark_details.name",
1758
+ "coverage_type": "full",
1759
+ "score": 1.0
1760
+ },
1761
+ {
1762
+ "field_path": "autobenchmarkcard.benchmark_details.overview",
1763
+ "coverage_type": "full",
1764
+ "score": 1.0
1765
+ },
1766
+ {
1767
+ "field_path": "autobenchmarkcard.benchmark_details.data_type",
1768
+ "coverage_type": "full",
1769
+ "score": 1.0
1770
+ },
1771
+ {
1772
+ "field_path": "autobenchmarkcard.benchmark_details.domains",
1773
+ "coverage_type": "full",
1774
+ "score": 1.0
1775
+ },
1776
+ {
1777
+ "field_path": "autobenchmarkcard.benchmark_details.languages",
1778
+ "coverage_type": "full",
1779
+ "score": 1.0
1780
+ },
1781
+ {
1782
+ "field_path": "autobenchmarkcard.benchmark_details.similar_benchmarks",
1783
+ "coverage_type": "full",
1784
+ "score": 1.0
1785
+ },
1786
+ {
1787
+ "field_path": "autobenchmarkcard.benchmark_details.resources",
1788
+ "coverage_type": "full",
1789
+ "score": 1.0
1790
+ },
1791
+ {
1792
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.goal",
1793
+ "coverage_type": "full",
1794
+ "score": 1.0
1795
+ },
1796
+ {
1797
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.audience",
1798
+ "coverage_type": "full",
1799
+ "score": 1.0
1800
+ },
1801
+ {
1802
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.tasks",
1803
+ "coverage_type": "full",
1804
+ "score": 1.0
1805
+ },
1806
+ {
1807
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.limitations",
1808
+ "coverage_type": "full",
1809
+ "score": 1.0
1810
+ },
1811
+ {
1812
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.out_of_scope_uses",
1813
+ "coverage_type": "full",
1814
+ "score": 1.0
1815
+ },
1816
+ {
1817
+ "field_path": "autobenchmarkcard.methodology.methods",
1818
+ "coverage_type": "full",
1819
+ "score": 1.0
1820
+ },
1821
+ {
1822
+ "field_path": "autobenchmarkcard.methodology.metrics",
1823
+ "coverage_type": "full",
1824
+ "score": 1.0
1825
+ },
1826
+ {
1827
+ "field_path": "autobenchmarkcard.methodology.calculation",
1828
+ "coverage_type": "full",
1829
+ "score": 1.0
1830
+ },
1831
+ {
1832
+ "field_path": "autobenchmarkcard.methodology.interpretation",
1833
+ "coverage_type": "full",
1834
+ "score": 1.0
1835
+ },
1836
+ {
1837
+ "field_path": "autobenchmarkcard.methodology.baseline_results",
1838
+ "coverage_type": "full",
1839
+ "score": 1.0
1840
+ },
1841
+ {
1842
+ "field_path": "autobenchmarkcard.methodology.validation",
1843
+ "coverage_type": "full",
1844
+ "score": 1.0
1845
+ },
1846
+ {
1847
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.privacy_and_anonymity",
1848
+ "coverage_type": "full",
1849
+ "score": 1.0
1850
+ },
1851
+ {
1852
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.data_licensing",
1853
+ "coverage_type": "full",
1854
+ "score": 1.0
1855
+ },
1856
+ {
1857
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.consent_procedures",
1858
+ "coverage_type": "full",
1859
+ "score": 1.0
1860
+ },
1861
+ {
1862
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
1863
+ "coverage_type": "full",
1864
+ "score": 1.0
1865
+ },
1866
+ {
1867
+ "field_path": "autobenchmarkcard.data",
1868
+ "coverage_type": "partial",
1869
+ "score": 1.0
1870
+ },
1871
+ {
1872
+ "field_path": "eee_eval.source_metadata.source_type",
1873
+ "coverage_type": "full",
1874
+ "score": 1.0
1875
+ },
1876
+ {
1877
+ "field_path": "eee_eval.source_metadata.source_organization_name",
1878
+ "coverage_type": "full",
1879
+ "score": 1.0
1880
+ },
1881
+ {
1882
+ "field_path": "eee_eval.source_metadata.evaluator_relationship",
1883
+ "coverage_type": "full",
1884
+ "score": 1.0
1885
+ },
1886
+ {
1887
+ "field_path": "evalcards.lifecycle_status",
1888
+ "coverage_type": "reserved",
1889
+ "score": 0.0
1890
+ },
1891
+ {
1892
+ "field_path": "evalcards.preregistration_url",
1893
+ "coverage_type": "reserved",
1894
+ "score": 0.0
1895
+ }
1896
+ ],
1897
+ "signal_version": "1.0"
1898
+ },
1899
+ "benchmark_comparability": {
1900
+ "variant_divergence_groups": [],
1901
+ "cross_party_divergence_groups": []
1902
+ }
1903
+ }
1904
+ },
1905
+ "reproducibility_summary": {
1906
+ "results_total": 19,
1907
+ "has_reproducibility_gap_count": 19,
1908
+ "populated_ratio_avg": 0.0
1909
+ },
1910
+ "provenance_summary": {
1911
+ "total_results": 19,
1912
+ "total_groups": 19,
1913
+ "multi_source_groups": 0,
1914
+ "first_party_only_groups": 19,
1915
+ "source_type_distribution": {
1916
+ "first_party": 19,
1917
+ "third_party": 0,
1918
+ "collaborative": 0,
1919
+ "unspecified": 0
1920
+ }
1921
+ },
1922
+ "comparability_summary": {
1923
+ "total_groups": 19,
1924
+ "groups_with_variant_check": 0,
1925
+ "groups_with_cross_party_check": 0,
1926
+ "variant_divergent_count": 0,
1927
+ "cross_party_divergent_count": 0
1928
+ }
1929
+ }
tests/fixtures/evals/appworld.json ADDED
The diff for this file is too large to render. See raw diff
 
tests/fixtures/evals/artificial_analysis_llms_artificial_analysis_aime.json ADDED
The diff for this file is too large to render. See raw diff
 
tests/fixtures/evals/helm_capabilities.json ADDED
The diff for this file is too large to render. See raw diff
 
tests/fixtures/evals/helm_classic_truthfulqa.json ADDED
The diff for this file is too large to render. See raw diff
 
tests/fixtures/evals/helm_lite_narrativeqa.json ADDED
The diff for this file is too large to render. See raw diff
 
tests/fixtures/evals/helm_safety_simplesafetytests.json ADDED
The diff for this file is too large to render. See raw diff
 
tests/fixtures/loader.ts ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { readFileSync, readdirSync } from "fs"
2
+ import { fileURLToPath } from "url"
3
+ import path from "path"
4
+
5
+ import manifest from "./manifest.json"
6
+
7
+ const FIXTURES_DIR = path.dirname(fileURLToPath(import.meta.url))
8
+
9
+ export type FixtureGroup = "evals" | "models" | "developers" | "model_cards"
10
+
11
+ const GROUP_DIRS: Record<FixtureGroup, string> = {
12
+ evals: "evals",
13
+ models: "models",
14
+ developers: "developers",
15
+ model_cards: "model-cards",
16
+ }
17
+
18
+ export interface FixtureEntry {
19
+ id: string
20
+ why: string
21
+ }
22
+
23
+ export function fixtureEntries(group: FixtureGroup): FixtureEntry[] {
24
+ return (manifest as Record<FixtureGroup, FixtureEntry[]>)[group]
25
+ }
26
+
27
+ export function loadFixture<T>(group: FixtureGroup, id: string): T {
28
+ const filePath = path.join(FIXTURES_DIR, GROUP_DIRS[group], `${id}.json`)
29
+ return JSON.parse(readFileSync(filePath, "utf8")) as T
30
+ }
31
+
32
+ export function loadAllFixtures<T>(group: FixtureGroup): Array<{ id: string; why: string; data: T }> {
33
+ return fixtureEntries(group).map((entry) => ({
34
+ id: entry.id,
35
+ why: entry.why,
36
+ data: loadFixture<T>(group, entry.id),
37
+ }))
38
+ }
39
+
40
+ export function listLiveCacheFiles(group: FixtureGroup): string[] {
41
+ const dir = path.resolve(FIXTURES_DIR, "..", "..", ".cache", "hf-data", group)
42
+ try {
43
+ return readdirSync(dir).filter((f) => f.endsWith(".json"))
44
+ } catch {
45
+ return []
46
+ }
47
+ }
48
+
49
+ export function loadLiveCacheFile<T>(group: FixtureGroup, fileName: string): T {
50
+ const filePath = path.resolve(FIXTURES_DIR, "..", "..", ".cache", "hf-data", group, fileName)
51
+ return JSON.parse(readFileSync(filePath, "utf8")) as T
52
+ }
53
+
54
+ // Walks every model_result row inside an HFModelDetail's hierarchy_by_category.
55
+ // Used by both the fixture contracts and the live-cache drift checks. Generic
56
+ // in the result type so callers can pass a precise type from lib/hf-data.ts.
57
+ export function* walkHierarchyResults<TResult>(
58
+ detail: HierarchyDetail<TResult>,
59
+ fixtureId: string
60
+ ): Generator<{ result: TResult; path: string }> {
61
+ for (const [categoryKey, nodes] of Object.entries(detail.hierarchy_by_category ?? {})) {
62
+ for (const [nodeIdx, node] of (nodes ?? []).entries()) {
63
+ yield* walkNode<TResult>(node, `${fixtureId}.hierarchy_by_category.${categoryKey}[${nodeIdx}]`)
64
+ }
65
+ }
66
+ }
67
+
68
+ interface HierarchyDetail<TResult> {
69
+ hierarchy_by_category?: Record<string, HierarchyNode<TResult>[]>
70
+ }
71
+
72
+ interface HierarchyNode<TResult> {
73
+ metrics?: Array<{ model_results?: TResult[] }>
74
+ subtasks?: HierarchyNode<TResult>[]
75
+ }
76
+
77
+ function* walkNode<TResult>(
78
+ node: HierarchyNode<TResult>,
79
+ basePath: string
80
+ ): Generator<{ result: TResult; path: string }> {
81
+ for (const [metricIdx, metric] of (node.metrics ?? []).entries()) {
82
+ for (const [resultIdx, result] of (metric.model_results ?? []).entries()) {
83
+ yield {
84
+ result,
85
+ path: `${basePath}.metrics[${metricIdx}].model_results[${resultIdx}]`,
86
+ }
87
+ }
88
+ }
89
+ for (const [subtaskIdx, subtask] of (node.subtasks ?? []).entries()) {
90
+ yield* walkNode<TResult>(subtask, `${basePath}.subtasks[${subtaskIdx}]`)
91
+ }
92
+ }
tests/fixtures/manifest.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "$comment": "Pinned snapshot of HF cache files used by Tier A pipeline-contract tests and Tier B adapter snapshot tests. Refresh via `pnpm refresh-fixtures`. Each fixture earns its place by exercising a specific code path documented in `notes/testing-strategy.md` (curation criteria).",
3
+ "snapshot_source": ".cache/hf-data",
4
+ "snapshot_ts": "2026-04-27T22:32:59.376Z",
5
+ "evals": [
6
+ {
7
+ "id": "apex_v1",
8
+ "why": "Mercor first-party badge; pipeline category=knowledge; subtasks"
9
+ },
10
+ {
11
+ "id": "artificial_analysis_llms_artificial_analysis_aime",
12
+ "why": "Artificial Analysis third-party badge; source_type=documentation in pipeline"
13
+ },
14
+ {
15
+ "id": "helm_classic_truthfulqa",
16
+ "why": "Safety regression-bait — pipeline category=knowledge but inferCategoryFromBenchmark returns Safety"
17
+ },
18
+ {
19
+ "id": "helm_safety_simplesafetytests",
20
+ "why": "Safety regression-bait — pipeline category=general for an obvious safety eval"
21
+ },
22
+ {
23
+ "id": "helm_capabilities",
24
+ "why": "Composite parent eval; pipeline category=knowledge"
25
+ },
26
+ {
27
+ "id": "helm_lite_narrativeqa",
28
+ "why": "Subtask leaf under helm_lite parent; pipeline category=reasoning, regex returns General"
29
+ },
30
+ {
31
+ "id": "appworld",
32
+ "why": "Pipeline category=coding (one of the 3 keys we added to PIPELINE_CATEGORY_MAP); inferCategoryFromBenchmark returns Agentic"
33
+ }
34
+ ],
35
+ "models": [
36
+ {
37
+ "id": "openai__gpt-5-2-pro",
38
+ "why": "5 variants — exercises variant_lookup and per-variant grouping in flattenModelEvaluations"
39
+ },
40
+ {
41
+ "id": "google__gemini-3-flash",
42
+ "why": "Already covered by parity harness; medium-size; multi-category hierarchy"
43
+ },
44
+ {
45
+ "id": "ai21__j1-grande-v1-17b",
46
+ "why": "Has `safety` hierarchy_by_category key"
47
+ },
48
+ {
49
+ "id": "bytedance__seed-2-0-lite",
50
+ "why": "Has `coding` hierarchy_by_category key (the substring-fallacy bug case); small"
51
+ }
52
+ ],
53
+ "developers": [
54
+ {
55
+ "id": "openai",
56
+ "why": "KNOWN_DEVELOPER_NAMES canonicalization (openai → OpenAI)"
57
+ },
58
+ {
59
+ "id": "anthropic",
60
+ "why": "Multiple model families; typical case"
61
+ },
62
+ {
63
+ "id": "01-ai",
64
+ "why": "Dash-prefix slug — exercises pipelineSlugify edge case"
65
+ }
66
+ ],
67
+ "model_cards": [
68
+ {
69
+ "id": "openai__gpt-5",
70
+ "why": "6 variants; rich top_benchmark_scores; hfModelCardToEvaluationCardData edge case"
71
+ },
72
+ {
73
+ "id": "anthropic__claude-opus-4.5",
74
+ "why": "Dotted route_id (vs dashed in model detail files) — capturing the route-id mismatch"
75
+ },
76
+ {
77
+ "id": "01-ai__yi-34b",
78
+ "why": "Developer name canonicalization (01-ai → 01.AI per KNOWN_DEVELOPER_NAMES)"
79
+ }
80
+ ]
81
+ }
tests/fixtures/model-cards/01-ai__yi-34b.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_family_id": "01-ai/yi-34b",
3
+ "model_route_id": "01-ai__yi-34b",
4
+ "model_family_name": "Yi 34B",
5
+ "developer": "01-ai",
6
+ "params_billions": 34,
7
+ "total_evaluations": 3,
8
+ "benchmark_count": 3,
9
+ "benchmark_family_count": 3,
10
+ "categories_covered": [
11
+ "general",
12
+ "knowledge",
13
+ "reasoning"
14
+ ],
15
+ "last_updated": "2026-03-21T12:31:52.005480Z",
16
+ "variants": [
17
+ {
18
+ "variant_key": "default",
19
+ "variant_label": "Default",
20
+ "evaluation_count": 3,
21
+ "raw_model_ids": [
22
+ "01-ai/yi-34b",
23
+ "01-ai/Yi-34B"
24
+ ],
25
+ "last_updated": "2026-03-21T12:31:52.005480Z"
26
+ }
27
+ ],
28
+ "score_summary": {
29
+ "count": 52,
30
+ "min": 0.0514,
31
+ "max": 0.936,
32
+ "average": 0.6793153846153845
33
+ },
34
+ "reproducibility_summary": {
35
+ "results_total": 52,
36
+ "has_reproducibility_gap_count": 52,
37
+ "populated_ratio_avg": 0
38
+ },
39
+ "provenance_summary": {
40
+ "total_results": 52,
41
+ "total_groups": 52,
42
+ "multi_source_groups": 0,
43
+ "first_party_only_groups": 0,
44
+ "source_type_distribution": {
45
+ "first_party": 0,
46
+ "third_party": 52,
47
+ "collaborative": 0,
48
+ "unspecified": 0
49
+ }
50
+ },
51
+ "comparability_summary": {
52
+ "total_groups": 52,
53
+ "groups_with_variant_check": 0,
54
+ "groups_with_cross_party_check": 0,
55
+ "variant_divergent_count": 0,
56
+ "cross_party_divergent_count": 0
57
+ },
58
+ "benchmark_names": [
59
+ "BBH",
60
+ "GPQA",
61
+ "GSM8K",
62
+ "Helm lite",
63
+ "IFEval",
64
+ "LegalBench",
65
+ "MATH",
66
+ "MATH Level 5",
67
+ "MMLU",
68
+ "MMLU-PRO",
69
+ "MUSR",
70
+ "MedQA",
71
+ "NarrativeQA",
72
+ "NaturalQuestions (closed-book)",
73
+ "OpenbookQA",
74
+ "WMT 2014"
75
+ ],
76
+ "top_benchmark_scores": [
77
+ {
78
+ "benchmark": "MMLU",
79
+ "benchmarkKey": "helm_mmlu",
80
+ "canonical_display_name": "Mmlu / Marketing / Exact Match",
81
+ "evaluation_name": "Marketing",
82
+ "score": 0.936,
83
+ "metric": "EM on Marketing",
84
+ "lower_is_better": false
85
+ },
86
+ {
87
+ "benchmark": "OpenbookQA",
88
+ "benchmarkKey": "helm_lite_openbookqa",
89
+ "canonical_display_name": "OpenbookQA / Exact Match",
90
+ "evaluation_name": "OpenbookQA",
91
+ "score": 0.92,
92
+ "metric": "EM on OpenbookQA",
93
+ "lower_is_better": false
94
+ },
95
+ {
96
+ "benchmark": "NarrativeQA",
97
+ "benchmarkKey": "helm_lite_narrativeqa",
98
+ "canonical_display_name": "NarrativeQA / F1",
99
+ "evaluation_name": "NarrativeQA",
100
+ "score": 0.782,
101
+ "metric": "F1 on NarrativeQA",
102
+ "lower_is_better": false
103
+ },
104
+ {
105
+ "benchmark": "MedQA",
106
+ "benchmarkKey": "helm_lite_medqa",
107
+ "canonical_display_name": "MedQA / Exact Match",
108
+ "evaluation_name": "MedQA",
109
+ "score": 0.656,
110
+ "metric": "EM on MedQA",
111
+ "lower_is_better": false
112
+ },
113
+ {
114
+ "benchmark": "MMLU",
115
+ "benchmarkKey": "helm_lite_mmlu",
116
+ "canonical_display_name": "MMLU / Exact Match",
117
+ "evaluation_name": "MMLU",
118
+ "score": 0.65,
119
+ "metric": "EM on MMLU",
120
+ "lower_is_better": false
121
+ },
122
+ {
123
+ "benchmark": "GSM8K",
124
+ "benchmarkKey": "helm_lite_gsm8k",
125
+ "canonical_display_name": "GSM8K / Exact Match",
126
+ "evaluation_name": "GSM8K",
127
+ "score": 0.648,
128
+ "metric": "EM on GSM8K",
129
+ "lower_is_better": false
130
+ },
131
+ {
132
+ "benchmark": "LegalBench",
133
+ "benchmarkKey": "helm_lite_legalbench",
134
+ "canonical_display_name": "LegalBench / Exact Match",
135
+ "evaluation_name": "LegalBench",
136
+ "score": 0.618,
137
+ "metric": "EM on LegalBench",
138
+ "lower_is_better": false
139
+ },
140
+ {
141
+ "benchmark": "Helm lite",
142
+ "benchmarkKey": "helm_lite",
143
+ "canonical_display_name": "Helm lite / Win Rate",
144
+ "evaluation_name": "helm_lite",
145
+ "score": 0.57,
146
+ "metric": "How many models this model outperforms on average (over columns).",
147
+ "lower_is_better": false
148
+ },
149
+ {
150
+ "benchmark": "BBH",
151
+ "benchmarkKey": "hfopenllm_v2_bbh",
152
+ "canonical_display_name": "BBH / Accuracy",
153
+ "evaluation_name": "BBH",
154
+ "score": 0.5457,
155
+ "metric": "Accuracy on BBH",
156
+ "lower_is_better": false
157
+ },
158
+ {
159
+ "benchmark": "NaturalQuestions (closed-book)",
160
+ "benchmarkKey": "helm_lite_naturalquestions_closed_book",
161
+ "canonical_display_name": "NaturalQuestions (closed-book) / F1",
162
+ "evaluation_name": "NaturalQuestions (closed-book)",
163
+ "score": 0.443,
164
+ "metric": "F1 on NaturalQuestions (closed-book)",
165
+ "lower_is_better": false
166
+ },
167
+ {
168
+ "benchmark": "MMLU-PRO",
169
+ "benchmarkKey": "hfopenllm_v2_mmlu_pro",
170
+ "canonical_display_name": "MMLU-PRO / Accuracy",
171
+ "evaluation_name": "MMLU-PRO",
172
+ "score": 0.4412,
173
+ "metric": "Accuracy on MMLU-PRO",
174
+ "lower_is_better": false
175
+ },
176
+ {
177
+ "benchmark": "MUSR",
178
+ "benchmarkKey": "hfopenllm_v2_musr",
179
+ "canonical_display_name": "MUSR / Accuracy",
180
+ "evaluation_name": "MUSR",
181
+ "score": 0.4119,
182
+ "metric": "Accuracy on MUSR",
183
+ "lower_is_better": false
184
+ },
185
+ {
186
+ "benchmark": "MATH",
187
+ "benchmarkKey": "helm_lite_math",
188
+ "canonical_display_name": "MATH / Equivalent (CoT)",
189
+ "evaluation_name": "MATH",
190
+ "score": 0.375,
191
+ "metric": "Equivalent (CoT) on MATH",
192
+ "lower_is_better": false
193
+ },
194
+ {
195
+ "benchmark": "GPQA",
196
+ "benchmarkKey": "hfopenllm_v2_gpqa",
197
+ "canonical_display_name": "GPQA / Accuracy",
198
+ "evaluation_name": "GPQA",
199
+ "score": 0.3666,
200
+ "metric": "Accuracy on GPQA",
201
+ "lower_is_better": false
202
+ },
203
+ {
204
+ "benchmark": "IFEval",
205
+ "benchmarkKey": "hfopenllm_v2_ifeval",
206
+ "canonical_display_name": "IFEval / Accuracy",
207
+ "evaluation_name": "IFEval",
208
+ "score": 0.3046,
209
+ "metric": "Accuracy on IFEval",
210
+ "lower_is_better": false
211
+ }
212
+ ]
213
+ }
tests/fixtures/model-cards/anthropic__claude-opus-4.5.json ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_family_id": "anthropic/claude-opus-4.5",
3
+ "model_route_id": "anthropic__claude-opus-4.5",
4
+ "model_family_name": "Claude Opus 4.5",
5
+ "developer": "anthropic",
6
+ "params_billions": null,
7
+ "total_evaluations": 49,
8
+ "benchmark_count": 10,
9
+ "benchmark_family_count": 10,
10
+ "categories_covered": [
11
+ "agentic",
12
+ "coding",
13
+ "knowledge",
14
+ "other",
15
+ "reasoning"
16
+ ],
17
+ "last_updated": "2026-04-25T09:07:44.422824Z",
18
+ "variants": [
19
+ {
20
+ "variant_key": "default",
21
+ "variant_label": "Default",
22
+ "evaluation_count": 40,
23
+ "raw_model_ids": [
24
+ "anthropic/claude-opus-4-5",
25
+ "anthropic/claude-opus-4.5"
26
+ ],
27
+ "last_updated": "2026-04-20T16:39:22.266076Z"
28
+ },
29
+ {
30
+ "variant_key": "20251101",
31
+ "variant_label": "2025-11-01",
32
+ "evaluation_count": 4,
33
+ "raw_model_ids": [
34
+ "anthropic/claude-opus-4-5-20251101",
35
+ "anthropic/claude-opus-4-5-20251101-fc",
36
+ "anthropic/claude-opus-4-5-20251101-prompt"
37
+ ],
38
+ "last_updated": "2026-04-25T09:07:44.422824Z"
39
+ },
40
+ {
41
+ "variant_key": "2025-11-01",
42
+ "variant_label": "2025-11-01",
43
+ "evaluation_count": 5,
44
+ "raw_model_ids": [
45
+ "anthropic/claude-opus-4-5-20251101-thinking-16k",
46
+ "anthropic/claude-opus-4-5-20251101-thinking-32k",
47
+ "anthropic/claude-opus-4-5-20251101-thinking-64k",
48
+ "anthropic/claude-opus-4-5-20251101-thinking-8k",
49
+ "anthropic/claude-opus-4-5-20251101-thinking-none"
50
+ ],
51
+ "last_updated": "2026-04-07T08:15:57.578212Z"
52
+ }
53
+ ],
54
+ "score_summary": {
55
+ "count": 164,
56
+ "min": 0.0708,
57
+ "max": 95.5,
58
+ "average": 25.281729268292686
59
+ },
60
+ "reproducibility_summary": {
61
+ "results_total": 164,
62
+ "has_reproducibility_gap_count": 164,
63
+ "populated_ratio_avg": 0
64
+ },
65
+ "provenance_summary": {
66
+ "total_results": 164,
67
+ "total_groups": 76,
68
+ "multi_source_groups": 0,
69
+ "first_party_only_groups": 9,
70
+ "source_type_distribution": {
71
+ "first_party": 9,
72
+ "third_party": 155,
73
+ "collaborative": 0,
74
+ "unspecified": 0
75
+ }
76
+ },
77
+ "comparability_summary": {
78
+ "total_groups": 76,
79
+ "groups_with_variant_check": 6,
80
+ "groups_with_cross_party_check": 0,
81
+ "variant_divergent_count": 6,
82
+ "cross_party_divergent_count": 0
83
+ },
84
+ "benchmark_names": [
85
+ "ARC Prize evaluations leaderboard JSON",
86
+ "ARC-AGI v2",
87
+ "AppWorld Benchmark",
88
+ "Artificial Analysis LLM API",
89
+ "BFCL leaderboard CSV",
90
+ "BrowseComp-Plus",
91
+ "MCP Atlas",
92
+ "MMMLU",
93
+ "MMMU (validation)",
94
+ "OSWorld",
95
+ "SWE-Bench Verified",
96
+ "SWE-bench Verified",
97
+ "Swe Bench",
98
+ "Tau2 Retail",
99
+ "Tau2 Telecom",
100
+ "Terminal Bench 2 0",
101
+ "Terminal-Bench 2.0",
102
+ "τ-bench (Tool-Agent-User Interaction Benchmark)"
103
+ ],
104
+ "top_benchmark_scores": [
105
+ {
106
+ "benchmark": "Terminal Bench 2 0",
107
+ "benchmarkKey": "terminal_bench_2_0",
108
+ "canonical_display_name": "Terminal bench 2 0 / Accuracy",
109
+ "evaluation_name": "terminal-bench-2.0",
110
+ "score": 63.1,
111
+ "metric": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
112
+ "lower_is_better": false
113
+ },
114
+ {
115
+ "benchmark": "Artificial Analysis LLM API",
116
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_artificial_analysis_math_index",
117
+ "canonical_display_name": "artificial_analysis.artificial_analysis_math_index / Artificial Analysis Math Index",
118
+ "evaluation_name": "artificial_analysis.artificial_analysis_math_index",
119
+ "score": 62.7,
120
+ "metric": "Artificial Analysis composite math index.",
121
+ "lower_is_better": false
122
+ },
123
+ {
124
+ "benchmark": "Artificial Analysis LLM API",
125
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_median_output_tokens_per_second",
126
+ "canonical_display_name": "artificial_analysis.median_output_tokens_per_second / Median output tokens per second",
127
+ "evaluation_name": "artificial_analysis.median_output_tokens_per_second",
128
+ "score": 52.885,
129
+ "metric": "Median output generation speed reported by Artificial Analysis.",
130
+ "lower_is_better": false
131
+ },
132
+ {
133
+ "benchmark": "Artificial Analysis LLM API",
134
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_artificial_analysis_intelligence_index",
135
+ "canonical_display_name": "artificial_analysis.artificial_analysis_intelligence_index / Artificial Analysis Intelligence Index",
136
+ "evaluation_name": "artificial_analysis.artificial_analysis_intelligence_index",
137
+ "score": 43.1,
138
+ "metric": "Artificial Analysis composite intelligence index.",
139
+ "lower_is_better": false
140
+ },
141
+ {
142
+ "benchmark": "Artificial Analysis LLM API",
143
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_artificial_analysis_coding_index",
144
+ "canonical_display_name": "artificial_analysis.artificial_analysis_coding_index / Artificial Analysis Coding Index",
145
+ "evaluation_name": "artificial_analysis.artificial_analysis_coding_index",
146
+ "score": 42.9,
147
+ "metric": "Artificial Analysis composite coding index.",
148
+ "lower_is_better": false
149
+ },
150
+ {
151
+ "benchmark": "Artificial Analysis LLM API",
152
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_price_1m_output_tokens",
153
+ "canonical_display_name": "artificial_analysis.price_1m_output_tokens / Price 1m Output Tokens",
154
+ "evaluation_name": "artificial_analysis.price_1m_output_tokens",
155
+ "score": 25,
156
+ "metric": "Price per 1M output tokens in USD.",
157
+ "lower_is_better": true
158
+ },
159
+ {
160
+ "benchmark": "Artificial Analysis LLM API",
161
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_price_1m_blended_3_to_1",
162
+ "canonical_display_name": "artificial_analysis.price_1m_blended_3_to_1 / Price 1m Blended 3 To 1",
163
+ "evaluation_name": "artificial_analysis.price_1m_blended_3_to_1",
164
+ "score": 10,
165
+ "metric": "Blended price per 1M tokens using a 3:1 input-to-output ratio.",
166
+ "lower_is_better": true
167
+ },
168
+ {
169
+ "benchmark": "Artificial Analysis LLM API",
170
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_price_1m_input_tokens",
171
+ "canonical_display_name": "artificial_analysis.price_1m_input_tokens / Price 1m Input Tokens",
172
+ "evaluation_name": "artificial_analysis.price_1m_input_tokens",
173
+ "score": 5,
174
+ "metric": "Price per 1M input tokens in USD.",
175
+ "lower_is_better": true
176
+ },
177
+ {
178
+ "benchmark": "BFCL leaderboard CSV",
179
+ "benchmarkKey": "bfcl",
180
+ "canonical_display_name": "Bfcl / Format sensitivity / Format Sensitivity Standard Deviation",
181
+ "evaluation_name": "format_sensitivity",
182
+ "score": 3.65,
183
+ "metric": "format_sensitivity",
184
+ "lower_is_better": true
185
+ },
186
+ {
187
+ "benchmark": "Artificial Analysis LLM API",
188
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_median_time_to_first_token_seconds",
189
+ "canonical_display_name": "artificial_analysis.median_time_to_first_token_seconds / Median Time To First Token Seconds",
190
+ "evaluation_name": "artificial_analysis.median_time_to_first_token_seconds",
191
+ "score": 1.311,
192
+ "metric": "Median time to first token reported by Artificial Analysis.",
193
+ "lower_is_better": true
194
+ },
195
+ {
196
+ "benchmark": "Artificial Analysis LLM API",
197
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_median_time_to_first_answer_token",
198
+ "canonical_display_name": "artificial_analysis.median_time_to_first_answer_token / Median time to first answer token",
199
+ "evaluation_name": "artificial_analysis.median_time_to_first_answer_token",
200
+ "score": 1.311,
201
+ "metric": "Median time to first answer token reported by Artificial Analysis.",
202
+ "lower_is_better": true
203
+ },
204
+ {
205
+ "benchmark": "Tau2 Telecom",
206
+ "benchmarkKey": "llm_stats_tau2_telecom",
207
+ "canonical_display_name": "Tau2 Telecom / Score",
208
+ "evaluation_name": "llm_stats.tau2-telecom",
209
+ "score": 0.982,
210
+ "metric": "τ²-Bench telecom domain evaluates conversational agents in a dual-control environment modeled as a Dec-POMDP, where both agent and user use tools in shared telecommunications troubleshooting scenarios that test coordination and communication capabilities.",
211
+ "lower_is_better": false
212
+ },
213
+ {
214
+ "benchmark": "MMMLU",
215
+ "benchmarkKey": "llm_stats_mmmlu",
216
+ "canonical_display_name": "Mmmlu / Score",
217
+ "evaluation_name": "llm_stats.mmmlu",
218
+ "score": 0.908,
219
+ "metric": "Multilingual Massive Multitask Language Understanding dataset released by OpenAI, featuring professionally translated MMLU test questions across 14 languages including Arabic, Bengali, German, Spanish, French, Hindi, Indonesian, Italian, Japanese, Korean, Portuguese, Swahili, Yoruba, and Chinese. Contains approximately 15,908 multiple-choice questions per language covering 57 subjects.",
220
+ "lower_is_better": false
221
+ },
222
+ {
223
+ "benchmark": "Artificial Analysis LLM API",
224
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_mmlu_pro",
225
+ "canonical_display_name": "artificial_analysis.mmlu_pro / MMLU-Pro",
226
+ "evaluation_name": "artificial_analysis.mmlu_pro",
227
+ "score": 0.889,
228
+ "metric": "Benchmark score on MMLU-Pro.",
229
+ "lower_is_better": false
230
+ },
231
+ {
232
+ "benchmark": "Tau2 Retail",
233
+ "benchmarkKey": "llm_stats_tau2_retail",
234
+ "canonical_display_name": "Tau2 Retail / Score",
235
+ "evaluation_name": "llm_stats.tau2-retail",
236
+ "score": 0.889,
237
+ "metric": "τ²-bench retail domain evaluates conversational AI agents in customer service scenarios within a dual-control environment where both agent and user can interact with tools. Tests tool-agent-user interaction, rule adherence, and task consistency in retail customer support contexts.",
238
+ "lower_is_better": false
239
+ }
240
+ ]
241
+ }
tests/fixtures/model-cards/openai__gpt-5.json ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_family_id": "openai/gpt-5",
3
+ "model_route_id": "openai__gpt-5",
4
+ "model_family_name": "GPT 5",
5
+ "developer": "openai",
6
+ "params_billions": null,
7
+ "total_evaluations": 30,
8
+ "benchmark_count": 18,
9
+ "benchmark_family_count": 18,
10
+ "categories_covered": [
11
+ "agentic",
12
+ "coding",
13
+ "general",
14
+ "knowledge",
15
+ "other",
16
+ "reasoning"
17
+ ],
18
+ "last_updated": "2026-04-25T09:07:44.422824Z",
19
+ "variants": [
20
+ {
21
+ "variant_key": "default",
22
+ "variant_label": "Default",
23
+ "evaluation_count": 17,
24
+ "raw_model_ids": [
25
+ "openai/GPT 5",
26
+ "openai/gpt-5",
27
+ "openai/GPT-5"
28
+ ],
29
+ "last_updated": "2026-04-20T17:00:36.089462Z"
30
+ },
31
+ {
32
+ "variant_key": "2025-08-07",
33
+ "variant_label": "2025-08-07",
34
+ "evaluation_count": 9,
35
+ "raw_model_ids": [
36
+ "openai/gpt-5-2025-08-07"
37
+ ],
38
+ "last_updated": "2026-04-25T09:07:44.422824Z"
39
+ },
40
+ {
41
+ "variant_key": "2025-08-07-low",
42
+ "variant_label": "2025-08-07 low",
43
+ "evaluation_count": 1,
44
+ "raw_model_ids": [
45
+ "openai/gpt-5-2025-08-07-low"
46
+ ],
47
+ "last_updated": "2026-04-07T08:15:57.602168Z"
48
+ },
49
+ {
50
+ "variant_key": "2025-08-07-medium",
51
+ "variant_label": "2025-08-07 medium",
52
+ "evaluation_count": 1,
53
+ "raw_model_ids": [
54
+ "openai/gpt-5-2025-08-07-medium"
55
+ ],
56
+ "last_updated": "2026-04-07T08:15:57.602461Z"
57
+ },
58
+ {
59
+ "variant_key": "2025-08-07-minimal",
60
+ "variant_label": "2025-08-07 minimal",
61
+ "evaluation_count": 1,
62
+ "raw_model_ids": [
63
+ "openai/gpt-5-2025-08-07-minimal"
64
+ ],
65
+ "last_updated": "2026-04-07T08:15:57.602750Z"
66
+ },
67
+ {
68
+ "variant_key": "2025-08-07-high",
69
+ "variant_label": "2025-08-07 high",
70
+ "evaluation_count": 1,
71
+ "raw_model_ids": [
72
+ "openai/gpt-5-2025-08-07-high"
73
+ ],
74
+ "last_updated": "2026-04-07T08:15:57.601872Z"
75
+ }
76
+ ],
77
+ "score_summary": {
78
+ "count": 559,
79
+ "min": 0,
80
+ "max": 73320,
81
+ "average": 673.8712930083715
82
+ },
83
+ "reproducibility_summary": {
84
+ "results_total": 559,
85
+ "has_reproducibility_gap_count": 541,
86
+ "populated_ratio_avg": 0.03220035778175313
87
+ },
88
+ "provenance_summary": {
89
+ "total_results": 559,
90
+ "total_groups": 508,
91
+ "multi_source_groups": 0,
92
+ "first_party_only_groups": 43,
93
+ "source_type_distribution": {
94
+ "first_party": 44,
95
+ "third_party": 515,
96
+ "collaborative": 0,
97
+ "unspecified": 0
98
+ }
99
+ },
100
+ "comparability_summary": {
101
+ "total_groups": 508,
102
+ "groups_with_variant_check": 0,
103
+ "groups_with_cross_party_check": 0,
104
+ "variant_divergent_count": 0,
105
+ "cross_party_divergent_count": 0
106
+ },
107
+ "benchmark_names": [
108
+ "ACE",
109
+ "APEX Agents",
110
+ "APEX v1",
111
+ "ARC Prize evaluations leaderboard JSON",
112
+ "Aider-Polyglot",
113
+ "Anthropic Red Team",
114
+ "Artificial Analysis LLM API",
115
+ "BBQ",
116
+ "BrowseComp Long Context 128k",
117
+ "BrowseComp Long Context 256k",
118
+ "COLLIE",
119
+ "CharXiv-R",
120
+ "ERQA",
121
+ "Easy Problems",
122
+ "FActScore",
123
+ "Fibble arena",
124
+ "FrontierMath",
125
+ "GPQA",
126
+ "Global-MMLU Lite",
127
+ "Graphwalks BFS <128k",
128
+ "Graphwalks parents <128k",
129
+ "HMMT 2025",
130
+ "Hard Problems",
131
+ "HarmBench",
132
+ "HealthBench Hard",
133
+ "Helm air bench",
134
+ "Helm safety",
135
+ "Holistic Evaluation of Language Models (HELM)",
136
+ "HumanEval",
137
+ "IFEval",
138
+ "Internal API instruction following (hard)",
139
+ "LongFact Concepts",
140
+ "LongFact Objects",
141
+ "MATH",
142
+ "MMLU",
143
+ "MMLU-Pro",
144
+ "MMMU",
145
+ "MMMU-Pro",
146
+ "Medium Problems",
147
+ "Multi-Challenge",
148
+ "Multi-SWE-bench (c++)",
149
+ "Omni-MATH",
150
+ "OpenAI-MRCR: 2 needle 128k",
151
+ "OpenAI-MRCR: 2 needle 256k",
152
+ "SWE-Lancer (IC-Diamond subset)",
153
+ "SWE-PolyBench Verified (Java)",
154
+ "SWE-PolyBench Verified (JavaScript)",
155
+ "SWE-PolyBench Verified (Python)",
156
+ "SWE-PolyBench Verified (TypeScript)",
157
+ "SWE-bench Verified",
158
+ "SciArena leaderboard API",
159
+ "SimpleSafetyTests",
160
+ "Tau2 Airline",
161
+ "Tau2 Retail",
162
+ "Tau2 Telecom",
163
+ "Terminal Bench 2 0",
164
+ "VideoMME w sub.",
165
+ "VideoMMMU",
166
+ "WildBench",
167
+ "Wordle Arena",
168
+ "XSTest"
169
+ ],
170
+ "top_benchmark_scores": [
171
+ {
172
+ "benchmark": "Artificial Analysis LLM API",
173
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_median_output_tokens_per_second",
174
+ "canonical_display_name": "artificial_analysis.median_output_tokens_per_second / Median output tokens per second",
175
+ "evaluation_name": "artificial_analysis.median_output_tokens_per_second",
176
+ "score": 95.722,
177
+ "metric": "Median output generation speed reported by Artificial Analysis.",
178
+ "lower_is_better": false
179
+ },
180
+ {
181
+ "benchmark": "Artificial Analysis LLM API",
182
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_artificial_analysis_math_index",
183
+ "canonical_display_name": "artificial_analysis.artificial_analysis_math_index / Artificial Analysis Math Index",
184
+ "evaluation_name": "artificial_analysis.artificial_analysis_math_index",
185
+ "score": 94.3,
186
+ "metric": "Artificial Analysis composite math index.",
187
+ "lower_is_better": false
188
+ },
189
+ {
190
+ "benchmark": "Artificial Analysis LLM API",
191
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_median_time_to_first_token_seconds",
192
+ "canonical_display_name": "artificial_analysis.median_time_to_first_token_seconds / Median Time To First Token Seconds",
193
+ "evaluation_name": "artificial_analysis.median_time_to_first_token_seconds",
194
+ "score": 82.082,
195
+ "metric": "Median time to first token reported by Artificial Analysis.",
196
+ "lower_is_better": true
197
+ },
198
+ {
199
+ "benchmark": "Artificial Analysis LLM API",
200
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_median_time_to_first_answer_token",
201
+ "canonical_display_name": "artificial_analysis.median_time_to_first_answer_token / Median time to first answer token",
202
+ "evaluation_name": "artificial_analysis.median_time_to_first_answer_token",
203
+ "score": 82.082,
204
+ "metric": "Median time to first answer token reported by Artificial Analysis.",
205
+ "lower_is_better": true
206
+ },
207
+ {
208
+ "benchmark": "Terminal Bench 2 0",
209
+ "benchmarkKey": "terminal_bench_2_0",
210
+ "canonical_display_name": "Terminal bench 2 0 / Accuracy",
211
+ "evaluation_name": "terminal-bench-2.0",
212
+ "score": 49.6,
213
+ "metric": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
214
+ "lower_is_better": false
215
+ },
216
+ {
217
+ "benchmark": "Artificial Analysis LLM API",
218
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_artificial_analysis_intelligence_index",
219
+ "canonical_display_name": "artificial_analysis.artificial_analysis_intelligence_index / Artificial Analysis Intelligence Index",
220
+ "evaluation_name": "artificial_analysis.artificial_analysis_intelligence_index",
221
+ "score": 44.6,
222
+ "metric": "Artificial Analysis composite intelligence index.",
223
+ "lower_is_better": false
224
+ },
225
+ {
226
+ "benchmark": "Artificial Analysis LLM API",
227
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_artificial_analysis_coding_index",
228
+ "canonical_display_name": "artificial_analysis.artificial_analysis_coding_index / Artificial Analysis Coding Index",
229
+ "evaluation_name": "artificial_analysis.artificial_analysis_coding_index",
230
+ "score": 36,
231
+ "metric": "Artificial Analysis composite coding index.",
232
+ "lower_is_better": false
233
+ },
234
+ {
235
+ "benchmark": "Artificial Analysis LLM API",
236
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_price_1m_output_tokens",
237
+ "canonical_display_name": "artificial_analysis.price_1m_output_tokens / Price 1m Output Tokens",
238
+ "evaluation_name": "artificial_analysis.price_1m_output_tokens",
239
+ "score": 10,
240
+ "metric": "Price per 1M output tokens in USD.",
241
+ "lower_is_better": true
242
+ },
243
+ {
244
+ "benchmark": "Artificial Analysis LLM API",
245
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_price_1m_blended_3_to_1",
246
+ "canonical_display_name": "artificial_analysis.price_1m_blended_3_to_1 / Price 1m Blended 3 To 1",
247
+ "evaluation_name": "artificial_analysis.price_1m_blended_3_to_1",
248
+ "score": 3.438,
249
+ "metric": "Blended price per 1M tokens using a 3:1 input-to-output ratio.",
250
+ "lower_is_better": true
251
+ },
252
+ {
253
+ "benchmark": "SciArena leaderboard API",
254
+ "benchmarkKey": "sciarena",
255
+ "canonical_display_name": "SciArena / Cost per 100 Calls",
256
+ "evaluation_name": "overall_cost_per_100_calls_usd",
257
+ "score": 2.9752,
258
+ "metric": "overall_cost_per_100_calls_usd",
259
+ "lower_is_better": true
260
+ },
261
+ {
262
+ "benchmark": "Artificial Analysis LLM API",
263
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_price_1m_input_tokens",
264
+ "canonical_display_name": "artificial_analysis.price_1m_input_tokens / Price 1m Input Tokens",
265
+ "evaluation_name": "artificial_analysis.price_1m_input_tokens",
266
+ "score": 1.25,
267
+ "metric": "Price per 1M input tokens in USD.",
268
+ "lower_is_better": true
269
+ },
270
+ {
271
+ "benchmark": "Helm air bench",
272
+ "benchmarkKey": "helm_air_bench",
273
+ "canonical_display_name": "Air bench / AIRBench 2024 - Self-harm / Refusal Rate",
274
+ "evaluation_name": "AIRBench 2024 - Self-harm",
275
+ "score": 1,
276
+ "metric": "Refusal Rate on AIRBench 2024 - Self-harm",
277
+ "lower_is_better": false
278
+ },
279
+ {
280
+ "benchmark": "SWE-Lancer (IC-Diamond subset)",
281
+ "benchmarkKey": "llm_stats_swe_lancer_ic_diamond_subset",
282
+ "canonical_display_name": "Swe Lancer Ic Diamond Subset / Score",
283
+ "evaluation_name": "llm_stats.swe-lancer-ic-diamond-subset",
284
+ "score": 1,
285
+ "metric": "SWE-Lancer (IC-Diamond subset) is a benchmark of real-world freelance software engineering tasks from Upwork, ranging from $50 bug fixes to $32,000 feature implementations. It evaluates AI models on independent engineering tasks using end-to-end tests triple-verified by experienced software engineers, and includes managerial tasks where models choose between technical implementation proposals.",
286
+ "lower_is_better": false
287
+ },
288
+ {
289
+ "benchmark": "SimpleSafetyTests",
290
+ "benchmarkKey": "helm_safety_simplesafetytests",
291
+ "canonical_display_name": "SimpleSafetyTests / LM Evaluated Safety score",
292
+ "evaluation_name": "SimpleSafetyTests",
293
+ "score": 0.998,
294
+ "metric": "LM Evaluated Safety score on SimpleSafetyTests",
295
+ "lower_is_better": false
296
+ },
297
+ {
298
+ "benchmark": "Artificial Analysis LLM API",
299
+ "benchmarkKey": "artificial_analysis_llms_artificial_analysis_math_500",
300
+ "canonical_display_name": "artificial_analysis.math_500 / MATH-500",
301
+ "evaluation_name": "artificial_analysis.math_500",
302
+ "score": 0.994,
303
+ "metric": "Benchmark score on MATH-500.",
304
+ "lower_is_better": false
305
+ }
306
+ ]
307
+ }
tests/fixtures/models/ai21__j1-grande-v1-17b.json ADDED
The diff for this file is too large to render. See raw diff
 
tests/fixtures/models/bytedance__seed-2-0-lite.json ADDED
@@ -0,0 +1,2086 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_info": {
3
+ "name": "Seed 2.0 Lite",
4
+ "id": "bytedance/seed-2-0-lite",
5
+ "developer": "bytedance",
6
+ "additional_details": {
7
+ "raw_id": "seed-2.0-lite",
8
+ "raw_name": "Seed 2.0 Lite",
9
+ "raw_model_id": "seed-2.0-lite",
10
+ "raw_model_name": "Seed 2.0 Lite",
11
+ "raw_organization_id": "bytedance",
12
+ "raw_organization_name": "ByteDance",
13
+ "raw_release_date": "2026-02-14",
14
+ "raw_announcement_date": "2026-02-14",
15
+ "raw_multimodal": "true",
16
+ "raw_provider_slug": "bytedance",
17
+ "raw_provider_name": "ByteDance"
18
+ },
19
+ "normalized_id": "bytedance/seed-2.0-lite",
20
+ "family_id": "bytedance/seed-2-0-lite",
21
+ "family_slug": "seed-2-0-lite",
22
+ "family_name": "Seed 2.0 Lite",
23
+ "variant_key": "default",
24
+ "variant_label": "Default",
25
+ "model_route_id": "bytedance__seed-2-0-lite",
26
+ "model_version": null
27
+ },
28
+ "model_family_id": "bytedance/seed-2-0-lite",
29
+ "model_route_id": "bytedance__seed-2-0-lite",
30
+ "model_family_name": "Seed 2.0 Lite",
31
+ "raw_model_ids": [
32
+ "bytedance/seed-2.0-lite"
33
+ ],
34
+ "evaluations_by_category": {
35
+ "other": [
36
+ {
37
+ "schema_version": "0.2.2",
38
+ "evaluation_id": "llm-stats/first_party/bytedance_seed-2.0-lite/1777108064.422824",
39
+ "retrieved_timestamp": "1777108064.422824",
40
+ "benchmark": "llm-stats",
41
+ "source_data": {
42
+ "dataset_name": "AIME 2026",
43
+ "source_type": "url",
44
+ "url": [
45
+ "https://llm-stats.com/models/seed-2.0-lite",
46
+ "https://llm-stats.com/benchmarks/aime-2026",
47
+ "https://api.llm-stats.com/leaderboard/benchmarks/aime-2026"
48
+ ],
49
+ "additional_details": {
50
+ "raw_benchmark_id": "aime-2026",
51
+ "raw_model_id": "seed-2.0-lite",
52
+ "source_role": "aggregator"
53
+ }
54
+ },
55
+ "source_metadata": {
56
+ "source_name": "LLM Stats API: first_party scores",
57
+ "source_type": "documentation",
58
+ "source_organization_name": "LLM Stats",
59
+ "source_organization_url": "https://llm-stats.com/",
60
+ "evaluator_relationship": "first_party",
61
+ "additional_details": {
62
+ "models_endpoint": "https://api.llm-stats.com/v1/models",
63
+ "benchmarks_endpoint": "https://api.llm-stats.com/leaderboard/benchmarks",
64
+ "scores_endpoint": "https://api.llm-stats.com/v1/scores",
65
+ "scores_endpoint_fallback": "https://api.llm-stats.com/leaderboard/benchmarks/{benchmark_id}",
66
+ "developer_page_url": "https://llm-stats.com/developer",
67
+ "attribution_url": "https://llm-stats.com/",
68
+ "attribution_required": "true",
69
+ "source_role": "aggregator"
70
+ }
71
+ },
72
+ "eval_library": {
73
+ "name": "LLM Stats",
74
+ "version": "unknown"
75
+ },
76
+ "model_info": {
77
+ "name": "Seed 2.0 Lite",
78
+ "id": "bytedance/seed-2.0-lite",
79
+ "developer": "bytedance",
80
+ "additional_details": {
81
+ "raw_id": "seed-2.0-lite",
82
+ "raw_name": "Seed 2.0 Lite",
83
+ "raw_model_id": "seed-2.0-lite",
84
+ "raw_model_name": "Seed 2.0 Lite",
85
+ "raw_organization_id": "bytedance",
86
+ "raw_organization_name": "ByteDance",
87
+ "raw_release_date": "2026-02-14",
88
+ "raw_announcement_date": "2026-02-14",
89
+ "raw_multimodal": "true",
90
+ "raw_provider_slug": "bytedance",
91
+ "raw_provider_name": "ByteDance"
92
+ },
93
+ "normalized_id": "bytedance/seed-2.0-lite",
94
+ "family_id": "bytedance/seed-2-0-lite",
95
+ "family_slug": "seed-2-0-lite",
96
+ "family_name": "Seed 2.0 Lite",
97
+ "variant_key": "default",
98
+ "variant_label": "Default",
99
+ "model_route_id": "bytedance__seed-2-0-lite"
100
+ },
101
+ "generation_config": null,
102
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/bytedance__seed-2-0-lite/llm_stats_first_party_bytedance_seed_2_0_lite_1777108064_422824.json",
103
+ "detailed_evaluation_results_meta": null,
104
+ "detailed_evaluation_results": null,
105
+ "passthrough_top_level_fields": null,
106
+ "evaluation_results": [
107
+ {
108
+ "evaluation_result_id": "aime-2026::aime-2026-seed-2.0-lite",
109
+ "evaluation_name": "llm_stats.aime-2026",
110
+ "source_data": {
111
+ "dataset_name": "AIME 2026",
112
+ "source_type": "url",
113
+ "url": [
114
+ "https://llm-stats.com/models/seed-2.0-lite",
115
+ "https://llm-stats.com/benchmarks/aime-2026",
116
+ "https://api.llm-stats.com/leaderboard/benchmarks/aime-2026"
117
+ ],
118
+ "additional_details": {
119
+ "raw_benchmark_id": "aime-2026",
120
+ "raw_model_id": "seed-2.0-lite",
121
+ "source_role": "aggregator"
122
+ }
123
+ },
124
+ "metric_config": {
125
+ "evaluation_description": "All 30 problems from the 2026 American Invitational Mathematics Examination (AIME I and AIME II), testing olympiad-level mathematical reasoning with integer answers from 000-999. Used as an AI benchmark to evaluate large language models' ability to solve complex mathematical problems requiring multi-step logical deductions and structured symbolic reasoning.",
126
+ "metric_id": "llm_stats.aime-2026.score",
127
+ "metric_name": "AIME 2026 score",
128
+ "metric_kind": "benchmark_score",
129
+ "metric_unit": "proportion",
130
+ "lower_is_better": false,
131
+ "score_type": "continuous",
132
+ "min_score": 0.0,
133
+ "max_score": 1.0,
134
+ "additional_details": {
135
+ "raw_benchmark_id": "aime-2026",
136
+ "raw_score_field": "score",
137
+ "bound_strategy": "inferred_proportion",
138
+ "raw_name": "AIME 2026",
139
+ "raw_categories": "[\"math\",\"reasoning\"]",
140
+ "raw_modality": "text",
141
+ "raw_verified": "false",
142
+ "raw_model_count": "12"
143
+ }
144
+ },
145
+ "score_details": {
146
+ "score": 0.883,
147
+ "details": {
148
+ "raw_score": "0.883",
149
+ "raw_score_field": "score",
150
+ "raw_model_id": "seed-2.0-lite",
151
+ "raw_benchmark_id": "aime-2026",
152
+ "source_urls_json": "[\"https://llm-stats.com/models/seed-2.0-lite\",\"https://llm-stats.com/benchmarks/aime-2026\",\"https://api.llm-stats.com/leaderboard/benchmarks/aime-2026\"]",
153
+ "raw_score_id": "aime-2026::seed-2.0-lite",
154
+ "raw_provenance_label": "unknown",
155
+ "raw_verified": "false"
156
+ }
157
+ },
158
+ "normalized_result": {
159
+ "benchmark_family_key": "llm_stats",
160
+ "benchmark_family_name": "AIME 2026",
161
+ "benchmark_parent_key": "llm_stats",
162
+ "benchmark_parent_name": "AIME 2026",
163
+ "benchmark_component_key": "aime_2026",
164
+ "benchmark_component_name": "Aime 2026",
165
+ "benchmark_leaf_key": "aime_2026",
166
+ "benchmark_leaf_name": "Aime 2026",
167
+ "slice_key": null,
168
+ "slice_name": null,
169
+ "metric_name": "Score",
170
+ "metric_id": "llm_stats.aime-2026.score",
171
+ "metric_key": "score",
172
+ "metric_source": "metric_config",
173
+ "display_name": "Aime 2026 / Score",
174
+ "canonical_display_name": "Aime 2026 / Score",
175
+ "raw_evaluation_name": "llm_stats.aime-2026",
176
+ "is_summary_score": false
177
+ }
178
+ },
179
+ {
180
+ "evaluation_result_id": "livecodebench-v6::livecodebench-v6-seed-2.0-lite",
181
+ "evaluation_name": "llm_stats.livecodebench-v6",
182
+ "source_data": {
183
+ "dataset_name": "LiveCodeBench v6",
184
+ "source_type": "url",
185
+ "url": [
186
+ "https://llm-stats.com/models/seed-2.0-lite",
187
+ "https://llm-stats.com/benchmarks/livecodebench-v6",
188
+ "https://api.llm-stats.com/leaderboard/benchmarks/livecodebench-v6"
189
+ ],
190
+ "additional_details": {
191
+ "raw_benchmark_id": "livecodebench-v6",
192
+ "raw_model_id": "seed-2.0-lite",
193
+ "source_role": "aggregator"
194
+ }
195
+ },
196
+ "metric_config": {
197
+ "evaluation_description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.",
198
+ "metric_id": "llm_stats.livecodebench-v6.score",
199
+ "metric_name": "LiveCodeBench v6 score",
200
+ "metric_kind": "benchmark_score",
201
+ "metric_unit": "proportion",
202
+ "lower_is_better": false,
203
+ "score_type": "continuous",
204
+ "min_score": 0.0,
205
+ "max_score": 1.0,
206
+ "additional_details": {
207
+ "raw_benchmark_id": "livecodebench-v6",
208
+ "raw_score_field": "score",
209
+ "bound_strategy": "inferred_proportion",
210
+ "raw_name": "LiveCodeBench v6",
211
+ "raw_categories": "[\"general\",\"reasoning\"]",
212
+ "raw_modality": "text",
213
+ "raw_verified": "false",
214
+ "raw_model_count": "45"
215
+ }
216
+ },
217
+ "score_details": {
218
+ "score": 0.817,
219
+ "details": {
220
+ "raw_score": "0.817",
221
+ "raw_score_field": "score",
222
+ "raw_model_id": "seed-2.0-lite",
223
+ "raw_benchmark_id": "livecodebench-v6",
224
+ "source_urls_json": "[\"https://llm-stats.com/models/seed-2.0-lite\",\"https://llm-stats.com/benchmarks/livecodebench-v6\",\"https://api.llm-stats.com/leaderboard/benchmarks/livecodebench-v6\"]",
225
+ "raw_score_id": "livecodebench-v6::seed-2.0-lite",
226
+ "raw_provenance_label": "unknown",
227
+ "raw_verified": "false"
228
+ }
229
+ },
230
+ "normalized_result": {
231
+ "benchmark_family_key": "llm_stats",
232
+ "benchmark_family_name": "LiveCodeBench v6",
233
+ "benchmark_parent_key": "llm_stats",
234
+ "benchmark_parent_name": "LiveCodeBench v6",
235
+ "benchmark_component_key": "livecodebench_v6",
236
+ "benchmark_component_name": "Livecodebench V6",
237
+ "benchmark_leaf_key": "livecodebench_v6",
238
+ "benchmark_leaf_name": "Livecodebench V6",
239
+ "slice_key": null,
240
+ "slice_name": null,
241
+ "metric_name": "Score",
242
+ "metric_id": "llm_stats.livecodebench-v6.score",
243
+ "metric_key": "score",
244
+ "metric_source": "metric_config",
245
+ "display_name": "Livecodebench V6 / Score",
246
+ "canonical_display_name": "Livecodebench V6 / Score",
247
+ "raw_evaluation_name": "llm_stats.livecodebench-v6",
248
+ "is_summary_score": false
249
+ }
250
+ }
251
+ ],
252
+ "benchmark_card": null,
253
+ "instance_level_data": null,
254
+ "eval_summary_ids": [
255
+ "llm_stats_aime_2026",
256
+ "llm_stats_livecodebench_v6"
257
+ ]
258
+ }
259
+ ]
260
+ },
261
+ "evaluation_summaries_by_category": {
262
+ "coding": [
263
+ {
264
+ "eval_summary_id": "llm_stats_livecodebench_v6",
265
+ "benchmark": "LiveCodeBench v6",
266
+ "benchmark_family_key": "llm_stats",
267
+ "benchmark_family_name": "LiveCodeBench v6",
268
+ "benchmark_parent_key": "llm_stats",
269
+ "benchmark_parent_name": "LiveCodeBench v6",
270
+ "benchmark_leaf_key": "livecodebench_v6",
271
+ "benchmark_leaf_name": "Livecodebench V6",
272
+ "benchmark_component_key": "livecodebench_v6",
273
+ "benchmark_component_name": "Livecodebench V6",
274
+ "evaluation_name": "Livecodebench V6",
275
+ "display_name": "Livecodebench V6",
276
+ "canonical_display_name": "Livecodebench V6",
277
+ "is_summary_score": false,
278
+ "category": "coding",
279
+ "source_data": {
280
+ "dataset_name": "LiveCodeBench v6",
281
+ "source_type": "url",
282
+ "url": [
283
+ "https://llm-stats.com/models/seed-2.0-lite",
284
+ "https://llm-stats.com/benchmarks/livecodebench-v6",
285
+ "https://api.llm-stats.com/leaderboard/benchmarks/livecodebench-v6"
286
+ ],
287
+ "additional_details": {
288
+ "raw_benchmark_id": "livecodebench-v6",
289
+ "raw_model_id": "seed-2.0-lite",
290
+ "source_role": "aggregator"
291
+ }
292
+ },
293
+ "benchmark_card": {
294
+ "benchmark_details": {
295
+ "name": "LiveCodeBench",
296
+ "overview": "LiveCodeBench is a holistic and contamination-free benchmark for evaluating large language models on code-related capabilities. It assesses a broader range of skills including code generation, self-repair, code execution, and test output prediction. The benchmark collects new problems over time from programming contest platforms to prevent data contamination, currently containing over 500 coding problems published between May 2023 and May 2024.",
297
+ "data_type": "text",
298
+ "domains": [
299
+ "code generation",
300
+ "programming competitions"
301
+ ],
302
+ "languages": [
303
+ "Not specified"
304
+ ],
305
+ "similar_benchmarks": [
306
+ "HumanEval",
307
+ "MBPP",
308
+ "APPS",
309
+ "DS-1000",
310
+ "ARCADE",
311
+ "NumpyEval",
312
+ "PandasEval",
313
+ "JuICe",
314
+ "APIBench",
315
+ "RepoBench",
316
+ "ODEX",
317
+ "SWE-Bench",
318
+ "GoogleCodeRepo",
319
+ "RepoEval",
320
+ "Cocomic-Data"
321
+ ],
322
+ "resources": [
323
+ "https://livecodebench.github.io/",
324
+ "https://arxiv.org/abs/2403.07974"
325
+ ],
326
+ "benchmark_type": "single"
327
+ },
328
+ "purpose_and_intended_users": {
329
+ "goal": "To provide a comprehensive and contamination-free evaluation of large language models for code by assessing a broader range of code-related capabilities beyond just code generation.",
330
+ "audience": [
331
+ "Researchers and practitioners in academia and industry who are interested in evaluating the capabilities of large language models for code"
332
+ ],
333
+ "tasks": [
334
+ "Code generation",
335
+ "Self-repair",
336
+ "Code execution",
337
+ "Test output prediction"
338
+ ],
339
+ "limitations": "The focus on competition programming problems might not be representative of the most general notion of LLM programming capabilities or real-world, open-ended software development tasks.",
340
+ "out_of_scope_uses": [
341
+ "Evaluating performance on real-world, open-ended, and unconstrained user-raised problems"
342
+ ]
343
+ },
344
+ "data": {
345
+ "source": "The data is collected from coding contests on three platforms: LeetCode, AtCoder, and CodeForces, with problems published between May 2023 and May 2024.",
346
+ "size": "Over 500 coding problems. Specific subsets include 479 samples from 85 problems for code execution and 442 problem instances from 181 LeetCode problems for test output prediction.",
347
+ "format": "Includes problem statements, public tests, user solutions, and starter code (for LeetCode). Problems are tagged with difficulty labels (Easy, Medium, Hard) from the platforms.",
348
+ "annotation": "Difficulty labels are provided by the competition platforms. For the code execution dataset, human-submitted solutions were filtered using compile-time and runtime filters followed by manual inspection to ensure quality."
349
+ },
350
+ "methodology": {
351
+ "methods": [
352
+ "Models are evaluated in a zero-shot setting across four scenarios: code generation, self-repair, code execution, and test output prediction.",
353
+ "For code generation and self-repair, program correctness is verified using a set of unseen test cases. For code execution, an execution-based correctness metric compares generated output to ground truth. For test output prediction, generated responses are parsed and equivalence checks are used for grading."
354
+ ],
355
+ "metrics": [
356
+ "Pass@1"
357
+ ],
358
+ "calculation": "For each problem, 10 candidate answers are generated. The Pass@1 score is the fraction of problems for which a generated program or answer is correct.",
359
+ "interpretation": "A higher Pass@1 score indicates better performance.",
360
+ "baseline_results": "The paper reports results for specific models including GPT-4, GPT-4-Turbo, Claude-3-Opus, Claude-3-Sonnet, and Mistral-L, but specific numerical scores are not provided in the given excerpts.",
361
+ "validation": "Program correctness for code generation and self-repair is verified using a set of unseen test cases. For code execution, an execution-based correctness metric is used to compare generated output to ground truth. For test output prediction, generated responses are parsed and equivalence checks are used for grading."
362
+ },
363
+ "ethical_and_legal_considerations": {
364
+ "privacy_and_anonymity": "Not specified",
365
+ "data_licensing": "Not specified",
366
+ "consent_procedures": "Not specified",
367
+ "compliance_with_regulations": "The benchmark operates under the Fair Use doctrine (§ 107) for copyrighted works, determining that its use of collected problems for academic, non-profit educational purposes constitutes fair use. It does not train on the collected problems."
368
+ },
369
+ "possible_risks": [
370
+ {
371
+ "category": "Over- or under-reliance",
372
+ "description": [
373
+ "In AI-assisted decision-making tasks, reliance measures how much a person trusts (and potentially acts on) a model's output. Over-reliance occurs when a person puts too much trust in a model, accepting a model's output when the model's output is likely incorrect. Under-reliance is the opposite, where the person doesn't trust the model but should."
374
+ ],
375
+ "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/over-or-under-reliance.html"
376
+ },
377
+ {
378
+ "category": "Unrepresentative data",
379
+ "description": [
380
+ "Unrepresentative data occurs when the training or fine-tuning data is not sufficiently representative of the underlying population or does not measure the phenomenon of interest. Synthetic data might not fully capture the complexity and nuances of real-world data. Causes include possible limitations in the seed data quality, biases in generation methods, or inadequate domain knowledge. Thus, AI models might struggle to generalize effectively to real-world scenarios."
381
+ ],
382
+ "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/unrepresentative-data.html"
383
+ },
384
+ {
385
+ "category": "Data contamination",
386
+ "description": [
387
+ "Data contamination occurs when incorrect data is used for training. For example, data that is not aligned with model's purpose or data that is already set aside for other development tasks such as testing and evaluation."
388
+ ],
389
+ "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/data-contamination.html"
390
+ },
391
+ {
392
+ "category": "Harmful code generation",
393
+ "description": [
394
+ "Models might generate code that causes harm or unintentionally affects other systems."
395
+ ],
396
+ "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/harmful-code-generation.html"
397
+ },
398
+ {
399
+ "category": "Reproducibility",
400
+ "description": [
401
+ "Replicating agent behavior or output can be impacted by changes or updates made to external services and tools. This impact is increased if the agent is built with generative AI."
402
+ ],
403
+ "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/reproducibility-agentic.html"
404
+ }
405
+ ],
406
+ "flagged_fields": {},
407
+ "missing_fields": []
408
+ },
409
+ "tags": {
410
+ "domains": [
411
+ "code generation",
412
+ "programming competitions"
413
+ ],
414
+ "languages": [
415
+ "Not specified"
416
+ ],
417
+ "tasks": [
418
+ "Code generation",
419
+ "Self-repair",
420
+ "Code execution",
421
+ "Test output prediction"
422
+ ]
423
+ },
424
+ "subtasks_count": 0,
425
+ "metrics_count": 1,
426
+ "metric_names": [
427
+ "Score"
428
+ ],
429
+ "primary_metric_name": "Score",
430
+ "evalcards": {
431
+ "annotations": {
432
+ "reporting_completeness": {
433
+ "completeness_score": 0.9285714285714286,
434
+ "total_fields_evaluated": 28,
435
+ "missing_required_fields": [
436
+ "evalcards.lifecycle_status",
437
+ "evalcards.preregistration_url"
438
+ ],
439
+ "partial_fields": [],
440
+ "field_scores": [
441
+ {
442
+ "field_path": "autobenchmarkcard.benchmark_details.name",
443
+ "coverage_type": "full",
444
+ "score": 1.0
445
+ },
446
+ {
447
+ "field_path": "autobenchmarkcard.benchmark_details.overview",
448
+ "coverage_type": "full",
449
+ "score": 1.0
450
+ },
451
+ {
452
+ "field_path": "autobenchmarkcard.benchmark_details.data_type",
453
+ "coverage_type": "full",
454
+ "score": 1.0
455
+ },
456
+ {
457
+ "field_path": "autobenchmarkcard.benchmark_details.domains",
458
+ "coverage_type": "full",
459
+ "score": 1.0
460
+ },
461
+ {
462
+ "field_path": "autobenchmarkcard.benchmark_details.languages",
463
+ "coverage_type": "full",
464
+ "score": 1.0
465
+ },
466
+ {
467
+ "field_path": "autobenchmarkcard.benchmark_details.similar_benchmarks",
468
+ "coverage_type": "full",
469
+ "score": 1.0
470
+ },
471
+ {
472
+ "field_path": "autobenchmarkcard.benchmark_details.resources",
473
+ "coverage_type": "full",
474
+ "score": 1.0
475
+ },
476
+ {
477
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.goal",
478
+ "coverage_type": "full",
479
+ "score": 1.0
480
+ },
481
+ {
482
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.audience",
483
+ "coverage_type": "full",
484
+ "score": 1.0
485
+ },
486
+ {
487
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.tasks",
488
+ "coverage_type": "full",
489
+ "score": 1.0
490
+ },
491
+ {
492
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.limitations",
493
+ "coverage_type": "full",
494
+ "score": 1.0
495
+ },
496
+ {
497
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.out_of_scope_uses",
498
+ "coverage_type": "full",
499
+ "score": 1.0
500
+ },
501
+ {
502
+ "field_path": "autobenchmarkcard.methodology.methods",
503
+ "coverage_type": "full",
504
+ "score": 1.0
505
+ },
506
+ {
507
+ "field_path": "autobenchmarkcard.methodology.metrics",
508
+ "coverage_type": "full",
509
+ "score": 1.0
510
+ },
511
+ {
512
+ "field_path": "autobenchmarkcard.methodology.calculation",
513
+ "coverage_type": "full",
514
+ "score": 1.0
515
+ },
516
+ {
517
+ "field_path": "autobenchmarkcard.methodology.interpretation",
518
+ "coverage_type": "full",
519
+ "score": 1.0
520
+ },
521
+ {
522
+ "field_path": "autobenchmarkcard.methodology.baseline_results",
523
+ "coverage_type": "full",
524
+ "score": 1.0
525
+ },
526
+ {
527
+ "field_path": "autobenchmarkcard.methodology.validation",
528
+ "coverage_type": "full",
529
+ "score": 1.0
530
+ },
531
+ {
532
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.privacy_and_anonymity",
533
+ "coverage_type": "full",
534
+ "score": 1.0
535
+ },
536
+ {
537
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.data_licensing",
538
+ "coverage_type": "full",
539
+ "score": 1.0
540
+ },
541
+ {
542
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.consent_procedures",
543
+ "coverage_type": "full",
544
+ "score": 1.0
545
+ },
546
+ {
547
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
548
+ "coverage_type": "full",
549
+ "score": 1.0
550
+ },
551
+ {
552
+ "field_path": "autobenchmarkcard.data",
553
+ "coverage_type": "partial",
554
+ "score": 1.0
555
+ },
556
+ {
557
+ "field_path": "eee_eval.source_metadata.source_type",
558
+ "coverage_type": "full",
559
+ "score": 1.0
560
+ },
561
+ {
562
+ "field_path": "eee_eval.source_metadata.source_organization_name",
563
+ "coverage_type": "full",
564
+ "score": 1.0
565
+ },
566
+ {
567
+ "field_path": "eee_eval.source_metadata.evaluator_relationship",
568
+ "coverage_type": "full",
569
+ "score": 1.0
570
+ },
571
+ {
572
+ "field_path": "evalcards.lifecycle_status",
573
+ "coverage_type": "reserved",
574
+ "score": 0.0
575
+ },
576
+ {
577
+ "field_path": "evalcards.preregistration_url",
578
+ "coverage_type": "reserved",
579
+ "score": 0.0
580
+ }
581
+ ],
582
+ "signal_version": "1.0"
583
+ },
584
+ "benchmark_comparability": {
585
+ "variant_divergence_groups": [],
586
+ "cross_party_divergence_groups": []
587
+ }
588
+ }
589
+ },
590
+ "reproducibility_summary": {
591
+ "results_total": 1,
592
+ "has_reproducibility_gap_count": 1,
593
+ "populated_ratio_avg": 0.0
594
+ },
595
+ "provenance_summary": {
596
+ "total_results": 1,
597
+ "total_groups": 1,
598
+ "multi_source_groups": 0,
599
+ "first_party_only_groups": 1,
600
+ "source_type_distribution": {
601
+ "first_party": 1,
602
+ "third_party": 0,
603
+ "collaborative": 0,
604
+ "unspecified": 0
605
+ }
606
+ },
607
+ "comparability_summary": {
608
+ "total_groups": 1,
609
+ "groups_with_variant_check": 0,
610
+ "groups_with_cross_party_check": 0,
611
+ "variant_divergent_count": 0,
612
+ "cross_party_divergent_count": 0
613
+ },
614
+ "metrics": [
615
+ {
616
+ "metric_summary_id": "llm_stats_livecodebench_v6_score",
617
+ "legacy_eval_summary_id": "llm_stats_llm_stats_livecodebench_v6",
618
+ "evaluation_name": "llm_stats.livecodebench-v6",
619
+ "display_name": "Livecodebench V6 / Score",
620
+ "canonical_display_name": "Livecodebench V6 / Score",
621
+ "benchmark_leaf_key": "livecodebench_v6",
622
+ "benchmark_leaf_name": "Livecodebench V6",
623
+ "slice_key": null,
624
+ "slice_name": null,
625
+ "lower_is_better": false,
626
+ "metric_name": "Score",
627
+ "metric_id": "llm_stats.livecodebench-v6.score",
628
+ "metric_key": "score",
629
+ "metric_source": "metric_config",
630
+ "metric_config": {
631
+ "evaluation_description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.",
632
+ "metric_id": "llm_stats.livecodebench-v6.score",
633
+ "metric_name": "LiveCodeBench v6 score",
634
+ "metric_kind": "benchmark_score",
635
+ "metric_unit": "proportion",
636
+ "lower_is_better": false,
637
+ "score_type": "continuous",
638
+ "min_score": 0.0,
639
+ "max_score": 1.0,
640
+ "additional_details": {
641
+ "raw_benchmark_id": "livecodebench-v6",
642
+ "raw_score_field": "score",
643
+ "bound_strategy": "inferred_proportion",
644
+ "raw_name": "LiveCodeBench v6",
645
+ "raw_categories": "[\"general\",\"reasoning\"]",
646
+ "raw_modality": "text",
647
+ "raw_verified": "false",
648
+ "raw_model_count": "45"
649
+ }
650
+ },
651
+ "models_count": 1,
652
+ "top_score": 0.817,
653
+ "model_results": [
654
+ {
655
+ "model_id": "bytedance/seed-2-0-lite",
656
+ "model_route_id": "bytedance__seed-2-0-lite",
657
+ "model_name": "Seed 2.0 Lite",
658
+ "developer": "bytedance",
659
+ "variant_key": "default",
660
+ "raw_model_id": "bytedance/seed-2.0-lite",
661
+ "score": 0.817,
662
+ "evaluation_id": "llm-stats/first_party/bytedance_seed-2.0-lite/1777108064.422824",
663
+ "retrieved_timestamp": "1777108064.422824",
664
+ "source_metadata": {
665
+ "source_name": "LLM Stats API: first_party scores",
666
+ "source_type": "documentation",
667
+ "source_organization_name": "LLM Stats",
668
+ "source_organization_url": "https://llm-stats.com/",
669
+ "evaluator_relationship": "first_party",
670
+ "additional_details": {
671
+ "models_endpoint": "https://api.llm-stats.com/v1/models",
672
+ "benchmarks_endpoint": "https://api.llm-stats.com/leaderboard/benchmarks",
673
+ "scores_endpoint": "https://api.llm-stats.com/v1/scores",
674
+ "scores_endpoint_fallback": "https://api.llm-stats.com/leaderboard/benchmarks/{benchmark_id}",
675
+ "developer_page_url": "https://llm-stats.com/developer",
676
+ "attribution_url": "https://llm-stats.com/",
677
+ "attribution_required": "true",
678
+ "source_role": "aggregator"
679
+ }
680
+ },
681
+ "source_data": {
682
+ "dataset_name": "AIME 2026",
683
+ "source_type": "url",
684
+ "url": [
685
+ "https://llm-stats.com/models/seed-2.0-lite",
686
+ "https://llm-stats.com/benchmarks/aime-2026",
687
+ "https://api.llm-stats.com/leaderboard/benchmarks/aime-2026"
688
+ ],
689
+ "additional_details": {
690
+ "raw_benchmark_id": "aime-2026",
691
+ "raw_model_id": "seed-2.0-lite",
692
+ "source_role": "aggregator"
693
+ }
694
+ },
695
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/bytedance__seed-2-0-lite/llm_stats_first_party_bytedance_seed_2_0_lite_1777108064_422824.json",
696
+ "detailed_evaluation_results": null,
697
+ "detailed_evaluation_results_meta": null,
698
+ "passthrough_top_level_fields": null,
699
+ "instance_level_data": null,
700
+ "normalized_result": {
701
+ "benchmark_family_key": "llm_stats",
702
+ "benchmark_family_name": "LiveCodeBench v6",
703
+ "benchmark_parent_key": "llm_stats",
704
+ "benchmark_parent_name": "LiveCodeBench v6",
705
+ "benchmark_component_key": "livecodebench_v6",
706
+ "benchmark_component_name": "Livecodebench V6",
707
+ "benchmark_leaf_key": "livecodebench_v6",
708
+ "benchmark_leaf_name": "Livecodebench V6",
709
+ "slice_key": null,
710
+ "slice_name": null,
711
+ "metric_name": "Score",
712
+ "metric_id": "llm_stats.livecodebench-v6.score",
713
+ "metric_key": "score",
714
+ "metric_source": "metric_config",
715
+ "display_name": "Livecodebench V6 / Score",
716
+ "canonical_display_name": "Livecodebench V6 / Score",
717
+ "raw_evaluation_name": "llm_stats.livecodebench-v6",
718
+ "is_summary_score": false
719
+ },
720
+ "evalcards": {
721
+ "annotations": {
722
+ "reproducibility_gap": {
723
+ "has_reproducibility_gap": true,
724
+ "missing_fields": [
725
+ "temperature",
726
+ "max_tokens"
727
+ ],
728
+ "required_field_count": 2,
729
+ "populated_field_count": 0,
730
+ "signal_version": "1.0"
731
+ },
732
+ "provenance": {
733
+ "source_type": "first_party",
734
+ "is_multi_source": false,
735
+ "first_party_only": true,
736
+ "distinct_reporting_organizations": 1,
737
+ "signal_version": "1.0"
738
+ },
739
+ "variant_divergence": null,
740
+ "cross_party_divergence": null
741
+ }
742
+ }
743
+ }
744
+ ]
745
+ }
746
+ ],
747
+ "subtasks": [],
748
+ "models_count": 1,
749
+ "top_score": 0.817,
750
+ "instance_data": {
751
+ "available": false,
752
+ "url_count": 0,
753
+ "sample_urls": [],
754
+ "models_with_loaded_instances": 0
755
+ }
756
+ }
757
+ ],
758
+ "other": [
759
+ {
760
+ "eval_summary_id": "llm_stats_aime_2026",
761
+ "benchmark": "AIME 2026",
762
+ "benchmark_family_key": "llm_stats",
763
+ "benchmark_family_name": "AIME 2026",
764
+ "benchmark_parent_key": "llm_stats",
765
+ "benchmark_parent_name": "AIME 2026",
766
+ "benchmark_leaf_key": "aime_2026",
767
+ "benchmark_leaf_name": "Aime 2026",
768
+ "benchmark_component_key": "aime_2026",
769
+ "benchmark_component_name": "Aime 2026",
770
+ "evaluation_name": "Aime 2026",
771
+ "display_name": "Aime 2026",
772
+ "canonical_display_name": "Aime 2026",
773
+ "is_summary_score": false,
774
+ "category": "other",
775
+ "source_data": {
776
+ "dataset_name": "AIME 2026",
777
+ "source_type": "url",
778
+ "url": [
779
+ "https://llm-stats.com/models/seed-2.0-lite",
780
+ "https://llm-stats.com/benchmarks/aime-2026",
781
+ "https://api.llm-stats.com/leaderboard/benchmarks/aime-2026"
782
+ ],
783
+ "additional_details": {
784
+ "raw_benchmark_id": "aime-2026",
785
+ "raw_model_id": "seed-2.0-lite",
786
+ "source_role": "aggregator"
787
+ }
788
+ },
789
+ "benchmark_card": null,
790
+ "tags": {
791
+ "domains": [],
792
+ "languages": [],
793
+ "tasks": []
794
+ },
795
+ "subtasks_count": 0,
796
+ "metrics_count": 1,
797
+ "metric_names": [
798
+ "Score"
799
+ ],
800
+ "primary_metric_name": "Score",
801
+ "evalcards": {
802
+ "annotations": {
803
+ "reporting_completeness": {
804
+ "completeness_score": 0.10714285714285714,
805
+ "total_fields_evaluated": 28,
806
+ "missing_required_fields": [
807
+ "autobenchmarkcard.benchmark_details.name",
808
+ "autobenchmarkcard.benchmark_details.overview",
809
+ "autobenchmarkcard.benchmark_details.data_type",
810
+ "autobenchmarkcard.benchmark_details.domains",
811
+ "autobenchmarkcard.benchmark_details.languages",
812
+ "autobenchmarkcard.benchmark_details.similar_benchmarks",
813
+ "autobenchmarkcard.benchmark_details.resources",
814
+ "autobenchmarkcard.purpose_and_intended_users.goal",
815
+ "autobenchmarkcard.purpose_and_intended_users.audience",
816
+ "autobenchmarkcard.purpose_and_intended_users.tasks",
817
+ "autobenchmarkcard.purpose_and_intended_users.limitations",
818
+ "autobenchmarkcard.purpose_and_intended_users.out_of_scope_uses",
819
+ "autobenchmarkcard.methodology.methods",
820
+ "autobenchmarkcard.methodology.metrics",
821
+ "autobenchmarkcard.methodology.calculation",
822
+ "autobenchmarkcard.methodology.interpretation",
823
+ "autobenchmarkcard.methodology.baseline_results",
824
+ "autobenchmarkcard.methodology.validation",
825
+ "autobenchmarkcard.ethical_and_legal_considerations.privacy_and_anonymity",
826
+ "autobenchmarkcard.ethical_and_legal_considerations.data_licensing",
827
+ "autobenchmarkcard.ethical_and_legal_considerations.consent_procedures",
828
+ "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
829
+ "autobenchmarkcard.data",
830
+ "evalcards.lifecycle_status",
831
+ "evalcards.preregistration_url"
832
+ ],
833
+ "partial_fields": [],
834
+ "field_scores": [
835
+ {
836
+ "field_path": "autobenchmarkcard.benchmark_details.name",
837
+ "coverage_type": "full",
838
+ "score": 0.0
839
+ },
840
+ {
841
+ "field_path": "autobenchmarkcard.benchmark_details.overview",
842
+ "coverage_type": "full",
843
+ "score": 0.0
844
+ },
845
+ {
846
+ "field_path": "autobenchmarkcard.benchmark_details.data_type",
847
+ "coverage_type": "full",
848
+ "score": 0.0
849
+ },
850
+ {
851
+ "field_path": "autobenchmarkcard.benchmark_details.domains",
852
+ "coverage_type": "full",
853
+ "score": 0.0
854
+ },
855
+ {
856
+ "field_path": "autobenchmarkcard.benchmark_details.languages",
857
+ "coverage_type": "full",
858
+ "score": 0.0
859
+ },
860
+ {
861
+ "field_path": "autobenchmarkcard.benchmark_details.similar_benchmarks",
862
+ "coverage_type": "full",
863
+ "score": 0.0
864
+ },
865
+ {
866
+ "field_path": "autobenchmarkcard.benchmark_details.resources",
867
+ "coverage_type": "full",
868
+ "score": 0.0
869
+ },
870
+ {
871
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.goal",
872
+ "coverage_type": "full",
873
+ "score": 0.0
874
+ },
875
+ {
876
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.audience",
877
+ "coverage_type": "full",
878
+ "score": 0.0
879
+ },
880
+ {
881
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.tasks",
882
+ "coverage_type": "full",
883
+ "score": 0.0
884
+ },
885
+ {
886
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.limitations",
887
+ "coverage_type": "full",
888
+ "score": 0.0
889
+ },
890
+ {
891
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.out_of_scope_uses",
892
+ "coverage_type": "full",
893
+ "score": 0.0
894
+ },
895
+ {
896
+ "field_path": "autobenchmarkcard.methodology.methods",
897
+ "coverage_type": "full",
898
+ "score": 0.0
899
+ },
900
+ {
901
+ "field_path": "autobenchmarkcard.methodology.metrics",
902
+ "coverage_type": "full",
903
+ "score": 0.0
904
+ },
905
+ {
906
+ "field_path": "autobenchmarkcard.methodology.calculation",
907
+ "coverage_type": "full",
908
+ "score": 0.0
909
+ },
910
+ {
911
+ "field_path": "autobenchmarkcard.methodology.interpretation",
912
+ "coverage_type": "full",
913
+ "score": 0.0
914
+ },
915
+ {
916
+ "field_path": "autobenchmarkcard.methodology.baseline_results",
917
+ "coverage_type": "full",
918
+ "score": 0.0
919
+ },
920
+ {
921
+ "field_path": "autobenchmarkcard.methodology.validation",
922
+ "coverage_type": "full",
923
+ "score": 0.0
924
+ },
925
+ {
926
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.privacy_and_anonymity",
927
+ "coverage_type": "full",
928
+ "score": 0.0
929
+ },
930
+ {
931
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.data_licensing",
932
+ "coverage_type": "full",
933
+ "score": 0.0
934
+ },
935
+ {
936
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.consent_procedures",
937
+ "coverage_type": "full",
938
+ "score": 0.0
939
+ },
940
+ {
941
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
942
+ "coverage_type": "full",
943
+ "score": 0.0
944
+ },
945
+ {
946
+ "field_path": "autobenchmarkcard.data",
947
+ "coverage_type": "partial",
948
+ "score": 0.0
949
+ },
950
+ {
951
+ "field_path": "eee_eval.source_metadata.source_type",
952
+ "coverage_type": "full",
953
+ "score": 1.0
954
+ },
955
+ {
956
+ "field_path": "eee_eval.source_metadata.source_organization_name",
957
+ "coverage_type": "full",
958
+ "score": 1.0
959
+ },
960
+ {
961
+ "field_path": "eee_eval.source_metadata.evaluator_relationship",
962
+ "coverage_type": "full",
963
+ "score": 1.0
964
+ },
965
+ {
966
+ "field_path": "evalcards.lifecycle_status",
967
+ "coverage_type": "reserved",
968
+ "score": 0.0
969
+ },
970
+ {
971
+ "field_path": "evalcards.preregistration_url",
972
+ "coverage_type": "reserved",
973
+ "score": 0.0
974
+ }
975
+ ],
976
+ "signal_version": "1.0"
977
+ },
978
+ "benchmark_comparability": {
979
+ "variant_divergence_groups": [],
980
+ "cross_party_divergence_groups": []
981
+ }
982
+ }
983
+ },
984
+ "reproducibility_summary": {
985
+ "results_total": 1,
986
+ "has_reproducibility_gap_count": 1,
987
+ "populated_ratio_avg": 0.0
988
+ },
989
+ "provenance_summary": {
990
+ "total_results": 1,
991
+ "total_groups": 1,
992
+ "multi_source_groups": 0,
993
+ "first_party_only_groups": 1,
994
+ "source_type_distribution": {
995
+ "first_party": 1,
996
+ "third_party": 0,
997
+ "collaborative": 0,
998
+ "unspecified": 0
999
+ }
1000
+ },
1001
+ "comparability_summary": {
1002
+ "total_groups": 1,
1003
+ "groups_with_variant_check": 0,
1004
+ "groups_with_cross_party_check": 0,
1005
+ "variant_divergent_count": 0,
1006
+ "cross_party_divergent_count": 0
1007
+ },
1008
+ "metrics": [
1009
+ {
1010
+ "metric_summary_id": "llm_stats_aime_2026_score",
1011
+ "legacy_eval_summary_id": "llm_stats_llm_stats_aime_2026",
1012
+ "evaluation_name": "llm_stats.aime-2026",
1013
+ "display_name": "Aime 2026 / Score",
1014
+ "canonical_display_name": "Aime 2026 / Score",
1015
+ "benchmark_leaf_key": "aime_2026",
1016
+ "benchmark_leaf_name": "Aime 2026",
1017
+ "slice_key": null,
1018
+ "slice_name": null,
1019
+ "lower_is_better": false,
1020
+ "metric_name": "Score",
1021
+ "metric_id": "llm_stats.aime-2026.score",
1022
+ "metric_key": "score",
1023
+ "metric_source": "metric_config",
1024
+ "metric_config": {
1025
+ "evaluation_description": "All 30 problems from the 2026 American Invitational Mathematics Examination (AIME I and AIME II), testing olympiad-level mathematical reasoning with integer answers from 000-999. Used as an AI benchmark to evaluate large language models' ability to solve complex mathematical problems requiring multi-step logical deductions and structured symbolic reasoning.",
1026
+ "metric_id": "llm_stats.aime-2026.score",
1027
+ "metric_name": "AIME 2026 score",
1028
+ "metric_kind": "benchmark_score",
1029
+ "metric_unit": "proportion",
1030
+ "lower_is_better": false,
1031
+ "score_type": "continuous",
1032
+ "min_score": 0.0,
1033
+ "max_score": 1.0,
1034
+ "additional_details": {
1035
+ "raw_benchmark_id": "aime-2026",
1036
+ "raw_score_field": "score",
1037
+ "bound_strategy": "inferred_proportion",
1038
+ "raw_name": "AIME 2026",
1039
+ "raw_categories": "[\"math\",\"reasoning\"]",
1040
+ "raw_modality": "text",
1041
+ "raw_verified": "false",
1042
+ "raw_model_count": "12"
1043
+ }
1044
+ },
1045
+ "models_count": 1,
1046
+ "top_score": 0.883,
1047
+ "model_results": [
1048
+ {
1049
+ "model_id": "bytedance/seed-2-0-lite",
1050
+ "model_route_id": "bytedance__seed-2-0-lite",
1051
+ "model_name": "Seed 2.0 Lite",
1052
+ "developer": "bytedance",
1053
+ "variant_key": "default",
1054
+ "raw_model_id": "bytedance/seed-2.0-lite",
1055
+ "score": 0.883,
1056
+ "evaluation_id": "llm-stats/first_party/bytedance_seed-2.0-lite/1777108064.422824",
1057
+ "retrieved_timestamp": "1777108064.422824",
1058
+ "source_metadata": {
1059
+ "source_name": "LLM Stats API: first_party scores",
1060
+ "source_type": "documentation",
1061
+ "source_organization_name": "LLM Stats",
1062
+ "source_organization_url": "https://llm-stats.com/",
1063
+ "evaluator_relationship": "first_party",
1064
+ "additional_details": {
1065
+ "models_endpoint": "https://api.llm-stats.com/v1/models",
1066
+ "benchmarks_endpoint": "https://api.llm-stats.com/leaderboard/benchmarks",
1067
+ "scores_endpoint": "https://api.llm-stats.com/v1/scores",
1068
+ "scores_endpoint_fallback": "https://api.llm-stats.com/leaderboard/benchmarks/{benchmark_id}",
1069
+ "developer_page_url": "https://llm-stats.com/developer",
1070
+ "attribution_url": "https://llm-stats.com/",
1071
+ "attribution_required": "true",
1072
+ "source_role": "aggregator"
1073
+ }
1074
+ },
1075
+ "source_data": {
1076
+ "dataset_name": "AIME 2026",
1077
+ "source_type": "url",
1078
+ "url": [
1079
+ "https://llm-stats.com/models/seed-2.0-lite",
1080
+ "https://llm-stats.com/benchmarks/aime-2026",
1081
+ "https://api.llm-stats.com/leaderboard/benchmarks/aime-2026"
1082
+ ],
1083
+ "additional_details": {
1084
+ "raw_benchmark_id": "aime-2026",
1085
+ "raw_model_id": "seed-2.0-lite",
1086
+ "source_role": "aggregator"
1087
+ }
1088
+ },
1089
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/bytedance__seed-2-0-lite/llm_stats_first_party_bytedance_seed_2_0_lite_1777108064_422824.json",
1090
+ "detailed_evaluation_results": null,
1091
+ "detailed_evaluation_results_meta": null,
1092
+ "passthrough_top_level_fields": null,
1093
+ "instance_level_data": null,
1094
+ "normalized_result": {
1095
+ "benchmark_family_key": "llm_stats",
1096
+ "benchmark_family_name": "AIME 2026",
1097
+ "benchmark_parent_key": "llm_stats",
1098
+ "benchmark_parent_name": "AIME 2026",
1099
+ "benchmark_component_key": "aime_2026",
1100
+ "benchmark_component_name": "Aime 2026",
1101
+ "benchmark_leaf_key": "aime_2026",
1102
+ "benchmark_leaf_name": "Aime 2026",
1103
+ "slice_key": null,
1104
+ "slice_name": null,
1105
+ "metric_name": "Score",
1106
+ "metric_id": "llm_stats.aime-2026.score",
1107
+ "metric_key": "score",
1108
+ "metric_source": "metric_config",
1109
+ "display_name": "Aime 2026 / Score",
1110
+ "canonical_display_name": "Aime 2026 / Score",
1111
+ "raw_evaluation_name": "llm_stats.aime-2026",
1112
+ "is_summary_score": false
1113
+ },
1114
+ "evalcards": {
1115
+ "annotations": {
1116
+ "reproducibility_gap": {
1117
+ "has_reproducibility_gap": true,
1118
+ "missing_fields": [
1119
+ "temperature",
1120
+ "max_tokens"
1121
+ ],
1122
+ "required_field_count": 2,
1123
+ "populated_field_count": 0,
1124
+ "signal_version": "1.0"
1125
+ },
1126
+ "provenance": {
1127
+ "source_type": "first_party",
1128
+ "is_multi_source": false,
1129
+ "first_party_only": true,
1130
+ "distinct_reporting_organizations": 1,
1131
+ "signal_version": "1.0"
1132
+ },
1133
+ "variant_divergence": null,
1134
+ "cross_party_divergence": null
1135
+ }
1136
+ }
1137
+ }
1138
+ ]
1139
+ }
1140
+ ],
1141
+ "subtasks": [],
1142
+ "models_count": 1,
1143
+ "top_score": 0.883,
1144
+ "instance_data": {
1145
+ "available": false,
1146
+ "url_count": 0,
1147
+ "sample_urls": [],
1148
+ "models_with_loaded_instances": 0
1149
+ }
1150
+ }
1151
+ ]
1152
+ },
1153
+ "hierarchy_by_category": {
1154
+ "coding": [
1155
+ {
1156
+ "eval_summary_id": "llm_stats_livecodebench_v6",
1157
+ "benchmark": "LiveCodeBench v6",
1158
+ "benchmark_family_key": "llm_stats",
1159
+ "benchmark_family_name": "LiveCodeBench v6",
1160
+ "benchmark_parent_key": "llm_stats",
1161
+ "benchmark_parent_name": "LiveCodeBench v6",
1162
+ "benchmark_leaf_key": "livecodebench_v6",
1163
+ "benchmark_leaf_name": "Livecodebench V6",
1164
+ "benchmark_component_key": "livecodebench_v6",
1165
+ "benchmark_component_name": "Livecodebench V6",
1166
+ "evaluation_name": "Livecodebench V6",
1167
+ "display_name": "Livecodebench V6",
1168
+ "canonical_display_name": "Livecodebench V6",
1169
+ "is_summary_score": false,
1170
+ "category": "coding",
1171
+ "source_data": {
1172
+ "dataset_name": "LiveCodeBench v6",
1173
+ "source_type": "url",
1174
+ "url": [
1175
+ "https://llm-stats.com/models/seed-2.0-lite",
1176
+ "https://llm-stats.com/benchmarks/livecodebench-v6",
1177
+ "https://api.llm-stats.com/leaderboard/benchmarks/livecodebench-v6"
1178
+ ],
1179
+ "additional_details": {
1180
+ "raw_benchmark_id": "livecodebench-v6",
1181
+ "raw_model_id": "seed-2.0-lite",
1182
+ "source_role": "aggregator"
1183
+ }
1184
+ },
1185
+ "benchmark_card": {
1186
+ "benchmark_details": {
1187
+ "name": "LiveCodeBench",
1188
+ "overview": "LiveCodeBench is a holistic and contamination-free benchmark for evaluating large language models on code-related capabilities. It assesses a broader range of skills including code generation, self-repair, code execution, and test output prediction. The benchmark collects new problems over time from programming contest platforms to prevent data contamination, currently containing over 500 coding problems published between May 2023 and May 2024.",
1189
+ "data_type": "text",
1190
+ "domains": [
1191
+ "code generation",
1192
+ "programming competitions"
1193
+ ],
1194
+ "languages": [
1195
+ "Not specified"
1196
+ ],
1197
+ "similar_benchmarks": [
1198
+ "HumanEval",
1199
+ "MBPP",
1200
+ "APPS",
1201
+ "DS-1000",
1202
+ "ARCADE",
1203
+ "NumpyEval",
1204
+ "PandasEval",
1205
+ "JuICe",
1206
+ "APIBench",
1207
+ "RepoBench",
1208
+ "ODEX",
1209
+ "SWE-Bench",
1210
+ "GoogleCodeRepo",
1211
+ "RepoEval",
1212
+ "Cocomic-Data"
1213
+ ],
1214
+ "resources": [
1215
+ "https://livecodebench.github.io/",
1216
+ "https://arxiv.org/abs/2403.07974"
1217
+ ],
1218
+ "benchmark_type": "single"
1219
+ },
1220
+ "purpose_and_intended_users": {
1221
+ "goal": "To provide a comprehensive and contamination-free evaluation of large language models for code by assessing a broader range of code-related capabilities beyond just code generation.",
1222
+ "audience": [
1223
+ "Researchers and practitioners in academia and industry who are interested in evaluating the capabilities of large language models for code"
1224
+ ],
1225
+ "tasks": [
1226
+ "Code generation",
1227
+ "Self-repair",
1228
+ "Code execution",
1229
+ "Test output prediction"
1230
+ ],
1231
+ "limitations": "The focus on competition programming problems might not be representative of the most general notion of LLM programming capabilities or real-world, open-ended software development tasks.",
1232
+ "out_of_scope_uses": [
1233
+ "Evaluating performance on real-world, open-ended, and unconstrained user-raised problems"
1234
+ ]
1235
+ },
1236
+ "data": {
1237
+ "source": "The data is collected from coding contests on three platforms: LeetCode, AtCoder, and CodeForces, with problems published between May 2023 and May 2024.",
1238
+ "size": "Over 500 coding problems. Specific subsets include 479 samples from 85 problems for code execution and 442 problem instances from 181 LeetCode problems for test output prediction.",
1239
+ "format": "Includes problem statements, public tests, user solutions, and starter code (for LeetCode). Problems are tagged with difficulty labels (Easy, Medium, Hard) from the platforms.",
1240
+ "annotation": "Difficulty labels are provided by the competition platforms. For the code execution dataset, human-submitted solutions were filtered using compile-time and runtime filters followed by manual inspection to ensure quality."
1241
+ },
1242
+ "methodology": {
1243
+ "methods": [
1244
+ "Models are evaluated in a zero-shot setting across four scenarios: code generation, self-repair, code execution, and test output prediction.",
1245
+ "For code generation and self-repair, program correctness is verified using a set of unseen test cases. For code execution, an execution-based correctness metric compares generated output to ground truth. For test output prediction, generated responses are parsed and equivalence checks are used for grading."
1246
+ ],
1247
+ "metrics": [
1248
+ "Pass@1"
1249
+ ],
1250
+ "calculation": "For each problem, 10 candidate answers are generated. The Pass@1 score is the fraction of problems for which a generated program or answer is correct.",
1251
+ "interpretation": "A higher Pass@1 score indicates better performance.",
1252
+ "baseline_results": "The paper reports results for specific models including GPT-4, GPT-4-Turbo, Claude-3-Opus, Claude-3-Sonnet, and Mistral-L, but specific numerical scores are not provided in the given excerpts.",
1253
+ "validation": "Program correctness for code generation and self-repair is verified using a set of unseen test cases. For code execution, an execution-based correctness metric is used to compare generated output to ground truth. For test output prediction, generated responses are parsed and equivalence checks are used for grading."
1254
+ },
1255
+ "ethical_and_legal_considerations": {
1256
+ "privacy_and_anonymity": "Not specified",
1257
+ "data_licensing": "Not specified",
1258
+ "consent_procedures": "Not specified",
1259
+ "compliance_with_regulations": "The benchmark operates under the Fair Use doctrine (§ 107) for copyrighted works, determining that its use of collected problems for academic, non-profit educational purposes constitutes fair use. It does not train on the collected problems."
1260
+ },
1261
+ "possible_risks": [
1262
+ {
1263
+ "category": "Over- or under-reliance",
1264
+ "description": [
1265
+ "In AI-assisted decision-making tasks, reliance measures how much a person trusts (and potentially acts on) a model's output. Over-reliance occurs when a person puts too much trust in a model, accepting a model's output when the model's output is likely incorrect. Under-reliance is the opposite, where the person doesn't trust the model but should."
1266
+ ],
1267
+ "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/over-or-under-reliance.html"
1268
+ },
1269
+ {
1270
+ "category": "Unrepresentative data",
1271
+ "description": [
1272
+ "Unrepresentative data occurs when the training or fine-tuning data is not sufficiently representative of the underlying population or does not measure the phenomenon of interest. Synthetic data might not fully capture the complexity and nuances of real-world data. Causes include possible limitations in the seed data quality, biases in generation methods, or inadequate domain knowledge. Thus, AI models might struggle to generalize effectively to real-world scenarios."
1273
+ ],
1274
+ "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/unrepresentative-data.html"
1275
+ },
1276
+ {
1277
+ "category": "Data contamination",
1278
+ "description": [
1279
+ "Data contamination occurs when incorrect data is used for training. For example, data that is not aligned with model's purpose or data that is already set aside for other development tasks such as testing and evaluation."
1280
+ ],
1281
+ "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/data-contamination.html"
1282
+ },
1283
+ {
1284
+ "category": "Harmful code generation",
1285
+ "description": [
1286
+ "Models might generate code that causes harm or unintentionally affects other systems."
1287
+ ],
1288
+ "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/harmful-code-generation.html"
1289
+ },
1290
+ {
1291
+ "category": "Reproducibility",
1292
+ "description": [
1293
+ "Replicating agent behavior or output can be impacted by changes or updates made to external services and tools. This impact is increased if the agent is built with generative AI."
1294
+ ],
1295
+ "url": "https://www.ibm.com/docs/en/watsonx/saas?topic=SSYOK8/wsj/ai-risk-atlas/reproducibility-agentic.html"
1296
+ }
1297
+ ],
1298
+ "flagged_fields": {},
1299
+ "missing_fields": []
1300
+ },
1301
+ "tags": {
1302
+ "domains": [
1303
+ "code generation",
1304
+ "programming competitions"
1305
+ ],
1306
+ "languages": [
1307
+ "Not specified"
1308
+ ],
1309
+ "tasks": [
1310
+ "Code generation",
1311
+ "Self-repair",
1312
+ "Code execution",
1313
+ "Test output prediction"
1314
+ ]
1315
+ },
1316
+ "subtasks_count": 0,
1317
+ "metrics_count": 1,
1318
+ "metric_names": [
1319
+ "Score"
1320
+ ],
1321
+ "primary_metric_name": "Score",
1322
+ "evalcards": {
1323
+ "annotations": {
1324
+ "reporting_completeness": {
1325
+ "completeness_score": 0.9285714285714286,
1326
+ "total_fields_evaluated": 28,
1327
+ "missing_required_fields": [
1328
+ "evalcards.lifecycle_status",
1329
+ "evalcards.preregistration_url"
1330
+ ],
1331
+ "partial_fields": [],
1332
+ "field_scores": [
1333
+ {
1334
+ "field_path": "autobenchmarkcard.benchmark_details.name",
1335
+ "coverage_type": "full",
1336
+ "score": 1.0
1337
+ },
1338
+ {
1339
+ "field_path": "autobenchmarkcard.benchmark_details.overview",
1340
+ "coverage_type": "full",
1341
+ "score": 1.0
1342
+ },
1343
+ {
1344
+ "field_path": "autobenchmarkcard.benchmark_details.data_type",
1345
+ "coverage_type": "full",
1346
+ "score": 1.0
1347
+ },
1348
+ {
1349
+ "field_path": "autobenchmarkcard.benchmark_details.domains",
1350
+ "coverage_type": "full",
1351
+ "score": 1.0
1352
+ },
1353
+ {
1354
+ "field_path": "autobenchmarkcard.benchmark_details.languages",
1355
+ "coverage_type": "full",
1356
+ "score": 1.0
1357
+ },
1358
+ {
1359
+ "field_path": "autobenchmarkcard.benchmark_details.similar_benchmarks",
1360
+ "coverage_type": "full",
1361
+ "score": 1.0
1362
+ },
1363
+ {
1364
+ "field_path": "autobenchmarkcard.benchmark_details.resources",
1365
+ "coverage_type": "full",
1366
+ "score": 1.0
1367
+ },
1368
+ {
1369
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.goal",
1370
+ "coverage_type": "full",
1371
+ "score": 1.0
1372
+ },
1373
+ {
1374
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.audience",
1375
+ "coverage_type": "full",
1376
+ "score": 1.0
1377
+ },
1378
+ {
1379
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.tasks",
1380
+ "coverage_type": "full",
1381
+ "score": 1.0
1382
+ },
1383
+ {
1384
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.limitations",
1385
+ "coverage_type": "full",
1386
+ "score": 1.0
1387
+ },
1388
+ {
1389
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.out_of_scope_uses",
1390
+ "coverage_type": "full",
1391
+ "score": 1.0
1392
+ },
1393
+ {
1394
+ "field_path": "autobenchmarkcard.methodology.methods",
1395
+ "coverage_type": "full",
1396
+ "score": 1.0
1397
+ },
1398
+ {
1399
+ "field_path": "autobenchmarkcard.methodology.metrics",
1400
+ "coverage_type": "full",
1401
+ "score": 1.0
1402
+ },
1403
+ {
1404
+ "field_path": "autobenchmarkcard.methodology.calculation",
1405
+ "coverage_type": "full",
1406
+ "score": 1.0
1407
+ },
1408
+ {
1409
+ "field_path": "autobenchmarkcard.methodology.interpretation",
1410
+ "coverage_type": "full",
1411
+ "score": 1.0
1412
+ },
1413
+ {
1414
+ "field_path": "autobenchmarkcard.methodology.baseline_results",
1415
+ "coverage_type": "full",
1416
+ "score": 1.0
1417
+ },
1418
+ {
1419
+ "field_path": "autobenchmarkcard.methodology.validation",
1420
+ "coverage_type": "full",
1421
+ "score": 1.0
1422
+ },
1423
+ {
1424
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.privacy_and_anonymity",
1425
+ "coverage_type": "full",
1426
+ "score": 1.0
1427
+ },
1428
+ {
1429
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.data_licensing",
1430
+ "coverage_type": "full",
1431
+ "score": 1.0
1432
+ },
1433
+ {
1434
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.consent_procedures",
1435
+ "coverage_type": "full",
1436
+ "score": 1.0
1437
+ },
1438
+ {
1439
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
1440
+ "coverage_type": "full",
1441
+ "score": 1.0
1442
+ },
1443
+ {
1444
+ "field_path": "autobenchmarkcard.data",
1445
+ "coverage_type": "partial",
1446
+ "score": 1.0
1447
+ },
1448
+ {
1449
+ "field_path": "eee_eval.source_metadata.source_type",
1450
+ "coverage_type": "full",
1451
+ "score": 1.0
1452
+ },
1453
+ {
1454
+ "field_path": "eee_eval.source_metadata.source_organization_name",
1455
+ "coverage_type": "full",
1456
+ "score": 1.0
1457
+ },
1458
+ {
1459
+ "field_path": "eee_eval.source_metadata.evaluator_relationship",
1460
+ "coverage_type": "full",
1461
+ "score": 1.0
1462
+ },
1463
+ {
1464
+ "field_path": "evalcards.lifecycle_status",
1465
+ "coverage_type": "reserved",
1466
+ "score": 0.0
1467
+ },
1468
+ {
1469
+ "field_path": "evalcards.preregistration_url",
1470
+ "coverage_type": "reserved",
1471
+ "score": 0.0
1472
+ }
1473
+ ],
1474
+ "signal_version": "1.0"
1475
+ },
1476
+ "benchmark_comparability": {
1477
+ "variant_divergence_groups": [],
1478
+ "cross_party_divergence_groups": []
1479
+ }
1480
+ }
1481
+ },
1482
+ "reproducibility_summary": {
1483
+ "results_total": 1,
1484
+ "has_reproducibility_gap_count": 1,
1485
+ "populated_ratio_avg": 0.0
1486
+ },
1487
+ "provenance_summary": {
1488
+ "total_results": 1,
1489
+ "total_groups": 1,
1490
+ "multi_source_groups": 0,
1491
+ "first_party_only_groups": 1,
1492
+ "source_type_distribution": {
1493
+ "first_party": 1,
1494
+ "third_party": 0,
1495
+ "collaborative": 0,
1496
+ "unspecified": 0
1497
+ }
1498
+ },
1499
+ "comparability_summary": {
1500
+ "total_groups": 1,
1501
+ "groups_with_variant_check": 0,
1502
+ "groups_with_cross_party_check": 0,
1503
+ "variant_divergent_count": 0,
1504
+ "cross_party_divergent_count": 0
1505
+ },
1506
+ "metrics": [
1507
+ {
1508
+ "metric_summary_id": "llm_stats_livecodebench_v6_score",
1509
+ "legacy_eval_summary_id": "llm_stats_llm_stats_livecodebench_v6",
1510
+ "evaluation_name": "llm_stats.livecodebench-v6",
1511
+ "display_name": "Livecodebench V6 / Score",
1512
+ "canonical_display_name": "Livecodebench V6 / Score",
1513
+ "benchmark_leaf_key": "livecodebench_v6",
1514
+ "benchmark_leaf_name": "Livecodebench V6",
1515
+ "slice_key": null,
1516
+ "slice_name": null,
1517
+ "lower_is_better": false,
1518
+ "metric_name": "Score",
1519
+ "metric_id": "llm_stats.livecodebench-v6.score",
1520
+ "metric_key": "score",
1521
+ "metric_source": "metric_config",
1522
+ "metric_config": {
1523
+ "evaluation_description": "LiveCodeBench is a holistic and contamination-free evaluation benchmark for large language models for code. It continuously collects new problems from programming contests (LeetCode, AtCoder, CodeForces) and evaluates four different scenarios: code generation, self-repair, code execution, and test output prediction. Problems are annotated with release dates to enable evaluation on unseen problems released after a model's training cutoff.",
1524
+ "metric_id": "llm_stats.livecodebench-v6.score",
1525
+ "metric_name": "LiveCodeBench v6 score",
1526
+ "metric_kind": "benchmark_score",
1527
+ "metric_unit": "proportion",
1528
+ "lower_is_better": false,
1529
+ "score_type": "continuous",
1530
+ "min_score": 0.0,
1531
+ "max_score": 1.0,
1532
+ "additional_details": {
1533
+ "raw_benchmark_id": "livecodebench-v6",
1534
+ "raw_score_field": "score",
1535
+ "bound_strategy": "inferred_proportion",
1536
+ "raw_name": "LiveCodeBench v6",
1537
+ "raw_categories": "[\"general\",\"reasoning\"]",
1538
+ "raw_modality": "text",
1539
+ "raw_verified": "false",
1540
+ "raw_model_count": "45"
1541
+ }
1542
+ },
1543
+ "models_count": 1,
1544
+ "top_score": 0.817,
1545
+ "model_results": [
1546
+ {
1547
+ "model_id": "bytedance/seed-2-0-lite",
1548
+ "model_route_id": "bytedance__seed-2-0-lite",
1549
+ "model_name": "Seed 2.0 Lite",
1550
+ "developer": "bytedance",
1551
+ "variant_key": "default",
1552
+ "raw_model_id": "bytedance/seed-2.0-lite",
1553
+ "score": 0.817,
1554
+ "evaluation_id": "llm-stats/first_party/bytedance_seed-2.0-lite/1777108064.422824",
1555
+ "retrieved_timestamp": "1777108064.422824",
1556
+ "source_metadata": {
1557
+ "source_name": "LLM Stats API: first_party scores",
1558
+ "source_type": "documentation",
1559
+ "source_organization_name": "LLM Stats",
1560
+ "source_organization_url": "https://llm-stats.com/",
1561
+ "evaluator_relationship": "first_party",
1562
+ "additional_details": {
1563
+ "models_endpoint": "https://api.llm-stats.com/v1/models",
1564
+ "benchmarks_endpoint": "https://api.llm-stats.com/leaderboard/benchmarks",
1565
+ "scores_endpoint": "https://api.llm-stats.com/v1/scores",
1566
+ "scores_endpoint_fallback": "https://api.llm-stats.com/leaderboard/benchmarks/{benchmark_id}",
1567
+ "developer_page_url": "https://llm-stats.com/developer",
1568
+ "attribution_url": "https://llm-stats.com/",
1569
+ "attribution_required": "true",
1570
+ "source_role": "aggregator"
1571
+ }
1572
+ },
1573
+ "source_data": {
1574
+ "dataset_name": "AIME 2026",
1575
+ "source_type": "url",
1576
+ "url": [
1577
+ "https://llm-stats.com/models/seed-2.0-lite",
1578
+ "https://llm-stats.com/benchmarks/aime-2026",
1579
+ "https://api.llm-stats.com/leaderboard/benchmarks/aime-2026"
1580
+ ],
1581
+ "additional_details": {
1582
+ "raw_benchmark_id": "aime-2026",
1583
+ "raw_model_id": "seed-2.0-lite",
1584
+ "source_role": "aggregator"
1585
+ }
1586
+ },
1587
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/bytedance__seed-2-0-lite/llm_stats_first_party_bytedance_seed_2_0_lite_1777108064_422824.json",
1588
+ "detailed_evaluation_results": null,
1589
+ "detailed_evaluation_results_meta": null,
1590
+ "passthrough_top_level_fields": null,
1591
+ "instance_level_data": null,
1592
+ "normalized_result": {
1593
+ "benchmark_family_key": "llm_stats",
1594
+ "benchmark_family_name": "LiveCodeBench v6",
1595
+ "benchmark_parent_key": "llm_stats",
1596
+ "benchmark_parent_name": "LiveCodeBench v6",
1597
+ "benchmark_component_key": "livecodebench_v6",
1598
+ "benchmark_component_name": "Livecodebench V6",
1599
+ "benchmark_leaf_key": "livecodebench_v6",
1600
+ "benchmark_leaf_name": "Livecodebench V6",
1601
+ "slice_key": null,
1602
+ "slice_name": null,
1603
+ "metric_name": "Score",
1604
+ "metric_id": "llm_stats.livecodebench-v6.score",
1605
+ "metric_key": "score",
1606
+ "metric_source": "metric_config",
1607
+ "display_name": "Livecodebench V6 / Score",
1608
+ "canonical_display_name": "Livecodebench V6 / Score",
1609
+ "raw_evaluation_name": "llm_stats.livecodebench-v6",
1610
+ "is_summary_score": false
1611
+ },
1612
+ "evalcards": {
1613
+ "annotations": {
1614
+ "reproducibility_gap": {
1615
+ "has_reproducibility_gap": true,
1616
+ "missing_fields": [
1617
+ "temperature",
1618
+ "max_tokens"
1619
+ ],
1620
+ "required_field_count": 2,
1621
+ "populated_field_count": 0,
1622
+ "signal_version": "1.0"
1623
+ },
1624
+ "provenance": {
1625
+ "source_type": "first_party",
1626
+ "is_multi_source": false,
1627
+ "first_party_only": true,
1628
+ "distinct_reporting_organizations": 1,
1629
+ "signal_version": "1.0"
1630
+ },
1631
+ "variant_divergence": null,
1632
+ "cross_party_divergence": null
1633
+ }
1634
+ }
1635
+ }
1636
+ ]
1637
+ }
1638
+ ],
1639
+ "subtasks": [],
1640
+ "models_count": 1,
1641
+ "top_score": 0.817,
1642
+ "instance_data": {
1643
+ "available": false,
1644
+ "url_count": 0,
1645
+ "sample_urls": [],
1646
+ "models_with_loaded_instances": 0
1647
+ }
1648
+ }
1649
+ ],
1650
+ "other": [
1651
+ {
1652
+ "eval_summary_id": "llm_stats_aime_2026",
1653
+ "benchmark": "AIME 2026",
1654
+ "benchmark_family_key": "llm_stats",
1655
+ "benchmark_family_name": "AIME 2026",
1656
+ "benchmark_parent_key": "llm_stats",
1657
+ "benchmark_parent_name": "AIME 2026",
1658
+ "benchmark_leaf_key": "aime_2026",
1659
+ "benchmark_leaf_name": "Aime 2026",
1660
+ "benchmark_component_key": "aime_2026",
1661
+ "benchmark_component_name": "Aime 2026",
1662
+ "evaluation_name": "Aime 2026",
1663
+ "display_name": "Aime 2026",
1664
+ "canonical_display_name": "Aime 2026",
1665
+ "is_summary_score": false,
1666
+ "category": "other",
1667
+ "source_data": {
1668
+ "dataset_name": "AIME 2026",
1669
+ "source_type": "url",
1670
+ "url": [
1671
+ "https://llm-stats.com/models/seed-2.0-lite",
1672
+ "https://llm-stats.com/benchmarks/aime-2026",
1673
+ "https://api.llm-stats.com/leaderboard/benchmarks/aime-2026"
1674
+ ],
1675
+ "additional_details": {
1676
+ "raw_benchmark_id": "aime-2026",
1677
+ "raw_model_id": "seed-2.0-lite",
1678
+ "source_role": "aggregator"
1679
+ }
1680
+ },
1681
+ "benchmark_card": null,
1682
+ "tags": {
1683
+ "domains": [],
1684
+ "languages": [],
1685
+ "tasks": []
1686
+ },
1687
+ "subtasks_count": 0,
1688
+ "metrics_count": 1,
1689
+ "metric_names": [
1690
+ "Score"
1691
+ ],
1692
+ "primary_metric_name": "Score",
1693
+ "evalcards": {
1694
+ "annotations": {
1695
+ "reporting_completeness": {
1696
+ "completeness_score": 0.10714285714285714,
1697
+ "total_fields_evaluated": 28,
1698
+ "missing_required_fields": [
1699
+ "autobenchmarkcard.benchmark_details.name",
1700
+ "autobenchmarkcard.benchmark_details.overview",
1701
+ "autobenchmarkcard.benchmark_details.data_type",
1702
+ "autobenchmarkcard.benchmark_details.domains",
1703
+ "autobenchmarkcard.benchmark_details.languages",
1704
+ "autobenchmarkcard.benchmark_details.similar_benchmarks",
1705
+ "autobenchmarkcard.benchmark_details.resources",
1706
+ "autobenchmarkcard.purpose_and_intended_users.goal",
1707
+ "autobenchmarkcard.purpose_and_intended_users.audience",
1708
+ "autobenchmarkcard.purpose_and_intended_users.tasks",
1709
+ "autobenchmarkcard.purpose_and_intended_users.limitations",
1710
+ "autobenchmarkcard.purpose_and_intended_users.out_of_scope_uses",
1711
+ "autobenchmarkcard.methodology.methods",
1712
+ "autobenchmarkcard.methodology.metrics",
1713
+ "autobenchmarkcard.methodology.calculation",
1714
+ "autobenchmarkcard.methodology.interpretation",
1715
+ "autobenchmarkcard.methodology.baseline_results",
1716
+ "autobenchmarkcard.methodology.validation",
1717
+ "autobenchmarkcard.ethical_and_legal_considerations.privacy_and_anonymity",
1718
+ "autobenchmarkcard.ethical_and_legal_considerations.data_licensing",
1719
+ "autobenchmarkcard.ethical_and_legal_considerations.consent_procedures",
1720
+ "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
1721
+ "autobenchmarkcard.data",
1722
+ "evalcards.lifecycle_status",
1723
+ "evalcards.preregistration_url"
1724
+ ],
1725
+ "partial_fields": [],
1726
+ "field_scores": [
1727
+ {
1728
+ "field_path": "autobenchmarkcard.benchmark_details.name",
1729
+ "coverage_type": "full",
1730
+ "score": 0.0
1731
+ },
1732
+ {
1733
+ "field_path": "autobenchmarkcard.benchmark_details.overview",
1734
+ "coverage_type": "full",
1735
+ "score": 0.0
1736
+ },
1737
+ {
1738
+ "field_path": "autobenchmarkcard.benchmark_details.data_type",
1739
+ "coverage_type": "full",
1740
+ "score": 0.0
1741
+ },
1742
+ {
1743
+ "field_path": "autobenchmarkcard.benchmark_details.domains",
1744
+ "coverage_type": "full",
1745
+ "score": 0.0
1746
+ },
1747
+ {
1748
+ "field_path": "autobenchmarkcard.benchmark_details.languages",
1749
+ "coverage_type": "full",
1750
+ "score": 0.0
1751
+ },
1752
+ {
1753
+ "field_path": "autobenchmarkcard.benchmark_details.similar_benchmarks",
1754
+ "coverage_type": "full",
1755
+ "score": 0.0
1756
+ },
1757
+ {
1758
+ "field_path": "autobenchmarkcard.benchmark_details.resources",
1759
+ "coverage_type": "full",
1760
+ "score": 0.0
1761
+ },
1762
+ {
1763
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.goal",
1764
+ "coverage_type": "full",
1765
+ "score": 0.0
1766
+ },
1767
+ {
1768
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.audience",
1769
+ "coverage_type": "full",
1770
+ "score": 0.0
1771
+ },
1772
+ {
1773
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.tasks",
1774
+ "coverage_type": "full",
1775
+ "score": 0.0
1776
+ },
1777
+ {
1778
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.limitations",
1779
+ "coverage_type": "full",
1780
+ "score": 0.0
1781
+ },
1782
+ {
1783
+ "field_path": "autobenchmarkcard.purpose_and_intended_users.out_of_scope_uses",
1784
+ "coverage_type": "full",
1785
+ "score": 0.0
1786
+ },
1787
+ {
1788
+ "field_path": "autobenchmarkcard.methodology.methods",
1789
+ "coverage_type": "full",
1790
+ "score": 0.0
1791
+ },
1792
+ {
1793
+ "field_path": "autobenchmarkcard.methodology.metrics",
1794
+ "coverage_type": "full",
1795
+ "score": 0.0
1796
+ },
1797
+ {
1798
+ "field_path": "autobenchmarkcard.methodology.calculation",
1799
+ "coverage_type": "full",
1800
+ "score": 0.0
1801
+ },
1802
+ {
1803
+ "field_path": "autobenchmarkcard.methodology.interpretation",
1804
+ "coverage_type": "full",
1805
+ "score": 0.0
1806
+ },
1807
+ {
1808
+ "field_path": "autobenchmarkcard.methodology.baseline_results",
1809
+ "coverage_type": "full",
1810
+ "score": 0.0
1811
+ },
1812
+ {
1813
+ "field_path": "autobenchmarkcard.methodology.validation",
1814
+ "coverage_type": "full",
1815
+ "score": 0.0
1816
+ },
1817
+ {
1818
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.privacy_and_anonymity",
1819
+ "coverage_type": "full",
1820
+ "score": 0.0
1821
+ },
1822
+ {
1823
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.data_licensing",
1824
+ "coverage_type": "full",
1825
+ "score": 0.0
1826
+ },
1827
+ {
1828
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.consent_procedures",
1829
+ "coverage_type": "full",
1830
+ "score": 0.0
1831
+ },
1832
+ {
1833
+ "field_path": "autobenchmarkcard.ethical_and_legal_considerations.compliance_with_regulations",
1834
+ "coverage_type": "full",
1835
+ "score": 0.0
1836
+ },
1837
+ {
1838
+ "field_path": "autobenchmarkcard.data",
1839
+ "coverage_type": "partial",
1840
+ "score": 0.0
1841
+ },
1842
+ {
1843
+ "field_path": "eee_eval.source_metadata.source_type",
1844
+ "coverage_type": "full",
1845
+ "score": 1.0
1846
+ },
1847
+ {
1848
+ "field_path": "eee_eval.source_metadata.source_organization_name",
1849
+ "coverage_type": "full",
1850
+ "score": 1.0
1851
+ },
1852
+ {
1853
+ "field_path": "eee_eval.source_metadata.evaluator_relationship",
1854
+ "coverage_type": "full",
1855
+ "score": 1.0
1856
+ },
1857
+ {
1858
+ "field_path": "evalcards.lifecycle_status",
1859
+ "coverage_type": "reserved",
1860
+ "score": 0.0
1861
+ },
1862
+ {
1863
+ "field_path": "evalcards.preregistration_url",
1864
+ "coverage_type": "reserved",
1865
+ "score": 0.0
1866
+ }
1867
+ ],
1868
+ "signal_version": "1.0"
1869
+ },
1870
+ "benchmark_comparability": {
1871
+ "variant_divergence_groups": [],
1872
+ "cross_party_divergence_groups": []
1873
+ }
1874
+ }
1875
+ },
1876
+ "reproducibility_summary": {
1877
+ "results_total": 1,
1878
+ "has_reproducibility_gap_count": 1,
1879
+ "populated_ratio_avg": 0.0
1880
+ },
1881
+ "provenance_summary": {
1882
+ "total_results": 1,
1883
+ "total_groups": 1,
1884
+ "multi_source_groups": 0,
1885
+ "first_party_only_groups": 1,
1886
+ "source_type_distribution": {
1887
+ "first_party": 1,
1888
+ "third_party": 0,
1889
+ "collaborative": 0,
1890
+ "unspecified": 0
1891
+ }
1892
+ },
1893
+ "comparability_summary": {
1894
+ "total_groups": 1,
1895
+ "groups_with_variant_check": 0,
1896
+ "groups_with_cross_party_check": 0,
1897
+ "variant_divergent_count": 0,
1898
+ "cross_party_divergent_count": 0
1899
+ },
1900
+ "metrics": [
1901
+ {
1902
+ "metric_summary_id": "llm_stats_aime_2026_score",
1903
+ "legacy_eval_summary_id": "llm_stats_llm_stats_aime_2026",
1904
+ "evaluation_name": "llm_stats.aime-2026",
1905
+ "display_name": "Aime 2026 / Score",
1906
+ "canonical_display_name": "Aime 2026 / Score",
1907
+ "benchmark_leaf_key": "aime_2026",
1908
+ "benchmark_leaf_name": "Aime 2026",
1909
+ "slice_key": null,
1910
+ "slice_name": null,
1911
+ "lower_is_better": false,
1912
+ "metric_name": "Score",
1913
+ "metric_id": "llm_stats.aime-2026.score",
1914
+ "metric_key": "score",
1915
+ "metric_source": "metric_config",
1916
+ "metric_config": {
1917
+ "evaluation_description": "All 30 problems from the 2026 American Invitational Mathematics Examination (AIME I and AIME II), testing olympiad-level mathematical reasoning with integer answers from 000-999. Used as an AI benchmark to evaluate large language models' ability to solve complex mathematical problems requiring multi-step logical deductions and structured symbolic reasoning.",
1918
+ "metric_id": "llm_stats.aime-2026.score",
1919
+ "metric_name": "AIME 2026 score",
1920
+ "metric_kind": "benchmark_score",
1921
+ "metric_unit": "proportion",
1922
+ "lower_is_better": false,
1923
+ "score_type": "continuous",
1924
+ "min_score": 0.0,
1925
+ "max_score": 1.0,
1926
+ "additional_details": {
1927
+ "raw_benchmark_id": "aime-2026",
1928
+ "raw_score_field": "score",
1929
+ "bound_strategy": "inferred_proportion",
1930
+ "raw_name": "AIME 2026",
1931
+ "raw_categories": "[\"math\",\"reasoning\"]",
1932
+ "raw_modality": "text",
1933
+ "raw_verified": "false",
1934
+ "raw_model_count": "12"
1935
+ }
1936
+ },
1937
+ "models_count": 1,
1938
+ "top_score": 0.883,
1939
+ "model_results": [
1940
+ {
1941
+ "model_id": "bytedance/seed-2-0-lite",
1942
+ "model_route_id": "bytedance__seed-2-0-lite",
1943
+ "model_name": "Seed 2.0 Lite",
1944
+ "developer": "bytedance",
1945
+ "variant_key": "default",
1946
+ "raw_model_id": "bytedance/seed-2.0-lite",
1947
+ "score": 0.883,
1948
+ "evaluation_id": "llm-stats/first_party/bytedance_seed-2.0-lite/1777108064.422824",
1949
+ "retrieved_timestamp": "1777108064.422824",
1950
+ "source_metadata": {
1951
+ "source_name": "LLM Stats API: first_party scores",
1952
+ "source_type": "documentation",
1953
+ "source_organization_name": "LLM Stats",
1954
+ "source_organization_url": "https://llm-stats.com/",
1955
+ "evaluator_relationship": "first_party",
1956
+ "additional_details": {
1957
+ "models_endpoint": "https://api.llm-stats.com/v1/models",
1958
+ "benchmarks_endpoint": "https://api.llm-stats.com/leaderboard/benchmarks",
1959
+ "scores_endpoint": "https://api.llm-stats.com/v1/scores",
1960
+ "scores_endpoint_fallback": "https://api.llm-stats.com/leaderboard/benchmarks/{benchmark_id}",
1961
+ "developer_page_url": "https://llm-stats.com/developer",
1962
+ "attribution_url": "https://llm-stats.com/",
1963
+ "attribution_required": "true",
1964
+ "source_role": "aggregator"
1965
+ }
1966
+ },
1967
+ "source_data": {
1968
+ "dataset_name": "AIME 2026",
1969
+ "source_type": "url",
1970
+ "url": [
1971
+ "https://llm-stats.com/models/seed-2.0-lite",
1972
+ "https://llm-stats.com/benchmarks/aime-2026",
1973
+ "https://api.llm-stats.com/leaderboard/benchmarks/aime-2026"
1974
+ ],
1975
+ "additional_details": {
1976
+ "raw_benchmark_id": "aime-2026",
1977
+ "raw_model_id": "seed-2.0-lite",
1978
+ "source_role": "aggregator"
1979
+ }
1980
+ },
1981
+ "source_record_url": "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/records/bytedance__seed-2-0-lite/llm_stats_first_party_bytedance_seed_2_0_lite_1777108064_422824.json",
1982
+ "detailed_evaluation_results": null,
1983
+ "detailed_evaluation_results_meta": null,
1984
+ "passthrough_top_level_fields": null,
1985
+ "instance_level_data": null,
1986
+ "normalized_result": {
1987
+ "benchmark_family_key": "llm_stats",
1988
+ "benchmark_family_name": "AIME 2026",
1989
+ "benchmark_parent_key": "llm_stats",
1990
+ "benchmark_parent_name": "AIME 2026",
1991
+ "benchmark_component_key": "aime_2026",
1992
+ "benchmark_component_name": "Aime 2026",
1993
+ "benchmark_leaf_key": "aime_2026",
1994
+ "benchmark_leaf_name": "Aime 2026",
1995
+ "slice_key": null,
1996
+ "slice_name": null,
1997
+ "metric_name": "Score",
1998
+ "metric_id": "llm_stats.aime-2026.score",
1999
+ "metric_key": "score",
2000
+ "metric_source": "metric_config",
2001
+ "display_name": "Aime 2026 / Score",
2002
+ "canonical_display_name": "Aime 2026 / Score",
2003
+ "raw_evaluation_name": "llm_stats.aime-2026",
2004
+ "is_summary_score": false
2005
+ },
2006
+ "evalcards": {
2007
+ "annotations": {
2008
+ "reproducibility_gap": {
2009
+ "has_reproducibility_gap": true,
2010
+ "missing_fields": [
2011
+ "temperature",
2012
+ "max_tokens"
2013
+ ],
2014
+ "required_field_count": 2,
2015
+ "populated_field_count": 0,
2016
+ "signal_version": "1.0"
2017
+ },
2018
+ "provenance": {
2019
+ "source_type": "first_party",
2020
+ "is_multi_source": false,
2021
+ "first_party_only": true,
2022
+ "distinct_reporting_organizations": 1,
2023
+ "signal_version": "1.0"
2024
+ },
2025
+ "variant_divergence": null,
2026
+ "cross_party_divergence": null
2027
+ }
2028
+ }
2029
+ }
2030
+ ]
2031
+ }
2032
+ ],
2033
+ "subtasks": [],
2034
+ "models_count": 1,
2035
+ "top_score": 0.883,
2036
+ "instance_data": {
2037
+ "available": false,
2038
+ "url_count": 0,
2039
+ "sample_urls": [],
2040
+ "models_with_loaded_instances": 0
2041
+ }
2042
+ }
2043
+ ]
2044
+ },
2045
+ "total_evaluations": 1,
2046
+ "last_updated": "2026-04-25T09:07:44.422824Z",
2047
+ "categories_covered": [
2048
+ "coding",
2049
+ "other"
2050
+ ],
2051
+ "variants": [
2052
+ {
2053
+ "variant_key": "default",
2054
+ "variant_label": "Default",
2055
+ "evaluation_count": 1,
2056
+ "raw_model_ids": [
2057
+ "bytedance/seed-2.0-lite"
2058
+ ],
2059
+ "last_updated": "2026-04-25T09:07:44.422824Z"
2060
+ }
2061
+ ],
2062
+ "reproducibility_summary": {
2063
+ "results_total": 2,
2064
+ "has_reproducibility_gap_count": 2,
2065
+ "populated_ratio_avg": 0.0
2066
+ },
2067
+ "provenance_summary": {
2068
+ "total_results": 2,
2069
+ "total_groups": 2,
2070
+ "multi_source_groups": 0,
2071
+ "first_party_only_groups": 2,
2072
+ "source_type_distribution": {
2073
+ "first_party": 2,
2074
+ "third_party": 0,
2075
+ "collaborative": 0,
2076
+ "unspecified": 0
2077
+ }
2078
+ },
2079
+ "comparability_summary": {
2080
+ "total_groups": 2,
2081
+ "groups_with_variant_check": 0,
2082
+ "groups_with_cross_party_check": 0,
2083
+ "variant_divergent_count": 0,
2084
+ "cross_party_divergent_count": 0
2085
+ }
2086
+ }
tests/fixtures/models/google__gemini-3-flash.json ADDED
The diff for this file is too large to render. See raw diff
 
tests/fixtures/models/openai__gpt-5-2-pro.json ADDED
The diff for this file is too large to render. See raw diff
 
tests/pipeline-contract.test.ts ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { readdirSync } from "fs"
2
+ import path from "path"
3
+ import { fileURLToPath } from "url"
4
+
5
+ import { describe, expect, it } from "vitest"
6
+
7
+ import type { HFEvalDetail, HFEvalModelResult, HFModelDetail, HFModelCardEntry } from "../lib/hf-data"
8
+ import { flattenModelEvaluations } from "../lib/hf-data"
9
+
10
+ import { fixtureEntries, loadAllFixtures, walkHierarchyResults } from "./fixtures/loader"
11
+
12
+ const FIXTURES_DIR = path.join(path.dirname(fileURLToPath(import.meta.url)), "fixtures")
13
+
14
+ // Tier A — pipeline contract tests.
15
+ //
16
+ // These tests assert that the pipeline-emitted artifacts in tests/fixtures/
17
+ // carry every field the TS code depends on. They run against PINNED fixtures,
18
+ // not the live cache, so an upstream data refresh doesn't make these flap.
19
+ //
20
+ // To check the live cache for drift instead, see tests/upstream-drift.test.ts.
21
+ //
22
+ // When adding a deletion that depends on a new pipeline guarantee, add a
23
+ // contract here first. Each contract should fail loudly with the offending
24
+ // file path + key path so violations are easy to fix.
25
+
26
+ const KNOWN_PIPELINE_CATEGORY_KEYS = new Set([
27
+ "agentic",
28
+ "reasoning",
29
+ "general",
30
+ "safety",
31
+ "knowledge",
32
+ "other",
33
+ "coding",
34
+ "instruction_following",
35
+ "language_understanding",
36
+ ])
37
+
38
+ const VALID_EVALUATOR_RELATIONSHIPS = new Set(["first_party", "third_party", "other"])
39
+
40
+ interface Violation {
41
+ fixture: string
42
+ path: string
43
+ detail: string
44
+ }
45
+
46
+ describe("Tier A — pipeline contracts (model files)", () => {
47
+ const models = loadAllFixtures<HFModelDetail>("models")
48
+
49
+ it("every model_result carries source_metadata", () => {
50
+ const violations: Violation[] = []
51
+ for (const { id, data } of models) {
52
+ for (const { result, path } of walkHierarchyResults<HFEvalModelResult>(data, id)) {
53
+ if (!result.source_metadata) {
54
+ violations.push({ fixture: id, path, detail: "missing source_metadata" })
55
+ }
56
+ }
57
+ }
58
+ expect(violations, formatViolations(violations)).toEqual([])
59
+ })
60
+
61
+ it("every source_metadata.evaluator_relationship is in the known set", () => {
62
+ const violations: Violation[] = []
63
+ for (const { id, data } of models) {
64
+ for (const { result, path } of walkHierarchyResults<HFEvalModelResult>(data, id)) {
65
+ const rel = result.source_metadata?.evaluator_relationship
66
+ if (rel != null && !VALID_EVALUATOR_RELATIONSHIPS.has(rel)) {
67
+ violations.push({ fixture: id, path, detail: `unknown evaluator_relationship=${rel}` })
68
+ }
69
+ }
70
+ }
71
+ expect(violations, formatViolations(violations)).toEqual([])
72
+ })
73
+
74
+ it("every hierarchy_by_category key is in PIPELINE_CATEGORY_MAP", () => {
75
+ const violations: Violation[] = []
76
+ for (const { id, data } of models) {
77
+ for (const key of Object.keys(data.hierarchy_by_category ?? {})) {
78
+ if (!KNOWN_PIPELINE_CATEGORY_KEYS.has(key.toLowerCase())) {
79
+ violations.push({ fixture: id, path: `hierarchy_by_category.${key}`, detail: "unknown category key" })
80
+ }
81
+ }
82
+ }
83
+ expect(violations, formatViolations(violations)).toEqual([])
84
+ })
85
+
86
+ it("every model_result.retrieved_timestamp parses as a valid Date", () => {
87
+ const violations: Violation[] = []
88
+ for (const { id, data } of models) {
89
+ for (const { result, path } of walkHierarchyResults<HFEvalModelResult>(data, id)) {
90
+ const ts = result.retrieved_timestamp
91
+ if (ts == null) continue
92
+ // Pipeline emits either ISO strings or unix-seconds-as-string.
93
+ const numeric = Number.parseFloat(ts)
94
+ const isNumeric = Number.isFinite(numeric) && !ts.includes("-")
95
+ const dateValue = isNumeric ? new Date(numeric * 1000) : new Date(ts)
96
+ if (Number.isNaN(dateValue.getTime())) {
97
+ violations.push({ fixture: id, path: `${path}.retrieved_timestamp`, detail: `unparseable: ${ts}` })
98
+ }
99
+ }
100
+ }
101
+ expect(violations, formatViolations(violations)).toEqual([])
102
+ })
103
+
104
+ it("model card has model_family_id matching pipelineSlugify(model_family_id) → model_route_id", () => {
105
+ const violations: Violation[] = []
106
+ for (const { id, data } of models) {
107
+ if (!data.model_family_id) {
108
+ violations.push({ fixture: id, path: "model_family_id", detail: "missing" })
109
+ continue
110
+ }
111
+ const expected = data.model_family_id.replace(/\//g, "__")
112
+ if (data.model_route_id !== expected) {
113
+ violations.push({
114
+ fixture: id,
115
+ path: "model_route_id",
116
+ detail: `${data.model_route_id} !== ${expected} (derived from ${data.model_family_id})`,
117
+ })
118
+ }
119
+ }
120
+ expect(violations, formatViolations(violations)).toEqual([])
121
+ })
122
+
123
+ it("flattenModelEvaluations output has source_metadata on every evaluation (cross-check)", () => {
124
+ const violations: Violation[] = []
125
+ for (const { id, data } of models) {
126
+ const evaluations = flattenModelEvaluations(data)
127
+ for (const [idx, evalEntry] of evaluations.entries()) {
128
+ if (!evalEntry.source_metadata) {
129
+ violations.push({
130
+ fixture: id,
131
+ path: `flattenModelEvaluations(${id})[${idx}]`,
132
+ detail: "missing source_metadata after flatten",
133
+ })
134
+ }
135
+ }
136
+ }
137
+ expect(violations, formatViolations(violations)).toEqual([])
138
+ })
139
+ })
140
+
141
+ describe("Tier A — pipeline contracts (eval-detail files)", () => {
142
+ const evals = loadAllFixtures<HFEvalDetail>("evals")
143
+
144
+ it("every eval-detail has eval_summary_id, benchmark, benchmark_leaf_name", () => {
145
+ const violations: Violation[] = []
146
+ for (const { id, data } of evals) {
147
+ for (const field of ["eval_summary_id", "benchmark", "benchmark_leaf_name"] as const) {
148
+ if (!data[field]) {
149
+ violations.push({ fixture: id, path: field, detail: "missing or empty" })
150
+ }
151
+ }
152
+ }
153
+ expect(violations, formatViolations(violations)).toEqual([])
154
+ })
155
+
156
+ it("every eval-detail has category as a non-empty string", () => {
157
+ const violations: Violation[] = []
158
+ for (const { id, data } of evals) {
159
+ if (typeof data.category !== "string" || data.category.length === 0) {
160
+ violations.push({ fixture: id, path: "category", detail: `not a non-empty string: ${data.category}` })
161
+ }
162
+ }
163
+ expect(violations, formatViolations(violations)).toEqual([])
164
+ })
165
+
166
+ it("every model_result in eval-detail metrics carries source_metadata", () => {
167
+ const violations: Violation[] = []
168
+ for (const { id, data } of evals) {
169
+ for (const [metricIdx, metric] of (data.metrics ?? []).entries()) {
170
+ for (const [resultIdx, mr] of (metric.model_results ?? []).entries()) {
171
+ if (!mr.source_metadata) {
172
+ violations.push({
173
+ fixture: id,
174
+ path: `metrics[${metricIdx}].model_results[${resultIdx}]`,
175
+ detail: "missing source_metadata",
176
+ })
177
+ }
178
+ }
179
+ }
180
+ }
181
+ expect(violations, formatViolations(violations)).toEqual([])
182
+ })
183
+
184
+ it("every metric has metric_summary_id and metric_name", () => {
185
+ const violations: Violation[] = []
186
+ for (const { id, data } of evals) {
187
+ for (const [metricIdx, metric] of (data.metrics ?? []).entries()) {
188
+ if (!metric.metric_summary_id) {
189
+ violations.push({ fixture: id, path: `metrics[${metricIdx}].metric_summary_id`, detail: "missing" })
190
+ }
191
+ if (!metric.metric_name) {
192
+ violations.push({ fixture: id, path: `metrics[${metricIdx}].metric_name`, detail: "missing" })
193
+ }
194
+ }
195
+ }
196
+ expect(violations, formatViolations(violations)).toEqual([])
197
+ })
198
+ })
199
+
200
+ describe("Tier A — pipeline contracts (model card list entries)", () => {
201
+ const cards = loadAllFixtures<HFModelCardEntry>("model_cards")
202
+
203
+ it("model card has model_route_id === pipelineSlugify(model_family_id)", () => {
204
+ const violations: Violation[] = []
205
+ for (const { id, data } of cards) {
206
+ if (!data.model_family_id) {
207
+ violations.push({ fixture: id, path: "model_family_id", detail: "missing" })
208
+ continue
209
+ }
210
+ const expected = data.model_family_id.replace(/\//g, "__")
211
+ if (data.model_route_id !== expected) {
212
+ violations.push({
213
+ fixture: id,
214
+ path: "model_route_id",
215
+ detail: `${data.model_route_id} !== ${expected}`,
216
+ })
217
+ }
218
+ }
219
+ expect(violations, formatViolations(violations)).toEqual([])
220
+ })
221
+ })
222
+
223
+ describe("Tier A — pipeline contracts (developer files)", () => {
224
+ const developers = loadAllFixtures<{ developer: string; models: HFModelCardEntry[] }>("developers")
225
+
226
+ it("every developer payload has developer + models[]", () => {
227
+ const violations: Violation[] = []
228
+ for (const { id, data } of developers) {
229
+ if (!data.developer) violations.push({ fixture: id, path: "developer", detail: "missing" })
230
+ if (!Array.isArray(data.models)) violations.push({ fixture: id, path: "models", detail: "not an array" })
231
+ }
232
+ expect(violations, formatViolations(violations)).toEqual([])
233
+ })
234
+
235
+ it("every model in developer.models has model_family_id", () => {
236
+ const violations: Violation[] = []
237
+ for (const { id, data } of developers) {
238
+ for (const [modelIdx, model] of (data.models ?? []).entries()) {
239
+ if (!model.model_family_id) {
240
+ violations.push({ fixture: id, path: `models[${modelIdx}].model_family_id`, detail: "missing" })
241
+ }
242
+ }
243
+ }
244
+ expect(violations, formatViolations(violations)).toEqual([])
245
+ })
246
+ })
247
+
248
+ describe("Tier A — fixture inventory", () => {
249
+ // Catches both directions: (a) a fixture file exists that isn't in the
250
+ // manifest (stale/unreferenced and won't be exercised by snapshot tests),
251
+ // (b) a manifest entry references a missing file. The "manifest entry
252
+ // resolves to a readable file" check from earlier was redundant with the
253
+ // 14 contract tests above (which all call loadAllFixtures at module
254
+ // scope), but the file→manifest direction was uncovered.
255
+ it("fixture files match the manifest exactly (no orphans, no missing)", () => {
256
+ const groupsAndDirs = [
257
+ ["evals", "evals"],
258
+ ["models", "models"],
259
+ ["developers", "developers"],
260
+ ["model_cards", "model-cards"],
261
+ ] as const
262
+ const orphans: string[] = []
263
+ const missing: string[] = []
264
+ for (const [group, dirName] of groupsAndDirs) {
265
+ const dir = path.join(FIXTURES_DIR, dirName)
266
+ const onDisk = new Set(readdirSync(dir).filter((f) => f.endsWith(".json")))
267
+ const inManifest = new Set(fixtureEntries(group).map((e) => `${e.id}.json`))
268
+ for (const f of onDisk) if (!inManifest.has(f)) orphans.push(`${group}/${f}`)
269
+ for (const f of inManifest) if (!onDisk.has(f)) missing.push(`${group}/${f}`)
270
+ }
271
+ expect({ orphans, missing }).toEqual({ orphans: [], missing: [] })
272
+ })
273
+ })
274
+
275
+ function formatViolations(violations: Violation[]): string {
276
+ if (violations.length === 0) return ""
277
+ const sample = violations.slice(0, 10)
278
+ const more = violations.length > 10 ? `\n …and ${violations.length - 10} more` : ""
279
+ return [
280
+ `\n${violations.length} contract violation(s):`,
281
+ ...sample.map((v) => ` ${v.fixture} :: ${v.path} — ${v.detail}`),
282
+ more,
283
+ ].join("\n")
284
+ }
tests/upstream-drift.test.ts ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fs from "fs"
2
+ import path from "path"
3
+
4
+ import { describe, expect, it } from "vitest"
5
+
6
+ import type { HFEvalDetail, HFEvalModelResult, HFModelDetail } from "../lib/hf-data"
7
+
8
+ import { listLiveCacheFiles, loadLiveCacheFile, walkHierarchyResults } from "./fixtures/loader"
9
+
10
+ // Tier A — drift detection variant. Runs the same shape of contracts as
11
+ // pipeline-contract.test.ts but against the LIVE .cache/hf-data/ directory
12
+ // rather than pinned fixtures. Opt-in via `pnpm test:drift`. NOT included in
13
+ // the default `pnpm test` run because (a) it requires the cache to be primed
14
+ // and (b) flapping on every upstream refresh defeats the purpose of the pin.
15
+ //
16
+ // Use this when:
17
+ // - You suspect upstream has changed (the existing JSON path behaves oddly).
18
+ // - Before a `pnpm refresh-fixtures` you want to know what the pin will see.
19
+ // - As a periodic sanity check (CI nightly, manual).
20
+
21
+ const KNOWN_PIPELINE_CATEGORY_KEYS = new Set([
22
+ "agentic", "reasoning", "general", "safety", "knowledge", "other",
23
+ "coding", "instruction_following", "language_understanding",
24
+ ])
25
+ const VALID_EVALUATOR_RELATIONSHIPS = new Set(["first_party", "third_party", "other"])
26
+
27
+ const modelFiles = listLiveCacheFiles("models")
28
+ const evalFiles = listLiveCacheFiles("evals")
29
+ // Drift checks are gated by RUN_DRIFT=1 (set by `pnpm test:drift`) so they
30
+ // stay out of the default `pnpm test` run. They additionally need a populated
31
+ // cache.
32
+ const shouldRun = process.env.RUN_DRIFT === "1" && modelFiles.length > 0 && evalFiles.length > 0
33
+
34
+ describe.skipIf(!shouldRun)(`Tier A drift — live cache contracts (${modelFiles.length} models, ${evalFiles.length} evals)`, () => {
35
+ it("every model_result in every model file carries source_metadata", () => {
36
+ let scanned = 0
37
+ let violations = 0
38
+ const examples: string[] = []
39
+ for (const file of modelFiles) {
40
+ const data = loadLiveCacheFile<HFModelDetail>("models", file)
41
+ for (const { result, path } of walkHierarchyResults<HFEvalModelResult>(data, file)) {
42
+ scanned += 1
43
+ if (!result.source_metadata) {
44
+ violations += 1
45
+ if (examples.length < 5) examples.push(path)
46
+ }
47
+ }
48
+ }
49
+ expect(violations, `${violations}/${scanned} rows lack source_metadata. Examples:\n ${examples.join("\n ")}`).toBe(0)
50
+ })
51
+
52
+ it("every hierarchy_by_category key across all models is in PIPELINE_CATEGORY_MAP", () => {
53
+ const unknown = new Map<string, number>()
54
+ for (const file of modelFiles) {
55
+ const data = loadLiveCacheFile<HFModelDetail>("models", file)
56
+ for (const key of Object.keys(data.hierarchy_by_category ?? {})) {
57
+ if (!KNOWN_PIPELINE_CATEGORY_KEYS.has(key.toLowerCase())) {
58
+ unknown.set(key, (unknown.get(key) ?? 0) + 1)
59
+ }
60
+ }
61
+ }
62
+ const summary = Array.from(unknown.entries()).map(([k, n]) => `${k}=${n}`).join(", ")
63
+ expect(unknown.size, `Unknown keys (key=count): ${summary}`).toBe(0)
64
+ })
65
+
66
+ it("every model_result in every eval-detail carries source_metadata", () => {
67
+ let scanned = 0
68
+ let violations = 0
69
+ const examples: string[] = []
70
+ for (const file of evalFiles) {
71
+ const data = loadLiveCacheFile<HFEvalDetail>("evals", file)
72
+ for (const [metricIdx, metric] of (data.metrics ?? []).entries()) {
73
+ for (const [resultIdx, mr] of (metric.model_results ?? []).entries()) {
74
+ scanned += 1
75
+ if (!mr.source_metadata) {
76
+ violations += 1
77
+ if (examples.length < 5) examples.push(`${file} metrics[${metricIdx}].model_results[${resultIdx}]`)
78
+ }
79
+ }
80
+ }
81
+ }
82
+ expect(violations, `${violations}/${scanned} eval-detail rows lack source_metadata. Examples:\n ${examples.join("\n ")}`).toBe(0)
83
+ })
84
+
85
+ it("every eval-detail has a non-empty category", () => {
86
+ let violations = 0
87
+ const examples: string[] = []
88
+ for (const file of evalFiles) {
89
+ const data = loadLiveCacheFile<HFEvalDetail>("evals", file)
90
+ if (typeof data.category !== "string" || data.category.length === 0) {
91
+ violations += 1
92
+ if (examples.length < 5) examples.push(file)
93
+ }
94
+ }
95
+ expect(violations, `${violations} eval-details without category. Examples: ${examples.join(", ")}`).toBe(0)
96
+ })
97
+
98
+ it("every model card has model_route_id === pipelineSlugify(model_family_id)", () => {
99
+ let violations = 0
100
+ const examples: string[] = []
101
+ const cardsPath = path.resolve(import.meta.dirname, "..", ".cache", "hf-data", "model-cards.json")
102
+ const cards = JSON.parse(fs.readFileSync(cardsPath, "utf8"))
103
+ for (const card of cards) {
104
+ const expected = (card.model_family_id || "").replace(/\//g, "__")
105
+ if (card.model_route_id !== expected) {
106
+ violations += 1
107
+ if (examples.length < 5) examples.push(`${card.model_route_id} (expected ${expected})`)
108
+ }
109
+ }
110
+ expect(violations, `${violations}/${cards.length} mismatches. Examples: ${examples.join(", ")}`).toBe(0)
111
+ })
112
+
113
+ it("every source_metadata.evaluator_relationship is in {first_party, third_party, other}", () => {
114
+ const counts = new Map<string, number>()
115
+ for (const file of evalFiles) {
116
+ const data = loadLiveCacheFile<HFEvalDetail>("evals", file)
117
+ for (const metric of data.metrics ?? []) {
118
+ for (const mr of metric.model_results ?? []) {
119
+ const rel = mr.source_metadata?.evaluator_relationship
120
+ if (rel != null && !VALID_EVALUATOR_RELATIONSHIPS.has(rel)) {
121
+ counts.set(rel, (counts.get(rel) ?? 0) + 1)
122
+ }
123
+ }
124
+ }
125
+ }
126
+ const summary = Array.from(counts.entries()).map(([k, n]) => `${k}=${n}`).join(", ")
127
+ expect(counts.size, `Unknown evaluator_relationship values: ${summary}`).toBe(0)
128
+ })
129
+ })