evijit HF Staff j-chim commited on
Commit
fe99ffa
·
1 Parent(s): 32864b0

Swap backend data (#3)

Browse files

- Integrate with test backend data (7635aee64606c5b9138e680b833ca1383b570887)
- Drop input_modalities/output_modalities from MODEL_CARD_COLUMNS (bfce8f214eed3054b820176601eaa0a23e31bee7)
- Merge remote-tracking branch 'origin/main' into feat/use-new-backend-data (25ba6d010ff24b92f252f8fe11a6624f68aa6690)
- Use model_key as the addressable identifier and wire comparison-index sidecar (0e529dce5eb243708739730f1fcec00d27202d71)


Co-authored-by: Jenny Chim <j-chim@users.noreply.huggingface.co>

Dockerfile CHANGED
@@ -9,20 +9,18 @@ ARG PNPM_VERSION=10.25.0
9
 
10
  # Build-time data-source configuration. HF Spaces "Variables" are NOT injected
11
  # into Docker RUN steps automatically — only into the final runtime — so we
12
- # bake the DuckDB-mode defaults here. `cache-hf-data.mjs` reads these to know
13
- # which dataset to clone and to apply lean cache mode (skip JSON-fallback
14
- # artifacts). Override at build time via `--build-arg HF_DATASET_REPO=...`.
15
- ARG DATA_BACKEND=duckdb
16
  ARG HF_DATASET_REPO=https://huggingface.co/datasets/evaleval/card_backend
17
- # Static prerender (`next build`) executes route handlers, which call
18
- # `getModelCards` etc. `lib/duckdb-data.ts`, which requires
19
- # `LOCAL_PIPELINE_OUTPUT`. The cache populated by `cache-hf-data.mjs`
20
- # lives at `/app/.cache/hf-data`. `HF_DATA_OFFLINE=1` keeps the metadata
21
- # fetchers (`lib/hf-data.ts`) from attempting `evaleval/card_backend`
22
- # network reads with `revalidate: 0` (which Next 15 treats as dynamic
23
- # and fails the static export of `/`).
24
  ENV DATA_BACKEND=${DATA_BACKEND} \
25
  HF_DATASET_REPO=${HF_DATASET_REPO} \
 
26
  LOCAL_PIPELINE_OUTPUT=/app/.cache/hf-data \
27
  HF_DATA_LOCAL_DIR=/app/.cache/hf-data \
28
  HF_DATA_OFFLINE=1
@@ -49,13 +47,15 @@ RUN pnpm run build
49
  FROM node:18-bullseye-slim AS runner
50
  WORKDIR /app
51
 
52
- # Runtime needs the same DuckDB-mode envs that the builder used. HF Space
53
- # Variables aren't set on this Space, and Docker multi-stage doesn't carry
54
- # ENVs across stages — without these, lib/duckdb-data.ts throws
55
- # "DATA_BACKEND=duckdb requires LOCAL_PIPELINE_OUTPUT" at request time and
56
- # every model/eval/developer endpoint returns empty.
 
57
  ENV NODE_ENV=production \
58
- DATA_BACKEND=duckdb \
 
59
  LOCAL_PIPELINE_OUTPUT=/app/.cache/hf-data \
60
  HF_DATA_LOCAL_DIR=/app/.cache/hf-data \
61
  HF_DATA_OFFLINE=1
 
9
 
10
  # Build-time data-source configuration. HF Spaces "Variables" are NOT injected
11
  # into Docker RUN steps automatically — only into the final runtime — so we
12
+ # bake the selected backend here. `DATA_BACKEND=v2` reads `SNAPSHOT_URL`
13
+ # directly; legacy DuckDB mode still clones `HF_DATASET_REPO` into the cache.
14
+ # Override at build time via `--build-arg ...`.
15
+ ARG DATA_BACKEND=v2
16
  ARG HF_DATASET_REPO=https://huggingface.co/datasets/evaleval/card_backend
17
+ ARG SNAPSHOT_URL=https://huggingface.co/datasets/j-chim/temp_evalcard_backend/resolve/main/warehouse/2026-05-03T21-46-50Z
18
+ # Static prerender (`next build`) executes route handlers. In legacy mode the
19
+ # cache populated by `cache-hf-data.mjs` lives at `/app/.cache/hf-data`; in v2
20
+ # the cache step is skipped and the app reads the pinned Stage J snapshot.
 
 
 
21
  ENV DATA_BACKEND=${DATA_BACKEND} \
22
  HF_DATASET_REPO=${HF_DATASET_REPO} \
23
+ SNAPSHOT_URL=${SNAPSHOT_URL} \
24
  LOCAL_PIPELINE_OUTPUT=/app/.cache/hf-data \
25
  HF_DATA_LOCAL_DIR=/app/.cache/hf-data \
26
  HF_DATA_OFFLINE=1
 
47
  FROM node:18-bullseye-slim AS runner
48
  WORKDIR /app
49
 
50
+ ARG DATA_BACKEND=v2
51
+ ARG SNAPSHOT_URL=https://huggingface.co/datasets/j-chim/temp_evalcard_backend/resolve/main/warehouse/2026-05-03T21-46-50Z
52
+
53
+ # Runtime needs the same data-source envs that the builder used. Docker
54
+ # multi-stage doesn't carry ENVs across stages, so keep backend selection and
55
+ # snapshot/cache pointers explicit here too.
56
  ENV NODE_ENV=production \
57
+ DATA_BACKEND=${DATA_BACKEND} \
58
+ SNAPSHOT_URL=${SNAPSHOT_URL} \
59
  LOCAL_PIPELINE_OUTPUT=/app/.cache/hf-data \
60
  HF_DATA_LOCAL_DIR=/app/.cache/hf-data \
61
  HF_DATA_OFFLINE=1
app/page.tsx CHANGED
@@ -244,7 +244,7 @@ export default async function HomePage() {
244
  <p className="mx-auto mt-2 max-w-2xl text-sm leading-6 text-[color:var(--fg-muted)]">
245
  The current backend snapshot does not include{" "}
246
  <code className="rounded-sm bg-[color:var(--bg-surface)] px-1.5 py-0.5 font-mono text-xs">
247
- corpus-aggregates.json
248
  </code>
249
  . When it does, this section will render the four corpus-level rollups.
250
  </p>
 
244
  <p className="mx-auto mt-2 max-w-2xl text-sm leading-6 text-[color:var(--fg-muted)]">
245
  The current backend snapshot does not include{" "}
246
  <code className="rounded-sm bg-[color:var(--bg-surface)] px-1.5 py-0.5 font-mono text-xs">
247
+ headline.json
248
  </code>
249
  . When it does, this section will render the four corpus-level rollups.
250
  </p>
components/signals/corpus-dashboard.tsx CHANGED
@@ -20,7 +20,7 @@ import {
20
  formatPercent,
21
  } from "./signal-utils"
22
 
23
- const CATEGORY_ORDER = ["agentic", "general", "knowledge", "reasoning", "safety", "other"]
24
 
25
  const SOURCE_COLORS: Record<string, string> = {
26
  first_party: "bg-amber-500",
@@ -51,13 +51,21 @@ export function CorpusDashboard({
51
  }, [mode])
52
 
53
  const categoryKeys = useMemo(
54
- () =>
55
- CATEGORY_ORDER.filter((category) =>
56
- aggregates.reproducibility.by_category[category] ||
57
- aggregates.completeness.by_category[category] ||
58
- aggregates.provenance.by_category[category] ||
59
- aggregates.comparability.by_category[category]
60
- ),
 
 
 
 
 
 
 
 
61
  [aggregates]
62
  )
63
 
@@ -190,25 +198,14 @@ function CompletenessSection({
190
  icon={<ClipboardCheck className="h-5 w-5" />}
191
  title="Reporting Completeness"
192
  subtitle="How much benchmark documentation is populated."
193
- headline={formatPercent(block.completeness_score_mean)}
194
- headlineLabel={`Median ${formatPercent(block.completeness_score_median)} across ${block.total_benchmarks.toLocaleString()} benchmarks`}
195
  >
196
  {scores.length > 0 && <Histogram scores={scores} />}
197
- <div className="mt-4 grid gap-2">
198
- {Object.entries(block.per_field_population).slice(0, 10).map(([field, value]) => (
199
- <div key={field} className="rounded-xl border border-border/60 bg-background px-3 py-2">
200
- <div className="flex items-start justify-between gap-3 text-sm">
201
- <span className="font-medium">{formatFieldLabel(field)}</span>
202
- <span className="shrink-0 tabular-nums text-muted-foreground">
203
- {formatPercent(value.mean_score)}
204
- </span>
205
- </div>
206
- <div className="mt-2 grid gap-1.5">
207
- <MetricBar label="Any data" value={value.populated_rate} compact />
208
- <MetricBar label="Fully populated" value={value.fully_populated_rate} compact />
209
- </div>
210
- </div>
211
- ))}
212
  </div>
213
  </DashboardSection>
214
  )
@@ -217,14 +214,16 @@ function CompletenessSection({
217
  function ProvenanceSection({ block }: { block: ProvenanceCorpusBlock }) {
218
  const distribution = block.source_type_distribution
219
  const total = Object.values(distribution).reduce((sum, value) => sum + value, 0)
 
 
220
 
221
  return (
222
  <DashboardSection
223
  icon={<BarChart3 className="h-5 w-5" />}
224
  title="Provenance"
225
  subtitle="Who reported the scores, and whether groups have multiple sources."
226
- headline={formatPercent(block.multi_source_rate)}
227
- headlineLabel="of (model, benchmark, metric) groups have multiple reporting sources"
228
  >
229
  <div className="overflow-hidden rounded-full border border-border/70 bg-muted/30">
230
  <div className="flex h-4 w-full">
@@ -240,34 +239,40 @@ function ProvenanceSection({ block }: { block: ProvenanceCorpusBlock }) {
240
  </div>
241
 
242
  <div className="mt-3 grid gap-2 sm:grid-cols-2">
243
- <RatioTile label="Multi-source groups" value={block.multi_source_rate} count={block.multi_source_groups} />
244
- <RatioTile label="First-party only groups" value={block.first_party_only_rate} count={block.first_party_only_groups} />
245
  </div>
246
  </DashboardSection>
247
  )
248
  }
249
 
250
  function ComparabilitySection({ block }: { block: ComparabilityCorpusBlock }) {
 
 
 
 
 
 
251
  return (
252
  <DashboardSection
253
  icon={<GitCompareArrows className="h-5 w-5" />}
254
  title="Comparability"
255
  subtitle="Eligible groups where scores diverge across setups or reporting organizations."
256
- headline={formatNullableRate(block.variant_divergence_rate)}
257
- headlineLabel={`${block.variant_divergent_groups.toLocaleString()} of ${block.variant_eligible_groups.toLocaleString()} setup-eligible groups diverge`}
258
  >
259
  <div className="grid gap-3 md:grid-cols-2">
260
  <ComparabilityRateCard
261
  title="Variant divergence"
262
- rate={block.variant_divergence_rate}
263
- eligible={block.variant_eligible_groups}
264
- divergent={block.variant_divergent_groups}
265
  />
266
  <ComparabilityRateCard
267
  title="Cross-party divergence"
268
- rate={block.cross_party_divergence_rate}
269
- eligible={block.cross_party_eligible_groups}
270
- divergent={block.cross_party_divergent_groups}
271
  />
272
  </div>
273
  </DashboardSection>
@@ -288,6 +293,15 @@ function CategoryPanel({
288
  comparability?: ComparabilityCorpusBlock
289
  }) {
290
  const categoryLabel = `${category.charAt(0).toUpperCase()}${category.slice(1)}`
 
 
 
 
 
 
 
 
 
291
 
292
  return (
293
  <section className="rounded-2xl border border-border/70 bg-card p-4 shadow-sm">
@@ -297,11 +311,11 @@ function CategoryPanel({
297
  </div>
298
  <div className="grid gap-3 sm:grid-cols-2">
299
  <MiniMetric label="Reproducibility gaps" value={formatPercent(reproducibility?.reproducibility_gap_rate)} />
300
- <MiniMetric label="Documentation mean" value={formatPercent(completeness?.completeness_score_mean)} />
301
- <MiniMetric label="Multi-source groups" value={formatPercent(provenance?.multi_source_rate)} />
302
- <MiniMetric label="Variant divergence" value={formatNullableRate(comparability?.variant_divergence_rate)} />
303
  </div>
304
- {comparability?.cross_party_divergence_rate == null && (
305
  <div className="mt-3 rounded-xl border border-dashed border-border/70 bg-muted/10 px-3 py-2 text-sm text-muted-foreground">
306
  Cross-party divergence: N/A - not enough multi-org coverage.
307
  </div>
@@ -411,7 +425,7 @@ function RatioTile({ label, value, count }: { label: string; value: number | nul
411
  <div className="text-sm font-medium">{label}</div>
412
  <div className="mt-1 flex items-baseline justify-between gap-2">
413
  <span className="text-xl font-semibold tabular-nums">{formatPercent(value)}</span>
414
- <span className="text-xs text-muted-foreground">{count.toLocaleString()} groups</span>
415
  </div>
416
  </div>
417
  )
@@ -463,6 +477,11 @@ function formatNullableRate(value: number | null | undefined) {
463
  return value == null ? "N/A" : formatPercent(value)
464
  }
465
 
 
 
 
 
 
466
  function formatGeneratedDate(value: string) {
467
  const date = new Date(value)
468
  if (Number.isNaN(date.getTime())) {
 
20
  formatPercent,
21
  } from "./signal-utils"
22
 
23
+ const CATEGORY_ORDER = ["Agentic", "General", "Knowledge", "Reasoning", "Safety", "Other"]
24
 
25
  const SOURCE_COLORS: Record<string, string> = {
26
  first_party: "bg-amber-500",
 
51
  }, [mode])
52
 
53
  const categoryKeys = useMemo(
54
+ () => {
55
+ const available = new Set([
56
+ ...Object.keys(aggregates.reproducibility.by_category),
57
+ ...Object.keys(aggregates.completeness.by_category),
58
+ ...Object.keys(aggregates.provenance.by_category),
59
+ ...Object.keys(aggregates.comparability.by_category),
60
+ ])
61
+
62
+ return [
63
+ ...CATEGORY_ORDER.filter((category) => available.has(category)),
64
+ ...Array.from(available)
65
+ .filter((category) => !CATEGORY_ORDER.includes(category))
66
+ .sort((a, b) => a.localeCompare(b)),
67
+ ]
68
+ },
69
  [aggregates]
70
  )
71
 
 
198
  icon={<ClipboardCheck className="h-5 w-5" />}
199
  title="Reporting Completeness"
200
  subtitle="How much benchmark documentation is populated."
201
+ headline={formatPercent(block.completeness_avg)}
202
+ headlineLabel={`Range ${formatPercent(block.completeness_min)} to ${formatPercent(block.completeness_max)} across ${block.total_triples.toLocaleString()} reported score triples`}
203
  >
204
  {scores.length > 0 && <Histogram scores={scores} />}
205
+ <div className="mt-4 grid gap-2 sm:grid-cols-3">
206
+ <MiniMetric label="Minimum" value={formatPercent(block.completeness_min)} />
207
+ <MiniMetric label="Average" value={formatPercent(block.completeness_avg)} />
208
+ <MiniMetric label="Maximum" value={formatPercent(block.completeness_max)} />
 
 
 
 
 
 
 
 
 
 
 
209
  </div>
210
  </DashboardSection>
211
  )
 
214
  function ProvenanceSection({ block }: { block: ProvenanceCorpusBlock }) {
215
  const distribution = block.source_type_distribution
216
  const total = Object.values(distribution).reduce((sum, value) => sum + value, 0)
217
+ const multiSourceRate = rate(block.multi_source_triples, block.total_triples)
218
+ const firstPartyOnlyRate = rate(block.first_party_only_triples, block.total_triples)
219
 
220
  return (
221
  <DashboardSection
222
  icon={<BarChart3 className="h-5 w-5" />}
223
  title="Provenance"
224
  subtitle="Who reported the scores, and whether groups have multiple sources."
225
+ headline={formatPercent(multiSourceRate)}
226
+ headlineLabel="of reported score triples have multiple reporting sources"
227
  >
228
  <div className="overflow-hidden rounded-full border border-border/70 bg-muted/30">
229
  <div className="flex h-4 w-full">
 
239
  </div>
240
 
241
  <div className="mt-3 grid gap-2 sm:grid-cols-2">
242
+ <RatioTile label="Multi-source triples" value={multiSourceRate} count={block.multi_source_triples} />
243
+ <RatioTile label="First-party only triples" value={firstPartyOnlyRate} count={block.first_party_only_triples} />
244
  </div>
245
  </DashboardSection>
246
  )
247
  }
248
 
249
  function ComparabilitySection({ block }: { block: ComparabilityCorpusBlock }) {
250
+ const variantRate = rate(block.variant_divergent_count, block.groups_with_variant_check)
251
+ const crossPartyRate = rate(
252
+ block.cross_party_divergent_count,
253
+ block.groups_with_cross_party_check
254
+ )
255
+
256
  return (
257
  <DashboardSection
258
  icon={<GitCompareArrows className="h-5 w-5" />}
259
  title="Comparability"
260
  subtitle="Eligible groups where scores diverge across setups or reporting organizations."
261
+ headline={formatNullableRate(variantRate)}
262
+ headlineLabel={`${block.variant_divergent_count.toLocaleString()} of ${block.groups_with_variant_check.toLocaleString()} setup-eligible groups diverge`}
263
  >
264
  <div className="grid gap-3 md:grid-cols-2">
265
  <ComparabilityRateCard
266
  title="Variant divergence"
267
+ rate={variantRate}
268
+ eligible={block.groups_with_variant_check}
269
+ divergent={block.variant_divergent_count}
270
  />
271
  <ComparabilityRateCard
272
  title="Cross-party divergence"
273
+ rate={crossPartyRate}
274
+ eligible={block.groups_with_cross_party_check}
275
+ divergent={block.cross_party_divergent_count}
276
  />
277
  </div>
278
  </DashboardSection>
 
293
  comparability?: ComparabilityCorpusBlock
294
  }) {
295
  const categoryLabel = `${category.charAt(0).toUpperCase()}${category.slice(1)}`
296
+ const multiSourceRate = rate(provenance?.multi_source_triples, provenance?.total_triples)
297
+ const variantRate = rate(
298
+ comparability?.variant_divergent_count,
299
+ comparability?.groups_with_variant_check
300
+ )
301
+ const crossPartyRate = rate(
302
+ comparability?.cross_party_divergent_count,
303
+ comparability?.groups_with_cross_party_check
304
+ )
305
 
306
  return (
307
  <section className="rounded-2xl border border-border/70 bg-card p-4 shadow-sm">
 
311
  </div>
312
  <div className="grid gap-3 sm:grid-cols-2">
313
  <MiniMetric label="Reproducibility gaps" value={formatPercent(reproducibility?.reproducibility_gap_rate)} />
314
+ <MiniMetric label="Documentation mean" value={formatPercent(completeness?.completeness_avg)} />
315
+ <MiniMetric label="Multi-source triples" value={formatPercent(multiSourceRate)} />
316
+ <MiniMetric label="Variant divergence" value={formatNullableRate(variantRate)} />
317
  </div>
318
+ {crossPartyRate == null && (
319
  <div className="mt-3 rounded-xl border border-dashed border-border/70 bg-muted/10 px-3 py-2 text-sm text-muted-foreground">
320
  Cross-party divergence: N/A - not enough multi-org coverage.
321
  </div>
 
425
  <div className="text-sm font-medium">{label}</div>
426
  <div className="mt-1 flex items-baseline justify-between gap-2">
427
  <span className="text-xl font-semibold tabular-nums">{formatPercent(value)}</span>
428
+ <span className="text-xs text-muted-foreground">{count.toLocaleString()} triples</span>
429
  </div>
430
  </div>
431
  )
 
477
  return value == null ? "N/A" : formatPercent(value)
478
  }
479
 
480
+ function rate(numerator: number | null | undefined, denominator: number | null | undefined) {
481
+ if (numerator == null || denominator == null || denominator <= 0) return null
482
+ return numerator / denominator
483
+ }
484
+
485
  function formatGeneratedDate(value: string) {
486
  const date = new Date(value)
487
  if (Number.isNaN(date.getTime())) {
components/signals/corpus-signals-strip.tsx CHANGED
@@ -39,8 +39,13 @@ export function CorpusSignalsStrip({
39
  const tpShare = totalReports > 0 ? prov.source_type_distribution.third_party / totalReports : 0
40
  const fpShare = totalReports > 0 ? prov.source_type_distribution.first_party / totalReports : 0
41
 
42
- const cmpRate = cmp.variant_divergence_rate
43
- const crossPartyAvailable = cmp.cross_party_eligible_groups > 0
 
 
 
 
 
44
 
45
  return (
46
  <div className="signals-grid">
@@ -58,29 +63,29 @@ export function CorpusSignalsStrip({
58
  />
59
  <SignalTile
60
  id="completeness"
61
- statValue={pctNum(comp.completeness_score_mean)}
62
  statUnit="%"
63
- headline={`mean across ${comp.total_benchmarks.toLocaleString()} benchmarks (median ${formatPct(comp.completeness_score_median)}).`}
64
- detail="Source-provenance fields populate fully; preregistration fields are unmet."
65
  asks="Is the benchmark itself documented well enough to interpret a score on it?"
66
  />
67
  <SignalTile
68
  id="provenance"
69
- statValue={pctNum(prov.multi_source_rate)}
70
  statUnit="%"
71
- headline="of (model, benchmark) groups have reports from more than one party."
72
- detail={`${formatPct(tpShare)} third-party, ${formatPct(fpShare)} first-party of ${totalReports.toLocaleString()} results.`}
73
  asks="Who reported this score, and have others reproduced it?"
74
  />
75
  <SignalTile
76
  id="comparability"
77
  statValue={pctNum(cmpRate)}
78
  statUnit="%"
79
- headline={`of setup-eligible groups diverge across variants (${cmp.variant_divergent_groups.toLocaleString()} of ${cmp.variant_eligible_groups.toLocaleString()}).`}
80
  detail={
81
  crossPartyAvailable
82
- ? `Cross-party divergence: ${formatPct(cmp.cross_party_divergence_rate)}.`
83
- : "Cross-party divergence not yet computable too few multi-org reports."
84
  }
85
  asks="Are scores on the same benchmark actually measuring the same thing?"
86
  />
@@ -154,6 +159,11 @@ function formatPct(value: number | null | undefined): string {
154
  return `${Math.round(value * 100)}%`
155
  }
156
 
 
 
 
 
 
157
  const FIELD_LABELS: Record<string, string> = {
158
  temperature: "temperature",
159
  max_tokens: "max tokens",
 
39
  const tpShare = totalReports > 0 ? prov.source_type_distribution.third_party / totalReports : 0
40
  const fpShare = totalReports > 0 ? prov.source_type_distribution.first_party / totalReports : 0
41
 
42
+ const multiSourceRate = rate(prov.multi_source_triples, prov.total_triples)
43
+ const cmpRate = rate(cmp.variant_divergent_count, cmp.groups_with_variant_check)
44
+ const crossPartyRate = rate(
45
+ cmp.cross_party_divergent_count,
46
+ cmp.groups_with_cross_party_check
47
+ )
48
+ const crossPartyAvailable = cmp.groups_with_cross_party_check > 0
49
 
50
  return (
51
  <div className="signals-grid">
 
63
  />
64
  <SignalTile
65
  id="completeness"
66
+ statValue={pctNum(comp.completeness_avg)}
67
  statUnit="%"
68
+ headline={`mean across ${comp.total_triples.toLocaleString()} reported score triples.`}
69
+ detail={`Observed range: ${formatPct(comp.completeness_min)} to ${formatPct(comp.completeness_max)}.`}
70
  asks="Is the benchmark itself documented well enough to interpret a score on it?"
71
  />
72
  <SignalTile
73
  id="provenance"
74
+ statValue={pctNum(multiSourceRate)}
75
  statUnit="%"
76
+ headline="of reported score triples have reports from more than one party."
77
+ detail={`${formatPct(tpShare)} third-party, ${formatPct(fpShare)} first-party of ${totalReports.toLocaleString()} triples.`}
78
  asks="Who reported this score, and have others reproduced it?"
79
  />
80
  <SignalTile
81
  id="comparability"
82
  statValue={pctNum(cmpRate)}
83
  statUnit="%"
84
+ headline={`of setup-eligible groups diverge across variants (${cmp.variant_divergent_count.toLocaleString()} of ${cmp.groups_with_variant_check.toLocaleString()}).`}
85
  detail={
86
  crossPartyAvailable
87
+ ? `Cross-party divergence: ${formatPct(crossPartyRate)}.`
88
+ : "Cross-party divergence not yet computable: too few multi-org reports."
89
  }
90
  asks="Are scores on the same benchmark actually measuring the same thing?"
91
  />
 
159
  return `${Math.round(value * 100)}%`
160
  }
161
 
162
+ function rate(numerator: number | null | undefined, denominator: number | null | undefined) {
163
+ if (numerator == null || denominator == null || denominator <= 0) return null
164
+ return numerator / denominator
165
+ }
166
+
167
  const FIELD_LABELS: Record<string, string> = {
168
  temperature: "temperature",
169
  max_tokens: "max tokens",
lib/backend-artifacts.ts CHANGED
@@ -12,6 +12,7 @@ export interface BackendManifest {
12
  skipped_config_count?: number
13
  summary_artifacts?: {
14
  corpus_aggregates?: string
 
15
  [key: string]: string | undefined
16
  }
17
  }
@@ -177,6 +178,27 @@ export interface CorpusAggregates {
177
  completeness: Stratified<CompletenessCorpusBlock>
178
  provenance: Stratified<ProvenanceCorpusBlock>
179
  comparability: Stratified<ComparabilityCorpusBlock>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  }
181
 
182
  export interface Stratified<T> {
@@ -198,35 +220,25 @@ export interface ReproducibilityCorpusBlock {
198
  }
199
 
200
  export interface CompletenessCorpusBlock {
201
- total_benchmarks: number
202
- completeness_score_mean: number | null
203
- completeness_score_median: number | null
204
- per_field_population: Record<string, {
205
- mean_score: number
206
- populated_rate: number
207
- fully_populated_rate: number
208
- benchmark_count: number
209
- }>
210
  }
211
 
212
  export interface ProvenanceCorpusBlock {
213
  total_triples: number
214
- total_groups: number
215
- multi_source_groups: number
216
- multi_source_rate: number | null
217
- first_party_only_groups: number
218
- first_party_only_rate: number | null
219
  source_type_distribution: Record<ProvenanceSourceType, number>
220
  }
221
 
222
  export interface ComparabilityCorpusBlock {
223
- total_groups: number
224
- variant_eligible_groups: number
225
- variant_divergent_groups: number
226
- variant_divergence_rate: number | null
227
- cross_party_eligible_groups: number
228
- cross_party_divergent_groups: number
229
- cross_party_divergence_rate: number | null
230
  }
231
 
232
  export interface HierarchyTags {
 
12
  skipped_config_count?: number
13
  summary_artifacts?: {
14
  corpus_aggregates?: string
15
+ eval_hierarchy?: string
16
  [key: string]: string | undefined
17
  }
18
  }
 
178
  completeness: Stratified<CompletenessCorpusBlock>
179
  provenance: Stratified<ProvenanceCorpusBlock>
180
  comparability: Stratified<ComparabilityCorpusBlock>
181
+ developers?: DeveloperListEntry[]
182
+ families?: Array<{
183
+ family_key: string
184
+ display_name: string
185
+ model_count: number
186
+ eval_count: number
187
+ }>
188
+ categories?: Array<{
189
+ category: string
190
+ model_count: number
191
+ eval_count: number
192
+ }>
193
+ }
194
+
195
+ export interface DeveloperListEntry {
196
+ developer: string
197
+ route_id: string
198
+ model_count: number
199
+ benchmark_count: number
200
+ evaluation_count: number
201
+ popular_evals: Array<{ benchmark: string; model_count: number }>
202
  }
203
 
204
  export interface Stratified<T> {
 
220
  }
221
 
222
  export interface CompletenessCorpusBlock {
223
+ total_triples: number
224
+ completeness_avg: number | null
225
+ completeness_min: number | null
226
+ completeness_max: number | null
 
 
 
 
 
227
  }
228
 
229
  export interface ProvenanceCorpusBlock {
230
  total_triples: number
231
+ multi_source_triples: number
232
+ first_party_only_triples: number
 
 
 
233
  source_type_distribution: Record<ProvenanceSourceType, number>
234
  }
235
 
236
  export interface ComparabilityCorpusBlock {
237
+ total_triples: number
238
+ variant_divergent_count: number
239
+ cross_party_divergent_count: number
240
+ groups_with_variant_check: number
241
+ groups_with_cross_party_check: number
 
 
242
  }
243
 
244
  export interface HierarchyTags {
lib/benchmark-schema.ts CHANGED
@@ -124,6 +124,7 @@ export interface ScoreDetails {
124
  }
125
 
126
  export interface GenerationConfig {
 
127
  generation_args?: {
128
  temperature?: number
129
  top_p?: number
 
124
  }
125
 
126
  export interface GenerationConfig {
127
+ num_few_shot?: number
128
  generation_args?: {
129
  temperature?: number
130
  top_p?: number
lib/data-backend.ts CHANGED
@@ -1,49 +1,140 @@
1
  import "server-only"
2
 
3
- import {
4
- getDashboardDataFromDuckDB,
5
- getModelCardsFromDuckDB,
6
- getModelCardsLiteFromDuckDB,
7
- getEvalListDataFromDuckDB,
8
- getEvalListLiteDataFromDuckDB,
9
- getEvalListFromDuckDB,
10
- getDeveloperListFromDuckDB,
11
- getDeveloperSummaryByIdFromDuckDB,
12
- getModelSummaryByIdFromDuckDB,
13
- getEvalSummaryByIdFromDuckDB,
14
- } from "@/lib/duckdb-data"
15
  import { normalizeEvalSummary } from "@/lib/eval-processing"
16
- import {
17
- fetchBackendManifest,
18
- fetchBackendManifestStatus,
19
- fetchEvalHierarchy,
20
- } from "@/lib/hf-data"
21
-
22
- export const getDashboardData = getDashboardDataFromDuckDB
23
- export const getModelCards = getModelCardsFromDuckDB
24
- export const getModelCardsLite = getModelCardsLiteFromDuckDB
25
- export const getEvalListData = getEvalListDataFromDuckDB
26
- export const getEvalListLiteData = getEvalListLiteDataFromDuckDB
27
- export const getEvalList = getEvalListFromDuckDB
28
- export const getDeveloperList = getDeveloperListFromDuckDB
29
- export const getDeveloperSummaryById = getDeveloperSummaryByIdFromDuckDB
30
- export const getModelSummaryById = getModelSummaryByIdFromDuckDB
31
-
32
- /**
33
- * Eval summary lookups go through `normalizeEvalSummary` so derivable but
34
- * sometimes-blank fields (currently `instance_data`) are reconciled from
35
- * `model_results` before they reach any consumer. The strict pass-through
36
- * contract of `duckdb-data.ts` stays intact — reconciliation of known
37
- * upstream gaps belongs in this thin adapter layer.
38
- */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  export async function getEvalSummaryById(evalId: string) {
40
- const summary = await getEvalSummaryByIdFromDuckDB(evalId)
 
 
 
 
41
  return summary ? normalizeEvalSummary(summary) : summary
42
  }
43
 
44
- // Metadata-style artifacts are still read through the existing JSON/HF path.
45
- // They are not request-time processing hotspots and the DuckDB shadow doesn't
46
- // re-shape them, so calling lib/hf-data directly avoids needless indirection.
47
- export const getBackendManifestData = fetchBackendManifest
48
- export const getBackendManifestStatusData = fetchBackendManifestStatus
49
- export const getEvalHierarchyData = fetchEvalHierarchy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import "server-only"
2
 
3
+ import type { BackendManifestStatus } from "@/lib/backend-artifacts"
 
 
 
 
 
 
 
 
 
 
 
4
  import { normalizeEvalSummary } from "@/lib/eval-processing"
5
+
6
+ const BACKEND_VERSION = process.env.DATA_BACKEND?.trim().toLowerCase() ?? "duckdb"
7
+
8
+ function useViewLayerBackend() {
9
+ return BACKEND_VERSION === "v2" || BACKEND_VERSION === "stage-j"
10
+ }
11
+
12
+ async function legacyBackend() {
13
+ return import("@/lib/duckdb-data")
14
+ }
15
+
16
+ async function viewBackend() {
17
+ return import("@/lib/view-data")
18
+ }
19
+
20
+ async function sidecars() {
21
+ return import("@/lib/sidecars")
22
+ }
23
+
24
+ async function hfData() {
25
+ return import("@/lib/hf-data")
26
+ }
27
+
28
+ export async function getModelCards() {
29
+ if (useViewLayerBackend()) {
30
+ return (await viewBackend()).getModelCards()
31
+ }
32
+
33
+ return (await legacyBackend()).getModelCardsFromDuckDB()
34
+ }
35
+
36
+ export async function getModelCardsLite() {
37
+ if (useViewLayerBackend()) {
38
+ return (await viewBackend()).getModelCardsLite()
39
+ }
40
+
41
+ return (await legacyBackend()).getModelCardsLiteFromDuckDB()
42
+ }
43
+
44
+ export async function getEvalListData() {
45
+ if (useViewLayerBackend()) {
46
+ return (await viewBackend()).getEvalListData()
47
+ }
48
+
49
+ return (await legacyBackend()).getEvalListDataFromDuckDB()
50
+ }
51
+
52
+ export async function getEvalListLiteData() {
53
+ if (useViewLayerBackend()) {
54
+ return (await viewBackend()).getEvalListLiteData()
55
+ }
56
+
57
+ return (await legacyBackend()).getEvalListLiteDataFromDuckDB()
58
+ }
59
+
60
+ export async function getEvalList() {
61
+ if (useViewLayerBackend()) {
62
+ return (await viewBackend()).getEvalList()
63
+ }
64
+
65
+ return (await legacyBackend()).getEvalListFromDuckDB()
66
+ }
67
+
68
+ export async function getDashboardData() {
69
+ if (useViewLayerBackend()) {
70
+ return (await viewBackend()).getDashboardData()
71
+ }
72
+
73
+ return (await legacyBackend()).getDashboardDataFromDuckDB()
74
+ }
75
+
76
+ export async function getDeveloperList() {
77
+ if (useViewLayerBackend()) {
78
+ return (await viewBackend()).getDeveloperList()
79
+ }
80
+
81
+ return (await legacyBackend()).getDeveloperListFromDuckDB()
82
+ }
83
+
84
+ export async function getDeveloperSummaryById(routeId: string) {
85
+ if (useViewLayerBackend()) {
86
+ return (await viewBackend()).getDeveloperSummaryById(routeId)
87
+ }
88
+
89
+ return (await legacyBackend()).getDeveloperSummaryByIdFromDuckDB(routeId)
90
+ }
91
+
92
+ export async function getModelSummaryById(modelId: string) {
93
+ if (useViewLayerBackend()) {
94
+ return (await viewBackend()).getModelSummaryById(modelId)
95
+ }
96
+
97
+ return (await legacyBackend()).getModelSummaryByIdFromDuckDB(modelId)
98
+ }
99
+
100
  export async function getEvalSummaryById(evalId: string) {
101
+ if (useViewLayerBackend()) {
102
+ return (await viewBackend()).getEvalSummaryById(evalId)
103
+ }
104
+
105
+ const summary = await (await legacyBackend()).getEvalSummaryByIdFromDuckDB(evalId)
106
  return summary ? normalizeEvalSummary(summary) : summary
107
  }
108
 
109
+ export async function getBackendManifestData() {
110
+ if (useViewLayerBackend()) {
111
+ return (await sidecars()).fetchManifest()
112
+ }
113
+
114
+ return (await hfData()).fetchBackendManifest()
115
+ }
116
+
117
+ export async function getBackendManifestStatusData(): Promise<BackendManifestStatus> {
118
+ if (useViewLayerBackend()) {
119
+ const manifest = await (await sidecars()).fetchManifest()
120
+ return {
121
+ currentManifest: manifest,
122
+ latestManifest: manifest,
123
+ currentManifestSignature: manifest.generated_at,
124
+ latestManifestSignature: manifest.generated_at,
125
+ updateAvailable: false,
126
+ refreshing: false,
127
+ pendingRefreshCount: 0,
128
+ }
129
+ }
130
+
131
+ return (await hfData()).fetchBackendManifestStatus()
132
+ }
133
+
134
+ export async function getEvalHierarchyData() {
135
+ if (useViewLayerBackend()) {
136
+ return (await sidecars()).fetchHierarchy()
137
+ }
138
+
139
+ return (await hfData()).fetchEvalHierarchy()
140
+ }
lib/duckdb.ts ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import "server-only"
2
+
3
+ import { DuckDBConnection } from "@duckdb/node-api"
4
+
5
+ let connectionPromise: Promise<DuckDBConnection> | null = null
6
+
7
+ function getSnapshotUrl() {
8
+ const snapshotUrl = process.env.SNAPSHOT_URL?.trim()
9
+ if (!snapshotUrl) {
10
+ throw new Error("DATA_BACKEND=v2 requires SNAPSHOT_URL to point at a Stage J snapshot directory")
11
+ }
12
+
13
+ return snapshotUrl.replace(/\/+$/, "")
14
+ }
15
+
16
+ function snapshotArtifact(name: string) {
17
+ return `${getSnapshotUrl()}/${name}`
18
+ }
19
+
20
+ function sqlString(value: string) {
21
+ return `'${value.replace(/'/g, "''")}'`
22
+ }
23
+
24
+ const VIEW_FILES = {
25
+ models_view: "models_view.parquet",
26
+ evals_view: "evals_view.parquet",
27
+ eval_results_view: "eval_results_view.parquet",
28
+ } as const
29
+
30
+ export async function getConnection(): Promise<DuckDBConnection> {
31
+ if (!connectionPromise) {
32
+ connectionPromise = (async () => {
33
+ const connection = await DuckDBConnection.create()
34
+
35
+ for (const [viewName, fileName] of Object.entries(VIEW_FILES)) {
36
+ await connection.run(
37
+ `CREATE OR REPLACE VIEW ${viewName} AS SELECT * FROM read_parquet(${sqlString(snapshotArtifact(fileName))})`
38
+ )
39
+ }
40
+
41
+ return connection
42
+ })()
43
+ }
44
+
45
+ return connectionPromise
46
+ }
lib/hf-data.ts CHANGED
@@ -138,6 +138,15 @@ function getManifestSignature(manifest: BackendManifest | null | undefined) {
138
  // reading the same on-disk artifacts cannot diverge mid-test via background
139
  // refresh, and useful generally for offline development.
140
  const OFFLINE = process.env.HF_DATA_OFFLINE === "1"
 
 
 
 
 
 
 
 
 
141
 
142
  async function fetchRemoteJson<T>(relativePath: string): Promise<T> {
143
  if (OFFLINE) {
@@ -423,6 +432,19 @@ async function fetchHFJson<T>(relativePath: string): Promise<T> {
423
  }
424
 
425
  export async function fetchBackendManifestStatus(): Promise<BackendManifestStatus> {
 
 
 
 
 
 
 
 
 
 
 
 
 
426
  const snapshot = await getManifestSnapshot()
427
  const currentManifest = getCurrentManifestFromSnapshot(snapshot)
428
  const currentManifestSignature = getManifestSignature(currentManifest)
@@ -864,14 +886,26 @@ export async function fetchDevelopersList(): Promise<HFDeveloperEntry[]> {
864
  }
865
 
866
  export async function fetchBenchmarkMetadataMap(): Promise<Record<string, BenchmarkCard>> {
 
 
 
 
867
  return fetchHFJson<Record<string, BenchmarkCard>>("benchmark-metadata.json")
868
  }
869
 
870
  export async function fetchBackendManifest(): Promise<BackendManifest> {
 
 
 
 
871
  return fetchHFJson<BackendManifest>("manifest.json")
872
  }
873
 
874
  export async function fetchEvalHierarchy(): Promise<EvalHierarchy> {
 
 
 
 
875
  const raw = await fetchHFJson<EvalHierarchy>("eval-hierarchy.json")
876
  return adaptEvalHierarchy(raw)
877
  }
@@ -971,10 +1005,18 @@ function adaptEvalHierarchy(raw: EvalHierarchy): EvalHierarchy {
971
  }
972
 
973
  export async function fetchComparisonIndex(): Promise<ComparisonIndex> {
 
 
 
 
974
  return fetchHFJson<ComparisonIndex>("comparison-index.json")
975
  }
976
 
977
  export async function fetchCorpusAggregates(): Promise<CorpusAggregates | null> {
 
 
 
 
978
  return fetchHFJsonSafe<CorpusAggregates>("corpus-aggregates.json")
979
  }
980
 
 
138
  // reading the same on-disk artifacts cannot diverge mid-test via background
139
  // refresh, and useful generally for offline development.
140
  const OFFLINE = process.env.HF_DATA_OFFLINE === "1"
141
+ const DATA_BACKEND_VERSION = process.env.DATA_BACKEND?.trim().toLowerCase()
142
+
143
+ function useViewLayerBackend() {
144
+ return DATA_BACKEND_VERSION === "v2" || DATA_BACKEND_VERSION === "stage-j"
145
+ }
146
+
147
+ async function fetchSnapshotSidecars() {
148
+ return import("@/lib/sidecars")
149
+ }
150
 
151
  async function fetchRemoteJson<T>(relativePath: string): Promise<T> {
152
  if (OFFLINE) {
 
432
  }
433
 
434
  export async function fetchBackendManifestStatus(): Promise<BackendManifestStatus> {
435
+ if (useViewLayerBackend()) {
436
+ const manifest = await (await fetchSnapshotSidecars()).fetchManifest()
437
+ return {
438
+ currentManifest: manifest,
439
+ latestManifest: manifest,
440
+ currentManifestSignature: manifest.generated_at,
441
+ latestManifestSignature: manifest.generated_at,
442
+ updateAvailable: false,
443
+ refreshing: false,
444
+ pendingRefreshCount: 0,
445
+ }
446
+ }
447
+
448
  const snapshot = await getManifestSnapshot()
449
  const currentManifest = getCurrentManifestFromSnapshot(snapshot)
450
  const currentManifestSignature = getManifestSignature(currentManifest)
 
886
  }
887
 
888
  export async function fetchBenchmarkMetadataMap(): Promise<Record<string, BenchmarkCard>> {
889
+ if (useViewLayerBackend()) {
890
+ return (await import("@/lib/view-data")).getBenchmarkMetadataMap()
891
+ }
892
+
893
  return fetchHFJson<Record<string, BenchmarkCard>>("benchmark-metadata.json")
894
  }
895
 
896
  export async function fetchBackendManifest(): Promise<BackendManifest> {
897
+ if (useViewLayerBackend()) {
898
+ return (await fetchSnapshotSidecars()).fetchManifest()
899
+ }
900
+
901
  return fetchHFJson<BackendManifest>("manifest.json")
902
  }
903
 
904
  export async function fetchEvalHierarchy(): Promise<EvalHierarchy> {
905
+ if (useViewLayerBackend()) {
906
+ return adaptEvalHierarchy(await (await fetchSnapshotSidecars()).fetchHierarchy())
907
+ }
908
+
909
  const raw = await fetchHFJson<EvalHierarchy>("eval-hierarchy.json")
910
  return adaptEvalHierarchy(raw)
911
  }
 
1005
  }
1006
 
1007
  export async function fetchComparisonIndex(): Promise<ComparisonIndex> {
1008
+ if (useViewLayerBackend()) {
1009
+ return (await fetchSnapshotSidecars()).fetchComparisonIndex()
1010
+ }
1011
+
1012
  return fetchHFJson<ComparisonIndex>("comparison-index.json")
1013
  }
1014
 
1015
  export async function fetchCorpusAggregates(): Promise<CorpusAggregates | null> {
1016
+ if (useViewLayerBackend()) {
1017
+ return (await fetchSnapshotSidecars()).fetchHeadline()
1018
+ }
1019
+
1020
  return fetchHFJsonSafe<CorpusAggregates>("corpus-aggregates.json")
1021
  }
1022
 
lib/sidecars.ts ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import "server-only"
2
+
3
+ import type {
4
+ BackendManifest,
5
+ ComparisonIndex,
6
+ CorpusAggregates,
7
+ EvalHierarchy,
8
+ } from "@/lib/backend-artifacts"
9
+
10
+ let cache: {
11
+ manifest?: Promise<BackendManifest>
12
+ headline?: Promise<CorpusAggregates>
13
+ hierarchy?: Promise<EvalHierarchy>
14
+ comparisonIndex?: Promise<ComparisonIndex>
15
+ } = {}
16
+
17
+ function getSnapshotUrl() {
18
+ const snapshotUrl = process.env.SNAPSHOT_URL?.trim()
19
+ if (!snapshotUrl) {
20
+ throw new Error("DATA_BACKEND=v2 requires SNAPSHOT_URL to point at a Stage J snapshot directory")
21
+ }
22
+
23
+ return snapshotUrl.replace(/\/+$/, "")
24
+ }
25
+
26
+ function sidecarUrl(name: string) {
27
+ return `${getSnapshotUrl()}/${name}`
28
+ }
29
+
30
+ async function fetchJson<T>(name: string): Promise<T> {
31
+ const url = sidecarUrl(name)
32
+
33
+ if (url.startsWith("file://")) {
34
+ const fs = await import("fs/promises")
35
+ const text = await fs.readFile(new URL(url), "utf8")
36
+ return JSON.parse(text) as T
37
+ }
38
+
39
+ const response = await fetch(url, { next: { revalidate: 3600 } })
40
+ if (!response.ok) {
41
+ throw new Error(`Snapshot sidecar fetch failed: ${response.status} ${response.statusText} for ${url}`)
42
+ }
43
+
44
+ return (await response.json()) as T
45
+ }
46
+
47
+ export function fetchManifest(): Promise<BackendManifest> {
48
+ return (cache.manifest ??= fetchJson<BackendManifest>("manifest.json"))
49
+ }
50
+
51
+ export function fetchHeadline(): Promise<CorpusAggregates> {
52
+ return (cache.headline ??= fetchJson<CorpusAggregates>("headline.json"))
53
+ }
54
+
55
+ export function fetchHierarchy(): Promise<EvalHierarchy> {
56
+ return (cache.hierarchy ??= fetchJson<EvalHierarchy>("hierarchy.json"))
57
+ }
58
+
59
+ export function fetchComparisonIndex(): Promise<ComparisonIndex> {
60
+ return (cache.comparisonIndex ??= fetchJson<ComparisonIndex>("comparison-index.json"))
61
+ }
62
+
63
+ export function resetSidecarCacheForTests() {
64
+ cache = {}
65
+ }
lib/view-data.ts ADDED
@@ -0,0 +1,576 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import "server-only"
2
+
3
+ import { getConnection } from "@/lib/duckdb"
4
+ import { fetchHeadline } from "@/lib/sidecars"
5
+ import {
6
+ EVALUATION_CATEGORIES,
7
+ type BenchmarkCard,
8
+ type BenchmarkEvaluation,
9
+ type CategoryType,
10
+ type EvaluationCardData,
11
+ type EvaluationResult,
12
+ type GenerationConfig,
13
+ type MetricConfig,
14
+ type ModelInfo,
15
+ type ModelEvaluationSummary,
16
+ type ModelVariantSummary,
17
+ type ScoreDetails,
18
+ type SourceData,
19
+ type SourceMetadata,
20
+ } from "@/lib/benchmark-schema"
21
+ import type { DeveloperListEntry } from "@/lib/backend-artifacts"
22
+ import type {
23
+ BenchmarkEvalListItem,
24
+ BenchmarkEvalSummary,
25
+ ModelResultForBenchmark,
26
+ } from "@/lib/eval-processing"
27
+
28
+ type Row = Record<string, any>
29
+
30
+ const MODEL_CARD_COLUMNS = `
31
+ id, model_key, route_id, model_name, model_id, canonical_model_name, developer,
32
+ evaluations_count, benchmarks_count, variant_count,
33
+ categories, category_stats, latest_timestamp,
34
+ evaluator_count, evaluator_names, source_type_count, source_types,
35
+ evidence_count, missing_generation_config_count,
36
+ third_party_eval_count, independent_verification_ratio,
37
+ reproducibility_status, eval_libraries, latest_source_name,
38
+ params_billions, benchmark_names, score_summary,
39
+ reproducibility_summary, provenance_summary, comparability_summary,
40
+ top_scores, source_urls, detail_urls,
41
+ model_url, release_date,
42
+ architecture, params, inference_engine, inference_platform
43
+ `
44
+
45
+ const EVAL_LIST_COLUMNS = `
46
+ evaluation_id, evaluation_name, canonical_display_name,
47
+ composite_benchmark_key, composite_benchmark_name,
48
+ benchmark_family_key, benchmark_leaf_key, category,
49
+ metric_config, models_count, evaluator_names, source_types,
50
+ latest_source_name, third_party_ratio,
51
+ missing_generation_config_count, best_model, worst_model,
52
+ avg_score, avg_score_norm, has_card, benchmark_card,
53
+ is_aggregated, aggregate_sources, tags,
54
+ metrics_count, metric_names, instance_data, top_score,
55
+ subtasks_count, is_summary_score, summary_eval_ids,
56
+ root_metrics, subtasks, leaderboard_metrics,
57
+ reproducibility_summary, provenance_summary, comparability_summary,
58
+ source_data
59
+ `
60
+
61
+ const CELL_JOIN_COLUMNS = `
62
+ r.*,
63
+ e.evaluation_name AS eval_evaluation_name,
64
+ e.canonical_display_name AS eval_canonical_display_name,
65
+ e.composite_benchmark_key AS eval_composite_benchmark_key,
66
+ e.composite_benchmark_name AS eval_composite_benchmark_name,
67
+ e.benchmark_family_key AS eval_benchmark_family_key,
68
+ e.benchmark_leaf_key AS eval_benchmark_leaf_key,
69
+ e.category AS eval_category,
70
+ e.metric_config AS eval_metric_config,
71
+ e.source_data AS eval_source_data,
72
+ e.benchmark_card AS eval_benchmark_card,
73
+ e.tags AS eval_tags,
74
+ e.is_summary_score AS eval_is_summary_score,
75
+ e.summary_eval_ids AS eval_summary_eval_ids
76
+ `
77
+
78
+ function normalizeDuckDBValue(value: unknown): unknown {
79
+ if (typeof value === "bigint") {
80
+ return Number(value)
81
+ }
82
+
83
+ if (value instanceof Date) {
84
+ return value.toISOString()
85
+ }
86
+
87
+ if (value instanceof Map) {
88
+ return Object.fromEntries(
89
+ Array.from(value.entries()).map(([key, mapValue]) => [String(key), normalizeDuckDBValue(mapValue)])
90
+ )
91
+ }
92
+
93
+ if (Array.isArray(value)) {
94
+ return value.map(normalizeDuckDBValue)
95
+ }
96
+
97
+ if (value && typeof value === "object") {
98
+ const duckValue = value as {
99
+ constructor?: { name?: string }
100
+ entries?: unknown
101
+ items?: unknown
102
+ scale?: unknown
103
+ value?: unknown
104
+ toString?: () => string
105
+ }
106
+ const constructorName = duckValue.constructor?.name ?? ""
107
+
108
+ if (constructorName === "DuckDBStructValue" && duckValue.entries && typeof duckValue.entries === "object") {
109
+ return normalizeDuckDBValue(duckValue.entries)
110
+ }
111
+
112
+ if (
113
+ (constructorName === "DuckDBListValue" || constructorName === "DuckDBArrayValue") &&
114
+ Array.isArray(duckValue.items)
115
+ ) {
116
+ return duckValue.items.map(normalizeDuckDBValue)
117
+ }
118
+
119
+ if (constructorName === "DuckDBMapValue" && Array.isArray(duckValue.entries)) {
120
+ return Object.fromEntries(
121
+ duckValue.entries.map((entry) => {
122
+ const pair = entry as { key: unknown; value: unknown }
123
+ return [String(pair.key), normalizeDuckDBValue(pair.value)]
124
+ })
125
+ )
126
+ }
127
+
128
+ if (constructorName === "DuckDBDecimalValue" && typeof duckValue.toString === "function") {
129
+ return Number(duckValue.toString())
130
+ }
131
+
132
+ if (constructorName.startsWith("DuckDB") && typeof duckValue.toString === "function") {
133
+ return duckValue.toString()
134
+ }
135
+
136
+ return Object.fromEntries(
137
+ Object.entries(value).map(([key, objectValue]) => [key, normalizeDuckDBValue(objectValue)])
138
+ )
139
+ }
140
+
141
+ return value
142
+ }
143
+
144
+ async function readRows<T = Row>(sql: string, params: unknown[] = []): Promise<T[]> {
145
+ const connection = await getConnection()
146
+ const reader = params.length > 0
147
+ ? await connection.runAndReadAll(sql, params as any[])
148
+ : await connection.runAndReadAll(sql)
149
+ return reader.getRowObjects().map((row) => normalizeDuckDBValue(row) as T)
150
+ }
151
+
152
+ function asNumber(value: unknown, fallback = 0) {
153
+ if (typeof value === "number" && Number.isFinite(value)) return value
154
+ if (typeof value === "bigint") return Number(value)
155
+ if (typeof value === "string" && value.trim() !== "") {
156
+ const parsed = Number(value)
157
+ if (Number.isFinite(parsed)) return parsed
158
+ }
159
+ return fallback
160
+ }
161
+
162
+ function optionalNumber(value: unknown) {
163
+ if (value == null) return undefined
164
+ const parsed = asNumber(value, Number.NaN)
165
+ return Number.isFinite(parsed) ? parsed : undefined
166
+ }
167
+
168
+ function asString(value: unknown, fallback = "") {
169
+ return typeof value === "string" ? value : fallback
170
+ }
171
+
172
+ function optionalString(value: unknown) {
173
+ return typeof value === "string" && value.length > 0 ? value : undefined
174
+ }
175
+
176
+ function asArray<T>(value: unknown): T[] {
177
+ return Array.isArray(value) ? value as T[] : []
178
+ }
179
+
180
+ function normalizeCategory(value: unknown): CategoryType {
181
+ return EVALUATION_CATEGORIES.includes(value as CategoryType)
182
+ ? value as CategoryType
183
+ : "General"
184
+ }
185
+
186
+ function emptyEvaluationsByCategory(): Record<CategoryType, BenchmarkEvaluation[]> {
187
+ return EVALUATION_CATEGORIES.reduce((acc, category) => {
188
+ acc[category] = []
189
+ return acc
190
+ }, {} as Record<CategoryType, BenchmarkEvaluation[]>)
191
+ }
192
+
193
+ function sourceMetadataFromRow(row: Row): SourceMetadata {
194
+ if (row.source_metadata && typeof row.source_metadata === "object") {
195
+ return row.source_metadata as SourceMetadata
196
+ }
197
+
198
+ return {
199
+ source_type: "documentation",
200
+ source_organization_name: asString(row.latest_source_name, "Unknown"),
201
+ evaluator_relationship: "other",
202
+ }
203
+ }
204
+
205
+ function sourceDataFromRow(row: Row): BenchmarkEvaluation["source_data"] {
206
+ const sourceData = row.source_data ?? row.eval_source_data
207
+ if (sourceData) {
208
+ return sourceData as BenchmarkEvaluation["source_data"]
209
+ }
210
+
211
+ return {
212
+ dataset_name: asString(row.eval_evaluation_name ?? row.evaluation_name ?? row.benchmark_id, "Unknown dataset"),
213
+ } satisfies SourceData
214
+ }
215
+
216
+ function scoreDetailsFromRow(row: Row): ScoreDetails {
217
+ const details = row.score_details && typeof row.score_details === "object"
218
+ ? row.score_details as Partial<ScoreDetails>
219
+ : {}
220
+ const score = asNumber(details.score ?? row.score)
221
+
222
+ return {
223
+ ...details,
224
+ score,
225
+ } as ScoreDetails
226
+ }
227
+
228
+ function metricConfigFromRow(row: Row): MetricConfig {
229
+ const config = (row.metric_config ?? row.eval_metric_config ?? {}) as Partial<MetricConfig>
230
+ const scoreType = config.score_type === "binary" || config.score_type === "discrete"
231
+ ? config.score_type
232
+ : "continuous"
233
+
234
+ return {
235
+ evaluation_description: asString(
236
+ config.evaluation_description ??
237
+ row.metric_description ??
238
+ row.metric_display_name ??
239
+ row.eval_evaluation_name ??
240
+ row.evaluation_name,
241
+ ""
242
+ ),
243
+ lower_is_better: Boolean(row.lower_is_better ?? config.lower_is_better ?? false),
244
+ score_type: scoreType,
245
+ min_score: optionalNumber(config.min_score ?? row.min_score),
246
+ max_score: optionalNumber(config.max_score ?? row.max_score),
247
+ unit: optionalString(row.metric_unit ?? config.unit),
248
+ }
249
+ }
250
+
251
+ function modelInfoFromModelRow(row: Row): ModelInfo {
252
+ return {
253
+ name: asString(row.model_name ?? row.model_family_name ?? row.model_id ?? row.model_key, "Unknown model"),
254
+ id: asString(row.model_key ?? row.model_id ?? row.id ?? row.route_id, "unknown-model"),
255
+ developer: optionalString(row.developer),
256
+ inference_platform: optionalString(row.inference_platform),
257
+ inference_engine: optionalString(row.inference_engine),
258
+ architecture: optionalString(row.architecture),
259
+ parameter_count: optionalString(row.params),
260
+ release_date: optionalString(row.release_date),
261
+ model_url: optionalString(row.model_url),
262
+ additional_details: {
263
+ params_billions: row.params_billions,
264
+ },
265
+ modalities: {
266
+ input: asArray<string>(row.input_modalities),
267
+ output: asArray<string>(row.output_modalities),
268
+ },
269
+ }
270
+ }
271
+
272
+ function resultFromCell(row: Row): EvaluationResult {
273
+ const scoreDetails = scoreDetailsFromRow(row)
274
+ const generationConfig = row.generation_config as GenerationConfig | undefined
275
+ const annotations = row.evalcards_annotations
276
+
277
+ return {
278
+ evaluation_name: asString(row.metric_display_name ?? row.eval_evaluation_name ?? row.metric_id, "Score"),
279
+ display_name: optionalString(row.metric_display_name),
280
+ canonical_display_name: optionalString(row.metric_display_name),
281
+ metric_summary_id: optionalString(row.metric_summary_id),
282
+ metric_key: optionalString(row.metric_id),
283
+ evaluation_timestamp: asString(row.evaluation_timestamp, ""),
284
+ source_data: sourceDataFromRow(row),
285
+ metric_config: metricConfigFromRow(row),
286
+ score_details: scoreDetails,
287
+ generation_config: generationConfig,
288
+ detailed_evaluation_results_url: optionalString(row.instance_file_path),
289
+ evalcards: annotations ? { annotations } : undefined,
290
+ }
291
+ }
292
+
293
+ function reshapeCellToModelResult(row: Row): ModelResultForBenchmark {
294
+ const scoreDetails = scoreDetailsFromRow(row)
295
+
296
+ return {
297
+ model_info: (row.model_info ?? modelInfoFromModelRow(row)) as ModelInfo,
298
+ model_route_id: optionalString(row.model_route_id),
299
+ score: scoreDetails.score,
300
+ score_details: scoreDetails,
301
+ evaluation_timestamp: asString(row.evaluation_timestamp, ""),
302
+ source_metadata: sourceMetadataFromRow(row),
303
+ source_data: sourceDataFromRow(row),
304
+ source_record_url: optionalString(row.source_record_url),
305
+ aggregate_components: asArray<NonNullable<ModelResultForBenchmark["aggregate_components"]>[number]>(
306
+ row.aggregate_components
307
+ ),
308
+ result: resultFromCell(row),
309
+ }
310
+ }
311
+
312
+ function reshapeCellToBenchmarkEvaluation(row: Row): BenchmarkEvaluation {
313
+ const result = resultFromCell(row)
314
+ const modelInfo = (row.model_info ?? modelInfoFromModelRow(row)) as ModelInfo
315
+
316
+ return {
317
+ schema_version: "1.0",
318
+ eval_summary_id: optionalString(row.evaluation_id),
319
+ evaluation_id: asString(row.evaluation_id ?? row.benchmark_id, "unknown-evaluation"),
320
+ retrieved_timestamp: asString(row.evaluation_timestamp, ""),
321
+ benchmark: optionalString(row.eval_evaluation_name ?? row.benchmark_id),
322
+ display_name: optionalString(row.eval_evaluation_name),
323
+ canonical_display_name: optionalString(row.eval_canonical_display_name),
324
+ category: normalizeCategory(row.eval_category ?? row.category),
325
+ benchmark_family_key: optionalString(row.eval_benchmark_family_key),
326
+ benchmark_family_name: optionalString(row.eval_composite_benchmark_name),
327
+ benchmark_parent_key: optionalString(row.eval_composite_benchmark_key),
328
+ benchmark_parent_name: optionalString(row.eval_composite_benchmark_name),
329
+ benchmark_leaf_key: optionalString(row.eval_benchmark_leaf_key),
330
+ benchmark_leaf_name: optionalString(row.eval_evaluation_name),
331
+ is_summary_score: Boolean(row.eval_is_summary_score ?? row.is_summary_score),
332
+ source_data: sourceDataFromRow(row),
333
+ source_metadata: sourceMetadataFromRow(row),
334
+ eval_library: row.eval_library,
335
+ model_info: modelInfo,
336
+ generation_config: row.generation_config,
337
+ evaluation_results: [result],
338
+ }
339
+ }
340
+
341
+ function modelSummaryFromRows(modelRow: Row, cellRows: Row[]): ModelEvaluationSummary {
342
+ const evaluationsByCategory = emptyEvaluationsByCategory()
343
+ for (const cellRow of cellRows) {
344
+ const evaluation = reshapeCellToBenchmarkEvaluation(cellRow)
345
+ const category = normalizeCategory(evaluation.category)
346
+ evaluationsByCategory[category].push(evaluation)
347
+ }
348
+
349
+ const categoriesCovered = asArray<CategoryType>(modelRow.categories).filter((category) =>
350
+ EVALUATION_CATEGORIES.includes(category)
351
+ )
352
+ const modelInfo = (modelRow.model_info ?? modelInfoFromModelRow(modelRow)) as ModelInfo
353
+ const totalEvaluations = asNumber(modelRow.total_evaluations ?? modelRow.evaluations_count)
354
+ const lastUpdated = asString(modelRow.last_updated ?? modelRow.latest_timestamp, "")
355
+ const rawModelIds = asArray<string>(modelRow.raw_model_ids)
356
+
357
+ const core = {
358
+ model_info: modelInfo,
359
+ evaluations_by_category: evaluationsByCategory,
360
+ total_evaluations: totalEvaluations,
361
+ last_updated: lastUpdated,
362
+ categories_covered: categoriesCovered.length > 0
363
+ ? categoriesCovered
364
+ : EVALUATION_CATEGORIES.filter((category) => evaluationsByCategory[category].length > 0),
365
+ reproducibility_summary: modelRow.reproducibility_summary,
366
+ provenance_summary: modelRow.provenance_summary,
367
+ comparability_summary: modelRow.comparability_summary,
368
+ }
369
+
370
+ const variants = asArray<Row>(modelRow.variants).map((variant, index) => ({
371
+ ...core,
372
+ ...variant,
373
+ variant_id: asString(variant.variant_id ?? variant.variant_key, `variant-${index}`),
374
+ variant_key: asString(variant.variant_key, `variant-${index}`),
375
+ variant_label: asString(variant.variant_label ?? variant.variant_display_name, "Default"),
376
+ variant_display_name: asString(variant.variant_display_name ?? variant.variant_label ?? modelRow.model_name, modelRow.model_name),
377
+ raw_model_ids: asArray<string>(variant.raw_model_ids),
378
+ family_id: asString(variant.family_id ?? modelRow.model_family_id, modelRow.model_family_id),
379
+ family_name: asString(variant.family_name ?? modelRow.model_family_name, modelRow.model_family_name),
380
+ total_evaluations: asNumber(variant.total_evaluations ?? totalEvaluations),
381
+ last_updated: asString(variant.last_updated ?? lastUpdated, lastUpdated),
382
+ categories_covered: asArray<CategoryType>(variant.categories_covered).length > 0
383
+ ? asArray<CategoryType>(variant.categories_covered)
384
+ : core.categories_covered,
385
+ model_info: {
386
+ ...modelInfo,
387
+ name: asString(variant.variant_display_name ?? variant.variant_label ?? modelInfo.name, modelInfo.name),
388
+ },
389
+ })) as ModelVariantSummary[]
390
+
391
+ return {
392
+ ...core,
393
+ model_family_id: asString(modelRow.model_family_id ?? modelRow.model_key ?? modelRow.model_id, modelRow.model_key ?? modelRow.model_id),
394
+ model_route_id: asString(modelRow.model_route_id ?? modelRow.route_id, modelRow.route_id),
395
+ model_family_name: asString(modelRow.model_family_name ?? modelRow.model_name, modelRow.model_name),
396
+ raw_model_ids: rawModelIds.length > 0 ? rawModelIds : [asString(modelRow.model_key ?? modelRow.model_id, "")].filter(Boolean),
397
+ variants,
398
+ }
399
+ }
400
+
401
+ async function getModelEvaluationRows(modelKey: string): Promise<Row[]> {
402
+ // model_key is the producer's addressable identifier — non-null for both
403
+ // resolved and unresolved models (the latter fall back to the raw source
404
+ // name). Querying by model_id alone would silently miss unresolved models.
405
+ return readRows<Row>(
406
+ `SELECT ${CELL_JOIN_COLUMNS}
407
+ FROM eval_results_view r
408
+ LEFT JOIN evals_view e ON r.evaluation_id = e.evaluation_id
409
+ WHERE r.model_key = ?
410
+ AND r.score IS NOT NULL
411
+ ORDER BY r.category, r.percentile DESC NULLS LAST`,
412
+ [modelKey]
413
+ )
414
+ }
415
+
416
+ export async function getModelCards(): Promise<EvaluationCardData[]> {
417
+ return readRows<EvaluationCardData>(
418
+ `SELECT ${MODEL_CARD_COLUMNS}
419
+ FROM models_view
420
+ ORDER BY latest_timestamp DESC NULLS LAST`
421
+ )
422
+ }
423
+
424
+ export async function getModelCardsLite(): Promise<EvaluationCardData[]> {
425
+ return readRows<EvaluationCardData>(
426
+ `SELECT ${MODEL_CARD_COLUMNS}
427
+ FROM models_view
428
+ ORDER BY benchmarks_count DESC NULLS LAST, evaluations_count DESC NULLS LAST, model_name ASC`
429
+ )
430
+ }
431
+
432
+ export async function getEvalListData(): Promise<{
433
+ evals: BenchmarkEvalListItem[]
434
+ totalModels: number
435
+ }> {
436
+ const [evals, countRows] = await Promise.all([
437
+ readRows<BenchmarkEvalListItem>(
438
+ `SELECT ${EVAL_LIST_COLUMNS}
439
+ FROM evals_view
440
+ ORDER BY evaluation_name ASC`
441
+ ),
442
+ readRows<{ n: number }>("SELECT COUNT(*) AS n FROM models_view"),
443
+ ])
444
+
445
+ return {
446
+ evals,
447
+ totalModels: asNumber(countRows[0]?.n),
448
+ }
449
+ }
450
+
451
+ export async function getEvalListLiteData(): Promise<{
452
+ evals: BenchmarkEvalListItem[]
453
+ totalModels: number
454
+ }> {
455
+ return getEvalListData()
456
+ }
457
+
458
+ export async function getEvalList() {
459
+ const { evals } = await getEvalListData()
460
+ return evals
461
+ }
462
+
463
+ export async function getDashboardData() {
464
+ const [models, evals] = await Promise.all([
465
+ getModelCards(),
466
+ getEvalList(),
467
+ ])
468
+ return { models, evals }
469
+ }
470
+
471
+ export async function getModelSummaryById(routeId: string): Promise<ModelEvaluationSummary | null> {
472
+ // Lookups use the addressable identifier (`model_key`/`route_id`/
473
+ // `model_route_id`/`model_family_id`) so unresolved models — whose
474
+ // `model_id` is NULL — are still findable. `model_id` is kept in the
475
+ // OR chain as a back-compat fallback for old links.
476
+ const rows = await readRows<Row>(
477
+ `SELECT *
478
+ FROM models_view
479
+ WHERE model_key = ? OR route_id = ? OR model_route_id = ? OR model_family_id = ? OR model_id = ?
480
+ LIMIT 1`,
481
+ [routeId, routeId, routeId, routeId, routeId]
482
+ )
483
+ const modelRow = rows[0]
484
+ if (!modelRow) return null
485
+
486
+ const cellRows = await getModelEvaluationRows(asString(modelRow.model_key ?? modelRow.model_id, routeId))
487
+ return modelSummaryFromRows(modelRow, cellRows)
488
+ }
489
+
490
+ export async function getEvalSummaryById(evalId: string): Promise<BenchmarkEvalSummary | null> {
491
+ const evalRows = await readRows<Row>(
492
+ "SELECT * FROM evals_view WHERE evaluation_id = ? LIMIT 1",
493
+ [evalId]
494
+ )
495
+ const evalRow = evalRows[0]
496
+ if (!evalRow) return null
497
+
498
+ let cellRows = await readRows<Row>(
499
+ `SELECT ${CELL_JOIN_COLUMNS}
500
+ FROM eval_results_view r
501
+ LEFT JOIN evals_view e ON r.evaluation_id = e.evaluation_id
502
+ WHERE r.evaluation_id = ?
503
+ AND r.metric_id = (SELECT primary_metric_id FROM evals_view WHERE evaluation_id = ?)
504
+ AND r.score IS NOT NULL
505
+ ORDER BY r.position ASC NULLS LAST`,
506
+ [evalId, evalId]
507
+ )
508
+
509
+ if (cellRows.length === 0) {
510
+ cellRows = await readRows<Row>(
511
+ `SELECT ${CELL_JOIN_COLUMNS}
512
+ FROM eval_results_view r
513
+ LEFT JOIN evals_view e ON r.evaluation_id = e.evaluation_id
514
+ WHERE r.evaluation_id = ?
515
+ AND r.score IS NOT NULL
516
+ ORDER BY r.position ASC NULLS LAST`,
517
+ [evalId]
518
+ )
519
+ }
520
+
521
+ return {
522
+ ...evalRow,
523
+ model_results: cellRows.map(reshapeCellToModelResult),
524
+ } as BenchmarkEvalSummary
525
+ }
526
+
527
+ export async function getDeveloperList(): Promise<DeveloperListEntry[]> {
528
+ const headline = await fetchHeadline()
529
+ return [...(headline.developers ?? [])].sort((a, b) => a.developer.localeCompare(b.developer))
530
+ }
531
+
532
+ export async function getDeveloperSummaryById(routeId: string) {
533
+ const developers = await getDeveloperList()
534
+ const developer = developers.find((entry) => entry.route_id === routeId)
535
+ if (!developer) return null
536
+
537
+ const models = await readRows<EvaluationCardData>(
538
+ `SELECT ${MODEL_CARD_COLUMNS}
539
+ FROM models_view
540
+ WHERE developer = ?
541
+ ORDER BY benchmarks_count DESC NULLS LAST, evaluations_count DESC NULLS LAST, model_name ASC`,
542
+ [developer.developer]
543
+ )
544
+
545
+ return {
546
+ ...developer,
547
+ models,
548
+ }
549
+ }
550
+
551
+ export async function getBenchmarkMetadataMap(): Promise<Record<string, BenchmarkCard>> {
552
+ const rows = await readRows<Row>(
553
+ `SELECT evaluation_id, evaluation_name, composite_benchmark_key, benchmark_card
554
+ FROM evals_view
555
+ WHERE benchmark_card IS NOT NULL`
556
+ )
557
+ const result: Record<string, BenchmarkCard> = {}
558
+
559
+ for (const row of rows) {
560
+ const card = row.benchmark_card as BenchmarkCard | null | undefined
561
+ if (!card) continue
562
+
563
+ const keys = [
564
+ row.evaluation_id,
565
+ row.evaluation_name,
566
+ row.composite_benchmark_key,
567
+ card.benchmark_details?.name,
568
+ ].filter((key): key is string => typeof key === "string" && key.length > 0)
569
+
570
+ for (const key of keys) {
571
+ result[key] = card
572
+ }
573
+ }
574
+
575
+ return result
576
+ }
notes/backend-v2-migration.md ADDED
@@ -0,0 +1,616 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Frontend migration to backend v2 (Stage J view layer)
2
+
3
+ > **Status:** spec, drafted 2026-05-03 against `eval_card_backend`'s
4
+ > Stage J view-layer contract.
5
+ >
6
+ > **Sources:**
7
+ > - Backend spec (the contract this consumes):
8
+ > `../eval_card_backend/notes/08-frontend-view-layer.md`
9
+ > - Canonical schema (audit/debug only; not in hot path):
10
+ > `../eval_card_backend/notes/01-schema-from-frontend.md`
11
+
12
+ ---
13
+
14
+ ## Context
15
+
16
+ The legacy producer (`eval_cards_backend_pipeline`) emitted ten
17
+ parquets where each row carried a `payload_json` VARCHAR with the
18
+ post-TS-adapter shape baked in. The frontend's "DuckDB backend"
19
+ (`lib/duckdb-data.ts`) read these blobs and `JSON.parse`d them — column
20
+ projection, filter pushdown, and type contracts were all forfeited.
21
+
22
+ The new producer (`eval_card_backend`) emits a typed view layer over
23
+ its canonical normalised tables. Three Parquet files cover every page
24
+ shape, three small JSON sidecars cover corpus-level scalars and the
25
+ hierarchy tree. Column names match the frontend's TS interfaces
26
+ field-for-field, so the row→object cast is a typed spread for most
27
+ accessors. Two interfaces (`ModelResultForBenchmark` and the
28
+ `evaluations_by_category` body of `ModelEvaluationSummary`) require a
29
+ small mechanical reshape over the row, since one nests fields that the
30
+ view stores flat — see the per-accessor sections below. No
31
+ HF-record-to-display adapter logic survives.
32
+
33
+ This document specifies what changes in `general-eval-card` once
34
+ backend v2 is faithfully implemented. **The visual frontend, page
35
+ renderers, and TS interface shapes do not change.** Only the I/O
36
+ boundary moves.
37
+
38
+ ---
39
+
40
+ ## What changes (overview)
41
+
42
+ | layer | before (v1) | after (v2) |
43
+ |---|---|---|
44
+ | Distribution | `LOCAL_PIPELINE_OUTPUT` env var pointing at a producer output dir; `duckdb/v1/` subpath; implicit "warehouse/latest/" coupling | `SNAPSHOT_URL` env var (file:// or HF dataset URL); one snapshot pinned per deploy |
45
+ | Storage shape | 10 parquets each with one `payload_json` column | 3 typed-column view parquets + 3 JSON sidecars |
46
+ | Read pattern | `SELECT payload_json FROM read_parquet(?) WHERE id = ?`, then `JSON.parse` | `SELECT col1, col2, ... FROM <view> WHERE id = ?`, typed row spread |
47
+ | List vs detail | Separate `*_lite.parquet` files | Column projection on the same parquet |
48
+ | Suite/aggregate dispatch | Eval id prefix (`aggregate__`, `matrix__`) → different parquet | `is_summary_score` flag + `parent_benchmark_id` on `evals_view` |
49
+ | Slug rule | Custom `replace('/', '__')` escapes; per-page slug helpers | Producer-owned RFC 3986 percent-encoded `route_id` / `evaluation_id` / `metric_summary_id`; frontend decodes only on `<Link>` href |
50
+ | Corpus aggregates | `corpus-aggregates.json` over HF JSON loader | `headline.json` sidecar in the snapshot dir |
51
+ | Hierarchy | Synthesised in the producer's `eval_hierarchy` JSON | `hierarchy.json` sidecar |
52
+ | Backend manifest | `manifest.json` fetched from upstream HF dataset root via `lib/hf-data.ts` | `manifest.json` sidecar inside the snapshot dir, read via `SNAPSHOT_URL` |
53
+
54
+ The TS interfaces (`EvaluationCardData`, `BenchmarkEvalSummary`,
55
+ `ModelEvaluationSummary`, `ModelResultForBenchmark`, `CorpusAggregates`,
56
+ `EvalHierarchy`, `BackendManifest`) stay as-is — the producer agreed to
57
+ emit columns under those exact names.
58
+
59
+ ---
60
+
61
+ ## What does not change
62
+
63
+ - All page components under `app/`. The renderer trees are unchanged.
64
+ - TS interface declarations in `lib/benchmark-schema.ts`,
65
+ `lib/eval-processing.ts`, `lib/backend-artifacts.ts`. These are now
66
+ the contract surface — column names match field names by agreement
67
+ with the producer.
68
+ - Component files under `components/`.
69
+ - `lib/glossary.ts`, `lib/known-issues.ts`, `lib/utils.ts`,
70
+ `lib/na-utils.ts` — these are pure presentation helpers.
71
+ - `app/api/*/route.ts` handlers stay as thin pass-throughs to
72
+ `lib/data-backend.ts`.
73
+
74
+ ---
75
+
76
+ ## Distribution: `SNAPSHOT_URL`
77
+
78
+ Frontend reads `SNAPSHOT_URL` from env at process start. One deploy =
79
+ one snapshot. The URL points at a directory containing the six
80
+ artifacts the frontend reads:
81
+
82
+ ```
83
+ $SNAPSHOT_URL/
84
+ ├── models_view.parquet
85
+ ├── evals_view.parquet
86
+ ├── eval_results_view.parquet
87
+ ├── headline.json
88
+ ├── hierarchy.json
89
+ └── manifest.json
90
+ ```
91
+
92
+ Examples:
93
+
94
+ - Local dev: `SNAPSHOT_URL=file:///path/to/eval_card_backend/warehouse/2026-05-03T15-48-59Z`
95
+ - Production (pinned snapshot): `SNAPSHOT_URL=https://huggingface.co/datasets/evaleval/eval-cards-data/resolve/<rev>/warehouse/<snapshot_id>`
96
+ - Production (rolling): `SNAPSHOT_URL=https://huggingface.co/datasets/evaleval/eval-cards-data/resolve/main/warehouse/latest`
97
+
98
+ `LOCAL_PIPELINE_OUTPUT` is removed. The `duckdb/v1/` subpath is
99
+ removed. The producer maintains a `warehouse/latest/` alias that
100
+ points at the most recent snapshot, so deploys can pin either to a
101
+ timestamped snapshot (immutable, redeploy required to roll forward)
102
+ or to `latest` (auto-rolls forward on the next Space rebuild). Within
103
+ a running process the snapshot is still effectively constant — sidecar
104
+ caches in `lib/sidecars.ts` are first-write-wins per process.
105
+
106
+ ---
107
+
108
+ ## DuckDB connection lifecycle
109
+
110
+ `lib/duckdb.ts` (new file; replaces the connection-management portion
111
+ of `lib/duckdb-data.ts`):
112
+
113
+ ```ts
114
+ import "server-only"
115
+ import { DuckDBConnection } from "@duckdb/node-api"
116
+
117
+ let connectionPromise: Promise<DuckDBConnection> | null = null
118
+
119
+ const SNAPSHOT_URL = process.env.SNAPSHOT_URL
120
+ if (!SNAPSHOT_URL) {
121
+ throw new Error("SNAPSHOT_URL must be set; see notes/backend-v2-migration.md")
122
+ }
123
+
124
+ const VIEWS = {
125
+ models_view: `${SNAPSHOT_URL}/models_view.parquet`,
126
+ evals_view: `${SNAPSHOT_URL}/evals_view.parquet`,
127
+ eval_results_view: `${SNAPSHOT_URL}/eval_results_view.parquet`,
128
+ } as const
129
+
130
+ export async function getConnection(): Promise<DuckDBConnection> {
131
+ if (!connectionPromise) {
132
+ connectionPromise = (async () => {
133
+ const conn = await DuckDBConnection.create()
134
+ // httpfs is built into duckdb-node-api; no INSTALL needed.
135
+ // Register each parquet as a view so callers write `FROM models_view`,
136
+ // not the full URL.
137
+ for (const [name, path] of Object.entries(VIEWS)) {
138
+ await conn.run(
139
+ `CREATE OR REPLACE VIEW ${name} AS SELECT * FROM read_parquet(?)`,
140
+ [path]
141
+ )
142
+ }
143
+ return conn
144
+ })()
145
+ }
146
+ return connectionPromise
147
+ }
148
+ ```
149
+
150
+ One connection per Node process. Views are registered once at
151
+ startup; subsequent queries write `FROM models_view` rather than
152
+ re-passing the parquet URL. DuckDB's column projection means the cost
153
+ of `SELECT route_id, model_name FROM models_view` is independent of
154
+ how wide `models_view` is.
155
+
156
+ ---
157
+
158
+ ## Per-accessor mapping
159
+
160
+ `lib/data-backend.ts` keeps its current export names. `lib/duckdb-data.ts`
161
+ gets gutted; each function becomes a thin typed `SELECT`. The mapping
162
+ below uses the column names spec'd in
163
+ `../eval_card_backend/notes/08-frontend-view-layer.md` — the row
164
+ returned by DuckDB casts directly to the TS interface.
165
+
166
+ ### Models
167
+
168
+ ```ts
169
+ // getModelCards / getModelCardsLite — list pages
170
+ export async function getModelCards(): Promise<EvaluationCardData[]> {
171
+ const conn = await getConnection()
172
+ const reader = await conn.runAndReadAll(`
173
+ SELECT id, route_id, model_name, model_id, canonical_model_name, developer,
174
+ evaluations_count, benchmarks_count, variant_count,
175
+ categories, category_stats, latest_timestamp,
176
+ evaluator_count, evaluator_names, source_type_count, source_types,
177
+ evidence_count, missing_generation_config_count,
178
+ third_party_eval_count, independent_verification_ratio,
179
+ reproducibility_status, eval_libraries, latest_source_name,
180
+ params_billions, benchmark_names, score_summary,
181
+ reproducibility_summary, provenance_summary, comparability_summary,
182
+ top_scores, source_urls, detail_urls,
183
+ model_url, release_date, input_modalities, output_modalities,
184
+ architecture, params, inference_engine, inference_platform
185
+ FROM models_view
186
+ ORDER BY latest_timestamp DESC
187
+ `)
188
+ return reader.getRowObjects() as EvaluationCardData[]
189
+ }
190
+
191
+ // "Lite" is just narrower projection — same parquet, fewer columns.
192
+ export async function getModelCardsLite(): Promise<EvaluationCardData[]> {
193
+ const conn = await getConnection()
194
+ const reader = await conn.runAndReadAll(`
195
+ SELECT id, route_id, model_name, model_id, developer,
196
+ evaluations_count, benchmarks_count, categories,
197
+ latest_timestamp, third_party_eval_count,
198
+ independent_verification_ratio, reproducibility_status,
199
+ latest_source_name, params_billions
200
+ FROM models_view
201
+ ORDER BY benchmarks_count DESC, evaluations_count DESC, model_name ASC
202
+ `)
203
+ return reader.getRowObjects() as EvaluationCardData[]
204
+ }
205
+
206
+ // getModelSummaryById — detail page.
207
+ //
208
+ // The row carries the metadata shell (variants[], categories,
209
+ // category_stats, signal summaries, model_family_id, raw_model_ids,
210
+ // total_evaluations, last_updated). The full `ModelEvaluationSummary`
211
+ // also requires `evaluations_by_category: Record<CategoryType,
212
+ // BenchmarkEvaluation[]>`, which is a heavyweight per-cell breakdown —
213
+ // produced by a separate join over `eval_results_view`, see
214
+ // `getModelEvaluationCells` below.
215
+ //
216
+ // Returning a `ModelSummaryShell` (Omit-ed type, defined alongside the
217
+ // existing TS interface) makes the contract explicit and stops the cast
218
+ // from lying. The model-detail page composes the full
219
+ // `ModelEvaluationSummary` from `shell` + `cells`.
220
+ export type ModelSummaryShell = Omit<
221
+ ModelEvaluationSummary,
222
+ "evaluations_by_category"
223
+ >
224
+
225
+ export async function getModelSummaryById(routeId: string): Promise<ModelSummaryShell | null> {
226
+ const conn = await getConnection()
227
+ const reader = await conn.runAndReadAll(
228
+ `SELECT * FROM models_view WHERE route_id = ? OR model_family_id = ? LIMIT 1`,
229
+ [routeId, routeId]
230
+ )
231
+ const rows = reader.getRowObjects()
232
+ if (rows.length === 0) return null
233
+ return rows[0] as unknown as ModelSummaryShell
234
+ }
235
+
236
+ // Per-cell reshape helper. `eval_results_view` rows carry the per-cell
237
+ // fields scattered (model_info, score_details, evaluation_timestamp,
238
+ // source_metadata, source_data, metric_*, etc.) rather than under a
239
+ // nested `result: EvaluationResult` STRUCT. Reshape into the
240
+ // `ModelResultForBenchmark` shape the leaderboard / model-detail
241
+ // renderers expect. Single helper; reused by getEvalSummaryById and
242
+ // getModelEvaluationCells. No HF-record-to-display logic survives.
243
+ function reshapeCellToModelResult(row: Record<string, any>): ModelResultForBenchmark {
244
+ return {
245
+ model_info: row.model_info,
246
+ model_route_id: row.model_route_id,
247
+ score: row.score,
248
+ score_details: row.score_details,
249
+ evaluation_timestamp: row.evaluation_timestamp,
250
+ source_metadata: row.source_metadata,
251
+ source_data: row.source_data,
252
+ source_record_url: row.source_record_url,
253
+ aggregate_components: row.aggregate_components,
254
+ result: {
255
+ evaluation_name: row.metric_display_name,
256
+ metric_summary_id: row.metric_summary_id,
257
+ metric_key: row.metric_id,
258
+ evaluation_timestamp: row.evaluation_timestamp,
259
+ metric_config: { lower_is_better: row.lower_is_better, unit: row.metric_unit, /* …denormalised meta… */ },
260
+ score_details: row.score_details,
261
+ evalcards: row.evalcards_annotations ? { annotations: row.evalcards_annotations } : undefined,
262
+ },
263
+ }
264
+ }
265
+
266
+ // Helper for the model-detail page's evaluations_by_category body.
267
+ // The page groups by `category` in TS after this returns.
268
+ export async function getModelEvaluationCells(modelId: string): Promise<ModelResultForBenchmark[]> {
269
+ const conn = await getConnection()
270
+ const reader = await conn.runAndReadAll(
271
+ `SELECT * FROM eval_results_view WHERE model_id = ? ORDER BY category, percentile DESC`,
272
+ [modelId]
273
+ )
274
+ return reader.getRowObjects().map(reshapeCellToModelResult)
275
+ }
276
+ ```
277
+
278
+ ### Evals
279
+
280
+ ```ts
281
+ // getEvalListData / getEvalListLiteData — list pages
282
+ export async function getEvalListData(): Promise<{
283
+ evals: BenchmarkEvalListItem[]
284
+ totalModels: number
285
+ }> {
286
+ const conn = await getConnection()
287
+ const [evalsReader, modelsReader] = await Promise.all([
288
+ conn.runAndReadAll(`
289
+ SELECT evaluation_id, evaluation_name, canonical_display_name,
290
+ composite_benchmark_key, composite_benchmark_name,
291
+ benchmark_family_key, benchmark_leaf_key, category,
292
+ metric_config, models_count, evaluator_names, source_types,
293
+ latest_source_name, third_party_ratio,
294
+ missing_generation_config_count, best_model, worst_model,
295
+ avg_score, avg_score_norm, has_card,
296
+ is_aggregated, aggregate_sources, tags,
297
+ metrics_count, metric_names, instance_data, top_score,
298
+ subtasks_count, is_summary_score, summary_eval_ids,
299
+ root_metrics, subtasks, leaderboard_metrics,
300
+ reproducibility_summary, provenance_summary, comparability_summary,
301
+ source_data
302
+ FROM evals_view
303
+ ORDER BY evaluation_name ASC
304
+ `),
305
+ conn.runAndReadAll(`SELECT COUNT(*) AS n FROM models_view`),
306
+ ])
307
+ return {
308
+ evals: evalsReader.getRowObjects() as BenchmarkEvalListItem[],
309
+ totalModels: Number(modelsReader.getRowObjects()[0].n),
310
+ }
311
+ }
312
+
313
+ // getEvalSummaryById — detail page.
314
+ //
315
+ // No more aggregate__/matrix__ id-prefix dispatch — `evals_view` is the
316
+ // single source for all eval shapes. Suite-vs-leaf is a column
317
+ // (`is_summary_score`, `is_aggregated`) on the same parquet.
318
+ //
319
+ // `model_results[]` rows go through the same reshape helper as
320
+ // `getModelEvaluationCells` (defined below) — they share the
321
+ // ModelResultForBenchmark target shape, so the eval/metric/cell
322
+ // → BenchmarkEvaluation reshape is one helper, two callers.
323
+ export async function getEvalSummaryById(evalId: string): Promise<BenchmarkEvalSummary | null> {
324
+ const conn = await getConnection()
325
+ const [evalReader, cellsReader] = await Promise.all([
326
+ conn.runAndReadAll(
327
+ `SELECT * FROM evals_view WHERE evaluation_id = ? LIMIT 1`,
328
+ [evalId]
329
+ ),
330
+ conn.runAndReadAll(
331
+ `SELECT * FROM eval_results_view
332
+ WHERE evaluation_id = ?
333
+ AND metric_id = (SELECT primary_metric_id FROM evals_view WHERE evaluation_id = ?)
334
+ ORDER BY position ASC`,
335
+ [evalId, evalId]
336
+ ),
337
+ ])
338
+ const evalRows = evalReader.getRowObjects()
339
+ if (evalRows.length === 0) return null
340
+ return {
341
+ ...(evalRows[0] as Omit<BenchmarkEvalSummary, "model_results">),
342
+ model_results: cellsReader.getRowObjects().map(reshapeCellToModelResult),
343
+ } as BenchmarkEvalSummary
344
+ }
345
+ ```
346
+
347
+ ### Developers
348
+
349
+ ```ts
350
+ // getDeveloperList — list page; reads from headline.json (precomputed,
351
+ // including producer-owned route_id, model/benchmark/evaluation counts,
352
+ // and popular_evals). DeveloperListEntry is satisfied directly by the
353
+ // headline entry shape.
354
+ export async function getDeveloperList(): Promise<DeveloperListEntry[]> {
355
+ const headline = await fetchHeadline()
356
+ return headline.developers as DeveloperListEntry[]
357
+ }
358
+
359
+ // getDeveloperSummaryById — detail page; reads models_view filtered by developer.
360
+ // The route_id on headline.developers[] is the canonical lookup key — we don't
361
+ // re-derive `developer` from the URL slug, since percent-decoding may not
362
+ // round-trip exactly to the producer's source string.
363
+ export async function getDeveloperSummaryById(routeId: string) {
364
+ const headline = await fetchHeadline()
365
+ const headlineEntry = headline.developers.find((d) => d.route_id === routeId)
366
+ if (!headlineEntry) return null
367
+ const conn = await getConnection()
368
+ const reader = await conn.runAndReadAll(
369
+ `SELECT * FROM models_view WHERE developer = ?`,
370
+ [headlineEntry.developer]
371
+ )
372
+ const models = reader.getRowObjects() as EvaluationCardData[]
373
+ return { ...headlineEntry, models }
374
+ }
375
+ ```
376
+
377
+ ### Dashboard convenience accessor
378
+
379
+ ```ts
380
+ // Was: { models, evals } over both legacy parquets; same shape, new sources.
381
+ export async function getDashboardData() {
382
+ const [models, evalListData] = await Promise.all([
383
+ getModelCards(),
384
+ getEvalListData(),
385
+ ])
386
+ return { models, evals: evalListData.evals }
387
+ }
388
+ ```
389
+
390
+ ---
391
+
392
+ ## Sidecar fetchers (replace `lib/hf-data.ts` corpus calls)
393
+
394
+ Three small JSON files live in the snapshot dir alongside the
395
+ parquets. New module `lib/sidecars.ts` exposes typed fetchers.
396
+ `lib/hf-data.ts`'s `fetchCorpusAggregates`, `fetchEvalHierarchy`,
397
+ `fetchBackendManifest`, and `fetchBackendManifestStatus` get their
398
+ implementations replaced — same export names, new sources.
399
+
400
+ ```ts
401
+ // lib/sidecars.ts
402
+ import "server-only"
403
+ import type {
404
+ CorpusAggregates,
405
+ EvalHierarchy,
406
+ BackendManifest,
407
+ } from "@/lib/backend-artifacts"
408
+
409
+ const SNAPSHOT_URL = process.env.SNAPSHOT_URL!
410
+
411
+ let cache: {
412
+ manifest?: Promise<BackendManifest>
413
+ headline?: Promise<CorpusAggregates>
414
+ hierarchy?: Promise<EvalHierarchy>
415
+ } = {}
416
+
417
+ async function fetchJson<T>(name: string): Promise<T> {
418
+ const url = `${SNAPSHOT_URL}/${name}`
419
+ const res = url.startsWith("file://")
420
+ ? await import("fs/promises").then((fs) => fs.readFile(new URL(url), "utf8"))
421
+ : await fetch(url, { next: { revalidate: 3600 } }).then((r) => r.text())
422
+ return JSON.parse(typeof res === "string" ? res : res.toString()) as T
423
+ }
424
+
425
+ export function fetchManifest(): Promise<BackendManifest> {
426
+ return (cache.manifest ??= fetchJson<BackendManifest>("manifest.json"))
427
+ }
428
+
429
+ export function fetchHeadline(): Promise<CorpusAggregates> {
430
+ return (cache.headline ??= fetchJson<CorpusAggregates>("headline.json"))
431
+ }
432
+
433
+ export function fetchHierarchy(): Promise<EvalHierarchy> {
434
+ return (cache.hierarchy ??= fetchJson<EvalHierarchy>("hierarchy.json"))
435
+ }
436
+ ```
437
+
438
+ Then in `lib/hf-data.ts`:
439
+
440
+ ```ts
441
+ // fetchBackendManifest: was a fetchHFJsonSafe call; now reads the snapshot sidecar.
442
+ export const fetchBackendManifest = fetchManifest
443
+ export const fetchCorpusAggregates = fetchHeadline
444
+ export const fetchEvalHierarchy = fetchHierarchy
445
+
446
+ // fetchBackendManifestStatus: simplified — single snapshot pin, no "latest" comparison.
447
+ export async function fetchBackendManifestStatus(): Promise<BackendManifestStatus> {
448
+ const m = await fetchManifest()
449
+ return {
450
+ currentManifest: m,
451
+ latestManifest: m, // no separate "latest" — snapshot is pinned
452
+ currentManifestSignature: m.generated_at,
453
+ latestManifestSignature: m.generated_at,
454
+ updateAvailable: false,
455
+ refreshing: false,
456
+ pendingRefreshCount: 0,
457
+ }
458
+ }
459
+ ```
460
+
461
+ ---
462
+
463
+ ## What deletes
464
+
465
+ After v2 is live, the following code is dead and can be removed in a
466
+ follow-up cleanup:
467
+
468
+ - `lib/duckdb-data.ts` — replaced by typed SELECTs split between
469
+ `lib/duckdb.ts` (connection) and `lib/data-backend.ts` (queries).
470
+ - The `payload_json` parser helpers (`parsePayload`, `readPayloads`,
471
+ `readPayloadById`, `assertDeveloperListShape`) — no JSON blobs to
472
+ parse.
473
+ - The `aggregate__` / `matrix__` eval-id prefix dispatch in
474
+ `getEvalSummaryByIdFromDuckDB` — the typed view is the only path.
475
+ - `lib/model-data.ts` — most of its functions exist to convert HF
476
+ JSON records into `BenchmarkEvaluation` / `EvaluationCardData`. Once
477
+ the producer emits those shapes directly, the adapter logic deletes.
478
+ Keep only the helpers that don't touch HF records (slug parsing,
479
+ display formatters).
480
+ - `lib/eval-processing.ts` — the `groupEvaluationsByModel`,
481
+ `createModelSummary`, `createBenchmarkEvalSummary`, and
482
+ `inferCategoryFromBenchmark` adapter functions are no longer called
483
+ in the data path. The exported types stay.
484
+ - `scripts/audit-adapters.mjs`, `scripts/dump-adapter-outputs.mts`,
485
+ `scripts/compare-data-backends.mjs`, `scripts/refresh-fixtures.mjs`,
486
+ `scripts/cache-hf-data.mjs` — adapter / parity-check tooling for the
487
+ legacy pipeline. Delete once v1 is retired.
488
+ - `data/models/`, `data/developers/`, `data/benchmarks.json`,
489
+ `data/models.json`, `data/developers.json` — bundled snapshots of
490
+ v1 output for fixture tests. Replace with v2 fixtures if needed.
491
+ - `LOCAL_PIPELINE_OUTPUT` env var, `duckdb/v1/` subpath conventions,
492
+ and the parity-emitter expectations documented in
493
+ `lib/duckdb-data.ts`'s preamble.
494
+ - `inferCategoryFromBenchmark` regex chain in
495
+ `lib/benchmark-schema.ts` — producer is the source of truth for
496
+ category. Keep the `EVALUATION_CATEGORIES` const + `CategoryType`
497
+ type; delete the inference function and `BENCHMARK_PRIORITY_RULES`.
498
+
499
+ ---
500
+
501
+ ## Slug rule
502
+
503
+ Producer emits all URL-bearing identifiers in
504
+ RFC 3986 percent-encoded form (`route_id`, `evaluation_id`,
505
+ `metric_summary_id`). Frontend treats them as opaque except for
506
+ `<Link>` href construction:
507
+
508
+ ```tsx
509
+ // Old: href={`/models/${model.route_id}`} // route_id was already escaped via __ rule
510
+ // New: href={`/models/${model.route_id}`} // same code; route_id is now percent-encoded
511
+ ```
512
+
513
+ Decode happens inside the route handler when looking up by slug:
514
+
515
+ ```ts
516
+ // app/models/[id]/page.tsx
517
+ export default async function ModelDetailPage({ params }: { params: { id: string } }) {
518
+ const summary = await getModelSummaryById(params.id) // pass encoded form straight through
519
+ ...
520
+ }
521
+ ```
522
+
523
+ `getModelSummaryById` looks up by `route_id = ?` directly without
524
+ decoding — the producer's `route_id` column matches the URL path
525
+ segment byte-for-byte. The legacy `replace('/', '__')` and
526
+ `replace(/\//g, ...)` helpers in `lib/utils.ts` and `lib/model-family.ts`
527
+ become dead code; remove them in the cleanup pass.
528
+
529
+ ---
530
+
531
+ ## Migration strategy
532
+
533
+ A feature flag gates v1 vs v2 during the transition:
534
+
535
+ ```ts
536
+ // lib/data-backend.ts
537
+ const BACKEND_VERSION = process.env.DATA_BACKEND ?? "v1"
538
+
539
+ export const getModelCards =
540
+ BACKEND_VERSION === "v2"
541
+ ? (await import("@/lib/duckdb")).getModelCards
542
+ : (await import("@/lib/duckdb-data")).getModelCardsFromDuckDB
543
+ // ... same pattern for other accessors
544
+ ```
545
+
546
+ Phase plan:
547
+
548
+ 1. **Producer ships Stage J.** `eval_card_backend` emits the six
549
+ v2 artifacts in `warehouse/<snapshot_id>/`. Existing canonical
550
+ parquets stay alongside.
551
+ 2. **Frontend lands `lib/duckdb.ts` + `lib/sidecars.ts`** behind the
552
+ `DATA_BACKEND=v2` flag. CI builds both backends; default stays v1.
553
+ 3. **Smoke test in dev with `DATA_BACKEND=v2`,
554
+ `SNAPSHOT_URL=file://...`.** Verify each page renders identical
555
+ bytes (modulo source-of-data labels). Where they diverge, file
556
+ producer issues — do not patch the frontend to paper over.
557
+ 4. **Flip the production default to v2.** Keep v1 path compilable but
558
+ unreachable. Monitor for a release.
559
+ 5. **Delete v1 path** (the "What deletes" list above).
560
+
561
+ The flag is intentionally process-wide, not per-accessor. Mixing
562
+ backends within one render produces inconsistent snapshots.
563
+
564
+ ---
565
+
566
+ ## What doesn't move
567
+
568
+ - **Instance-level data fetching** (`fetchInstanceLevelData` in
569
+ `lib/hf-data.ts`). Instance JSONL is referenced by URL in
570
+ `eval_results_view.instance_file_path`; the lazy-load stays. Pointer
571
+ shape on the row is unchanged from v1.
572
+ - **Benchmark card metadata** lives inside `evals_view.benchmark_card`
573
+ STRUCT now, not a separate `benchmark_card_*.json` per file. The
574
+ page reads it from the eval row directly. Adapter-style readers
575
+ (`fetchBenchmarkMetadataMap`) become a `SELECT benchmark_id, benchmark_card
576
+ FROM evals_view` aggregation if anything still calls them — most
577
+ callers should fold into `getEvalSummaryById`.
578
+ - **EvalCards annotations** (`evalcards.annotations`) live on
579
+ `eval_results_view.evalcards_annotations` per-row. The eval-detail
580
+ page reads them inline; no separate fetcher.
581
+
582
+ ---
583
+
584
+ ## Open questions / risks
585
+
586
+ - **httpfs cold-start latency.** First query against an HF-hosted
587
+ parquet pays a round trip per file. Mitigate by pre-registering all
588
+ three views at process start (above), so the first user query hits
589
+ warm metadata. Measure on the production HF Space; if too slow,
590
+ consider downloading the snapshot to local disk at container start
591
+ (~MB per snapshot).
592
+ - **Connection lifetime in serverless.** Vercel's serverless
593
+ runtime tears down the Node process per request; the
594
+ `connectionPromise` cache doesn't help. The HF Space deployment
595
+ (Docker, long-lived) is unaffected. If we ever target serverless,
596
+ switch to `duckdb-wasm` in the browser or a separate serving
597
+ process.
598
+ - **`aggregate_components[]` on `eval_results_view`.** This array is
599
+ the per-suite-component breakdown for rollup rows. For non-rollup
600
+ rows it's always empty. If suite rollups grow common, the storage
601
+ cost of trailing-empty arrays is non-trivial; consider splitting
602
+ into a dedicated parquet at that point.
603
+ - **Category drift.** Producer's `category_mapping.json` will lag real
604
+ benchmark tag changes. The mapping is producer-owned, so the
605
+ frontend can't patch around drift — this is a feature, not a bug,
606
+ but it requires operator discipline. Surface "uncategorised
607
+ benchmark count" in the producer's run summary and the home-page
608
+ manifest banner.
609
+ - **Type widening for `score_summary` etc.** The producer emits these
610
+ as DuckDB STRUCTs; the TS interface declares them as nested
611
+ `{ count, min, max, average }`. `runAndReadAll` returns nested
612
+ STRUCTs as plain JS objects, so the cast works — but if duckdb-node
613
+ changes its STRUCT serialisation, audit the `as` casts here. Add a
614
+ dev-only validator that runs `EvaluationCardData`'s shape check at
615
+ the row level on the first `getModelCards()` call after process
616
+ start.
notes/merge-cheatsheet-backend-v2.md ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Merge cheatsheet: pulling `main` into `feat/use-new-backend-data`
2
+
3
+ > Drafted 2026-05-04, before pulling. Companion to `backend-v2-migration.md`
4
+ > (which is the design doc). This file is just a per-file conflict guide.
5
+ >
6
+ > Branch: `feat/use-new-backend-data` (2 commits ahead of `main`:
7
+ > `7635aee` Integrate with test backend data, `bfce8f2` Drop
8
+ > input/output_modalities from MODEL_CARD_COLUMNS).
9
+
10
+ ## Triage at a glance
11
+
12
+ | File | Risk | Strategy |
13
+ |---|---|---|
14
+ | `lib/data-backend.ts` | **High** | Keep ours wholesale; re-port any new accessors main added |
15
+ | `lib/backend-artifacts.ts` | **High** | Keep our schema renames; reconcile any *new* main-side fields against producer output |
16
+ | `components/signals/corpus-dashboard.tsx` | **Med** | Keep main's UI structure; rewire data fields to v2 names |
17
+ | `components/signals/corpus-signals-strip.tsx` | **Med** | Same as above |
18
+ | `lib/hf-data.ts` | **Med** | Keep `useViewLayerBackend()` short-circuits at top of 5 fetchers |
19
+ | `Dockerfile` | **Med** | Keep our `DATA_BACKEND=v2` + `SNAPSHOT_URL` wiring; layer main's other changes on top |
20
+ | `lib/benchmark-schema.ts` | **Low** | Trivial 1-line addition (`num_few_shot?`) |
21
+ | `app/page.tsx` | **Low** | One-line copy change (`corpus-aggregates.json` → `headline.json`) |
22
+
23
+ New files (no conflict possible): `lib/view-data.ts`, `lib/duckdb.ts`,
24
+ `lib/sidecars.ts`, `tests/view-data.test.ts`,
25
+ `notes/backend-v2-migration.md`.
26
+
27
+ ---
28
+
29
+ ## `lib/data-backend.ts` — High
30
+
31
+ **What we did:** Replaced static re-exports from `lib/duckdb-data` with
32
+ a `BACKEND_VERSION` env-flag dispatcher. Each accessor now branches on
33
+ `useViewLayerBackend()` (true when `DATA_BACKEND=v2` or `stage-j`) and
34
+ lazy-imports either `@/lib/view-data` or `@/lib/duckdb-data`.
35
+ Manifest/hierarchy accessors branch between `@/lib/sidecars` and
36
+ `@/lib/hf-data`.
37
+
38
+ **Reconcile:**
39
+ - Conflict almost certain if main touched any export wiring here.
40
+ - **Keep our file as-is.** The dispatcher pattern is load-bearing.
41
+ - If main added a new accessor (e.g. `getFooBar`), add a new dispatcher
42
+ function following the same pattern — only the legacy branch needs
43
+ to be wired immediately; v2 branch can throw `Not implemented` until
44
+ `lib/view-data.ts` adds it.
45
+
46
+ ---
47
+
48
+ ## `lib/backend-artifacts.ts` — High
49
+
50
+ **What we did:** Renamed corpus-block fields to match what the v2
51
+ producer emits:
52
+
53
+ | Block | v1 (main) | v2 (ours) |
54
+ |---|---|---|
55
+ | Completeness | `total_benchmarks`, `completeness_score_mean`, `completeness_score_median`, `per_field_population{}` | `total_triples`, `completeness_avg`, `completeness_min`, `completeness_max` |
56
+ | Provenance | `multi_source_groups`, `multi_source_rate`, `first_party_only_groups`, `first_party_only_rate`, `total_groups` | `multi_source_triples`, `first_party_only_triples`, `total_triples` (rates dropped — derived in components via local `rate()` helper) |
57
+ | Comparability | `variant_eligible_groups`, `variant_divergent_groups`, `variant_divergence_rate`, `cross_party_eligible_groups`, `cross_party_divergent_groups`, `cross_party_divergence_rate`, `total_groups` | `total_triples`, `variant_divergent_count`, `cross_party_divergent_count`, `groups_with_variant_check`, `groups_with_cross_party_check` |
58
+
59
+ Also added: `DeveloperListEntry` interface, optional
60
+ `developers/families/categories` arrays on `CorpusAggregates`,
61
+ optional `eval_hierarchy` key in `BackendManifest.summary_artifacts`.
62
+
63
+ **Reconcile:**
64
+ - Producer is the source of truth for v2 field names — do **not** add
65
+ back v1 names to satisfy a main-side change. If main added a field
66
+ the v2 producer doesn't emit, either drop it or check
67
+ `eval_card_backend/notes/08-frontend-view-layer.md` first.
68
+ - Keep all three new optional sections on `CorpusAggregates`
69
+ (developers, families, categories) — they back the new
70
+ developer-list path.
71
+ - The `summary_artifacts.eval_hierarchy` key is additive; safe to keep
72
+ alongside whatever main added there.
73
+
74
+ ---
75
+
76
+ ## `components/signals/corpus-dashboard.tsx` — Medium
77
+
78
+ **What we did:** Mechanical rewrite of every field reference in this
79
+ file to use the v2 names from `lib/backend-artifacts.ts` (above).
80
+ Removed the `per_field_population` per-field grid and replaced it with
81
+ a `min / avg / max` MiniMetric trio. Added a local `rate(num, denom)`
82
+ helper (returns null if either side is null/zero) since v2 stores
83
+ counts, not pre-computed rates. Title-cased `CATEGORY_ORDER`
84
+ (`"Agentic"`, `"General"`, …) and made the keys-to-render set extend
85
+ gracefully to unknown categories.
86
+
87
+ **Reconcile:**
88
+ - If main touched this file for design/UX reasons, **prefer main's
89
+ visual structure** — but keep our field accessors. The recipe is:
90
+ - Anywhere main reads `multi_source_rate`, replace with `rate(prov.multi_source_triples, prov.total_triples)`.
91
+ - Anywhere main reads `completeness_score_mean`, replace with `comp.completeness_avg`.
92
+ - Anywhere main reads `*_eligible_groups` / `*_divergent_groups`, swap to `groups_with_*_check` / `*_divergent_count`.
93
+ - Drop any new code that reads `per_field_population` — gone in v2.
94
+ - Keep the local `rate()` helper at the bottom of the file.
95
+ - Category lookup must use the new title-cased keys (or stay tolerant
96
+ via the `available` set logic we added).
97
+
98
+ ---
99
+
100
+ ## `components/signals/corpus-signals-strip.tsx` — Medium
101
+
102
+ **What we did:** Same field renames as above, same local `rate()`
103
+ helper added. Headline copy updated from "groups" → "triples" where
104
+ the underlying unit changed.
105
+
106
+ **Reconcile:** Apply the same recipe as `corpus-dashboard.tsx`. The
107
+ two files share field names and the `rate()` helper.
108
+
109
+ ---
110
+
111
+ ## `lib/hf-data.ts` — Medium
112
+
113
+ **What we did:** Added an early-return guard at the top of five
114
+ functions:
115
+ - `fetchBackendManifestStatus` — synthesizes a status from the v2 manifest sidecar
116
+ - `fetchBenchmarkMetadataMap` — delegates to `view-data.getBenchmarkMetadataMap`
117
+ - `fetchBackendManifest` — delegates to `sidecars.fetchManifest`
118
+ - `fetchEvalHierarchy` — delegates to `sidecars.fetchHierarchy` (still wraps in `adaptEvalHierarchy`)
119
+ - `fetchCorpusAggregates` — delegates to `sidecars.fetchHeadline`
120
+
121
+ Plus a module-level `useViewLayerBackend()` helper and a lazy
122
+ `fetchSnapshotSidecars()` importer near the top of the file.
123
+
124
+ **Reconcile:**
125
+ - These are all additive guards at the start of existing functions —
126
+ conflicts are likely only if main re-shaped the same function
127
+ bodies.
128
+ - Pattern: `if (useViewLayerBackend()) { return <v2 path> }` then fall
129
+ through to the existing v1 implementation untouched.
130
+ - If main renamed one of these functions, port the guard into the
131
+ renamed version. Don't drop the guard.
132
+
133
+ ---
134
+
135
+ ## `Dockerfile` — Medium
136
+
137
+ **What we did:**
138
+ - Default `ARG DATA_BACKEND` flipped from `duckdb` → `v2` in **both**
139
+ stages (builder and runner).
140
+ - Added `ARG SNAPSHOT_URL` + `ENV SNAPSHOT_URL` in both stages,
141
+ defaulting to a pinned `evaleval/eval-cards-data` warehouse path.
142
+ - Comment block rewritten to reflect v2 + legacy coexistence.
143
+ - Kept legacy `LOCAL_PIPELINE_OUTPUT`, `HF_DATA_LOCAL_DIR`,
144
+ `HF_DATA_OFFLINE=1` envs intact (legacy backend still compilable).
145
+
146
+ **Uncommitted tweak (working tree):** `SNAPSHOT_URL` default points at
147
+ `j-chim/temp_evalcard_backend` instead of `evaleval/eval-cards-data` —
148
+ this is the dev/test dataset for the temp HF Space deploy. Do **not**
149
+ commit this override; revert before merging to main, or keep it only
150
+ on local working copy.
151
+
152
+ **Reconcile:**
153
+ - Keep our `DATA_BACKEND=v2` default and `SNAPSHOT_URL` plumbing.
154
+ - Layer main's non-data changes (base image bumps, `pnpm` version,
155
+ build commands) on top.
156
+
157
+ ---
158
+
159
+ ## `lib/benchmark-schema.ts` — Low
160
+
161
+ **What we did:** Added one optional field, `num_few_shot?: number`, on
162
+ `GenerationConfig`. That's it.
163
+
164
+ **Reconcile:** Trivially additive. Keep our line; merge tool should
165
+ handle it cleanly unless main touched the same struct.
166
+
167
+ ---
168
+
169
+ ## `app/page.tsx` — Low
170
+
171
+ **What we did:** One-line copy change in the empty-state banner —
172
+ `corpus-aggregates.json` → `headline.json` (the v2 sidecar name).
173
+
174
+ **Reconcile:** Trivial. Keep ours.
175
+
176
+ ---
177
+
178
+ ## Order of operations after `git pull`
179
+
180
+ 1. Resolve `lib/backend-artifacts.ts` first — it's the schema source
181
+ of truth that the components depend on.
182
+ 2. Resolve `lib/data-backend.ts` and `lib/hf-data.ts` — backend wiring.
183
+ 3. Resolve the two `components/signals/*` files using the rename recipe.
184
+ 4. Resolve `Dockerfile` — keep our v2 envs.
185
+ 5. `app/page.tsx` and `lib/benchmark-schema.ts` — should auto-merge or
186
+ be trivial.
187
+ 6. Run `pnpm tsc --noEmit` (or whatever the project's typecheck is) to
188
+ catch any v1 field references main introduced that didn't conflict
189
+ textually but break against our renamed types.
190
+ 7. Run `pnpm test` — `tests/view-data.test.ts` and
191
+ `tests/duckdb-data.test.ts` should both still pass.
192
+ 8. Smoke test with `DATA_BACKEND=v2 SNAPSHOT_URL=file://…` and again
193
+ without (legacy path) — both must render.
scripts/cache-hf-data.mjs CHANGED
@@ -18,6 +18,13 @@ import { promisify } from "util"
18
  const root = path.resolve(new URL(import.meta.url).pathname, "..", "..")
19
  const cacheDir = path.join(root, ".cache", "hf-data")
20
  const publicDir = path.join(root, "public")
 
 
 
 
 
 
 
21
  const HF_DATASET_REPO = process.env.HF_DATASET_REPO?.trim()
22
  || "https://huggingface.co/datasets/evaleval/card_backend"
23
  const HF_RESOLVE_BASE = `${HF_DATASET_REPO}/resolve/main`
 
18
  const root = path.resolve(new URL(import.meta.url).pathname, "..", "..")
19
  const cacheDir = path.join(root, ".cache", "hf-data")
20
  const publicDir = path.join(root, "public")
21
+ const dataBackend = process.env.DATA_BACKEND?.trim().toLowerCase()
22
+ if (dataBackend === "v2" || dataBackend === "stage-j") {
23
+ await fs.mkdir(cacheDir, { recursive: true })
24
+ console.log("[cache-hf-data] DATA_BACKEND=v2: skipping legacy HF cache; runtime reads SNAPSHOT_URL")
25
+ process.exit(0)
26
+ }
27
+
28
  const HF_DATASET_REPO = process.env.HF_DATASET_REPO?.trim()
29
  || "https://huggingface.co/datasets/evaleval/card_backend"
30
  const HF_RESOLVE_BASE = `${HF_DATASET_REPO}/resolve/main`
tests/duckdb-data.test.ts CHANGED
@@ -12,27 +12,13 @@ function sqlString(value: string) {
12
  }
13
 
14
  async function writeParquetPayload(outputDir: string, fileName: string, payloads: unknown[]) {
15
- const parquetDir = path.join(outputDir, "experimental", "parquet")
16
  await mkdir(parquetDir, { recursive: true })
17
 
18
  const selects = payloads
19
- .map((payload, index) => {
20
- const record = payload as Record<string, unknown>
21
  const payloadJson = JSON.stringify(payload)
22
- return [
23
- `SELECT 'model_card_lite' AS record_type`,
24
- `${sqlString(String(record.model_route_id ?? index))} AS model_route_id`,
25
- `${sqlString(String(record.model_family_id ?? ""))} AS model_family_id`,
26
- `${sqlString(String(record.developer ?? ""))} AS developer`,
27
- `NULL AS eval_summary_id`,
28
- `NULL AS developer_route_id`,
29
- `NULL AS category`,
30
- `NULL AS benchmark_family_key`,
31
- `${Number(record.benchmark_family_count ?? 0)} AS models_count`,
32
- `${Number(record.total_evaluations ?? 0)} AS total_evaluations`,
33
- `${sqlString(String(record.last_updated ?? ""))} AS last_updated`,
34
- `${sqlString(payloadJson)} AS payload_json`,
35
- ].join(", ")
36
  })
37
  .join(" UNION ALL ")
38
 
@@ -49,22 +35,36 @@ describe("DuckDB local data backend", () => {
49
  process.env.LOCAL_PIPELINE_OUTPUT = outputDir
50
  await writeParquetPayload(outputDir, "model_cards_lite.parquet", [
51
  {
52
- model_family_id: "openai/gpt-5",
53
- model_route_id: "openai__gpt-5",
54
- model_family_name: "GPT 5",
55
- developer: "openai",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  params_billions: 100,
57
- total_evaluations: 3,
58
- benchmark_count: 2,
59
- benchmark_family_count: 2,
60
- categories_covered: ["reasoning"],
61
- last_updated: "2026-01-01T00:00:00Z",
62
- variants: [],
63
  score_summary: { count: 1, min: 0.7, max: 0.9, average: 0.8 },
64
  benchmark_names: ["mmlu"],
65
- top_benchmark_scores: [
66
  { benchmark: "mmlu", score: 0.9, metric: "accuracy" },
67
  ],
 
 
68
  },
69
  ])
70
 
@@ -93,7 +93,7 @@ describe("DuckDB local data backend", () => {
93
  try {
94
  process.env.LOCAL_PIPELINE_OUTPUT = outputDir
95
  await expect(getModelCardsLiteFromDuckDB()).rejects.toThrow(
96
- /EXPORT_EXPERIMENTAL_PARQUET=1/
97
  )
98
  } finally {
99
  if (previousOutput == null) {
 
12
  }
13
 
14
  async function writeParquetPayload(outputDir: string, fileName: string, payloads: unknown[]) {
15
+ const parquetDir = path.join(outputDir, "duckdb", "v1")
16
  await mkdir(parquetDir, { recursive: true })
17
 
18
  const selects = payloads
19
+ .map((payload) => {
 
20
  const payloadJson = JSON.stringify(payload)
21
+ return `SELECT ${sqlString(payloadJson)} AS payload_json`
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  })
23
  .join(" UNION ALL ")
24
 
 
35
  process.env.LOCAL_PIPELINE_OUTPUT = outputDir
36
  await writeParquetPayload(outputDir, "model_cards_lite.parquet", [
37
  {
38
+ id: "openai/gpt-5",
39
+ route_id: "openai__gpt-5",
40
+ model_name: "GPT 5",
41
+ model_id: "openai/gpt-5",
42
+ canonical_model_name: "GPT 5",
43
+ developer: "OpenAI",
44
+ evaluations_count: 3,
45
+ benchmarks_count: 2,
46
+ variant_count: 1,
47
+ categories: ["Reasoning"],
48
+ category_stats: { General: 0, Reasoning: 2, Agentic: 0, Safety: 0, Knowledge: 0 },
49
+ latest_timestamp: "2026-01-01T00:00:00Z",
50
+ evaluator_count: 1,
51
+ evaluator_names: ["OpenAI"],
52
+ source_type_count: 1,
53
+ source_types: ["documentation"],
54
+ evidence_count: 3,
55
+ missing_generation_config_count: 0,
56
+ third_party_eval_count: 0,
57
+ independent_verification_ratio: 0,
58
+ reproducibility_status: "complete",
59
+ eval_libraries: [],
60
  params_billions: 100,
 
 
 
 
 
 
61
  score_summary: { count: 1, min: 0.7, max: 0.9, average: 0.8 },
62
  benchmark_names: ["mmlu"],
63
+ top_scores: [
64
  { benchmark: "mmlu", score: 0.9, metric: "accuracy" },
65
  ],
66
+ source_urls: [],
67
+ detail_urls: [],
68
  },
69
  ])
70
 
 
93
  try {
94
  process.env.LOCAL_PIPELINE_OUTPUT = outputDir
95
  await expect(getModelCardsLiteFromDuckDB()).rejects.toThrow(
96
+ /duckdb\/v1\/model_cards_lite\.parquet/
97
  )
98
  } finally {
99
  if (previousOutput == null) {
tests/view-data.test.ts ADDED
@@ -0,0 +1,466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { mkdir, mkdtemp, rm, writeFile } from "fs/promises"
2
+ import os from "os"
3
+ import path from "path"
4
+
5
+ import { DuckDBConnection } from "@duckdb/node-api"
6
+ import { describe, expect, it } from "vitest"
7
+
8
+ function sqlString(value: string) {
9
+ return `'${value.replace(/'/g, "''")}'`
10
+ }
11
+
12
+ async function copyParquet(connection: DuckDBConnection, sql: string, outputPath: string) {
13
+ await connection.run(`COPY (${sql}) TO ${sqlString(outputPath)} (FORMAT parquet)`)
14
+ }
15
+
16
+ async function writeSyntheticStageJSnapshot(snapshotDir: string) {
17
+ await mkdir(snapshotDir, { recursive: true })
18
+ const connection = await DuckDBConnection.create()
19
+
20
+ await copyParquet(
21
+ connection,
22
+ `
23
+ SELECT
24
+ TIMESTAMP '2026-05-03 00:00:00' AS snapshot_id,
25
+ 'openai/gpt-5' AS model_key,
26
+ 'openai/gpt-5' AS model_id,
27
+ 'openai/gpt-5' AS id,
28
+ 'openai%2Fgpt-5' AS route_id,
29
+ 'openai%2Fgpt-5' AS model_route_id,
30
+ 'openai/gpt-5' AS model_family_id,
31
+ 'GPT 5' AS model_name,
32
+ 'GPT 5' AS canonical_model_name,
33
+ 'GPT 5' AS model_family_name,
34
+ 'OpenAI' AS developer,
35
+ DATE '2026-01-01' AS release_date,
36
+ 'https://example.test/model' AS model_url,
37
+ 'transformer' AS architecture,
38
+ '100B' AS params,
39
+ 100.0 AS params_billions,
40
+ ['text']::VARCHAR[] AS input_modalities,
41
+ ['text']::VARCHAR[] AS output_modalities,
42
+ 'engine' AS inference_engine,
43
+ 'platform' AS inference_platform,
44
+ 1::BIGINT AS evaluations_count,
45
+ 1::BIGINT AS benchmarks_count,
46
+ 1::INTEGER AS variant_count,
47
+ 1::BIGINT AS evaluator_count,
48
+ ['OpenAI']::VARCHAR[] AS evaluator_names,
49
+ 1::INTEGER AS source_type_count,
50
+ ['documentation']::VARCHAR[] AS source_types,
51
+ 0::BIGINT AS third_party_eval_count,
52
+ 0.0 AS independent_verification_ratio,
53
+ 1::BIGINT AS evidence_count,
54
+ 0::INTEGER AS missing_generation_config_count,
55
+ TIMESTAMP '2026-05-03 00:00:00' AS latest_timestamp,
56
+ 'OpenAI' AS latest_source_name,
57
+ ['MMLU']::VARCHAR[] AS benchmark_names,
58
+ ['Reasoning']::VARCHAR[] AS categories,
59
+ struct_pack("General" := 0, "Reasoning" := 1, "Agentic" := 0, "Safety" := 0, "Knowledge" := 0) AS category_stats,
60
+ 'complete' AS reproducibility_status,
61
+ struct_pack(results_total := 1, has_reproducibility_gap_count := 0, populated_ratio_avg := 1.0) AS reproducibility_summary,
62
+ struct_pack(
63
+ total_results := 1,
64
+ total_groups := 1,
65
+ multi_source_groups := 0,
66
+ first_party_only_groups := 1,
67
+ source_type_distribution := struct_pack(first_party := 1, third_party := 0, collaborative := 0, unspecified := 0)
68
+ ) AS provenance_summary,
69
+ struct_pack(
70
+ total_groups := 1,
71
+ groups_with_variant_check := 0,
72
+ groups_with_cross_party_check := 0,
73
+ variant_divergent_count := 0,
74
+ cross_party_divergent_count := 0
75
+ ) AS comparability_summary,
76
+ [struct_pack(name := 'openai-evals', version := '1.0', fork := NULL::VARCHAR)] AS eval_libraries,
77
+ struct_pack(count := 1, min := 0.8, max := 0.8, average := 0.8) AS score_summary,
78
+ [struct_pack(benchmark := 'MMLU', benchmarkKey := 'mmlu', score := 0.8, metric := 'accuracy')] AS top_scores,
79
+ ['https://example.test/source']::VARCHAR[] AS source_urls,
80
+ []::VARCHAR[] AS detail_urls,
81
+ [struct_pack(
82
+ variant_id := 'default',
83
+ variant_key := 'default',
84
+ variant_label := 'Default',
85
+ variant_display_name := 'GPT 5',
86
+ raw_model_ids := ['openai/gpt-5']::VARCHAR[],
87
+ family_id := 'openai/gpt-5',
88
+ family_name := 'GPT 5',
89
+ version_date := NULL::VARCHAR,
90
+ version_qualifier := NULL::VARCHAR,
91
+ total_evaluations := 1,
92
+ last_updated := TIMESTAMP '2026-05-03 00:00:00',
93
+ categories_covered := ['Reasoning']::VARCHAR[]
94
+ )] AS variants,
95
+ ['openai/gpt-5']::VARCHAR[] AS raw_model_ids
96
+ `,
97
+ path.join(snapshotDir, "models_view.parquet")
98
+ )
99
+
100
+ await copyParquet(
101
+ connection,
102
+ `
103
+ SELECT
104
+ TIMESTAMP '2026-05-03 00:00:00' AS snapshot_id,
105
+ 'mmlu' AS evaluation_id,
106
+ 'mmlu' AS benchmark_id,
107
+ 'accuracy' AS primary_metric_id,
108
+ 'MMLU' AS evaluation_name,
109
+ 'MMLU' AS canonical_display_name,
110
+ 'mmlu' AS composite_benchmark_key,
111
+ 'MMLU' AS composite_benchmark_name,
112
+ 'mmlu' AS benchmark_family_key,
113
+ 'mmlu' AS benchmark_leaf_key,
114
+ 'Reasoning' AS category,
115
+ struct_pack(
116
+ evaluation_description := 'Accuracy on MMLU',
117
+ lower_is_better := false,
118
+ score_type := 'continuous',
119
+ min_score := 0.0,
120
+ max_score := 1.0,
121
+ unit := 'proportion'
122
+ ) AS metric_config,
123
+ 1::BIGINT AS models_count,
124
+ ['OpenAI']::VARCHAR[] AS evaluator_names,
125
+ ['documentation']::VARCHAR[] AS source_types,
126
+ 'OpenAI' AS latest_source_name,
127
+ 0.0 AS third_party_ratio,
128
+ 0::INTEGER AS missing_generation_config_count,
129
+ struct_pack(name := 'GPT 5', score := 0.8) AS best_model,
130
+ struct_pack(name := 'GPT 5', score := 0.8) AS worst_model,
131
+ 0.8 AS avg_score,
132
+ 0.8 AS avg_score_norm,
133
+ 0.8 AS top_score,
134
+ false AS has_card,
135
+ NULL AS benchmark_card,
136
+ false AS is_aggregated,
137
+ [] AS aggregate_sources,
138
+ false AS is_summary_score,
139
+ []::VARCHAR[] AS summary_eval_ids,
140
+ struct_pack(domains := ['knowledge']::VARCHAR[], languages := ['en']::VARCHAR[], tasks := ['qa']::VARCHAR[]) AS tags,
141
+ struct_pack(
142
+ dataset_name := 'MMLU',
143
+ source_type := 'documentation',
144
+ hf_repo := NULL::VARCHAR,
145
+ hf_split := NULL::VARCHAR,
146
+ samples_number := 10,
147
+ url := ['https://example.test/mmlu']::VARCHAR[],
148
+ dataset_url := 'https://example.test/mmlu',
149
+ dataset_version := 'v1'
150
+ ) AS source_data,
151
+ struct_pack(results_total := 1, has_reproducibility_gap_count := 0, populated_ratio_avg := 1.0) AS reproducibility_summary,
152
+ struct_pack(
153
+ total_results := 1,
154
+ total_groups := 1,
155
+ multi_source_groups := 0,
156
+ first_party_only_groups := 1,
157
+ source_type_distribution := struct_pack(first_party := 1, third_party := 0, collaborative := 0, unspecified := 0)
158
+ ) AS provenance_summary,
159
+ struct_pack(
160
+ total_groups := 1,
161
+ groups_with_variant_check := 0,
162
+ groups_with_cross_party_check := 0,
163
+ variant_divergent_count := 0,
164
+ cross_party_divergent_count := 0
165
+ ) AS comparability_summary,
166
+ struct_pack(available := false, url_count := 0::BIGINT, sample_urls := []::VARCHAR[], models_with_loaded_instances := 0) AS instance_data,
167
+ 1::INTEGER AS metrics_count,
168
+ ['Accuracy']::VARCHAR[] AS metric_names,
169
+ [struct_pack(
170
+ column_key := 'root:accuracy',
171
+ metric_summary_id := 'mmlu%3Aaccuracy',
172
+ metric_id := 'accuracy',
173
+ metric_name := 'accuracy',
174
+ display_name := 'Accuracy',
175
+ canonical_display_name := 'Accuracy',
176
+ lower_is_better := false,
177
+ unit := 'proportion',
178
+ scope := 'root',
179
+ subtask_key := NULL::VARCHAR,
180
+ subtask_name := NULL::VARCHAR
181
+ )] AS leaderboard_metrics,
182
+ [] AS leaderboard_rows,
183
+ [struct_pack(
184
+ metric_summary_id := 'mmlu%3Aaccuracy',
185
+ metric_name := 'accuracy',
186
+ display_name := 'Accuracy',
187
+ canonical_display_name := 'Accuracy',
188
+ metric_key := 'accuracy',
189
+ lower_is_better := false,
190
+ models_count := 1,
191
+ top_score := 0.8,
192
+ unit := 'proportion'
193
+ )] AS root_metrics,
194
+ [] AS subtasks,
195
+ 0::INTEGER AS subtasks_count
196
+ `,
197
+ path.join(snapshotDir, "evals_view.parquet")
198
+ )
199
+
200
+ await copyParquet(
201
+ connection,
202
+ `
203
+ SELECT
204
+ TIMESTAMP '2026-05-03 00:00:00' AS snapshot_id,
205
+ 'mmlu' AS evaluation_id,
206
+ 'mmlu%3Aaccuracy' AS metric_summary_id,
207
+ 'mmlu' AS benchmark_id,
208
+ 'accuracy' AS metric_id,
209
+ 'openai/gpt-5' AS model_key,
210
+ 'openai/gpt-5' AS model_id,
211
+ 'openai%2Fgpt-5' AS model_route_id,
212
+ struct_pack(
213
+ name := 'GPT 5',
214
+ id := 'openai/gpt-5',
215
+ developer := 'OpenAI',
216
+ inference_platform := 'platform',
217
+ inference_engine := 'engine',
218
+ model_version := NULL::VARCHAR,
219
+ architecture := 'transformer',
220
+ parameter_count := '100B',
221
+ release_date := '2026-01-01',
222
+ model_url := 'https://example.test/model',
223
+ modalities := struct_pack(input := ['text']::VARCHAR[], output := ['text']::VARCHAR[])
224
+ ) AS model_info,
225
+ 'Accuracy' AS metric_display_name,
226
+ 'proportion' AS metric_unit,
227
+ false AS lower_is_better,
228
+ 'Reasoning' AS category,
229
+ 0.8 AS score,
230
+ struct_pack(
231
+ score := 0.8,
232
+ standard_error := 0.01,
233
+ sample_size := 10,
234
+ confidence_interval := struct_pack(lower := 0.7, upper := 0.9, confidence_level := 0.95)
235
+ ) AS score_details,
236
+ 1::INTEGER AS fact_row_count,
237
+ 1::INTEGER AS position,
238
+ 1::INTEGER AS total,
239
+ 1.0 AS percentile,
240
+ TIMESTAMP '2026-05-03 00:00:00' AS evaluation_timestamp,
241
+ struct_pack(
242
+ source_name := 'OpenAI report',
243
+ source_type := 'documentation',
244
+ source_organization_name := 'OpenAI',
245
+ source_organization_url := 'https://example.test',
246
+ evaluator_relationship := 'first_party',
247
+ source_url := 'https://example.test/report',
248
+ publication_date := DATE '2026-05-03'
249
+ ) AS source_metadata,
250
+ struct_pack(
251
+ dataset_name := 'MMLU',
252
+ source_type := 'documentation',
253
+ hf_repo := NULL::VARCHAR,
254
+ hf_split := NULL::VARCHAR,
255
+ samples_number := 10,
256
+ url := ['https://example.test/mmlu']::VARCHAR[],
257
+ dataset_url := 'https://example.test/mmlu',
258
+ dataset_version := 'v1'
259
+ ) AS source_data,
260
+ 'https://example.test/record.json' AS source_record_url,
261
+ struct_pack(name := 'openai-evals', version := '1.0', fork := NULL::VARCHAR) AS eval_library,
262
+ ['first_party']::VARCHAR[] AS evaluator_relationships,
263
+ true AS has_first_party,
264
+ false AS has_third_party,
265
+ 'self' AS coverage_cell,
266
+ ['OpenAI']::VARCHAR[] AS reporting_orgs,
267
+ map(['OpenAI'], [0.8]) AS scores_by_organization,
268
+ false AS is_summary_score,
269
+ NULL::VARCHAR AS summary_score_for,
270
+ [] AS aggregate_components,
271
+ false AS has_reproducibility_gap,
272
+ 1.0 AS completeness_score,
273
+ false AS is_multi_source,
274
+ true AS first_party_only,
275
+ false AS has_variant_divergence,
276
+ false AS has_cross_party_divergence,
277
+ NULL AS evalcards_annotations,
278
+ NULL::VARCHAR AS instance_file_path,
279
+ NULL::VARCHAR AS instance_file_format,
280
+ 0::INTEGER AS instance_rows
281
+ `,
282
+ path.join(snapshotDir, "eval_results_view.parquet")
283
+ )
284
+
285
+ await writeFile(
286
+ path.join(snapshotDir, "manifest.json"),
287
+ JSON.stringify({
288
+ generated_at: "2026-05-03T00:00:00Z",
289
+ config_version: 2,
290
+ skipped_configs: [],
291
+ model_count: 1,
292
+ eval_count: 1,
293
+ metric_eval_count: 1,
294
+ source_config_count: 1,
295
+ skipped_config_count: 0,
296
+ summary_artifacts: {
297
+ corpus_aggregates: "headline.json",
298
+ eval_hierarchy: "hierarchy.json",
299
+ },
300
+ })
301
+ )
302
+
303
+ const reproducibilityBlock = {
304
+ total_triples: 1,
305
+ triples_with_reproducibility_gap: 0,
306
+ reproducibility_gap_rate: 0,
307
+ agentic_triples: 0,
308
+ per_field_missingness: {
309
+ temperature: {
310
+ missing_count: 0,
311
+ missing_rate: 0,
312
+ denominator: "all_triples",
313
+ denominator_count: 1,
314
+ },
315
+ },
316
+ }
317
+ const completenessBlock = {
318
+ total_triples: 1,
319
+ completeness_avg: 0.75,
320
+ completeness_min: 0.75,
321
+ completeness_max: 0.75,
322
+ }
323
+ const provenanceBlock = {
324
+ total_triples: 1,
325
+ multi_source_triples: 0,
326
+ first_party_only_triples: 1,
327
+ source_type_distribution: {
328
+ first_party: 1,
329
+ third_party: 0,
330
+ collaborative: 0,
331
+ unspecified: 0,
332
+ },
333
+ }
334
+ const comparabilityBlock = {
335
+ total_triples: 1,
336
+ variant_divergent_count: 0,
337
+ cross_party_divergent_count: 0,
338
+ groups_with_variant_check: 1,
339
+ groups_with_cross_party_check: 0,
340
+ }
341
+ await writeFile(
342
+ path.join(snapshotDir, "headline.json"),
343
+ JSON.stringify({
344
+ generated_at: "2026-05-03T00:00:00Z",
345
+ signal_version: "1.0",
346
+ stratification_dimensions: ["category"],
347
+ reproducibility: {
348
+ overall: reproducibilityBlock,
349
+ by_category: { Reasoning: reproducibilityBlock },
350
+ },
351
+ completeness: {
352
+ overall: completenessBlock,
353
+ by_category: { Reasoning: completenessBlock },
354
+ },
355
+ provenance: {
356
+ overall: provenanceBlock,
357
+ by_category: { Reasoning: provenanceBlock },
358
+ },
359
+ comparability: {
360
+ overall: comparabilityBlock,
361
+ by_category: { Reasoning: comparabilityBlock },
362
+ },
363
+ developers: [
364
+ {
365
+ developer: "OpenAI",
366
+ route_id: "OpenAI",
367
+ model_count: 1,
368
+ benchmark_count: 1,
369
+ evaluation_count: 1,
370
+ popular_evals: [{ benchmark: "MMLU", model_count: 1 }],
371
+ },
372
+ ],
373
+ })
374
+ )
375
+
376
+ await writeFile(
377
+ path.join(snapshotDir, "hierarchy.json"),
378
+ JSON.stringify({
379
+ stats: {
380
+ family_count: 1,
381
+ composite_count: 0,
382
+ standalone_benchmark_count: 1,
383
+ single_benchmark_count: 1,
384
+ slice_count: 0,
385
+ metric_count: 1,
386
+ metric_rows_scanned: 1,
387
+ },
388
+ families: [],
389
+ })
390
+ )
391
+ }
392
+
393
+ describe("Stage J view-layer backend", () => {
394
+ it("reads a pinned snapshot through the v2 accessors", async () => {
395
+ const snapshotDir = await mkdtemp(path.join(os.tmpdir(), "eval-card-stage-j-"))
396
+ const previousBackend = process.env.DATA_BACKEND
397
+ const previousSnapshotUrl = process.env.SNAPSHOT_URL
398
+
399
+ try {
400
+ await writeSyntheticStageJSnapshot(snapshotDir)
401
+ process.env.DATA_BACKEND = "v2"
402
+ process.env.SNAPSHOT_URL = `file://${snapshotDir}`
403
+
404
+ const dataBackend = await import("../lib/data-backend")
405
+ const hfData = await import("../lib/hf-data")
406
+
407
+ const [models, evalListData, modelSummary, evalSummary, developers, developerSummary, manifest, hierarchy, aggregates] =
408
+ await Promise.all([
409
+ dataBackend.getModelCardsLite(),
410
+ dataBackend.getEvalListLiteData(),
411
+ dataBackend.getModelSummaryById("openai%2Fgpt-5"),
412
+ dataBackend.getEvalSummaryById("mmlu"),
413
+ dataBackend.getDeveloperList(),
414
+ dataBackend.getDeveloperSummaryById("OpenAI"),
415
+ dataBackend.getBackendManifestData(),
416
+ dataBackend.getEvalHierarchyData(),
417
+ hfData.fetchCorpusAggregates(),
418
+ ])
419
+
420
+ expect(models[0]).toMatchObject({
421
+ route_id: "openai%2Fgpt-5",
422
+ model_name: "GPT 5",
423
+ evaluations_count: 1,
424
+ })
425
+ expect(evalListData).toMatchObject({
426
+ totalModels: 1,
427
+ evals: [{ evaluation_id: "mmlu", evaluation_name: "MMLU", models_count: 1 }],
428
+ })
429
+ expect(modelSummary?.evaluations_by_category.Reasoning).toHaveLength(1)
430
+ expect(evalSummary?.model_results[0]).toMatchObject({
431
+ model_route_id: "openai%2Fgpt-5",
432
+ score: 0.8,
433
+ result: { metric_summary_id: "mmlu%3Aaccuracy" },
434
+ })
435
+ expect(developers[0]).toMatchObject({ developer: "OpenAI", route_id: "OpenAI" })
436
+ expect(developerSummary?.models).toHaveLength(1)
437
+ expect(manifest.model_count).toBe(1)
438
+ expect(hierarchy.stats?.metric_rows_scanned).toBe(1)
439
+ expect(aggregates?.completeness.overall).toMatchObject({
440
+ total_triples: 1,
441
+ completeness_avg: 0.75,
442
+ })
443
+ expect(aggregates?.provenance.overall).toMatchObject({
444
+ total_triples: 1,
445
+ first_party_only_triples: 1,
446
+ })
447
+ expect(aggregates?.comparability.overall).toMatchObject({
448
+ groups_with_variant_check: 1,
449
+ variant_divergent_count: 0,
450
+ })
451
+ expect(aggregates?.comparability.by_category.Reasoning).toBeDefined()
452
+ } finally {
453
+ if (previousBackend == null) {
454
+ delete process.env.DATA_BACKEND
455
+ } else {
456
+ process.env.DATA_BACKEND = previousBackend
457
+ }
458
+ if (previousSnapshotUrl == null) {
459
+ delete process.env.SNAPSHOT_URL
460
+ } else {
461
+ process.env.SNAPSHOT_URL = previousSnapshotUrl
462
+ }
463
+ await rm(snapshotDir, { recursive: true, force: true })
464
+ }
465
+ })
466
+ })