Spaces:

evaleval
/

general-eval-card

Running

App Files Files Community

evijit HF Staff

j-chim commited on 26 days ago

Commit

fe99ffa

1 Parent(s): 32864b0

Swap backend data (#3)

Browse files

- Integrate with test backend data (7635aee64606c5b9138e680b833ca1383b570887)
- Drop input_modalities/output_modalities from MODEL_CARD_COLUMNS (bfce8f214eed3054b820176601eaa0a23e31bee7)
- Merge remote-tracking branch 'origin/main' into feat/use-new-backend-data (25ba6d010ff24b92f252f8fe11a6624f68aa6690)
- Use model_key as the addressable identifier and wire comparison-index sidecar (0e529dce5eb243708739730f1fcec00d27202d71)

Co-authored-by: Jenny Chim <j-chim@users.noreply.huggingface.co>

Files changed (16) hide show

Dockerfile +17 -17
app/page.tsx +1 -1
components/signals/corpus-dashboard.tsx +61 -42
components/signals/corpus-signals-strip.tsx +21 -11
lib/backend-artifacts.ts +33 -21
lib/benchmark-schema.ts +1 -0
lib/data-backend.ts +133 -42
lib/duckdb.ts +46 -0
lib/hf-data.ts +42 -0
lib/sidecars.ts +65 -0
lib/view-data.ts +576 -0
notes/backend-v2-migration.md +616 -0
notes/merge-cheatsheet-backend-v2.md +193 -0
scripts/cache-hf-data.mjs +7 -0
tests/duckdb-data.test.ts +29 -29
tests/view-data.test.ts +466 -0

Dockerfile CHANGED Viewed

@@ -9,20 +9,18 @@ ARG PNPM_VERSION=10.25.0
 # Build-time data-source configuration. HF Spaces "Variables" are NOT injected
 # into Docker RUN steps automatically — only into the final runtime — so we
-# bake the DuckDB-mode defaults here. `cache-hf-data.mjs` reads these to know
-# which dataset to clone and to apply lean cache mode (skip JSON-fallback
-# artifacts). Override at build time via `--build-arg HF_DATASET_REPO=...`.
-ARG DATA_BACKEND=duckdb
 ARG HF_DATASET_REPO=https://huggingface.co/datasets/evaleval/card_backend
-# Static prerender (`next build`) executes route handlers, which call
-# `getModelCards` etc. → `lib/duckdb-data.ts`, which requires
-# `LOCAL_PIPELINE_OUTPUT`. The cache populated by `cache-hf-data.mjs`
-# lives at `/app/.cache/hf-data`. `HF_DATA_OFFLINE=1` keeps the metadata
-# fetchers (`lib/hf-data.ts`) from attempting `evaleval/card_backend`
-# network reads with `revalidate: 0` (which Next 15 treats as dynamic
-# and fails the static export of `/`).
 ENV DATA_BACKEND=${DATA_BACKEND} \
     HF_DATASET_REPO=${HF_DATASET_REPO} \
     LOCAL_PIPELINE_OUTPUT=/app/.cache/hf-data \
     HF_DATA_LOCAL_DIR=/app/.cache/hf-data \
     HF_DATA_OFFLINE=1
@@ -49,13 +47,15 @@ RUN pnpm run build
 FROM node:18-bullseye-slim AS runner
 WORKDIR /app
-# Runtime needs the same DuckDB-mode envs that the builder used. HF Space
-# Variables aren't set on this Space, and Docker multi-stage doesn't carry
-# ENVs across stages — without these, lib/duckdb-data.ts throws
-# "DATA_BACKEND=duckdb requires LOCAL_PIPELINE_OUTPUT" at request time and
-# every model/eval/developer endpoint returns empty.
 ENV NODE_ENV=production \
-    DATA_BACKEND=duckdb \
     LOCAL_PIPELINE_OUTPUT=/app/.cache/hf-data \
     HF_DATA_LOCAL_DIR=/app/.cache/hf-data \
     HF_DATA_OFFLINE=1

 # Build-time data-source configuration. HF Spaces "Variables" are NOT injected
 # into Docker RUN steps automatically — only into the final runtime — so we
+# bake the selected backend here. `DATA_BACKEND=v2` reads `SNAPSHOT_URL`
+# directly; legacy DuckDB mode still clones `HF_DATASET_REPO` into the cache.
+# Override at build time via `--build-arg ...`.
+ARG DATA_BACKEND=v2
 ARG HF_DATASET_REPO=https://huggingface.co/datasets/evaleval/card_backend
+ARG SNAPSHOT_URL=https://huggingface.co/datasets/j-chim/temp_evalcard_backend/resolve/main/warehouse/2026-05-03T21-46-50Z
+# Static prerender (`next build`) executes route handlers. In legacy mode the
+# cache populated by `cache-hf-data.mjs` lives at `/app/.cache/hf-data`; in v2
+# the cache step is skipped and the app reads the pinned Stage J snapshot.
 ENV DATA_BACKEND=${DATA_BACKEND} \
     HF_DATASET_REPO=${HF_DATASET_REPO} \
+    SNAPSHOT_URL=${SNAPSHOT_URL} \
     LOCAL_PIPELINE_OUTPUT=/app/.cache/hf-data \
     HF_DATA_LOCAL_DIR=/app/.cache/hf-data \
     HF_DATA_OFFLINE=1
 FROM node:18-bullseye-slim AS runner
 WORKDIR /app
+ARG DATA_BACKEND=v2
+ARG SNAPSHOT_URL=https://huggingface.co/datasets/j-chim/temp_evalcard_backend/resolve/main/warehouse/2026-05-03T21-46-50Z
+# Runtime needs the same data-source envs that the builder used. Docker
+# multi-stage doesn't carry ENVs across stages, so keep backend selection and
+# snapshot/cache pointers explicit here too.
 ENV NODE_ENV=production \
+    DATA_BACKEND=${DATA_BACKEND} \
+    SNAPSHOT_URL=${SNAPSHOT_URL} \
     LOCAL_PIPELINE_OUTPUT=/app/.cache/hf-data \
     HF_DATA_LOCAL_DIR=/app/.cache/hf-data \
     HF_DATA_OFFLINE=1

app/page.tsx CHANGED Viewed

@@ -244,7 +244,7 @@ export default async function HomePage() {
               <p className="mx-auto mt-2 max-w-2xl text-sm leading-6 text-[color:var(--fg-muted)]">
                 The current backend snapshot does not include{" "}
                 <code className="rounded-sm bg-[color:var(--bg-surface)] px-1.5 py-0.5 font-mono text-xs">
-                  corpus-aggregates.json
                 </code>
                 . When it does, this section will render the four corpus-level rollups.
               </p>

               <p className="mx-auto mt-2 max-w-2xl text-sm leading-6 text-[color:var(--fg-muted)]">
                 The current backend snapshot does not include{" "}
                 <code className="rounded-sm bg-[color:var(--bg-surface)] px-1.5 py-0.5 font-mono text-xs">
+                  headline.json
                 </code>
                 . When it does, this section will render the four corpus-level rollups.
               </p>

components/signals/corpus-dashboard.tsx CHANGED Viewed

@@ -20,7 +20,7 @@ import {
   formatPercent,
 } from "./signal-utils"
-const CATEGORY_ORDER = ["agentic", "general", "knowledge", "reasoning", "safety", "other"]
 const SOURCE_COLORS: Record<string, string> = {
   first_party: "bg-amber-500",
@@ -51,13 +51,21 @@ export function CorpusDashboard({
   }, [mode])
   const categoryKeys = useMemo(
-    () =>
-      CATEGORY_ORDER.filter((category) =>
-        aggregates.reproducibility.by_category[category] ||
-        aggregates.completeness.by_category[category] ||
-        aggregates.provenance.by_category[category] ||
-        aggregates.comparability.by_category[category]
-      ),
     [aggregates]
   )
@@ -190,25 +198,14 @@ function CompletenessSection({
       icon={<ClipboardCheck className="h-5 w-5" />}
       title="Reporting Completeness"
       subtitle="How much benchmark documentation is populated."
-      headline={formatPercent(block.completeness_score_mean)}
-      headlineLabel={`Median ${formatPercent(block.completeness_score_median)} across ${block.total_benchmarks.toLocaleString()} benchmarks`}
     >
       {scores.length > 0 && <Histogram scores={scores} />}
-      <div className="mt-4 grid gap-2">
-        {Object.entries(block.per_field_population).slice(0, 10).map(([field, value]) => (
-          <div key={field} className="rounded-xl border border-border/60 bg-background px-3 py-2">
-            <div className="flex items-start justify-between gap-3 text-sm">
-              <span className="font-medium">{formatFieldLabel(field)}</span>
-              <span className="shrink-0 tabular-nums text-muted-foreground">
-                {formatPercent(value.mean_score)}
-              </span>
-            </div>
-            <div className="mt-2 grid gap-1.5">
-              <MetricBar label="Any data" value={value.populated_rate} compact />
-              <MetricBar label="Fully populated" value={value.fully_populated_rate} compact />
-            </div>
-          </div>
-        ))}
       </div>
     </DashboardSection>
   )
@@ -217,14 +214,16 @@ function CompletenessSection({
 function ProvenanceSection({ block }: { block: ProvenanceCorpusBlock }) {
   const distribution = block.source_type_distribution
   const total = Object.values(distribution).reduce((sum, value) => sum + value, 0)
   return (
     <DashboardSection
       icon={<BarChart3 className="h-5 w-5" />}
       title="Provenance"
       subtitle="Who reported the scores, and whether groups have multiple sources."
-      headline={formatPercent(block.multi_source_rate)}
-      headlineLabel="of (model, benchmark, metric) groups have multiple reporting sources"
     >
       <div className="overflow-hidden rounded-full border border-border/70 bg-muted/30">
         <div className="flex h-4 w-full">
@@ -240,34 +239,40 @@ function ProvenanceSection({ block }: { block: ProvenanceCorpusBlock }) {
       </div>
       <div className="mt-3 grid gap-2 sm:grid-cols-2">
-        <RatioTile label="Multi-source groups" value={block.multi_source_rate} count={block.multi_source_groups} />
-        <RatioTile label="First-party only groups" value={block.first_party_only_rate} count={block.first_party_only_groups} />
       </div>
     </DashboardSection>
   )
 }
 function ComparabilitySection({ block }: { block: ComparabilityCorpusBlock }) {
   return (
     <DashboardSection
       icon={<GitCompareArrows className="h-5 w-5" />}
       title="Comparability"
       subtitle="Eligible groups where scores diverge across setups or reporting organizations."
-      headline={formatNullableRate(block.variant_divergence_rate)}
-      headlineLabel={`${block.variant_divergent_groups.toLocaleString()} of ${block.variant_eligible_groups.toLocaleString()} setup-eligible groups diverge`}
     >
       <div className="grid gap-3 md:grid-cols-2">
         <ComparabilityRateCard
           title="Variant divergence"
-          rate={block.variant_divergence_rate}
-          eligible={block.variant_eligible_groups}
-          divergent={block.variant_divergent_groups}
         />
         <ComparabilityRateCard
           title="Cross-party divergence"
-          rate={block.cross_party_divergence_rate}
-          eligible={block.cross_party_eligible_groups}
-          divergent={block.cross_party_divergent_groups}
         />
       </div>
     </DashboardSection>
@@ -288,6 +293,15 @@ function CategoryPanel({
   comparability?: ComparabilityCorpusBlock
 }) {
   const categoryLabel = `${category.charAt(0).toUpperCase()}${category.slice(1)}`
   return (
     <section className="rounded-2xl border border-border/70 bg-card p-4 shadow-sm">
@@ -297,11 +311,11 @@ function CategoryPanel({
       </div>
       <div className="grid gap-3 sm:grid-cols-2">
         <MiniMetric label="Reproducibility gaps" value={formatPercent(reproducibility?.reproducibility_gap_rate)} />
-        <MiniMetric label="Documentation mean" value={formatPercent(completeness?.completeness_score_mean)} />
-        <MiniMetric label="Multi-source groups" value={formatPercent(provenance?.multi_source_rate)} />
-        <MiniMetric label="Variant divergence" value={formatNullableRate(comparability?.variant_divergence_rate)} />
       </div>
-      {comparability?.cross_party_divergence_rate == null && (
         <div className="mt-3 rounded-xl border border-dashed border-border/70 bg-muted/10 px-3 py-2 text-sm text-muted-foreground">
           Cross-party divergence: N/A - not enough multi-org coverage.
         </div>
@@ -411,7 +425,7 @@ function RatioTile({ label, value, count }: { label: string; value: number | nul
       <div className="text-sm font-medium">{label}</div>
       <div className="mt-1 flex items-baseline justify-between gap-2">
         <span className="text-xl font-semibold tabular-nums">{formatPercent(value)}</span>
-        <span className="text-xs text-muted-foreground">{count.toLocaleString()} groups</span>
       </div>
     </div>
   )
@@ -463,6 +477,11 @@ function formatNullableRate(value: number | null | undefined) {
   return value == null ? "N/A" : formatPercent(value)
 }
 function formatGeneratedDate(value: string) {
   const date = new Date(value)
   if (Number.isNaN(date.getTime())) {

   formatPercent,
 } from "./signal-utils"
+const CATEGORY_ORDER = ["Agentic", "General", "Knowledge", "Reasoning", "Safety", "Other"]
 const SOURCE_COLORS: Record<string, string> = {
   first_party: "bg-amber-500",
   }, [mode])
   const categoryKeys = useMemo(
+    () => {
+      const available = new Set([
+        ...Object.keys(aggregates.reproducibility.by_category),
+        ...Object.keys(aggregates.completeness.by_category),
+        ...Object.keys(aggregates.provenance.by_category),
+        ...Object.keys(aggregates.comparability.by_category),
+      ])
+      return [
+        ...CATEGORY_ORDER.filter((category) => available.has(category)),
+        ...Array.from(available)
+          .filter((category) => !CATEGORY_ORDER.includes(category))
+          .sort((a, b) => a.localeCompare(b)),
+      ]
+    },
     [aggregates]
   )
       icon={<ClipboardCheck className="h-5 w-5" />}
       title="Reporting Completeness"
       subtitle="How much benchmark documentation is populated."
+      headline={formatPercent(block.completeness_avg)}
+      headlineLabel={`Range ${formatPercent(block.completeness_min)} to ${formatPercent(block.completeness_max)} across ${block.total_triples.toLocaleString()} reported score triples`}
     >
       {scores.length > 0 && <Histogram scores={scores} />}
+      <div className="mt-4 grid gap-2 sm:grid-cols-3">
+        <MiniMetric label="Minimum" value={formatPercent(block.completeness_min)} />
+        <MiniMetric label="Average" value={formatPercent(block.completeness_avg)} />
+        <MiniMetric label="Maximum" value={formatPercent(block.completeness_max)} />
       </div>
     </DashboardSection>
   )
 function ProvenanceSection({ block }: { block: ProvenanceCorpusBlock }) {
   const distribution = block.source_type_distribution
   const total = Object.values(distribution).reduce((sum, value) => sum + value, 0)
+  const multiSourceRate = rate(block.multi_source_triples, block.total_triples)
+  const firstPartyOnlyRate = rate(block.first_party_only_triples, block.total_triples)
   return (
     <DashboardSection
       icon={<BarChart3 className="h-5 w-5" />}
       title="Provenance"
       subtitle="Who reported the scores, and whether groups have multiple sources."
+      headline={formatPercent(multiSourceRate)}
+      headlineLabel="of reported score triples have multiple reporting sources"
     >
       <div className="overflow-hidden rounded-full border border-border/70 bg-muted/30">
         <div className="flex h-4 w-full">
       </div>
       <div className="mt-3 grid gap-2 sm:grid-cols-2">
+        <RatioTile label="Multi-source triples" value={multiSourceRate} count={block.multi_source_triples} />
+        <RatioTile label="First-party only triples" value={firstPartyOnlyRate} count={block.first_party_only_triples} />
       </div>
     </DashboardSection>
   )
 }
 function ComparabilitySection({ block }: { block: ComparabilityCorpusBlock }) {
+  const variantRate = rate(block.variant_divergent_count, block.groups_with_variant_check)
+  const crossPartyRate = rate(
+    block.cross_party_divergent_count,
+    block.groups_with_cross_party_check
+  )
   return (
     <DashboardSection
       icon={<GitCompareArrows className="h-5 w-5" />}
       title="Comparability"
       subtitle="Eligible groups where scores diverge across setups or reporting organizations."
+      headline={formatNullableRate(variantRate)}
+      headlineLabel={`${block.variant_divergent_count.toLocaleString()} of ${block.groups_with_variant_check.toLocaleString()} setup-eligible groups diverge`}
     >
       <div className="grid gap-3 md:grid-cols-2">
         <ComparabilityRateCard
           title="Variant divergence"
+          rate={variantRate}
+          eligible={block.groups_with_variant_check}
+          divergent={block.variant_divergent_count}
         />
         <ComparabilityRateCard
           title="Cross-party divergence"
+          rate={crossPartyRate}
+          eligible={block.groups_with_cross_party_check}
+          divergent={block.cross_party_divergent_count}
         />
       </div>
     </DashboardSection>
   comparability?: ComparabilityCorpusBlock
 }) {
   const categoryLabel = `${category.charAt(0).toUpperCase()}${category.slice(1)}`
+  const multiSourceRate = rate(provenance?.multi_source_triples, provenance?.total_triples)
+  const variantRate = rate(
+    comparability?.variant_divergent_count,
+    comparability?.groups_with_variant_check
+  )
+  const crossPartyRate = rate(
+    comparability?.cross_party_divergent_count,
+    comparability?.groups_with_cross_party_check
+  )
   return (
     <section className="rounded-2xl border border-border/70 bg-card p-4 shadow-sm">
       </div>
       <div className="grid gap-3 sm:grid-cols-2">
         <MiniMetric label="Reproducibility gaps" value={formatPercent(reproducibility?.reproducibility_gap_rate)} />
+        <MiniMetric label="Documentation mean" value={formatPercent(completeness?.completeness_avg)} />
+        <MiniMetric label="Multi-source triples" value={formatPercent(multiSourceRate)} />
+        <MiniMetric label="Variant divergence" value={formatNullableRate(variantRate)} />
       </div>
+      {crossPartyRate == null && (
         <div className="mt-3 rounded-xl border border-dashed border-border/70 bg-muted/10 px-3 py-2 text-sm text-muted-foreground">
           Cross-party divergence: N/A - not enough multi-org coverage.
         </div>
       <div className="text-sm font-medium">{label}</div>
       <div className="mt-1 flex items-baseline justify-between gap-2">
         <span className="text-xl font-semibold tabular-nums">{formatPercent(value)}</span>
+        <span className="text-xs text-muted-foreground">{count.toLocaleString()} triples</span>
       </div>
     </div>
   )
   return value == null ? "N/A" : formatPercent(value)
 }
+function rate(numerator: number | null | undefined, denominator: number | null | undefined) {
+  if (numerator == null || denominator == null || denominator <= 0) return null
+  return numerator / denominator
+}
 function formatGeneratedDate(value: string) {
   const date = new Date(value)
   if (Number.isNaN(date.getTime())) {

components/signals/corpus-signals-strip.tsx CHANGED Viewed

@@ -39,8 +39,13 @@ export function CorpusSignalsStrip({
   const tpShare = totalReports > 0 ? prov.source_type_distribution.third_party / totalReports : 0
   const fpShare = totalReports > 0 ? prov.source_type_distribution.first_party / totalReports : 0
-  const cmpRate = cmp.variant_divergence_rate
-  const crossPartyAvailable = cmp.cross_party_eligible_groups > 0
   return (
     <div className="signals-grid">
@@ -58,29 +63,29 @@ export function CorpusSignalsStrip({
       />
       <SignalTile
         id="completeness"
-        statValue={pctNum(comp.completeness_score_mean)}
         statUnit="%"
-        headline={`mean across ${comp.total_benchmarks.toLocaleString()} benchmarks (median ${formatPct(comp.completeness_score_median)}).`}
-        detail="Source-provenance fields populate fully; preregistration fields are unmet."
         asks="Is the benchmark itself documented well enough to interpret a score on it?"
       />
       <SignalTile
         id="provenance"
-        statValue={pctNum(prov.multi_source_rate)}
         statUnit="%"
-        headline="of (model, benchmark) groups have reports from more than one party."
-        detail={`${formatPct(tpShare)} third-party, ${formatPct(fpShare)} first-party of ${totalReports.toLocaleString()} results.`}
         asks="Who reported this score, and have others reproduced it?"
       />
       <SignalTile
         id="comparability"
         statValue={pctNum(cmpRate)}
         statUnit="%"
-        headline={`of setup-eligible groups diverge across variants (${cmp.variant_divergent_groups.toLocaleString()} of ${cmp.variant_eligible_groups.toLocaleString()}).`}
         detail={
           crossPartyAvailable
-            ? `Cross-party divergence: ${formatPct(cmp.cross_party_divergence_rate)}.`
-            : "Cross-party divergence not yet computable — too few multi-org reports."
         }
         asks="Are scores on the same benchmark actually measuring the same thing?"
       />
@@ -154,6 +159,11 @@ function formatPct(value: number | null | undefined): string {
   return `${Math.round(value * 100)}%`
 }
 const FIELD_LABELS: Record<string, string> = {
   temperature: "temperature",
   max_tokens: "max tokens",

   const tpShare = totalReports > 0 ? prov.source_type_distribution.third_party / totalReports : 0
   const fpShare = totalReports > 0 ? prov.source_type_distribution.first_party / totalReports : 0
+  const multiSourceRate = rate(prov.multi_source_triples, prov.total_triples)
+  const cmpRate = rate(cmp.variant_divergent_count, cmp.groups_with_variant_check)
+  const crossPartyRate = rate(
+    cmp.cross_party_divergent_count,
+    cmp.groups_with_cross_party_check
+  )
+  const crossPartyAvailable = cmp.groups_with_cross_party_check > 0
   return (
     <div className="signals-grid">
       />
       <SignalTile
         id="completeness"
+        statValue={pctNum(comp.completeness_avg)}
         statUnit="%"
+        headline={`mean across ${comp.total_triples.toLocaleString()} reported score triples.`}
+        detail={`Observed range: ${formatPct(comp.completeness_min)} to ${formatPct(comp.completeness_max)}.`}
         asks="Is the benchmark itself documented well enough to interpret a score on it?"
       />
       <SignalTile
         id="provenance"
+        statValue={pctNum(multiSourceRate)}
         statUnit="%"
+        headline="of reported score triples have reports from more than one party."
+        detail={`${formatPct(tpShare)} third-party, ${formatPct(fpShare)} first-party of ${totalReports.toLocaleString()} triples.`}
         asks="Who reported this score, and have others reproduced it?"
       />
       <SignalTile
         id="comparability"
         statValue={pctNum(cmpRate)}
         statUnit="%"
+        headline={`of setup-eligible groups diverge across variants (${cmp.variant_divergent_count.toLocaleString()} of ${cmp.groups_with_variant_check.toLocaleString()}).`}
         detail={
           crossPartyAvailable
+            ? `Cross-party divergence: ${formatPct(crossPartyRate)}.`
+            : "Cross-party divergence not yet computable: too few multi-org reports."
         }
         asks="Are scores on the same benchmark actually measuring the same thing?"
       />
   return `${Math.round(value * 100)}%`
 }
+function rate(numerator: number | null | undefined, denominator: number | null | undefined) {
+  if (numerator == null || denominator == null || denominator <= 0) return null
+  return numerator / denominator
+}
 const FIELD_LABELS: Record<string, string> = {
   temperature: "temperature",
   max_tokens: "max tokens",

lib/backend-artifacts.ts CHANGED Viewed

@@ -12,6 +12,7 @@ export interface BackendManifest {
   skipped_config_count?: number
   summary_artifacts?: {
     corpus_aggregates?: string
     [key: string]: string | undefined
   }
 }
@@ -177,6 +178,27 @@ export interface CorpusAggregates {
   completeness: Stratified<CompletenessCorpusBlock>
   provenance: Stratified<ProvenanceCorpusBlock>
   comparability: Stratified<ComparabilityCorpusBlock>
 }
 export interface Stratified<T> {
@@ -198,35 +220,25 @@ export interface ReproducibilityCorpusBlock {
 }
 export interface CompletenessCorpusBlock {
-  total_benchmarks: number
-  completeness_score_mean: number | null
-  completeness_score_median: number | null
-  per_field_population: Record<string, {
-    mean_score: number
-    populated_rate: number
-    fully_populated_rate: number
-    benchmark_count: number
-  }>
 }
 export interface ProvenanceCorpusBlock {
   total_triples: number
-  total_groups: number
-  multi_source_groups: number
-  multi_source_rate: number | null
-  first_party_only_groups: number
-  first_party_only_rate: number | null
   source_type_distribution: Record<ProvenanceSourceType, number>
 }
 export interface ComparabilityCorpusBlock {
-  total_groups: number
-  variant_eligible_groups: number
-  variant_divergent_groups: number
-  variant_divergence_rate: number | null
-  cross_party_eligible_groups: number
-  cross_party_divergent_groups: number
-  cross_party_divergence_rate: number | null
 }
 export interface HierarchyTags {

   skipped_config_count?: number
   summary_artifacts?: {
     corpus_aggregates?: string
+    eval_hierarchy?: string
     [key: string]: string | undefined
   }
 }
   completeness: Stratified<CompletenessCorpusBlock>
   provenance: Stratified<ProvenanceCorpusBlock>
   comparability: Stratified<ComparabilityCorpusBlock>
+  developers?: DeveloperListEntry[]
+  families?: Array<{
+    family_key: string
+    display_name: string
+    model_count: number
+    eval_count: number
+  }>
+  categories?: Array<{
+    category: string
+    model_count: number
+    eval_count: number
+  }>
+}
+export interface DeveloperListEntry {
+  developer: string
+  route_id: string
+  model_count: number
+  benchmark_count: number
+  evaluation_count: number
+  popular_evals: Array<{ benchmark: string; model_count: number }>
 }
 export interface Stratified<T> {
 }
 export interface CompletenessCorpusBlock {
+  total_triples: number
+  completeness_avg: number | null
+  completeness_min: number | null
+  completeness_max: number | null
 }
 export interface ProvenanceCorpusBlock {
   total_triples: number
+  multi_source_triples: number
+  first_party_only_triples: number
   source_type_distribution: Record<ProvenanceSourceType, number>
 }
 export interface ComparabilityCorpusBlock {
+  total_triples: number
+  variant_divergent_count: number
+  cross_party_divergent_count: number
+  groups_with_variant_check: number
+  groups_with_cross_party_check: number
 }
 export interface HierarchyTags {

lib/benchmark-schema.ts CHANGED Viewed

@@ -124,6 +124,7 @@ export interface ScoreDetails {
 }
 export interface GenerationConfig {
   generation_args?: {
     temperature?: number
     top_p?: number

 }
 export interface GenerationConfig {
+  num_few_shot?: number
   generation_args?: {
     temperature?: number
     top_p?: number

lib/data-backend.ts CHANGED Viewed

@@ -1,49 +1,140 @@
 import "server-only"
-import {
-  getDashboardDataFromDuckDB,
-  getModelCardsFromDuckDB,
-  getModelCardsLiteFromDuckDB,
-  getEvalListDataFromDuckDB,
-  getEvalListLiteDataFromDuckDB,
-  getEvalListFromDuckDB,
-  getDeveloperListFromDuckDB,
-  getDeveloperSummaryByIdFromDuckDB,
-  getModelSummaryByIdFromDuckDB,
-  getEvalSummaryByIdFromDuckDB,
-} from "@/lib/duckdb-data"
 import { normalizeEvalSummary } from "@/lib/eval-processing"
-import {
-  fetchBackendManifest,
-  fetchBackendManifestStatus,
-  fetchEvalHierarchy,
-} from "@/lib/hf-data"
-export const getDashboardData = getDashboardDataFromDuckDB
-export const getModelCards = getModelCardsFromDuckDB
-export const getModelCardsLite = getModelCardsLiteFromDuckDB
-export const getEvalListData = getEvalListDataFromDuckDB
-export const getEvalListLiteData = getEvalListLiteDataFromDuckDB
-export const getEvalList = getEvalListFromDuckDB
-export const getDeveloperList = getDeveloperListFromDuckDB
-export const getDeveloperSummaryById = getDeveloperSummaryByIdFromDuckDB
-export const getModelSummaryById = getModelSummaryByIdFromDuckDB
-/**
- * Eval summary lookups go through `normalizeEvalSummary` so derivable but
- * sometimes-blank fields (currently `instance_data`) are reconciled from
- * `model_results` before they reach any consumer. The strict pass-through
- * contract of `duckdb-data.ts` stays intact — reconciliation of known
- * upstream gaps belongs in this thin adapter layer.
- */
 export async function getEvalSummaryById(evalId: string) {
-  const summary = await getEvalSummaryByIdFromDuckDB(evalId)
   return summary ? normalizeEvalSummary(summary) : summary
 }
-// Metadata-style artifacts are still read through the existing JSON/HF path.
-// They are not request-time processing hotspots and the DuckDB shadow doesn't
-// re-shape them, so calling lib/hf-data directly avoids needless indirection.
-export const getBackendManifestData = fetchBackendManifest
-export const getBackendManifestStatusData = fetchBackendManifestStatus
-export const getEvalHierarchyData = fetchEvalHierarchy

 import "server-only"
+import type { BackendManifestStatus } from "@/lib/backend-artifacts"
 import { normalizeEvalSummary } from "@/lib/eval-processing"
+const BACKEND_VERSION = process.env.DATA_BACKEND?.trim().toLowerCase() ?? "duckdb"
+function useViewLayerBackend() {
+  return BACKEND_VERSION === "v2" || BACKEND_VERSION === "stage-j"
+}
+async function legacyBackend() {
+  return import("@/lib/duckdb-data")
+}
+async function viewBackend() {
+  return import("@/lib/view-data")
+}
+async function sidecars() {
+  return import("@/lib/sidecars")
+}
+async function hfData() {
+  return import("@/lib/hf-data")
+}
+export async function getModelCards() {
+  if (useViewLayerBackend()) {
+    return (await viewBackend()).getModelCards()
+  }
+  return (await legacyBackend()).getModelCardsFromDuckDB()
+}
+export async function getModelCardsLite() {
+  if (useViewLayerBackend()) {
+    return (await viewBackend()).getModelCardsLite()
+  }
+  return (await legacyBackend()).getModelCardsLiteFromDuckDB()
+}
+export async function getEvalListData() {
+  if (useViewLayerBackend()) {
+    return (await viewBackend()).getEvalListData()
+  }
+  return (await legacyBackend()).getEvalListDataFromDuckDB()
+}
+export async function getEvalListLiteData() {
+  if (useViewLayerBackend()) {
+    return (await viewBackend()).getEvalListLiteData()
+  }
+  return (await legacyBackend()).getEvalListLiteDataFromDuckDB()
+}
+export async function getEvalList() {
+  if (useViewLayerBackend()) {
+    return (await viewBackend()).getEvalList()
+  }
+  return (await legacyBackend()).getEvalListFromDuckDB()
+}
+export async function getDashboardData() {
+  if (useViewLayerBackend()) {
+    return (await viewBackend()).getDashboardData()
+  }
+  return (await legacyBackend()).getDashboardDataFromDuckDB()
+}
+export async function getDeveloperList() {
+  if (useViewLayerBackend()) {
+    return (await viewBackend()).getDeveloperList()
+  }
+  return (await legacyBackend()).getDeveloperListFromDuckDB()
+}
+export async function getDeveloperSummaryById(routeId: string) {
+  if (useViewLayerBackend()) {
+    return (await viewBackend()).getDeveloperSummaryById(routeId)
+  }
+  return (await legacyBackend()).getDeveloperSummaryByIdFromDuckDB(routeId)
+}
+export async function getModelSummaryById(modelId: string) {
+  if (useViewLayerBackend()) {
+    return (await viewBackend()).getModelSummaryById(modelId)
+  }
+  return (await legacyBackend()).getModelSummaryByIdFromDuckDB(modelId)
+}
 export async function getEvalSummaryById(evalId: string) {
+  if (useViewLayerBackend()) {
+    return (await viewBackend()).getEvalSummaryById(evalId)
+  }
+  const summary = await (await legacyBackend()).getEvalSummaryByIdFromDuckDB(evalId)
   return summary ? normalizeEvalSummary(summary) : summary
 }
+export async function getBackendManifestData() {
+  if (useViewLayerBackend()) {
+    return (await sidecars()).fetchManifest()
+  }
+  return (await hfData()).fetchBackendManifest()
+}
+export async function getBackendManifestStatusData(): Promise<BackendManifestStatus> {
+  if (useViewLayerBackend()) {
+    const manifest = await (await sidecars()).fetchManifest()
+    return {
+      currentManifest: manifest,
+      latestManifest: manifest,
+      currentManifestSignature: manifest.generated_at,
+      latestManifestSignature: manifest.generated_at,
+      updateAvailable: false,
+      refreshing: false,
+      pendingRefreshCount: 0,
+    }
+  }
+  return (await hfData()).fetchBackendManifestStatus()
+}
+export async function getEvalHierarchyData() {
+  if (useViewLayerBackend()) {
+    return (await sidecars()).fetchHierarchy()
+  }
+  return (await hfData()).fetchEvalHierarchy()
+}

lib/duckdb.ts ADDED Viewed

	@@ -0,0 +1,46 @@

+import "server-only"
+import { DuckDBConnection } from "@duckdb/node-api"
+let connectionPromise: Promise<DuckDBConnection> | null = null
+function getSnapshotUrl() {
+  const snapshotUrl = process.env.SNAPSHOT_URL?.trim()
+  if (!snapshotUrl) {
+    throw new Error("DATA_BACKEND=v2 requires SNAPSHOT_URL to point at a Stage J snapshot directory")
+  }
+  return snapshotUrl.replace(/\/+$/, "")
+}
+function snapshotArtifact(name: string) {
+  return `${getSnapshotUrl()}/${name}`
+}
+function sqlString(value: string) {
+  return `'${value.replace(/'/g, "''")}'`
+}
+const VIEW_FILES = {
+  models_view: "models_view.parquet",
+  evals_view: "evals_view.parquet",
+  eval_results_view: "eval_results_view.parquet",
+} as const
+export async function getConnection(): Promise<DuckDBConnection> {
+  if (!connectionPromise) {
+    connectionPromise = (async () => {
+      const connection = await DuckDBConnection.create()
+      for (const [viewName, fileName] of Object.entries(VIEW_FILES)) {
+        await connection.run(
+          `CREATE OR REPLACE VIEW ${viewName} AS SELECT * FROM read_parquet(${sqlString(snapshotArtifact(fileName))})`
+        )
+      }
+      return connection
+    })()
+  }
+  return connectionPromise
+}

lib/hf-data.ts CHANGED Viewed

@@ -138,6 +138,15 @@ function getManifestSignature(manifest: BackendManifest | null | undefined) {
 // reading the same on-disk artifacts cannot diverge mid-test via background
 // refresh, and useful generally for offline development.
 const OFFLINE = process.env.HF_DATA_OFFLINE === "1"
 async function fetchRemoteJson<T>(relativePath: string): Promise<T> {
   if (OFFLINE) {
@@ -423,6 +432,19 @@ async function fetchHFJson<T>(relativePath: string): Promise<T> {
 }
 export async function fetchBackendManifestStatus(): Promise<BackendManifestStatus> {
   const snapshot = await getManifestSnapshot()
   const currentManifest = getCurrentManifestFromSnapshot(snapshot)
   const currentManifestSignature = getManifestSignature(currentManifest)
@@ -864,14 +886,26 @@ export async function fetchDevelopersList(): Promise<HFDeveloperEntry[]> {
 }
 export async function fetchBenchmarkMetadataMap(): Promise<Record<string, BenchmarkCard>> {
   return fetchHFJson<Record<string, BenchmarkCard>>("benchmark-metadata.json")
 }
 export async function fetchBackendManifest(): Promise<BackendManifest> {
   return fetchHFJson<BackendManifest>("manifest.json")
 }
 export async function fetchEvalHierarchy(): Promise<EvalHierarchy> {
   const raw = await fetchHFJson<EvalHierarchy>("eval-hierarchy.json")
   return adaptEvalHierarchy(raw)
 }
@@ -971,10 +1005,18 @@ function adaptEvalHierarchy(raw: EvalHierarchy): EvalHierarchy {
 }
 export async function fetchComparisonIndex(): Promise<ComparisonIndex> {
   return fetchHFJson<ComparisonIndex>("comparison-index.json")
 }
 export async function fetchCorpusAggregates(): Promise<CorpusAggregates | null> {
   return fetchHFJsonSafe<CorpusAggregates>("corpus-aggregates.json")
 }

 // reading the same on-disk artifacts cannot diverge mid-test via background
 // refresh, and useful generally for offline development.
 const OFFLINE = process.env.HF_DATA_OFFLINE === "1"
+const DATA_BACKEND_VERSION = process.env.DATA_BACKEND?.trim().toLowerCase()
+function useViewLayerBackend() {
+  return DATA_BACKEND_VERSION === "v2" || DATA_BACKEND_VERSION === "stage-j"
+}
+async function fetchSnapshotSidecars() {
+  return import("@/lib/sidecars")
+}
 async function fetchRemoteJson<T>(relativePath: string): Promise<T> {
   if (OFFLINE) {
 }
 export async function fetchBackendManifestStatus(): Promise<BackendManifestStatus> {
+  if (useViewLayerBackend()) {
+    const manifest = await (await fetchSnapshotSidecars()).fetchManifest()
+    return {
+      currentManifest: manifest,
+      latestManifest: manifest,
+      currentManifestSignature: manifest.generated_at,
+      latestManifestSignature: manifest.generated_at,
+      updateAvailable: false,
+      refreshing: false,
+      pendingRefreshCount: 0,
+    }
+  }
   const snapshot = await getManifestSnapshot()
   const currentManifest = getCurrentManifestFromSnapshot(snapshot)
   const currentManifestSignature = getManifestSignature(currentManifest)
 }
 export async function fetchBenchmarkMetadataMap(): Promise<Record<string, BenchmarkCard>> {
+  if (useViewLayerBackend()) {
+    return (await import("@/lib/view-data")).getBenchmarkMetadataMap()
+  }
   return fetchHFJson<Record<string, BenchmarkCard>>("benchmark-metadata.json")
 }
 export async function fetchBackendManifest(): Promise<BackendManifest> {
+  if (useViewLayerBackend()) {
+    return (await fetchSnapshotSidecars()).fetchManifest()
+  }
   return fetchHFJson<BackendManifest>("manifest.json")
 }
 export async function fetchEvalHierarchy(): Promise<EvalHierarchy> {
+  if (useViewLayerBackend()) {
+    return adaptEvalHierarchy(await (await fetchSnapshotSidecars()).fetchHierarchy())
+  }
   const raw = await fetchHFJson<EvalHierarchy>("eval-hierarchy.json")
   return adaptEvalHierarchy(raw)
 }
 }
 export async function fetchComparisonIndex(): Promise<ComparisonIndex> {
+  if (useViewLayerBackend()) {
+    return (await fetchSnapshotSidecars()).fetchComparisonIndex()
+  }
   return fetchHFJson<ComparisonIndex>("comparison-index.json")
 }
 export async function fetchCorpusAggregates(): Promise<CorpusAggregates | null> {
+  if (useViewLayerBackend()) {
+    return (await fetchSnapshotSidecars()).fetchHeadline()
+  }
   return fetchHFJsonSafe<CorpusAggregates>("corpus-aggregates.json")
 }

lib/sidecars.ts ADDED Viewed

	@@ -0,0 +1,65 @@

+import "server-only"
+import type {
+  BackendManifest,
+  ComparisonIndex,
+  CorpusAggregates,
+  EvalHierarchy,
+} from "@/lib/backend-artifacts"
+let cache: {
+  manifest?: Promise<BackendManifest>
+  headline?: Promise<CorpusAggregates>
+  hierarchy?: Promise<EvalHierarchy>
+  comparisonIndex?: Promise<ComparisonIndex>
+} = {}
+function getSnapshotUrl() {
+  const snapshotUrl = process.env.SNAPSHOT_URL?.trim()
+  if (!snapshotUrl) {
+    throw new Error("DATA_BACKEND=v2 requires SNAPSHOT_URL to point at a Stage J snapshot directory")
+  }
+  return snapshotUrl.replace(/\/+$/, "")
+}
+function sidecarUrl(name: string) {
+  return `${getSnapshotUrl()}/${name}`
+}
+async function fetchJson<T>(name: string): Promise<T> {
+  const url = sidecarUrl(name)
+  if (url.startsWith("file://")) {
+    const fs = await import("fs/promises")
+    const text = await fs.readFile(new URL(url), "utf8")
+    return JSON.parse(text) as T
+  }
+  const response = await fetch(url, { next: { revalidate: 3600 } })
+  if (!response.ok) {
+    throw new Error(`Snapshot sidecar fetch failed: ${response.status} ${response.statusText} for ${url}`)
+  }
+  return (await response.json()) as T
+}
+export function fetchManifest(): Promise<BackendManifest> {
+  return (cache.manifest ??= fetchJson<BackendManifest>("manifest.json"))
+}
+export function fetchHeadline(): Promise<CorpusAggregates> {
+  return (cache.headline ??= fetchJson<CorpusAggregates>("headline.json"))
+}
+export function fetchHierarchy(): Promise<EvalHierarchy> {
+  return (cache.hierarchy ??= fetchJson<EvalHierarchy>("hierarchy.json"))
+}
+export function fetchComparisonIndex(): Promise<ComparisonIndex> {
+  return (cache.comparisonIndex ??= fetchJson<ComparisonIndex>("comparison-index.json"))
+}
+export function resetSidecarCacheForTests() {
+  cache = {}
+}

lib/view-data.ts ADDED Viewed

	@@ -0,0 +1,576 @@

+import "server-only"
+import { getConnection } from "@/lib/duckdb"
+import { fetchHeadline } from "@/lib/sidecars"
+import {
+  EVALUATION_CATEGORIES,
+  type BenchmarkCard,
+  type BenchmarkEvaluation,
+  type CategoryType,
+  type EvaluationCardData,
+  type EvaluationResult,
+  type GenerationConfig,
+  type MetricConfig,
+  type ModelInfo,
+  type ModelEvaluationSummary,
+  type ModelVariantSummary,
+  type ScoreDetails,
+  type SourceData,
+  type SourceMetadata,
+} from "@/lib/benchmark-schema"
+import type { DeveloperListEntry } from "@/lib/backend-artifacts"
+import type {
+  BenchmarkEvalListItem,
+  BenchmarkEvalSummary,
+  ModelResultForBenchmark,
+} from "@/lib/eval-processing"
+type Row = Record<string, any>
+const MODEL_CARD_COLUMNS = `
+  id, model_key, route_id, model_name, model_id, canonical_model_name, developer,
+  evaluations_count, benchmarks_count, variant_count,
+  categories, category_stats, latest_timestamp,
+  evaluator_count, evaluator_names, source_type_count, source_types,
+  evidence_count, missing_generation_config_count,
+  third_party_eval_count, independent_verification_ratio,
+  reproducibility_status, eval_libraries, latest_source_name,
+  params_billions, benchmark_names, score_summary,
+  reproducibility_summary, provenance_summary, comparability_summary,
+  top_scores, source_urls, detail_urls,
+  model_url, release_date,
+  architecture, params, inference_engine, inference_platform
+`
+const EVAL_LIST_COLUMNS = `
+  evaluation_id, evaluation_name, canonical_display_name,
+  composite_benchmark_key, composite_benchmark_name,
+  benchmark_family_key, benchmark_leaf_key, category,
+  metric_config, models_count, evaluator_names, source_types,
+  latest_source_name, third_party_ratio,
+  missing_generation_config_count, best_model, worst_model,
+  avg_score, avg_score_norm, has_card, benchmark_card,
+  is_aggregated, aggregate_sources, tags,
+  metrics_count, metric_names, instance_data, top_score,
+  subtasks_count, is_summary_score, summary_eval_ids,
+  root_metrics, subtasks, leaderboard_metrics,
+  reproducibility_summary, provenance_summary, comparability_summary,
+  source_data
+`
+const CELL_JOIN_COLUMNS = `
+  r.*,
+  e.evaluation_name AS eval_evaluation_name,
+  e.canonical_display_name AS eval_canonical_display_name,
+  e.composite_benchmark_key AS eval_composite_benchmark_key,
+  e.composite_benchmark_name AS eval_composite_benchmark_name,
+  e.benchmark_family_key AS eval_benchmark_family_key,
+  e.benchmark_leaf_key AS eval_benchmark_leaf_key,
+  e.category AS eval_category,
+  e.metric_config AS eval_metric_config,
+  e.source_data AS eval_source_data,
+  e.benchmark_card AS eval_benchmark_card,
+  e.tags AS eval_tags,
+  e.is_summary_score AS eval_is_summary_score,
+  e.summary_eval_ids AS eval_summary_eval_ids
+`
+function normalizeDuckDBValue(value: unknown): unknown {
+  if (typeof value === "bigint") {
+    return Number(value)
+  }
+  if (value instanceof Date) {
+    return value.toISOString()
+  }
+  if (value instanceof Map) {
+    return Object.fromEntries(
+      Array.from(value.entries()).map(([key, mapValue]) => [String(key), normalizeDuckDBValue(mapValue)])
+    )
+  }
+  if (Array.isArray(value)) {
+    return value.map(normalizeDuckDBValue)
+  }
+  if (value && typeof value === "object") {
+    const duckValue = value as {
+      constructor?: { name?: string }
+      entries?: unknown
+      items?: unknown
+      scale?: unknown
+      value?: unknown
+      toString?: () => string
+    }
+    const constructorName = duckValue.constructor?.name ?? ""
+    if (constructorName === "DuckDBStructValue" && duckValue.entries && typeof duckValue.entries === "object") {
+      return normalizeDuckDBValue(duckValue.entries)
+    }
+    if (
+      (constructorName === "DuckDBListValue" || constructorName === "DuckDBArrayValue") &&
+      Array.isArray(duckValue.items)
+    ) {
+      return duckValue.items.map(normalizeDuckDBValue)
+    }
+    if (constructorName === "DuckDBMapValue" && Array.isArray(duckValue.entries)) {
+      return Object.fromEntries(
+        duckValue.entries.map((entry) => {
+          const pair = entry as { key: unknown; value: unknown }
+          return [String(pair.key), normalizeDuckDBValue(pair.value)]
+        })
+      )
+    }
+    if (constructorName === "DuckDBDecimalValue" && typeof duckValue.toString === "function") {
+      return Number(duckValue.toString())
+    }
+    if (constructorName.startsWith("DuckDB") && typeof duckValue.toString === "function") {
+      return duckValue.toString()
+    }
+    return Object.fromEntries(
+      Object.entries(value).map(([key, objectValue]) => [key, normalizeDuckDBValue(objectValue)])
+    )
+  }
+  return value
+}
+async function readRows<T = Row>(sql: string, params: unknown[] = []): Promise<T[]> {
+  const connection = await getConnection()
+  const reader = params.length > 0
+    ? await connection.runAndReadAll(sql, params as any[])
+    : await connection.runAndReadAll(sql)
+  return reader.getRowObjects().map((row) => normalizeDuckDBValue(row) as T)
+}
+function asNumber(value: unknown, fallback = 0) {
+  if (typeof value === "number" && Number.isFinite(value)) return value
+  if (typeof value === "bigint") return Number(value)
+  if (typeof value === "string" && value.trim() !== "") {
+    const parsed = Number(value)
+    if (Number.isFinite(parsed)) return parsed
+  }
+  return fallback
+}
+function optionalNumber(value: unknown) {
+  if (value == null) return undefined
+  const parsed = asNumber(value, Number.NaN)
+  return Number.isFinite(parsed) ? parsed : undefined
+}
+function asString(value: unknown, fallback = "") {
+  return typeof value === "string" ? value : fallback
+}
+function optionalString(value: unknown) {
+  return typeof value === "string" && value.length > 0 ? value : undefined
+}
+function asArray<T>(value: unknown): T[] {
+  return Array.isArray(value) ? value as T[] : []
+}
+function normalizeCategory(value: unknown): CategoryType {
+  return EVALUATION_CATEGORIES.includes(value as CategoryType)
+    ? value as CategoryType
+    : "General"
+}
+function emptyEvaluationsByCategory(): Record<CategoryType, BenchmarkEvaluation[]> {
+  return EVALUATION_CATEGORIES.reduce((acc, category) => {
+    acc[category] = []
+    return acc
+  }, {} as Record<CategoryType, BenchmarkEvaluation[]>)
+}
+function sourceMetadataFromRow(row: Row): SourceMetadata {
+  if (row.source_metadata && typeof row.source_metadata === "object") {
+    return row.source_metadata as SourceMetadata
+  }
+  return {
+    source_type: "documentation",
+    source_organization_name: asString(row.latest_source_name, "Unknown"),
+    evaluator_relationship: "other",
+  }
+}
+function sourceDataFromRow(row: Row): BenchmarkEvaluation["source_data"] {
+  const sourceData = row.source_data ?? row.eval_source_data
+  if (sourceData) {
+    return sourceData as BenchmarkEvaluation["source_data"]
+  }
+  return {
+    dataset_name: asString(row.eval_evaluation_name ?? row.evaluation_name ?? row.benchmark_id, "Unknown dataset"),
+  } satisfies SourceData
+}
+function scoreDetailsFromRow(row: Row): ScoreDetails {
+  const details = row.score_details && typeof row.score_details === "object"
+    ? row.score_details as Partial<ScoreDetails>
+    : {}
+  const score = asNumber(details.score ?? row.score)
+  return {
+    ...details,
+    score,
+  } as ScoreDetails
+}
+function metricConfigFromRow(row: Row): MetricConfig {
+  const config = (row.metric_config ?? row.eval_metric_config ?? {}) as Partial<MetricConfig>
+  const scoreType = config.score_type === "binary" || config.score_type === "discrete"
+    ? config.score_type
+    : "continuous"
+  return {
+    evaluation_description: asString(
+      config.evaluation_description ??
+        row.metric_description ??
+        row.metric_display_name ??
+        row.eval_evaluation_name ??
+        row.evaluation_name,
+      ""
+    ),
+    lower_is_better: Boolean(row.lower_is_better ?? config.lower_is_better ?? false),
+    score_type: scoreType,
+    min_score: optionalNumber(config.min_score ?? row.min_score),
+    max_score: optionalNumber(config.max_score ?? row.max_score),
+    unit: optionalString(row.metric_unit ?? config.unit),
+  }
+}
+function modelInfoFromModelRow(row: Row): ModelInfo {
+  return {
+    name: asString(row.model_name ?? row.model_family_name ?? row.model_id ?? row.model_key, "Unknown model"),
+    id: asString(row.model_key ?? row.model_id ?? row.id ?? row.route_id, "unknown-model"),
+    developer: optionalString(row.developer),
+    inference_platform: optionalString(row.inference_platform),
+    inference_engine: optionalString(row.inference_engine),
+    architecture: optionalString(row.architecture),
+    parameter_count: optionalString(row.params),
+    release_date: optionalString(row.release_date),
+    model_url: optionalString(row.model_url),
+    additional_details: {
+      params_billions: row.params_billions,
+    },
+    modalities: {
+      input: asArray<string>(row.input_modalities),
+      output: asArray<string>(row.output_modalities),
+    },
+  }
+}
+function resultFromCell(row: Row): EvaluationResult {
+  const scoreDetails = scoreDetailsFromRow(row)
+  const generationConfig = row.generation_config as GenerationConfig | undefined
+  const annotations = row.evalcards_annotations
+  return {
+    evaluation_name: asString(row.metric_display_name ?? row.eval_evaluation_name ?? row.metric_id, "Score"),
+    display_name: optionalString(row.metric_display_name),
+    canonical_display_name: optionalString(row.metric_display_name),
+    metric_summary_id: optionalString(row.metric_summary_id),
+    metric_key: optionalString(row.metric_id),
+    evaluation_timestamp: asString(row.evaluation_timestamp, ""),
+    source_data: sourceDataFromRow(row),
+    metric_config: metricConfigFromRow(row),
+    score_details: scoreDetails,
+    generation_config: generationConfig,
+    detailed_evaluation_results_url: optionalString(row.instance_file_path),
+    evalcards: annotations ? { annotations } : undefined,
+  }
+}
+function reshapeCellToModelResult(row: Row): ModelResultForBenchmark {
+  const scoreDetails = scoreDetailsFromRow(row)
+  return {
+    model_info: (row.model_info ?? modelInfoFromModelRow(row)) as ModelInfo,
+    model_route_id: optionalString(row.model_route_id),
+    score: scoreDetails.score,
+    score_details: scoreDetails,
+    evaluation_timestamp: asString(row.evaluation_timestamp, ""),
+    source_metadata: sourceMetadataFromRow(row),
+    source_data: sourceDataFromRow(row),
+    source_record_url: optionalString(row.source_record_url),
+    aggregate_components: asArray<NonNullable<ModelResultForBenchmark["aggregate_components"]>[number]>(
+      row.aggregate_components
+    ),
+    result: resultFromCell(row),
+  }
+}
+function reshapeCellToBenchmarkEvaluation(row: Row): BenchmarkEvaluation {
+  const result = resultFromCell(row)
+  const modelInfo = (row.model_info ?? modelInfoFromModelRow(row)) as ModelInfo
+  return {
+    schema_version: "1.0",
+    eval_summary_id: optionalString(row.evaluation_id),
+    evaluation_id: asString(row.evaluation_id ?? row.benchmark_id, "unknown-evaluation"),
+    retrieved_timestamp: asString(row.evaluation_timestamp, ""),
+    benchmark: optionalString(row.eval_evaluation_name ?? row.benchmark_id),
+    display_name: optionalString(row.eval_evaluation_name),
+    canonical_display_name: optionalString(row.eval_canonical_display_name),
+    category: normalizeCategory(row.eval_category ?? row.category),
+    benchmark_family_key: optionalString(row.eval_benchmark_family_key),
+    benchmark_family_name: optionalString(row.eval_composite_benchmark_name),
+    benchmark_parent_key: optionalString(row.eval_composite_benchmark_key),
+    benchmark_parent_name: optionalString(row.eval_composite_benchmark_name),
+    benchmark_leaf_key: optionalString(row.eval_benchmark_leaf_key),
+    benchmark_leaf_name: optionalString(row.eval_evaluation_name),
+    is_summary_score: Boolean(row.eval_is_summary_score ?? row.is_summary_score),
+    source_data: sourceDataFromRow(row),
+    source_metadata: sourceMetadataFromRow(row),
+    eval_library: row.eval_library,
+    model_info: modelInfo,
+    generation_config: row.generation_config,
+    evaluation_results: [result],
+  }
+}
+function modelSummaryFromRows(modelRow: Row, cellRows: Row[]): ModelEvaluationSummary {
+  const evaluationsByCategory = emptyEvaluationsByCategory()
+  for (const cellRow of cellRows) {
+    const evaluation = reshapeCellToBenchmarkEvaluation(cellRow)
+    const category = normalizeCategory(evaluation.category)
+    evaluationsByCategory[category].push(evaluation)
+  }
+  const categoriesCovered = asArray<CategoryType>(modelRow.categories).filter((category) =>
+    EVALUATION_CATEGORIES.includes(category)
+  )
+  const modelInfo = (modelRow.model_info ?? modelInfoFromModelRow(modelRow)) as ModelInfo
+  const totalEvaluations = asNumber(modelRow.total_evaluations ?? modelRow.evaluations_count)
+  const lastUpdated = asString(modelRow.last_updated ?? modelRow.latest_timestamp, "")
+  const rawModelIds = asArray<string>(modelRow.raw_model_ids)
+  const core = {
+    model_info: modelInfo,
+    evaluations_by_category: evaluationsByCategory,
+    total_evaluations: totalEvaluations,
+    last_updated: lastUpdated,
+    categories_covered: categoriesCovered.length > 0
+      ? categoriesCovered
+      : EVALUATION_CATEGORIES.filter((category) => evaluationsByCategory[category].length > 0),
+    reproducibility_summary: modelRow.reproducibility_summary,
+    provenance_summary: modelRow.provenance_summary,
+    comparability_summary: modelRow.comparability_summary,
+  }
+  const variants = asArray<Row>(modelRow.variants).map((variant, index) => ({
+    ...core,
+    ...variant,
+    variant_id: asString(variant.variant_id ?? variant.variant_key, `variant-${index}`),
+    variant_key: asString(variant.variant_key, `variant-${index}`),
+    variant_label: asString(variant.variant_label ?? variant.variant_display_name, "Default"),
+    variant_display_name: asString(variant.variant_display_name ?? variant.variant_label ?? modelRow.model_name, modelRow.model_name),
+    raw_model_ids: asArray<string>(variant.raw_model_ids),
+    family_id: asString(variant.family_id ?? modelRow.model_family_id, modelRow.model_family_id),
+    family_name: asString(variant.family_name ?? modelRow.model_family_name, modelRow.model_family_name),
+    total_evaluations: asNumber(variant.total_evaluations ?? totalEvaluations),
+    last_updated: asString(variant.last_updated ?? lastUpdated, lastUpdated),
+    categories_covered: asArray<CategoryType>(variant.categories_covered).length > 0
+      ? asArray<CategoryType>(variant.categories_covered)
+      : core.categories_covered,
+    model_info: {
+      ...modelInfo,
+      name: asString(variant.variant_display_name ?? variant.variant_label ?? modelInfo.name, modelInfo.name),
+    },
+  })) as ModelVariantSummary[]
+  return {
+    ...core,
+    model_family_id: asString(modelRow.model_family_id ?? modelRow.model_key ?? modelRow.model_id, modelRow.model_key ?? modelRow.model_id),
+    model_route_id: asString(modelRow.model_route_id ?? modelRow.route_id, modelRow.route_id),
+    model_family_name: asString(modelRow.model_family_name ?? modelRow.model_name, modelRow.model_name),
+    raw_model_ids: rawModelIds.length > 0 ? rawModelIds : [asString(modelRow.model_key ?? modelRow.model_id, "")].filter(Boolean),
+    variants,
+  }
+}
+async function getModelEvaluationRows(modelKey: string): Promise<Row[]> {
+  // model_key is the producer's addressable identifier — non-null for both
+  // resolved and unresolved models (the latter fall back to the raw source
+  // name). Querying by model_id alone would silently miss unresolved models.
+  return readRows<Row>(
+    `SELECT ${CELL_JOIN_COLUMNS}
+     FROM eval_results_view r
+     LEFT JOIN evals_view e ON r.evaluation_id = e.evaluation_id
+     WHERE r.model_key = ?
+       AND r.score IS NOT NULL
+     ORDER BY r.category, r.percentile DESC NULLS LAST`,
+    [modelKey]
+  )
+}
+export async function getModelCards(): Promise<EvaluationCardData[]> {
+  return readRows<EvaluationCardData>(
+    `SELECT ${MODEL_CARD_COLUMNS}
+     FROM models_view
+     ORDER BY latest_timestamp DESC NULLS LAST`
+  )
+}
+export async function getModelCardsLite(): Promise<EvaluationCardData[]> {
+  return readRows<EvaluationCardData>(
+    `SELECT ${MODEL_CARD_COLUMNS}
+     FROM models_view
+     ORDER BY benchmarks_count DESC NULLS LAST, evaluations_count DESC NULLS LAST, model_name ASC`
+  )
+}
+export async function getEvalListData(): Promise<{
+  evals: BenchmarkEvalListItem[]
+  totalModels: number
+}> {
+  const [evals, countRows] = await Promise.all([
+    readRows<BenchmarkEvalListItem>(
+      `SELECT ${EVAL_LIST_COLUMNS}
+       FROM evals_view
+       ORDER BY evaluation_name ASC`
+    ),
+    readRows<{ n: number }>("SELECT COUNT(*) AS n FROM models_view"),
+  ])
+  return {
+    evals,
+    totalModels: asNumber(countRows[0]?.n),
+  }
+}
+export async function getEvalListLiteData(): Promise<{
+  evals: BenchmarkEvalListItem[]
+  totalModels: number
+}> {
+  return getEvalListData()
+}
+export async function getEvalList() {
+  const { evals } = await getEvalListData()
+  return evals
+}
+export async function getDashboardData() {
+  const [models, evals] = await Promise.all([
+    getModelCards(),
+    getEvalList(),
+  ])
+  return { models, evals }
+}
+export async function getModelSummaryById(routeId: string): Promise<ModelEvaluationSummary | null> {
+  // Lookups use the addressable identifier (`model_key`/`route_id`/
+  // `model_route_id`/`model_family_id`) so unresolved models — whose
+  // `model_id` is NULL — are still findable. `model_id` is kept in the
+  // OR chain as a back-compat fallback for old links.
+  const rows = await readRows<Row>(
+    `SELECT *
+     FROM models_view
+     WHERE model_key = ? OR route_id = ? OR model_route_id = ? OR model_family_id = ? OR model_id = ?
+     LIMIT 1`,
+    [routeId, routeId, routeId, routeId, routeId]
+  )
+  const modelRow = rows[0]
+  if (!modelRow) return null
+  const cellRows = await getModelEvaluationRows(asString(modelRow.model_key ?? modelRow.model_id, routeId))
+  return modelSummaryFromRows(modelRow, cellRows)
+}
+export async function getEvalSummaryById(evalId: string): Promise<BenchmarkEvalSummary | null> {
+  const evalRows = await readRows<Row>(
+    "SELECT * FROM evals_view WHERE evaluation_id = ? LIMIT 1",
+    [evalId]
+  )
+  const evalRow = evalRows[0]
+  if (!evalRow) return null
+  let cellRows = await readRows<Row>(
+    `SELECT ${CELL_JOIN_COLUMNS}
+     FROM eval_results_view r
+     LEFT JOIN evals_view e ON r.evaluation_id = e.evaluation_id
+     WHERE r.evaluation_id = ?
+       AND r.metric_id = (SELECT primary_metric_id FROM evals_view WHERE evaluation_id = ?)
+       AND r.score IS NOT NULL
+     ORDER BY r.position ASC NULLS LAST`,
+    [evalId, evalId]
+  )
+  if (cellRows.length === 0) {
+    cellRows = await readRows<Row>(
+      `SELECT ${CELL_JOIN_COLUMNS}
+       FROM eval_results_view r
+       LEFT JOIN evals_view e ON r.evaluation_id = e.evaluation_id
+       WHERE r.evaluation_id = ?
+         AND r.score IS NOT NULL
+       ORDER BY r.position ASC NULLS LAST`,
+      [evalId]
+    )
+  }
+  return {
+    ...evalRow,
+    model_results: cellRows.map(reshapeCellToModelResult),
+  } as BenchmarkEvalSummary
+}
+export async function getDeveloperList(): Promise<DeveloperListEntry[]> {
+  const headline = await fetchHeadline()
+  return [...(headline.developers ?? [])].sort((a, b) => a.developer.localeCompare(b.developer))
+}
+export async function getDeveloperSummaryById(routeId: string) {
+  const developers = await getDeveloperList()
+  const developer = developers.find((entry) => entry.route_id === routeId)
+  if (!developer) return null
+  const models = await readRows<EvaluationCardData>(
+    `SELECT ${MODEL_CARD_COLUMNS}
+     FROM models_view
+     WHERE developer = ?
+     ORDER BY benchmarks_count DESC NULLS LAST, evaluations_count DESC NULLS LAST, model_name ASC`,
+    [developer.developer]
+  )
+  return {
+    ...developer,
+    models,
+  }
+}
+export async function getBenchmarkMetadataMap(): Promise<Record<string, BenchmarkCard>> {
+  const rows = await readRows<Row>(
+    `SELECT evaluation_id, evaluation_name, composite_benchmark_key, benchmark_card
+     FROM evals_view
+     WHERE benchmark_card IS NOT NULL`
+  )
+  const result: Record<string, BenchmarkCard> = {}
+  for (const row of rows) {
+    const card = row.benchmark_card as BenchmarkCard | null | undefined
+    if (!card) continue
+    const keys = [
+      row.evaluation_id,
+      row.evaluation_name,
+      row.composite_benchmark_key,
+      card.benchmark_details?.name,
+    ].filter((key): key is string => typeof key === "string" && key.length > 0)
+    for (const key of keys) {
+      result[key] = card
+    }
+  }
+  return result
+}

notes/backend-v2-migration.md ADDED Viewed

	@@ -0,0 +1,616 @@

+# Frontend migration to backend v2 (Stage J view layer)
+> **Status:** spec, drafted 2026-05-03 against `eval_card_backend`'s
+> Stage J view-layer contract.
+>
+> **Sources:**
+> - Backend spec (the contract this consumes):
+>   `../eval_card_backend/notes/08-frontend-view-layer.md`
+> - Canonical schema (audit/debug only; not in hot path):
+>   `../eval_card_backend/notes/01-schema-from-frontend.md`
+---
+## Context
+The legacy producer (`eval_cards_backend_pipeline`) emitted ten
+parquets where each row carried a `payload_json` VARCHAR with the
+post-TS-adapter shape baked in. The frontend's "DuckDB backend"
+(`lib/duckdb-data.ts`) read these blobs and `JSON.parse`d them — column
+projection, filter pushdown, and type contracts were all forfeited.
+The new producer (`eval_card_backend`) emits a typed view layer over
+its canonical normalised tables. Three Parquet files cover every page
+shape, three small JSON sidecars cover corpus-level scalars and the
+hierarchy tree. Column names match the frontend's TS interfaces
+field-for-field, so the row→object cast is a typed spread for most
+accessors. Two interfaces (`ModelResultForBenchmark` and the
+`evaluations_by_category` body of `ModelEvaluationSummary`) require a
+small mechanical reshape over the row, since one nests fields that the
+view stores flat — see the per-accessor sections below. No
+HF-record-to-display adapter logic survives.
+This document specifies what changes in `general-eval-card` once
+backend v2 is faithfully implemented. **The visual frontend, page
+renderers, and TS interface shapes do not change.** Only the I/O
+boundary moves.
+---
+## What changes (overview)
+| layer | before (v1) | after (v2) |
+|---|---|---|
+| Distribution | `LOCAL_PIPELINE_OUTPUT` env var pointing at a producer output dir; `duckdb/v1/` subpath; implicit "warehouse/latest/" coupling | `SNAPSHOT_URL` env var (file:// or HF dataset URL); one snapshot pinned per deploy |
+| Storage shape | 10 parquets each with one `payload_json` column | 3 typed-column view parquets + 3 JSON sidecars |
+| Read pattern | `SELECT payload_json FROM read_parquet(?) WHERE id = ?`, then `JSON.parse` | `SELECT col1, col2, ... FROM <view> WHERE id = ?`, typed row spread |
+| List vs detail | Separate `*_lite.parquet` files | Column projection on the same parquet |
+| Suite/aggregate dispatch | Eval id prefix (`aggregate__`, `matrix__`) → different parquet | `is_summary_score` flag + `parent_benchmark_id` on `evals_view` |
+| Slug rule | Custom `replace('/', '__')` escapes; per-page slug helpers | Producer-owned RFC 3986 percent-encoded `route_id` / `evaluation_id` / `metric_summary_id`; frontend decodes only on `<Link>` href |
+| Corpus aggregates | `corpus-aggregates.json` over HF JSON loader | `headline.json` sidecar in the snapshot dir |
+| Hierarchy | Synthesised in the producer's `eval_hierarchy` JSON | `hierarchy.json` sidecar |
+| Backend manifest | `manifest.json` fetched from upstream HF dataset root via `lib/hf-data.ts` | `manifest.json` sidecar inside the snapshot dir, read via `SNAPSHOT_URL` |
+The TS interfaces (`EvaluationCardData`, `BenchmarkEvalSummary`,
+`ModelEvaluationSummary`, `ModelResultForBenchmark`, `CorpusAggregates`,
+`EvalHierarchy`, `BackendManifest`) stay as-is — the producer agreed to
+emit columns under those exact names.
+---
+## What does not change
+- All page components under `app/`. The renderer trees are unchanged.
+- TS interface declarations in `lib/benchmark-schema.ts`,
+  `lib/eval-processing.ts`, `lib/backend-artifacts.ts`. These are now
+  the contract surface — column names match field names by agreement
+  with the producer.
+- Component files under `components/`.
+- `lib/glossary.ts`, `lib/known-issues.ts`, `lib/utils.ts`,
+  `lib/na-utils.ts` — these are pure presentation helpers.
+- `app/api/*/route.ts` handlers stay as thin pass-throughs to
+  `lib/data-backend.ts`.
+---
+## Distribution: `SNAPSHOT_URL`
+Frontend reads `SNAPSHOT_URL` from env at process start. One deploy =
+one snapshot. The URL points at a directory containing the six
+artifacts the frontend reads:
+```
+$SNAPSHOT_URL/
+├── models_view.parquet
+├── evals_view.parquet
+├── eval_results_view.parquet
+├── headline.json
+├── hierarchy.json
+└── manifest.json
+```
+Examples:
+- Local dev: `SNAPSHOT_URL=file:///path/to/eval_card_backend/warehouse/2026-05-03T15-48-59Z`
+- Production (pinned snapshot): `SNAPSHOT_URL=https://huggingface.co/datasets/evaleval/eval-cards-data/resolve/<rev>/warehouse/<snapshot_id>`
+- Production (rolling): `SNAPSHOT_URL=https://huggingface.co/datasets/evaleval/eval-cards-data/resolve/main/warehouse/latest`
+`LOCAL_PIPELINE_OUTPUT` is removed. The `duckdb/v1/` subpath is
+removed. The producer maintains a `warehouse/latest/` alias that
+points at the most recent snapshot, so deploys can pin either to a
+timestamped snapshot (immutable, redeploy required to roll forward)
+or to `latest` (auto-rolls forward on the next Space rebuild). Within
+a running process the snapshot is still effectively constant — sidecar
+caches in `lib/sidecars.ts` are first-write-wins per process.
+---
+## DuckDB connection lifecycle
+`lib/duckdb.ts` (new file; replaces the connection-management portion
+of `lib/duckdb-data.ts`):
+```ts
+import "server-only"
+import { DuckDBConnection } from "@duckdb/node-api"
+let connectionPromise: Promise<DuckDBConnection> | null = null
+const SNAPSHOT_URL = process.env.SNAPSHOT_URL
+if (!SNAPSHOT_URL) {
+  throw new Error("SNAPSHOT_URL must be set; see notes/backend-v2-migration.md")
+}
+const VIEWS = {
+  models_view:       `${SNAPSHOT_URL}/models_view.parquet`,
+  evals_view:        `${SNAPSHOT_URL}/evals_view.parquet`,
+  eval_results_view: `${SNAPSHOT_URL}/eval_results_view.parquet`,
+} as const
+export async function getConnection(): Promise<DuckDBConnection> {
+  if (!connectionPromise) {
+    connectionPromise = (async () => {
+      const conn = await DuckDBConnection.create()
+      // httpfs is built into duckdb-node-api; no INSTALL needed.
+      // Register each parquet as a view so callers write `FROM models_view`,
+      // not the full URL.
+      for (const [name, path] of Object.entries(VIEWS)) {
+        await conn.run(
+          `CREATE OR REPLACE VIEW ${name} AS SELECT * FROM read_parquet(?)`,
+          [path]
+        )
+      }
+      return conn
+    })()
+  }
+  return connectionPromise
+}
+```
+One connection per Node process. Views are registered once at
+startup; subsequent queries write `FROM models_view` rather than
+re-passing the parquet URL. DuckDB's column projection means the cost
+of `SELECT route_id, model_name FROM models_view` is independent of
+how wide `models_view` is.
+---
+## Per-accessor mapping
+`lib/data-backend.ts` keeps its current export names. `lib/duckdb-data.ts`
+gets gutted; each function becomes a thin typed `SELECT`. The mapping
+below uses the column names spec'd in
+`../eval_card_backend/notes/08-frontend-view-layer.md` — the row
+returned by DuckDB casts directly to the TS interface.
+### Models
+```ts
+// getModelCards / getModelCardsLite — list pages
+export async function getModelCards(): Promise<EvaluationCardData[]> {
+  const conn = await getConnection()
+  const reader = await conn.runAndReadAll(`
+    SELECT id, route_id, model_name, model_id, canonical_model_name, developer,
+           evaluations_count, benchmarks_count, variant_count,
+           categories, category_stats, latest_timestamp,
+           evaluator_count, evaluator_names, source_type_count, source_types,
+           evidence_count, missing_generation_config_count,
+           third_party_eval_count, independent_verification_ratio,
+           reproducibility_status, eval_libraries, latest_source_name,
+           params_billions, benchmark_names, score_summary,
+           reproducibility_summary, provenance_summary, comparability_summary,
+           top_scores, source_urls, detail_urls,
+           model_url, release_date, input_modalities, output_modalities,
+           architecture, params, inference_engine, inference_platform
+    FROM models_view
+    ORDER BY latest_timestamp DESC
+  `)
+  return reader.getRowObjects() as EvaluationCardData[]
+}
+// "Lite" is just narrower projection — same parquet, fewer columns.
+export async function getModelCardsLite(): Promise<EvaluationCardData[]> {
+  const conn = await getConnection()
+  const reader = await conn.runAndReadAll(`
+    SELECT id, route_id, model_name, model_id, developer,
+           evaluations_count, benchmarks_count, categories,
+           latest_timestamp, third_party_eval_count,
+           independent_verification_ratio, reproducibility_status,
+           latest_source_name, params_billions
+    FROM models_view
+    ORDER BY benchmarks_count DESC, evaluations_count DESC, model_name ASC
+  `)
+  return reader.getRowObjects() as EvaluationCardData[]
+}
+// getModelSummaryById — detail page.
+//
+// The row carries the metadata shell (variants[], categories,
+// category_stats, signal summaries, model_family_id, raw_model_ids,
+// total_evaluations, last_updated). The full `ModelEvaluationSummary`
+// also requires `evaluations_by_category: Record<CategoryType,
+// BenchmarkEvaluation[]>`, which is a heavyweight per-cell breakdown —
+// produced by a separate join over `eval_results_view`, see
+// `getModelEvaluationCells` below.
+//
+// Returning a `ModelSummaryShell` (Omit-ed type, defined alongside the
+// existing TS interface) makes the contract explicit and stops the cast
+// from lying. The model-detail page composes the full
+// `ModelEvaluationSummary` from `shell` + `cells`.
+export type ModelSummaryShell = Omit<
+  ModelEvaluationSummary,
+  "evaluations_by_category"
+>
+export async function getModelSummaryById(routeId: string): Promise<ModelSummaryShell | null> {
+  const conn = await getConnection()
+  const reader = await conn.runAndReadAll(
+    `SELECT * FROM models_view WHERE route_id = ? OR model_family_id = ? LIMIT 1`,
+    [routeId, routeId]
+  )
+  const rows = reader.getRowObjects()
+  if (rows.length === 0) return null
+  return rows[0] as unknown as ModelSummaryShell
+}
+// Per-cell reshape helper. `eval_results_view` rows carry the per-cell
+// fields scattered (model_info, score_details, evaluation_timestamp,
+// source_metadata, source_data, metric_*, etc.) rather than under a
+// nested `result: EvaluationResult` STRUCT. Reshape into the
+// `ModelResultForBenchmark` shape the leaderboard / model-detail
+// renderers expect. Single helper; reused by getEvalSummaryById and
+// getModelEvaluationCells. No HF-record-to-display logic survives.
+function reshapeCellToModelResult(row: Record<string, any>): ModelResultForBenchmark {
+  return {
+    model_info:           row.model_info,
+    model_route_id:       row.model_route_id,
+    score:                row.score,
+    score_details:        row.score_details,
+    evaluation_timestamp: row.evaluation_timestamp,
+    source_metadata:      row.source_metadata,
+    source_data:          row.source_data,
+    source_record_url:    row.source_record_url,
+    aggregate_components: row.aggregate_components,
+    result: {
+      evaluation_name:      row.metric_display_name,
+      metric_summary_id:    row.metric_summary_id,
+      metric_key:           row.metric_id,
+      evaluation_timestamp: row.evaluation_timestamp,
+      metric_config:        { lower_is_better: row.lower_is_better, unit: row.metric_unit, /* …denormalised meta… */ },
+      score_details:        row.score_details,
+      evalcards:            row.evalcards_annotations ? { annotations: row.evalcards_annotations } : undefined,
+    },
+  }
+}
+// Helper for the model-detail page's evaluations_by_category body.
+// The page groups by `category` in TS after this returns.
+export async function getModelEvaluationCells(modelId: string): Promise<ModelResultForBenchmark[]> {
+  const conn = await getConnection()
+  const reader = await conn.runAndReadAll(
+    `SELECT * FROM eval_results_view WHERE model_id = ? ORDER BY category, percentile DESC`,
+    [modelId]
+  )
+  return reader.getRowObjects().map(reshapeCellToModelResult)
+}
+```
+### Evals
+```ts
+// getEvalListData / getEvalListLiteData — list pages
+export async function getEvalListData(): Promise<{
+  evals: BenchmarkEvalListItem[]
+  totalModels: number
+}> {
+  const conn = await getConnection()
+  const [evalsReader, modelsReader] = await Promise.all([
+    conn.runAndReadAll(`
+      SELECT evaluation_id, evaluation_name, canonical_display_name,
+             composite_benchmark_key, composite_benchmark_name,
+             benchmark_family_key, benchmark_leaf_key, category,
+             metric_config, models_count, evaluator_names, source_types,
+             latest_source_name, third_party_ratio,
+             missing_generation_config_count, best_model, worst_model,
+             avg_score, avg_score_norm, has_card,
+             is_aggregated, aggregate_sources, tags,
+             metrics_count, metric_names, instance_data, top_score,
+             subtasks_count, is_summary_score, summary_eval_ids,
+             root_metrics, subtasks, leaderboard_metrics,
+             reproducibility_summary, provenance_summary, comparability_summary,
+             source_data
+      FROM evals_view
+      ORDER BY evaluation_name ASC
+    `),
+    conn.runAndReadAll(`SELECT COUNT(*) AS n FROM models_view`),
+  ])
+  return {
+    evals: evalsReader.getRowObjects() as BenchmarkEvalListItem[],
+    totalModels: Number(modelsReader.getRowObjects()[0].n),
+  }
+}
+// getEvalSummaryById — detail page.
+//
+// No more aggregate__/matrix__ id-prefix dispatch — `evals_view` is the
+// single source for all eval shapes. Suite-vs-leaf is a column
+// (`is_summary_score`, `is_aggregated`) on the same parquet.
+//
+// `model_results[]` rows go through the same reshape helper as
+// `getModelEvaluationCells` (defined below) — they share the
+// ModelResultForBenchmark target shape, so the eval/metric/cell
+// → BenchmarkEvaluation reshape is one helper, two callers.
+export async function getEvalSummaryById(evalId: string): Promise<BenchmarkEvalSummary | null> {
+  const conn = await getConnection()
+  const [evalReader, cellsReader] = await Promise.all([
+    conn.runAndReadAll(
+      `SELECT * FROM evals_view WHERE evaluation_id = ? LIMIT 1`,
+      [evalId]
+    ),
+    conn.runAndReadAll(
+      `SELECT * FROM eval_results_view
+       WHERE evaluation_id = ?
+         AND metric_id     = (SELECT primary_metric_id FROM evals_view WHERE evaluation_id = ?)
+       ORDER BY position ASC`,
+      [evalId, evalId]
+    ),
+  ])
+  const evalRows = evalReader.getRowObjects()
+  if (evalRows.length === 0) return null
+  return {
+    ...(evalRows[0] as Omit<BenchmarkEvalSummary, "model_results">),
+    model_results: cellsReader.getRowObjects().map(reshapeCellToModelResult),
+  } as BenchmarkEvalSummary
+}
+```
+### Developers
+```ts
+// getDeveloperList — list page; reads from headline.json (precomputed,
+// including producer-owned route_id, model/benchmark/evaluation counts,
+// and popular_evals). DeveloperListEntry is satisfied directly by the
+// headline entry shape.
+export async function getDeveloperList(): Promise<DeveloperListEntry[]> {
+  const headline = await fetchHeadline()
+  return headline.developers as DeveloperListEntry[]
+}
+// getDeveloperSummaryById — detail page; reads models_view filtered by developer.
+// The route_id on headline.developers[] is the canonical lookup key — we don't
+// re-derive `developer` from the URL slug, since percent-decoding may not
+// round-trip exactly to the producer's source string.
+export async function getDeveloperSummaryById(routeId: string) {
+  const headline = await fetchHeadline()
+  const headlineEntry = headline.developers.find((d) => d.route_id === routeId)
+  if (!headlineEntry) return null
+  const conn = await getConnection()
+  const reader = await conn.runAndReadAll(
+    `SELECT * FROM models_view WHERE developer = ?`,
+    [headlineEntry.developer]
+  )
+  const models = reader.getRowObjects() as EvaluationCardData[]
+  return { ...headlineEntry, models }
+}
+```
+### Dashboard convenience accessor
+```ts
+// Was: { models, evals } over both legacy parquets; same shape, new sources.
+export async function getDashboardData() {
+  const [models, evalListData] = await Promise.all([
+    getModelCards(),
+    getEvalListData(),
+  ])
+  return { models, evals: evalListData.evals }
+}
+```
+---
+## Sidecar fetchers (replace `lib/hf-data.ts` corpus calls)
+Three small JSON files live in the snapshot dir alongside the
+parquets. New module `lib/sidecars.ts` exposes typed fetchers.
+`lib/hf-data.ts`'s `fetchCorpusAggregates`, `fetchEvalHierarchy`,
+`fetchBackendManifest`, and `fetchBackendManifestStatus` get their
+implementations replaced — same export names, new sources.
+```ts
+// lib/sidecars.ts
+import "server-only"
+import type {
+  CorpusAggregates,
+  EvalHierarchy,
+  BackendManifest,
+} from "@/lib/backend-artifacts"
+const SNAPSHOT_URL = process.env.SNAPSHOT_URL!
+let cache: {
+  manifest?: Promise<BackendManifest>
+  headline?: Promise<CorpusAggregates>
+  hierarchy?: Promise<EvalHierarchy>
+} = {}
+async function fetchJson<T>(name: string): Promise<T> {
+  const url = `${SNAPSHOT_URL}/${name}`
+  const res = url.startsWith("file://")
+    ? await import("fs/promises").then((fs) => fs.readFile(new URL(url), "utf8"))
+    : await fetch(url, { next: { revalidate: 3600 } }).then((r) => r.text())
+  return JSON.parse(typeof res === "string" ? res : res.toString()) as T
+}
+export function fetchManifest(): Promise<BackendManifest> {
+  return (cache.manifest ??= fetchJson<BackendManifest>("manifest.json"))
+}
+export function fetchHeadline(): Promise<CorpusAggregates> {
+  return (cache.headline ??= fetchJson<CorpusAggregates>("headline.json"))
+}
+export function fetchHierarchy(): Promise<EvalHierarchy> {
+  return (cache.hierarchy ??= fetchJson<EvalHierarchy>("hierarchy.json"))
+}
+```
+Then in `lib/hf-data.ts`:
+```ts
+// fetchBackendManifest: was a fetchHFJsonSafe call; now reads the snapshot sidecar.
+export const fetchBackendManifest = fetchManifest
+export const fetchCorpusAggregates = fetchHeadline
+export const fetchEvalHierarchy = fetchHierarchy
+// fetchBackendManifestStatus: simplified — single snapshot pin, no "latest" comparison.
+export async function fetchBackendManifestStatus(): Promise<BackendManifestStatus> {
+  const m = await fetchManifest()
+  return {
+    currentManifest: m,
+    latestManifest: m,                          // no separate "latest" — snapshot is pinned
+    currentManifestSignature: m.generated_at,
+    latestManifestSignature: m.generated_at,
+    updateAvailable: false,
+    refreshing: false,
+    pendingRefreshCount: 0,
+  }
+}
+```
+---
+## What deletes
+After v2 is live, the following code is dead and can be removed in a
+follow-up cleanup:
+- `lib/duckdb-data.ts` — replaced by typed SELECTs split between
+  `lib/duckdb.ts` (connection) and `lib/data-backend.ts` (queries).
+- The `payload_json` parser helpers (`parsePayload`, `readPayloads`,
+  `readPayloadById`, `assertDeveloperListShape`) — no JSON blobs to
+  parse.
+- The `aggregate__` / `matrix__` eval-id prefix dispatch in
+  `getEvalSummaryByIdFromDuckDB` — the typed view is the only path.
+- `lib/model-data.ts` — most of its functions exist to convert HF
+  JSON records into `BenchmarkEvaluation` / `EvaluationCardData`. Once
+  the producer emits those shapes directly, the adapter logic deletes.
+  Keep only the helpers that don't touch HF records (slug parsing,
+  display formatters).
+- `lib/eval-processing.ts` — the `groupEvaluationsByModel`,
+  `createModelSummary`, `createBenchmarkEvalSummary`, and
+  `inferCategoryFromBenchmark` adapter functions are no longer called
+  in the data path. The exported types stay.
+- `scripts/audit-adapters.mjs`, `scripts/dump-adapter-outputs.mts`,
+  `scripts/compare-data-backends.mjs`, `scripts/refresh-fixtures.mjs`,
+  `scripts/cache-hf-data.mjs` — adapter / parity-check tooling for the
+  legacy pipeline. Delete once v1 is retired.
+- `data/models/`, `data/developers/`, `data/benchmarks.json`,
+  `data/models.json`, `data/developers.json` — bundled snapshots of
+  v1 output for fixture tests. Replace with v2 fixtures if needed.
+- `LOCAL_PIPELINE_OUTPUT` env var, `duckdb/v1/` subpath conventions,
+  and the parity-emitter expectations documented in
+  `lib/duckdb-data.ts`'s preamble.
+- `inferCategoryFromBenchmark` regex chain in
+  `lib/benchmark-schema.ts` — producer is the source of truth for
+  category. Keep the `EVALUATION_CATEGORIES` const + `CategoryType`
+  type; delete the inference function and `BENCHMARK_PRIORITY_RULES`.
+---
+## Slug rule
+Producer emits all URL-bearing identifiers in
+RFC 3986 percent-encoded form (`route_id`, `evaluation_id`,
+`metric_summary_id`). Frontend treats them as opaque except for
+`<Link>` href construction:
+```tsx
+// Old: href={`/models/${model.route_id}`}  // route_id was already escaped via __ rule
+// New: href={`/models/${model.route_id}`}  // same code; route_id is now percent-encoded
+```
+Decode happens inside the route handler when looking up by slug:
+```ts
+// app/models/[id]/page.tsx
+export default async function ModelDetailPage({ params }: { params: { id: string } }) {
+  const summary = await getModelSummaryById(params.id)  // pass encoded form straight through
+  ...
+}
+```
+`getModelSummaryById` looks up by `route_id = ?` directly without
+decoding — the producer's `route_id` column matches the URL path
+segment byte-for-byte. The legacy `replace('/', '__')` and
+`replace(/\//g, ...)` helpers in `lib/utils.ts` and `lib/model-family.ts`
+become dead code; remove them in the cleanup pass.
+---
+## Migration strategy
+A feature flag gates v1 vs v2 during the transition:
+```ts
+// lib/data-backend.ts
+const BACKEND_VERSION = process.env.DATA_BACKEND ?? "v1"
+export const getModelCards =
+  BACKEND_VERSION === "v2"
+    ? (await import("@/lib/duckdb")).getModelCards
+    : (await import("@/lib/duckdb-data")).getModelCardsFromDuckDB
+// ... same pattern for other accessors
+```
+Phase plan:
+1. **Producer ships Stage J.** `eval_card_backend` emits the six
+   v2 artifacts in `warehouse/<snapshot_id>/`. Existing canonical
+   parquets stay alongside.
+2. **Frontend lands `lib/duckdb.ts` + `lib/sidecars.ts`** behind the
+   `DATA_BACKEND=v2` flag. CI builds both backends; default stays v1.
+3. **Smoke test in dev with `DATA_BACKEND=v2`,
+   `SNAPSHOT_URL=file://...`.** Verify each page renders identical
+   bytes (modulo source-of-data labels). Where they diverge, file
+   producer issues — do not patch the frontend to paper over.
+4. **Flip the production default to v2.** Keep v1 path compilable but
+   unreachable. Monitor for a release.
+5. **Delete v1 path** (the "What deletes" list above).
+The flag is intentionally process-wide, not per-accessor. Mixing
+backends within one render produces inconsistent snapshots.
+---
+## What doesn't move
+- **Instance-level data fetching** (`fetchInstanceLevelData` in
+  `lib/hf-data.ts`). Instance JSONL is referenced by URL in
+  `eval_results_view.instance_file_path`; the lazy-load stays. Pointer
+  shape on the row is unchanged from v1.
+- **Benchmark card metadata** lives inside `evals_view.benchmark_card`
+  STRUCT now, not a separate `benchmark_card_*.json` per file. The
+  page reads it from the eval row directly. Adapter-style readers
+  (`fetchBenchmarkMetadataMap`) become a `SELECT benchmark_id, benchmark_card
+  FROM evals_view` aggregation if anything still calls them — most
+  callers should fold into `getEvalSummaryById`.
+- **EvalCards annotations** (`evalcards.annotations`) live on
+  `eval_results_view.evalcards_annotations` per-row. The eval-detail
+  page reads them inline; no separate fetcher.
+---
+## Open questions / risks
+- **httpfs cold-start latency.** First query against an HF-hosted
+  parquet pays a round trip per file. Mitigate by pre-registering all
+  three views at process start (above), so the first user query hits
+  warm metadata. Measure on the production HF Space; if too slow,
+  consider downloading the snapshot to local disk at container start
+  (~MB per snapshot).
+- **Connection lifetime in serverless.** Vercel's serverless
+  runtime tears down the Node process per request; the
+  `connectionPromise` cache doesn't help. The HF Space deployment
+  (Docker, long-lived) is unaffected. If we ever target serverless,
+  switch to `duckdb-wasm` in the browser or a separate serving
+  process.
+- **`aggregate_components[]` on `eval_results_view`.** This array is
+  the per-suite-component breakdown for rollup rows. For non-rollup
+  rows it's always empty. If suite rollups grow common, the storage
+  cost of trailing-empty arrays is non-trivial; consider splitting
+  into a dedicated parquet at that point.
+- **Category drift.** Producer's `category_mapping.json` will lag real
+  benchmark tag changes. The mapping is producer-owned, so the
+  frontend can't patch around drift — this is a feature, not a bug,
+  but it requires operator discipline. Surface "uncategorised
+  benchmark count" in the producer's run summary and the home-page
+  manifest banner.
+- **Type widening for `score_summary` etc.** The producer emits these
+  as DuckDB STRUCTs; the TS interface declares them as nested
+  `{ count, min, max, average }`. `runAndReadAll` returns nested
+  STRUCTs as plain JS objects, so the cast works — but if duckdb-node
+  changes its STRUCT serialisation, audit the `as` casts here. Add a
+  dev-only validator that runs `EvaluationCardData`'s shape check at
+  the row level on the first `getModelCards()` call after process
+  start.

notes/merge-cheatsheet-backend-v2.md ADDED Viewed

	@@ -0,0 +1,193 @@

+# Merge cheatsheet: pulling `main` into `feat/use-new-backend-data`
+> Drafted 2026-05-04, before pulling. Companion to `backend-v2-migration.md`
+> (which is the design doc). This file is just a per-file conflict guide.
+>
+> Branch: `feat/use-new-backend-data` (2 commits ahead of `main`:
+> `7635aee` Integrate with test backend data, `bfce8f2` Drop
+> input/output_modalities from MODEL_CARD_COLUMNS).
+## Triage at a glance
+| File | Risk | Strategy |
+|---|---|---|
+| `lib/data-backend.ts` | **High** | Keep ours wholesale; re-port any new accessors main added |
+| `lib/backend-artifacts.ts` | **High** | Keep our schema renames; reconcile any *new* main-side fields against producer output |
+| `components/signals/corpus-dashboard.tsx` | **Med** | Keep main's UI structure; rewire data fields to v2 names |
+| `components/signals/corpus-signals-strip.tsx` | **Med** | Same as above |
+| `lib/hf-data.ts` | **Med** | Keep `useViewLayerBackend()` short-circuits at top of 5 fetchers |
+| `Dockerfile` | **Med** | Keep our `DATA_BACKEND=v2` + `SNAPSHOT_URL` wiring; layer main's other changes on top |
+| `lib/benchmark-schema.ts` | **Low** | Trivial 1-line addition (`num_few_shot?`) |
+| `app/page.tsx` | **Low** | One-line copy change (`corpus-aggregates.json` → `headline.json`) |
+New files (no conflict possible): `lib/view-data.ts`, `lib/duckdb.ts`,
+`lib/sidecars.ts`, `tests/view-data.test.ts`,
+`notes/backend-v2-migration.md`.
+---
+## `lib/data-backend.ts` — High
+**What we did:** Replaced static re-exports from `lib/duckdb-data` with
+a `BACKEND_VERSION` env-flag dispatcher. Each accessor now branches on
+`useViewLayerBackend()` (true when `DATA_BACKEND=v2` or `stage-j`) and
+lazy-imports either `@/lib/view-data` or `@/lib/duckdb-data`.
+Manifest/hierarchy accessors branch between `@/lib/sidecars` and
+`@/lib/hf-data`.
+**Reconcile:**
+- Conflict almost certain if main touched any export wiring here.
+- **Keep our file as-is.** The dispatcher pattern is load-bearing.
+- If main added a new accessor (e.g. `getFooBar`), add a new dispatcher
+  function following the same pattern — only the legacy branch needs
+  to be wired immediately; v2 branch can throw `Not implemented` until
+  `lib/view-data.ts` adds it.
+---
+## `lib/backend-artifacts.ts` — High
+**What we did:** Renamed corpus-block fields to match what the v2
+producer emits:
+| Block | v1 (main) | v2 (ours) |
+|---|---|---|
+| Completeness | `total_benchmarks`, `completeness_score_mean`, `completeness_score_median`, `per_field_population{}` | `total_triples`, `completeness_avg`, `completeness_min`, `completeness_max` |
+| Provenance | `multi_source_groups`, `multi_source_rate`, `first_party_only_groups`, `first_party_only_rate`, `total_groups` | `multi_source_triples`, `first_party_only_triples`, `total_triples` (rates dropped — derived in components via local `rate()` helper) |
+| Comparability | `variant_eligible_groups`, `variant_divergent_groups`, `variant_divergence_rate`, `cross_party_eligible_groups`, `cross_party_divergent_groups`, `cross_party_divergence_rate`, `total_groups` | `total_triples`, `variant_divergent_count`, `cross_party_divergent_count`, `groups_with_variant_check`, `groups_with_cross_party_check` |
+Also added: `DeveloperListEntry` interface, optional
+`developers/families/categories` arrays on `CorpusAggregates`,
+optional `eval_hierarchy` key in `BackendManifest.summary_artifacts`.
+**Reconcile:**
+- Producer is the source of truth for v2 field names — do **not** add
+  back v1 names to satisfy a main-side change. If main added a field
+  the v2 producer doesn't emit, either drop it or check
+  `eval_card_backend/notes/08-frontend-view-layer.md` first.
+- Keep all three new optional sections on `CorpusAggregates`
+  (developers, families, categories) — they back the new
+  developer-list path.
+- The `summary_artifacts.eval_hierarchy` key is additive; safe to keep
+  alongside whatever main added there.
+---
+## `components/signals/corpus-dashboard.tsx` — Medium
+**What we did:** Mechanical rewrite of every field reference in this
+file to use the v2 names from `lib/backend-artifacts.ts` (above).
+Removed the `per_field_population` per-field grid and replaced it with
+a `min / avg / max` MiniMetric trio. Added a local `rate(num, denom)`
+helper (returns null if either side is null/zero) since v2 stores
+counts, not pre-computed rates. Title-cased `CATEGORY_ORDER`
+(`"Agentic"`, `"General"`, …) and made the keys-to-render set extend
+gracefully to unknown categories.
+**Reconcile:**
+- If main touched this file for design/UX reasons, **prefer main's
+  visual structure** — but keep our field accessors. The recipe is:
+  - Anywhere main reads `multi_source_rate`, replace with `rate(prov.multi_source_triples, prov.total_triples)`.
+  - Anywhere main reads `completeness_score_mean`, replace with `comp.completeness_avg`.
+  - Anywhere main reads `*_eligible_groups` / `*_divergent_groups`, swap to `groups_with_*_check` / `*_divergent_count`.
+  - Drop any new code that reads `per_field_population` — gone in v2.
+- Keep the local `rate()` helper at the bottom of the file.
+- Category lookup must use the new title-cased keys (or stay tolerant
+  via the `available` set logic we added).
+---
+## `components/signals/corpus-signals-strip.tsx` — Medium
+**What we did:** Same field renames as above, same local `rate()`
+helper added. Headline copy updated from "groups" → "triples" where
+the underlying unit changed.
+**Reconcile:** Apply the same recipe as `corpus-dashboard.tsx`. The
+two files share field names and the `rate()` helper.
+---
+## `lib/hf-data.ts` — Medium
+**What we did:** Added an early-return guard at the top of five
+functions:
+- `fetchBackendManifestStatus` — synthesizes a status from the v2 manifest sidecar
+- `fetchBenchmarkMetadataMap` — delegates to `view-data.getBenchmarkMetadataMap`
+- `fetchBackendManifest` — delegates to `sidecars.fetchManifest`
+- `fetchEvalHierarchy` — delegates to `sidecars.fetchHierarchy` (still wraps in `adaptEvalHierarchy`)
+- `fetchCorpusAggregates` — delegates to `sidecars.fetchHeadline`
+Plus a module-level `useViewLayerBackend()` helper and a lazy
+`fetchSnapshotSidecars()` importer near the top of the file.
+**Reconcile:**
+- These are all additive guards at the start of existing functions —
+  conflicts are likely only if main re-shaped the same function
+  bodies.
+- Pattern: `if (useViewLayerBackend()) { return <v2 path> }` then fall
+  through to the existing v1 implementation untouched.
+- If main renamed one of these functions, port the guard into the
+  renamed version. Don't drop the guard.
+---
+## `Dockerfile` — Medium
+**What we did:**
+- Default `ARG DATA_BACKEND` flipped from `duckdb` → `v2` in **both**
+  stages (builder and runner).
+- Added `ARG SNAPSHOT_URL` + `ENV SNAPSHOT_URL` in both stages,
+  defaulting to a pinned `evaleval/eval-cards-data` warehouse path.
+- Comment block rewritten to reflect v2 + legacy coexistence.
+- Kept legacy `LOCAL_PIPELINE_OUTPUT`, `HF_DATA_LOCAL_DIR`,
+  `HF_DATA_OFFLINE=1` envs intact (legacy backend still compilable).
+**Uncommitted tweak (working tree):** `SNAPSHOT_URL` default points at
+`j-chim/temp_evalcard_backend` instead of `evaleval/eval-cards-data` —
+this is the dev/test dataset for the temp HF Space deploy. Do **not**
+commit this override; revert before merging to main, or keep it only
+on local working copy.
+**Reconcile:**
+- Keep our `DATA_BACKEND=v2` default and `SNAPSHOT_URL` plumbing.
+- Layer main's non-data changes (base image bumps, `pnpm` version,
+  build commands) on top.
+---
+## `lib/benchmark-schema.ts` — Low
+**What we did:** Added one optional field, `num_few_shot?: number`, on
+`GenerationConfig`. That's it.
+**Reconcile:** Trivially additive. Keep our line; merge tool should
+handle it cleanly unless main touched the same struct.
+---
+## `app/page.tsx` — Low
+**What we did:** One-line copy change in the empty-state banner —
+`corpus-aggregates.json` → `headline.json` (the v2 sidecar name).
+**Reconcile:** Trivial. Keep ours.
+---
+## Order of operations after `git pull`
+1. Resolve `lib/backend-artifacts.ts` first — it's the schema source
+   of truth that the components depend on.
+2. Resolve `lib/data-backend.ts` and `lib/hf-data.ts` — backend wiring.
+3. Resolve the two `components/signals/*` files using the rename recipe.
+4. Resolve `Dockerfile` — keep our v2 envs.
+5. `app/page.tsx` and `lib/benchmark-schema.ts` — should auto-merge or
+   be trivial.
+6. Run `pnpm tsc --noEmit` (or whatever the project's typecheck is) to
+   catch any v1 field references main introduced that didn't conflict
+   textually but break against our renamed types.
+7. Run `pnpm test` — `tests/view-data.test.ts` and
+   `tests/duckdb-data.test.ts` should both still pass.
+8. Smoke test with `DATA_BACKEND=v2 SNAPSHOT_URL=file://…` and again
+   without (legacy path) — both must render.

scripts/cache-hf-data.mjs CHANGED Viewed

@@ -18,6 +18,13 @@ import { promisify } from "util"
 const root = path.resolve(new URL(import.meta.url).pathname, "..", "..")
 const cacheDir = path.join(root, ".cache", "hf-data")
 const publicDir = path.join(root, "public")
 const HF_DATASET_REPO = process.env.HF_DATASET_REPO?.trim()
   || "https://huggingface.co/datasets/evaleval/card_backend"
 const HF_RESOLVE_BASE = `${HF_DATASET_REPO}/resolve/main`

 const root = path.resolve(new URL(import.meta.url).pathname, "..", "..")
 const cacheDir = path.join(root, ".cache", "hf-data")
 const publicDir = path.join(root, "public")
+const dataBackend = process.env.DATA_BACKEND?.trim().toLowerCase()
+if (dataBackend === "v2" || dataBackend === "stage-j") {
+  await fs.mkdir(cacheDir, { recursive: true })
+  console.log("[cache-hf-data] DATA_BACKEND=v2: skipping legacy HF cache; runtime reads SNAPSHOT_URL")
+  process.exit(0)
+}
 const HF_DATASET_REPO = process.env.HF_DATASET_REPO?.trim()
   || "https://huggingface.co/datasets/evaleval/card_backend"
 const HF_RESOLVE_BASE = `${HF_DATASET_REPO}/resolve/main`

tests/duckdb-data.test.ts CHANGED Viewed

@@ -12,27 +12,13 @@ function sqlString(value: string) {
 }
 async function writeParquetPayload(outputDir: string, fileName: string, payloads: unknown[]) {
-  const parquetDir = path.join(outputDir, "experimental", "parquet")
   await mkdir(parquetDir, { recursive: true })
   const selects = payloads
-    .map((payload, index) => {
-      const record = payload as Record<string, unknown>
       const payloadJson = JSON.stringify(payload)
-      return [
-        `SELECT 'model_card_lite' AS record_type`,
-        `${sqlString(String(record.model_route_id ?? index))} AS model_route_id`,
-        `${sqlString(String(record.model_family_id ?? ""))} AS model_family_id`,
-        `${sqlString(String(record.developer ?? ""))} AS developer`,
-        `NULL AS eval_summary_id`,
-        `NULL AS developer_route_id`,
-        `NULL AS category`,
-        `NULL AS benchmark_family_key`,
-        `${Number(record.benchmark_family_count ?? 0)} AS models_count`,
-        `${Number(record.total_evaluations ?? 0)} AS total_evaluations`,
-        `${sqlString(String(record.last_updated ?? ""))} AS last_updated`,
-        `${sqlString(payloadJson)} AS payload_json`,
-      ].join(", ")
     })
     .join(" UNION ALL ")
@@ -49,22 +35,36 @@ describe("DuckDB local data backend", () => {
       process.env.LOCAL_PIPELINE_OUTPUT = outputDir
       await writeParquetPayload(outputDir, "model_cards_lite.parquet", [
         {
-          model_family_id: "openai/gpt-5",
-          model_route_id: "openai__gpt-5",
-          model_family_name: "GPT 5",
-          developer: "openai",
           params_billions: 100,
-          total_evaluations: 3,
-          benchmark_count: 2,
-          benchmark_family_count: 2,
-          categories_covered: ["reasoning"],
-          last_updated: "2026-01-01T00:00:00Z",
-          variants: [],
           score_summary: { count: 1, min: 0.7, max: 0.9, average: 0.8 },
           benchmark_names: ["mmlu"],
-          top_benchmark_scores: [
             { benchmark: "mmlu", score: 0.9, metric: "accuracy" },
           ],
         },
       ])
@@ -93,7 +93,7 @@ describe("DuckDB local data backend", () => {
     try {
       process.env.LOCAL_PIPELINE_OUTPUT = outputDir
       await expect(getModelCardsLiteFromDuckDB()).rejects.toThrow(
-        /EXPORT_EXPERIMENTAL_PARQUET=1/
       )
     } finally {
       if (previousOutput == null) {

 }
 async function writeParquetPayload(outputDir: string, fileName: string, payloads: unknown[]) {
+  const parquetDir = path.join(outputDir, "duckdb", "v1")
   await mkdir(parquetDir, { recursive: true })
   const selects = payloads
+    .map((payload) => {
       const payloadJson = JSON.stringify(payload)
+      return `SELECT ${sqlString(payloadJson)} AS payload_json`
     })
     .join(" UNION ALL ")
       process.env.LOCAL_PIPELINE_OUTPUT = outputDir
       await writeParquetPayload(outputDir, "model_cards_lite.parquet", [
         {
+          id: "openai/gpt-5",
+          route_id: "openai__gpt-5",
+          model_name: "GPT 5",
+          model_id: "openai/gpt-5",
+          canonical_model_name: "GPT 5",
+          developer: "OpenAI",
+          evaluations_count: 3,
+          benchmarks_count: 2,
+          variant_count: 1,
+          categories: ["Reasoning"],
+          category_stats: { General: 0, Reasoning: 2, Agentic: 0, Safety: 0, Knowledge: 0 },
+          latest_timestamp: "2026-01-01T00:00:00Z",
+          evaluator_count: 1,
+          evaluator_names: ["OpenAI"],
+          source_type_count: 1,
+          source_types: ["documentation"],
+          evidence_count: 3,
+          missing_generation_config_count: 0,
+          third_party_eval_count: 0,
+          independent_verification_ratio: 0,
+          reproducibility_status: "complete",
+          eval_libraries: [],
           params_billions: 100,
           score_summary: { count: 1, min: 0.7, max: 0.9, average: 0.8 },
           benchmark_names: ["mmlu"],
+          top_scores: [
             { benchmark: "mmlu", score: 0.9, metric: "accuracy" },
           ],
+          source_urls: [],
+          detail_urls: [],
         },
       ])
     try {
       process.env.LOCAL_PIPELINE_OUTPUT = outputDir
       await expect(getModelCardsLiteFromDuckDB()).rejects.toThrow(
+        /duckdb\/v1\/model_cards_lite\.parquet/
       )
     } finally {
       if (previousOutput == null) {

tests/view-data.test.ts ADDED Viewed

	@@ -0,0 +1,466 @@

+import { mkdir, mkdtemp, rm, writeFile } from "fs/promises"
+import os from "os"
+import path from "path"
+import { DuckDBConnection } from "@duckdb/node-api"
+import { describe, expect, it } from "vitest"
+function sqlString(value: string) {
+  return `'${value.replace(/'/g, "''")}'`
+}
+async function copyParquet(connection: DuckDBConnection, sql: string, outputPath: string) {
+  await connection.run(`COPY (${sql}) TO ${sqlString(outputPath)} (FORMAT parquet)`)
+}
+async function writeSyntheticStageJSnapshot(snapshotDir: string) {
+  await mkdir(snapshotDir, { recursive: true })
+  const connection = await DuckDBConnection.create()
+  await copyParquet(
+    connection,
+    `
+      SELECT
+        TIMESTAMP '2026-05-03 00:00:00' AS snapshot_id,
+        'openai/gpt-5' AS model_key,
+        'openai/gpt-5' AS model_id,
+        'openai/gpt-5' AS id,
+        'openai%2Fgpt-5' AS route_id,
+        'openai%2Fgpt-5' AS model_route_id,
+        'openai/gpt-5' AS model_family_id,
+        'GPT 5' AS model_name,
+        'GPT 5' AS canonical_model_name,
+        'GPT 5' AS model_family_name,
+        'OpenAI' AS developer,
+        DATE '2026-01-01' AS release_date,
+        'https://example.test/model' AS model_url,
+        'transformer' AS architecture,
+        '100B' AS params,
+        100.0 AS params_billions,
+        ['text']::VARCHAR[] AS input_modalities,
+        ['text']::VARCHAR[] AS output_modalities,
+        'engine' AS inference_engine,
+        'platform' AS inference_platform,
+        1::BIGINT AS evaluations_count,
+        1::BIGINT AS benchmarks_count,
+        1::INTEGER AS variant_count,
+        1::BIGINT AS evaluator_count,
+        ['OpenAI']::VARCHAR[] AS evaluator_names,
+        1::INTEGER AS source_type_count,
+        ['documentation']::VARCHAR[] AS source_types,
+        0::BIGINT AS third_party_eval_count,
+        0.0 AS independent_verification_ratio,
+        1::BIGINT AS evidence_count,
+        0::INTEGER AS missing_generation_config_count,
+        TIMESTAMP '2026-05-03 00:00:00' AS latest_timestamp,
+        'OpenAI' AS latest_source_name,
+        ['MMLU']::VARCHAR[] AS benchmark_names,
+        ['Reasoning']::VARCHAR[] AS categories,
+        struct_pack("General" := 0, "Reasoning" := 1, "Agentic" := 0, "Safety" := 0, "Knowledge" := 0) AS category_stats,
+        'complete' AS reproducibility_status,
+        struct_pack(results_total := 1, has_reproducibility_gap_count := 0, populated_ratio_avg := 1.0) AS reproducibility_summary,
+        struct_pack(
+          total_results := 1,
+          total_groups := 1,
+          multi_source_groups := 0,
+          first_party_only_groups := 1,
+          source_type_distribution := struct_pack(first_party := 1, third_party := 0, collaborative := 0, unspecified := 0)
+        ) AS provenance_summary,
+        struct_pack(
+          total_groups := 1,
+          groups_with_variant_check := 0,
+          groups_with_cross_party_check := 0,
+          variant_divergent_count := 0,
+          cross_party_divergent_count := 0
+        ) AS comparability_summary,
+        [struct_pack(name := 'openai-evals', version := '1.0', fork := NULL::VARCHAR)] AS eval_libraries,
+        struct_pack(count := 1, min := 0.8, max := 0.8, average := 0.8) AS score_summary,
+        [struct_pack(benchmark := 'MMLU', benchmarkKey := 'mmlu', score := 0.8, metric := 'accuracy')] AS top_scores,
+        ['https://example.test/source']::VARCHAR[] AS source_urls,
+        []::VARCHAR[] AS detail_urls,
+        [struct_pack(
+          variant_id := 'default',
+          variant_key := 'default',
+          variant_label := 'Default',
+          variant_display_name := 'GPT 5',
+          raw_model_ids := ['openai/gpt-5']::VARCHAR[],
+          family_id := 'openai/gpt-5',
+          family_name := 'GPT 5',
+          version_date := NULL::VARCHAR,
+          version_qualifier := NULL::VARCHAR,
+          total_evaluations := 1,
+          last_updated := TIMESTAMP '2026-05-03 00:00:00',
+          categories_covered := ['Reasoning']::VARCHAR[]
+        )] AS variants,
+        ['openai/gpt-5']::VARCHAR[] AS raw_model_ids
+    `,
+    path.join(snapshotDir, "models_view.parquet")
+  )
+  await copyParquet(
+    connection,
+    `
+      SELECT
+        TIMESTAMP '2026-05-03 00:00:00' AS snapshot_id,
+        'mmlu' AS evaluation_id,
+        'mmlu' AS benchmark_id,
+        'accuracy' AS primary_metric_id,
+        'MMLU' AS evaluation_name,
+        'MMLU' AS canonical_display_name,
+        'mmlu' AS composite_benchmark_key,
+        'MMLU' AS composite_benchmark_name,
+        'mmlu' AS benchmark_family_key,
+        'mmlu' AS benchmark_leaf_key,
+        'Reasoning' AS category,
+        struct_pack(
+          evaluation_description := 'Accuracy on MMLU',
+          lower_is_better := false,
+          score_type := 'continuous',
+          min_score := 0.0,
+          max_score := 1.0,
+          unit := 'proportion'
+        ) AS metric_config,
+        1::BIGINT AS models_count,
+        ['OpenAI']::VARCHAR[] AS evaluator_names,
+        ['documentation']::VARCHAR[] AS source_types,
+        'OpenAI' AS latest_source_name,
+        0.0 AS third_party_ratio,
+        0::INTEGER AS missing_generation_config_count,
+        struct_pack(name := 'GPT 5', score := 0.8) AS best_model,
+        struct_pack(name := 'GPT 5', score := 0.8) AS worst_model,
+        0.8 AS avg_score,
+        0.8 AS avg_score_norm,
+        0.8 AS top_score,
+        false AS has_card,
+        NULL AS benchmark_card,
+        false AS is_aggregated,
+        [] AS aggregate_sources,
+        false AS is_summary_score,
+        []::VARCHAR[] AS summary_eval_ids,
+        struct_pack(domains := ['knowledge']::VARCHAR[], languages := ['en']::VARCHAR[], tasks := ['qa']::VARCHAR[]) AS tags,
+        struct_pack(
+          dataset_name := 'MMLU',
+          source_type := 'documentation',
+          hf_repo := NULL::VARCHAR,
+          hf_split := NULL::VARCHAR,
+          samples_number := 10,
+          url := ['https://example.test/mmlu']::VARCHAR[],
+          dataset_url := 'https://example.test/mmlu',
+          dataset_version := 'v1'
+        ) AS source_data,
+        struct_pack(results_total := 1, has_reproducibility_gap_count := 0, populated_ratio_avg := 1.0) AS reproducibility_summary,
+        struct_pack(
+          total_results := 1,
+          total_groups := 1,
+          multi_source_groups := 0,
+          first_party_only_groups := 1,
+          source_type_distribution := struct_pack(first_party := 1, third_party := 0, collaborative := 0, unspecified := 0)
+        ) AS provenance_summary,
+        struct_pack(
+          total_groups := 1,
+          groups_with_variant_check := 0,
+          groups_with_cross_party_check := 0,
+          variant_divergent_count := 0,
+          cross_party_divergent_count := 0
+        ) AS comparability_summary,
+        struct_pack(available := false, url_count := 0::BIGINT, sample_urls := []::VARCHAR[], models_with_loaded_instances := 0) AS instance_data,
+        1::INTEGER AS metrics_count,
+        ['Accuracy']::VARCHAR[] AS metric_names,
+        [struct_pack(
+          column_key := 'root:accuracy',
+          metric_summary_id := 'mmlu%3Aaccuracy',
+          metric_id := 'accuracy',
+          metric_name := 'accuracy',
+          display_name := 'Accuracy',
+          canonical_display_name := 'Accuracy',
+          lower_is_better := false,
+          unit := 'proportion',
+          scope := 'root',
+          subtask_key := NULL::VARCHAR,
+          subtask_name := NULL::VARCHAR
+        )] AS leaderboard_metrics,
+        [] AS leaderboard_rows,
+        [struct_pack(
+          metric_summary_id := 'mmlu%3Aaccuracy',
+          metric_name := 'accuracy',
+          display_name := 'Accuracy',
+          canonical_display_name := 'Accuracy',
+          metric_key := 'accuracy',
+          lower_is_better := false,
+          models_count := 1,
+          top_score := 0.8,
+          unit := 'proportion'
+        )] AS root_metrics,
+        [] AS subtasks,
+        0::INTEGER AS subtasks_count
+    `,
+    path.join(snapshotDir, "evals_view.parquet")
+  )
+  await copyParquet(
+    connection,
+    `
+      SELECT
+        TIMESTAMP '2026-05-03 00:00:00' AS snapshot_id,
+        'mmlu' AS evaluation_id,
+        'mmlu%3Aaccuracy' AS metric_summary_id,
+        'mmlu' AS benchmark_id,
+        'accuracy' AS metric_id,
+        'openai/gpt-5' AS model_key,
+        'openai/gpt-5' AS model_id,
+        'openai%2Fgpt-5' AS model_route_id,
+        struct_pack(
+          name := 'GPT 5',
+          id := 'openai/gpt-5',
+          developer := 'OpenAI',
+          inference_platform := 'platform',
+          inference_engine := 'engine',
+          model_version := NULL::VARCHAR,
+          architecture := 'transformer',
+          parameter_count := '100B',
+          release_date := '2026-01-01',
+          model_url := 'https://example.test/model',
+          modalities := struct_pack(input := ['text']::VARCHAR[], output := ['text']::VARCHAR[])
+        ) AS model_info,
+        'Accuracy' AS metric_display_name,
+        'proportion' AS metric_unit,
+        false AS lower_is_better,
+        'Reasoning' AS category,
+        0.8 AS score,
+        struct_pack(
+          score := 0.8,
+          standard_error := 0.01,
+          sample_size := 10,
+          confidence_interval := struct_pack(lower := 0.7, upper := 0.9, confidence_level := 0.95)
+        ) AS score_details,
+        1::INTEGER AS fact_row_count,
+        1::INTEGER AS position,
+        1::INTEGER AS total,
+        1.0 AS percentile,
+        TIMESTAMP '2026-05-03 00:00:00' AS evaluation_timestamp,
+        struct_pack(
+          source_name := 'OpenAI report',
+          source_type := 'documentation',
+          source_organization_name := 'OpenAI',
+          source_organization_url := 'https://example.test',
+          evaluator_relationship := 'first_party',
+          source_url := 'https://example.test/report',
+          publication_date := DATE '2026-05-03'
+        ) AS source_metadata,
+        struct_pack(
+          dataset_name := 'MMLU',
+          source_type := 'documentation',
+          hf_repo := NULL::VARCHAR,
+          hf_split := NULL::VARCHAR,
+          samples_number := 10,
+          url := ['https://example.test/mmlu']::VARCHAR[],
+          dataset_url := 'https://example.test/mmlu',
+          dataset_version := 'v1'
+        ) AS source_data,
+        'https://example.test/record.json' AS source_record_url,
+        struct_pack(name := 'openai-evals', version := '1.0', fork := NULL::VARCHAR) AS eval_library,
+        ['first_party']::VARCHAR[] AS evaluator_relationships,
+        true AS has_first_party,
+        false AS has_third_party,
+        'self' AS coverage_cell,
+        ['OpenAI']::VARCHAR[] AS reporting_orgs,
+        map(['OpenAI'], [0.8]) AS scores_by_organization,
+        false AS is_summary_score,
+        NULL::VARCHAR AS summary_score_for,
+        [] AS aggregate_components,
+        false AS has_reproducibility_gap,
+        1.0 AS completeness_score,
+        false AS is_multi_source,
+        true AS first_party_only,
+        false AS has_variant_divergence,
+        false AS has_cross_party_divergence,
+        NULL AS evalcards_annotations,
+        NULL::VARCHAR AS instance_file_path,
+        NULL::VARCHAR AS instance_file_format,
+        0::INTEGER AS instance_rows
+    `,
+    path.join(snapshotDir, "eval_results_view.parquet")
+  )
+  await writeFile(
+    path.join(snapshotDir, "manifest.json"),
+    JSON.stringify({
+      generated_at: "2026-05-03T00:00:00Z",
+      config_version: 2,
+      skipped_configs: [],
+      model_count: 1,
+      eval_count: 1,
+      metric_eval_count: 1,
+      source_config_count: 1,
+      skipped_config_count: 0,
+      summary_artifacts: {
+        corpus_aggregates: "headline.json",
+        eval_hierarchy: "hierarchy.json",
+      },
+    })
+  )
+  const reproducibilityBlock = {
+    total_triples: 1,
+    triples_with_reproducibility_gap: 0,
+    reproducibility_gap_rate: 0,
+    agentic_triples: 0,
+    per_field_missingness: {
+      temperature: {
+        missing_count: 0,
+        missing_rate: 0,
+        denominator: "all_triples",
+        denominator_count: 1,
+      },
+    },
+  }
+  const completenessBlock = {
+    total_triples: 1,
+    completeness_avg: 0.75,
+    completeness_min: 0.75,
+    completeness_max: 0.75,
+  }
+  const provenanceBlock = {
+    total_triples: 1,
+    multi_source_triples: 0,
+    first_party_only_triples: 1,
+    source_type_distribution: {
+      first_party: 1,
+      third_party: 0,
+      collaborative: 0,
+      unspecified: 0,
+    },
+  }
+  const comparabilityBlock = {
+    total_triples: 1,
+    variant_divergent_count: 0,
+    cross_party_divergent_count: 0,
+    groups_with_variant_check: 1,
+    groups_with_cross_party_check: 0,
+  }
+  await writeFile(
+    path.join(snapshotDir, "headline.json"),
+    JSON.stringify({
+      generated_at: "2026-05-03T00:00:00Z",
+      signal_version: "1.0",
+      stratification_dimensions: ["category"],
+      reproducibility: {
+        overall: reproducibilityBlock,
+        by_category: { Reasoning: reproducibilityBlock },
+      },
+      completeness: {
+        overall: completenessBlock,
+        by_category: { Reasoning: completenessBlock },
+      },
+      provenance: {
+        overall: provenanceBlock,
+        by_category: { Reasoning: provenanceBlock },
+      },
+      comparability: {
+        overall: comparabilityBlock,
+        by_category: { Reasoning: comparabilityBlock },
+      },
+      developers: [
+        {
+          developer: "OpenAI",
+          route_id: "OpenAI",
+          model_count: 1,
+          benchmark_count: 1,
+          evaluation_count: 1,
+          popular_evals: [{ benchmark: "MMLU", model_count: 1 }],
+        },
+      ],
+    })
+  )
+  await writeFile(
+    path.join(snapshotDir, "hierarchy.json"),
+    JSON.stringify({
+      stats: {
+        family_count: 1,
+        composite_count: 0,
+        standalone_benchmark_count: 1,
+        single_benchmark_count: 1,
+        slice_count: 0,
+        metric_count: 1,
+        metric_rows_scanned: 1,
+      },
+      families: [],
+    })
+  )
+}
+describe("Stage J view-layer backend", () => {
+  it("reads a pinned snapshot through the v2 accessors", async () => {
+    const snapshotDir = await mkdtemp(path.join(os.tmpdir(), "eval-card-stage-j-"))
+    const previousBackend = process.env.DATA_BACKEND
+    const previousSnapshotUrl = process.env.SNAPSHOT_URL
+    try {
+      await writeSyntheticStageJSnapshot(snapshotDir)
+      process.env.DATA_BACKEND = "v2"
+      process.env.SNAPSHOT_URL = `file://${snapshotDir}`
+      const dataBackend = await import("../lib/data-backend")
+      const hfData = await import("../lib/hf-data")
+      const [models, evalListData, modelSummary, evalSummary, developers, developerSummary, manifest, hierarchy, aggregates] =
+        await Promise.all([
+          dataBackend.getModelCardsLite(),
+          dataBackend.getEvalListLiteData(),
+          dataBackend.getModelSummaryById("openai%2Fgpt-5"),
+          dataBackend.getEvalSummaryById("mmlu"),
+          dataBackend.getDeveloperList(),
+          dataBackend.getDeveloperSummaryById("OpenAI"),
+          dataBackend.getBackendManifestData(),
+          dataBackend.getEvalHierarchyData(),
+          hfData.fetchCorpusAggregates(),
+        ])
+      expect(models[0]).toMatchObject({
+        route_id: "openai%2Fgpt-5",
+        model_name: "GPT 5",
+        evaluations_count: 1,
+      })
+      expect(evalListData).toMatchObject({
+        totalModels: 1,
+        evals: [{ evaluation_id: "mmlu", evaluation_name: "MMLU", models_count: 1 }],
+      })
+      expect(modelSummary?.evaluations_by_category.Reasoning).toHaveLength(1)
+      expect(evalSummary?.model_results[0]).toMatchObject({
+        model_route_id: "openai%2Fgpt-5",
+        score: 0.8,
+        result: { metric_summary_id: "mmlu%3Aaccuracy" },
+      })
+      expect(developers[0]).toMatchObject({ developer: "OpenAI", route_id: "OpenAI" })
+      expect(developerSummary?.models).toHaveLength(1)
+      expect(manifest.model_count).toBe(1)
+      expect(hierarchy.stats?.metric_rows_scanned).toBe(1)
+      expect(aggregates?.completeness.overall).toMatchObject({
+        total_triples: 1,
+        completeness_avg: 0.75,
+      })
+      expect(aggregates?.provenance.overall).toMatchObject({
+        total_triples: 1,
+        first_party_only_triples: 1,
+      })
+      expect(aggregates?.comparability.overall).toMatchObject({
+        groups_with_variant_check: 1,
+        variant_divergent_count: 0,
+      })
+      expect(aggregates?.comparability.by_category.Reasoning).toBeDefined()
+    } finally {
+      if (previousBackend == null) {
+        delete process.env.DATA_BACKEND
+      } else {
+        process.env.DATA_BACKEND = previousBackend
+      }
+      if (previousSnapshotUrl == null) {
+        delete process.env.SNAPSHOT_URL
+      } else {
+        process.env.SNAPSHOT_URL = previousSnapshotUrl
+      }
+      await rm(snapshotDir, { recursive: true, force: true })
+    }
+  })
+})