Spaces:

evaleval
/

general-eval-card

Running

evijit HF Staff Claude Opus 4.7 (1M context) commited on Apr 27

Commit

bca888a

1 Parent(s): 431b0cc

Add interpretive signals, corpus dashboard, and slice browser

Surfaces reproducibility, reporting completeness, provenance, and
comparability signals from the backend across eval detail, model compare,
eval list cards, and a new /corpus dashboard. Adapts to the upstream
2-level hierarchy (family → leaf), caps the leaderboard at 24 default
columns and replaces the slice tabs with a search dialog when a
benchmark has more than 5 slices.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (29) hide show

app/api/corpus-aggregates/route.ts +16 -0
app/corpus/page.tsx +36 -0
app/evals/page.tsx +308 -19
components/benchmark-detail.tsx +45 -53
components/benchmark-evaluation-card.tsx +19 -0
components/eval-card.tsx +11 -6
components/eval-detail.tsx +274 -29
components/model-compare-dialog.tsx +15 -0
components/navigation.tsx +6 -0
components/signals/comparability-panel.tsx +193 -0
components/signals/completeness-panel.tsx +147 -0
components/signals/corpus-dashboard.tsx +442 -0
components/signals/cross-party-divergence-badge.tsx +46 -0
components/signals/provenance-badge.tsx +124 -0
components/signals/reproducibility-badge.tsx +46 -0
components/signals/reproducibility-panel.tsx +60 -0
components/signals/signal-tooltip.tsx +31 -0
components/signals/signal-utils.ts +105 -0
components/signals/signals-row-badges.tsx +68 -0
components/signals/variant-divergence-badge.tsx +46 -0
docs/INTERPRETIVE_SIGNALS.md +622 -0
lib/backend-artifacts.ts +230 -7
lib/benchmark-schema.ts +8 -1
lib/dashboard-data-client.ts +5 -1
lib/eval-processing.ts +7 -1
lib/hf-data.ts +118 -6
lib/model-data.ts +51 -28
public/peer-ranks.json +0 -0
scripts/cache-hf-data.mjs +2 -0

app/api/corpus-aggregates/route.ts ADDED Viewed

	@@ -0,0 +1,16 @@

+import { NextResponse } from "next/server"
+import { fetchCorpusAggregates } from "@/lib/hf-data"
+export async function GET() {
+  const aggregates = await fetchCorpusAggregates()
+  if (!aggregates) {
+    return NextResponse.json(
+      { error: "Corpus aggregates not available" },
+      { status: 404 }
+    )
+  }
+  return NextResponse.json(aggregates)
+}

app/corpus/page.tsx ADDED Viewed

	@@ -0,0 +1,36 @@

+import { CorpusDashboard } from "@/components/signals/corpus-dashboard"
+import { Navigation } from "@/components/navigation"
+import { fetchCorpusAggregates, fetchEvalListLite } from "@/lib/hf-data"
+export default async function CorpusPage() {
+  const [aggregates, evalList] = await Promise.all([
+    fetchCorpusAggregates(),
+    fetchEvalListLite().catch(() => ({ evals: [] })),
+  ])
+  const completenessScores = evalList.evals
+    .map((entry) => entry.evalcards?.annotations?.reporting_completeness?.completeness_score)
+    .filter((score): score is number => typeof score === "number" && Number.isFinite(score))
+  return (
+    <div className="min-h-screen bg-background">
+      <Navigation />
+      <main className="container mx-auto px-4 py-8">
+        {aggregates ? (
+          <CorpusDashboard aggregates={aggregates} completenessScores={completenessScores} />
+        ) : (
+          <section className="rounded-2xl border border-dashed border-border/70 bg-card p-8 text-center">
+            <div className="text-[11px] font-semibold uppercase tracking-[0.22em] text-muted-foreground">
+              Interpretive signals
+            </div>
+            <h1 className="mt-2 text-2xl font-semibold tracking-tight">Corpus aggregates are not available yet</h1>
+            <p className="mx-auto mt-3 max-w-2xl text-sm leading-6 text-muted-foreground">
+              The frontend is ready for `corpus-aggregates.json`, but this cached backend snapshot does not include it yet.
+              Once the dataset ships the file, this page will render reproducibility, completeness, provenance, and comparability rollups.
+            </p>
+          </section>
+        )}
+      </main>
+    </div>
+  )
+}

app/evals/page.tsx CHANGED Viewed

@@ -11,7 +11,7 @@ import { PageHeader } from "@/components/page-header"
 import { Button } from "@/components/ui/button"
 import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
 import { Input } from "@/components/ui/input"
-import type { EvalHierarchy } from "@/lib/backend-artifacts"
 import type { BenchmarkCard, CategoryType } from "@/lib/benchmark-schema"
 import type { BenchmarkEvalListItem } from "@/lib/eval-processing"
 import { fetchBenchmarkMetadata, fetchEvalHierarchy, fetchEvalList } from "@/lib/dashboard-data-client"
@@ -251,7 +251,7 @@ interface EvalBrowserMatrixPreviewRow {
   value: string
 }
-interface EvalBrowserNode {
   id: string
   parentId: string | null
   kind: EvalBrowserNodeKind
@@ -261,6 +261,7 @@ interface EvalBrowserNode {
   description: string
   category: CategoryType
   domains: string[]
   dataType?: string
   license?: string
   card?: BenchmarkCard
@@ -272,6 +273,8 @@ interface EvalBrowserNode {
   childIds: string[]
   href?: string
   scopeKeys: string[]
   matrixPreview?: {
     columnLabel: string
     rows: EvalBrowserMatrixPreviewRow[]
@@ -336,6 +339,86 @@ function summarizeNodeStats(
     0
   )
   return {
     category: getDominantCategory(summaries, fallbackCategory),
     modelsCount,
@@ -346,6 +429,10 @@ function summarizeNodeStats(
       summaries[0]?.source_data?.hf_repo ??
       summaries[0]?.source_data?.dataset_name ??
       "Hierarchy summary",
   }
 }
@@ -435,6 +522,94 @@ function getNodeCard(
   return undefined
 }
 function looksLikeLanguageSplit(value: string) {
   const normalized = normalizeBenchmarkKey(value)
   const languageLike = new Set([
@@ -528,6 +703,7 @@ export default function EvalsPage() {
   const [totalModels, setTotalModels] = useState(0)
   const [searchQuery, setSearchQuery] = useState("")
   const [selectedDomain, setSelectedDomain] = useState<string | null>(null)
   const [selectedCategory, setSelectedCategory] = useState<string | null>(null)
   const [selectedNodeKind, setSelectedNodeKind] = useState<EvalBrowserNodeKind | null>(null)
   const [currentNodeId, setCurrentNodeId] = useState<string | null>(null)
@@ -558,10 +734,12 @@ export default function EvalsPage() {
       const params = new URLSearchParams(window.location.search)
       const incomingSearch = params.get("search") ?? ""
       const incomingDomain = params.get("domain")
       const incomingCategory = params.get("category")
       const incomingNode = params.get("node")
       setSearchQuery(incomingSearch)
       setSelectedDomain(incomingDomain)
       setSelectedCategory(incomingCategory)
       setCurrentNodeId(incomingNode)
     }
@@ -586,6 +764,9 @@ export default function EvalsPage() {
     if (selectedDomain) {
       params.set("domain", selectedDomain)
     }
     if (selectedCategory) {
       params.set("category", selectedCategory)
     }
@@ -607,7 +788,7 @@ export default function EvalsPage() {
     }
     pendingHistoryActionRef.current = "replace"
-  }, [currentNodeId, searchQuery, selectedCategory, selectedDomain])
   const summariesWithCards = useMemo(() => {
     return summaries.map((summary) => {
@@ -665,6 +846,7 @@ export default function EvalsPage() {
       suiteLabel,
       category,
       domains,
       summaries,
       card,
       sourceLabel,
@@ -682,6 +864,7 @@ export default function EvalsPage() {
       suiteLabel?: string
       category: CategoryType
       domains: string[]
       summaries: BenchmarkEvalListItem[]
       card?: BenchmarkCard
       sourceLabel?: string
@@ -692,6 +875,7 @@ export default function EvalsPage() {
       descriptionFallback: string
     }) => {
       const stats = summarizeNodeStats(summaries, category)
       addNode({
         id,
         parentId,
@@ -702,6 +886,7 @@ export default function EvalsPage() {
         description: buildDescription(title, card, descriptionFallback),
         category: stats.category,
         domains: Array.from(new Set(domains.flatMap((domain) => normalizeDomainList(domain)))),
         dataType: card?.benchmark_details?.data_type,
         license: card?.ethical_and_legal_considerations?.data_licensing,
         card,
@@ -714,6 +899,10 @@ export default function EvalsPage() {
         href,
         scopeKeys,
         matrixPreview,
       })
     }
@@ -799,6 +988,7 @@ export default function EvalsPage() {
       slices = [],
       metrics = [],
       scopeKeys,
     }: {
       parentId: string | null
       familyLabel?: string
@@ -812,6 +1002,8 @@ export default function EvalsPage() {
       slices?: Array<{ key: string; display_name: string; metrics: Array<{ key: string; display_name: string }> }>
       metrics?: Array<{ key: string; display_name: string }>
       scopeKeys: string[]
     }) => {
       const benchmarkId = `${parentId ?? "root"}::benchmark:${normalizeBenchmarkKey(benchmarkKey)}`
       const card = summary?.benchmark_card ?? getNodeCard(benchmarkCards, ...cardCandidates)
@@ -821,6 +1013,13 @@ export default function EvalsPage() {
         !summary && metrics.length > 0
           ? scopeKeys.map((scopeKey) => pickSummaryForKey(summariesWithCards, scopeKey, scopeKeys)).find(Boolean)
           : undefined
       const isParentRollupBenchmark =
         Boolean(parentId) && scopeKeys.some((scopeKey) => isSameHierarchyKey(scopeKey, benchmarkKey))
@@ -829,10 +1028,10 @@ export default function EvalsPage() {
         if (drilldownSlices.length > 0) {
           createSliceNodes(parentId, parentLabel, summary, drilldownSlices, category, scopeKeys)
-        } else if (summary) {
           const parent = nodes.get(parentId)
           if (parent && !parent.href) {
-            parent.href = `/evals/${summary.evaluation_id}`
           }
         }
         return
@@ -849,14 +1048,7 @@ export default function EvalsPage() {
         domains,
         summaries: summary ? [summary] : [],
         card,
-        href:
-          drilldownSlices.length === 0
-            ? summary
-              ? `/evals/${summary.evaluation_id}`
-              : fallbackSummary
-                ? `/evals/${fallbackSummary.evaluation_id}`
-                : undefined
-            : undefined,
         scopeKeys,
         descriptionFallback: `Browse the {label} benchmark and its lower-level breakdowns.`,
       })
@@ -1038,6 +1230,7 @@ export default function EvalsPage() {
             slices: standalone.slices ?? [],
             metrics: standalone.metrics ?? [],
             scopeKeys: familyScopeKeys,
           })
         }
@@ -1199,6 +1392,8 @@ export default function EvalsPage() {
               })),
         metrics: benchmarkSource?.metrics ?? family.metrics ?? [],
         scopeKeys: familyScopeKeys,
       })
     }
@@ -1243,6 +1438,7 @@ export default function EvalsPage() {
         node.description,
         node.sourceLabel,
         ...node.domains,
       ]
       return haystacks.some((value) => value?.toLowerCase().includes(query))
@@ -1261,6 +1457,12 @@ export default function EvalsPage() {
       domainCandidates = domainCandidates.filter((node) => node.category === selectedCategory)
     }
     for (const node of domainCandidates) {
       for (const domain of node.domains) {
         domainSet.add(domain)
@@ -1268,7 +1470,34 @@ export default function EvalsPage() {
     }
     return Array.from(domainSet).sort((a, b) => a.localeCompare(b))
-  }, [nodesMatchingSearch, selectedCategory])
   const allCategories = useMemo(() => {
     const categorySet = new Set<string>()
@@ -1284,12 +1513,18 @@ export default function EvalsPage() {
       )
     }
     for (const node of categoryCandidates) {
       categorySet.add(node.category)
     }
     return Array.from(categorySet).sort((a, b) => a.localeCompare(b))
-  }, [nodesMatchingSearch, selectedDomain])
   const filtered = useMemo(() => {
     let list = [...nodesMatchingSearch]
@@ -1306,13 +1541,21 @@ export default function EvalsPage() {
       )
     }
     if (selectedCategory) {
       list = list.filter((node) => node.category === selectedCategory)
     }
     list.sort((a, b) => a.title.localeCompare(b.title, undefined, { sensitivity: "base" }))
     return list
-  }, [nodesMatchingSearch, selectedCategory, selectedDomain, selectedNodeKind])
   useEffect(() => {
     if (selectedDomain && !allDomains.includes(selectedDomain)) {
@@ -1320,6 +1563,12 @@ export default function EvalsPage() {
     }
   }, [allDomains, selectedDomain])
   useEffect(() => {
     if (selectedCategory && !allCategories.includes(selectedCategory)) {
       setSelectedCategory(null)
@@ -1328,7 +1577,7 @@ export default function EvalsPage() {
   useEffect(() => {
     setPage(1)
-  }, [currentNodeId, searchQuery, selectedCategory, selectedDomain, selectedNodeKind])
   const pagedNodes = useMemo(
     () => filtered.slice((page - 1) * PAGE_SIZE, page * PAGE_SIZE),
@@ -1336,7 +1585,7 @@ export default function EvalsPage() {
   )
   const currentLevelKinds = Array.from(new Set(currentLevelNodes.map((node) => node.kind)))
-  const activeFilterCount = [searchQuery.trim(), selectedDomain, selectedCategory, selectedNodeKind].filter(Boolean).length
   const currentLevelLabel =
     currentNodeId === null
       ? "Rollout entry level"
@@ -1480,6 +1729,7 @@ export default function EvalsPage() {
                   onClick={() => {
                     setSearchQuery("")
                     setSelectedDomain(null)
                     setSelectedCategory(null)
                     setSelectedNodeKind(null)
                   }}
@@ -1545,7 +1795,7 @@ export default function EvalsPage() {
                 </div>
               </div>
-              {hierarchy && (
                 <div className="flex flex-wrap gap-2 text-sm">
                   <span className="rounded-full border border-stone-200/80 bg-stone-50/80 px-3 py-1.5 font-medium text-stone-700 dark:border-stone-700/80 dark:bg-stone-900/70 dark:text-stone-200">
                     {hierarchy.stats.family_count} families
@@ -1669,6 +1919,43 @@ export default function EvalsPage() {
                   </div>
                 )}
                 {allCategories.length > 0 && (
                   <div className="mt-4 space-y-1.5">
                     <div className="text-[11px] font-semibold uppercase tracking-[0.2em] text-stone-500 dark:text-stone-400">
@@ -1794,6 +2081,8 @@ export default function EvalsPage() {
                     {node.title}
                   </h3>
                   {node.description && (
                     <p className="mb-4 flex-1 text-sm leading-6 text-stone-600 line-clamp-3 dark:text-stone-300">
                       {node.description}

 import { Button } from "@/components/ui/button"
 import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
 import { Input } from "@/components/ui/input"
+import type { EvalHierarchy, SignalSummaries } from "@/lib/backend-artifacts"
 import type { BenchmarkCard, CategoryType } from "@/lib/benchmark-schema"
 import type { BenchmarkEvalListItem } from "@/lib/eval-processing"
 import { fetchBenchmarkMetadata, fetchEvalHierarchy, fetchEvalList } from "@/lib/dashboard-data-client"
   value: string
 }
+interface EvalBrowserNode extends SignalSummaries {
   id: string
   parentId: string | null
   kind: EvalBrowserNodeKind
   description: string
   category: CategoryType
   domains: string[]
+  tasks: string[]
   dataType?: string
   license?: string
   card?: BenchmarkCard
   childIds: string[]
   href?: string
   scopeKeys: string[]
+  /** Reporting completeness score in [0, 1] when known, otherwise undefined. */
+  completenessScore?: number
   matrixPreview?: {
     columnLabel: string
     rows: EvalBrowserMatrixPreviewRow[]
     0
   )
+  // Aggregate signals across all summaries under this node so a family card
+  // can show signals that span its children.
+  const reproducibilitySummaries = summaries
+    .map((s) => s.reproducibility_summary)
+    .filter((value): value is NonNullable<typeof value> => Boolean(value))
+  const provenanceSummaries = summaries
+    .map((s) => s.provenance_summary)
+    .filter((value): value is NonNullable<typeof value> => Boolean(value))
+  const comparabilitySummaries = summaries
+    .map((s) => s.comparability_summary)
+    .filter((value): value is NonNullable<typeof value> => Boolean(value))
+  const reproducibility_summary = reproducibilitySummaries.length
+    ? reproducibilitySummaries.reduce(
+        (acc, item) => ({
+          results_total: acc.results_total + item.results_total,
+          has_reproducibility_gap_count:
+            acc.has_reproducibility_gap_count + item.has_reproducibility_gap_count,
+          populated_ratio_avg: null,
+        }),
+        { results_total: 0, has_reproducibility_gap_count: 0, populated_ratio_avg: null as number | null }
+      )
+    : undefined
+  const provenance_summary = provenanceSummaries.length
+    ? provenanceSummaries.reduce(
+        (acc, item) => {
+          for (const key of ["first_party", "third_party", "collaborative", "unspecified"] as const) {
+            acc.source_type_distribution[key] += item.source_type_distribution[key] ?? 0
+          }
+          return {
+            total_results: acc.total_results + item.total_results,
+            total_groups: acc.total_groups + item.total_groups,
+            multi_source_groups: acc.multi_source_groups + item.multi_source_groups,
+            first_party_only_groups: acc.first_party_only_groups + item.first_party_only_groups,
+            source_type_distribution: acc.source_type_distribution,
+          }
+        },
+        {
+          total_results: 0,
+          total_groups: 0,
+          multi_source_groups: 0,
+          first_party_only_groups: 0,
+          source_type_distribution: {
+            first_party: 0,
+            third_party: 0,
+            collaborative: 0,
+            unspecified: 0,
+          },
+        }
+      )
+    : undefined
+  const comparability_summary = comparabilitySummaries.length
+    ? comparabilitySummaries.reduce(
+        (acc, item) => ({
+          total_groups: acc.total_groups + item.total_groups,
+          groups_with_variant_check: acc.groups_with_variant_check + item.groups_with_variant_check,
+          groups_with_cross_party_check: acc.groups_with_cross_party_check + item.groups_with_cross_party_check,
+          variant_divergent_count: acc.variant_divergent_count + item.variant_divergent_count,
+          cross_party_divergent_count: acc.cross_party_divergent_count + item.cross_party_divergent_count,
+        }),
+        {
+          total_groups: 0,
+          groups_with_variant_check: 0,
+          groups_with_cross_party_check: 0,
+          variant_divergent_count: 0,
+          cross_party_divergent_count: 0,
+        }
+      )
+    : undefined
+  // Average completeness score across summaries that report one.
+  const completenessScores = summaries
+    .map((s) => s.evalcards?.annotations?.reporting_completeness?.completeness_score)
+    .filter((v): v is number => typeof v === "number" && Number.isFinite(v))
+  const completenessScore = completenessScores.length
+    ? completenessScores.reduce((sum, value) => sum + value, 0) / completenessScores.length
+    : undefined
   return {
     category: getDominantCategory(summaries, fallbackCategory),
     modelsCount,
       summaries[0]?.source_data?.hf_repo ??
       summaries[0]?.source_data?.dataset_name ??
       "Hierarchy summary",
+    reproducibility_summary,
+    provenance_summary,
+    comparability_summary,
+    completenessScore,
   }
 }
   return undefined
 }
+/**
+ * Compact signal indicators for a node card. Shown alongside (or instead of)
+ * the benchmark-card-derived metadata so that nodes lacking a benchmark card
+ * still surface useful interpretive context.
+ */
+function NodeSignalChips({ node }: { node: EvalBrowserNode }) {
+  const repro = node.reproducibility_summary
+  const prov = node.provenance_summary
+  const comparability = node.comparability_summary
+  const completeness = node.completenessScore
+  const reproPercent =
+    repro && repro.results_total > 0
+      ? Math.round((repro.has_reproducibility_gap_count / repro.results_total) * 100)
+      : null
+  const firstPartyPercent =
+    prov && prov.total_groups > 0
+      ? Math.round((prov.first_party_only_groups / prov.total_groups) * 100)
+      : null
+  const variantDivergent = comparability?.variant_divergent_count ?? 0
+  const crossPartyDivergent = comparability?.cross_party_divergent_count ?? 0
+  const completenessPercent = completeness != null ? Math.round(completeness * 100) : null
+  const hasAny =
+    reproPercent !== null ||
+    firstPartyPercent !== null ||
+    variantDivergent > 0 ||
+    crossPartyDivergent > 0 ||
+    completenessPercent !== null
+  if (!hasAny) {
+    return null
+  }
+  return (
+    <div className="mb-3 flex flex-wrap gap-1.5">
+      {completenessPercent !== null && (
+        <span
+          className={cn(
+            "inline-flex items-center gap-1 rounded-full border px-2.5 py-0.5 text-[10px] font-semibold",
+            completenessPercent >= 50
+              ? "border-emerald-200 bg-emerald-50 text-emerald-800 dark:border-emerald-900/50 dark:bg-emerald-950/30 dark:text-emerald-200"
+              : "border-amber-200 bg-amber-50 text-amber-800 dark:border-amber-900/50 dark:bg-amber-950/30 dark:text-amber-200"
+          )}
+          title={`Documentation completeness: ${completenessPercent}% of EvalCards fields populated.`}
+        >
+          {completenessPercent}% documented
+        </span>
+      )}
+      {reproPercent !== null && reproPercent > 0 && (
+        <span
+          className="inline-flex items-center gap-1 rounded-full border border-amber-200 bg-amber-50 px-2.5 py-0.5 text-[10px] font-semibold text-amber-800 dark:border-amber-900/50 dark:bg-amber-950/30 dark:text-amber-200"
+          title={`${repro?.has_reproducibility_gap_count.toLocaleString()} of ${repro?.results_total.toLocaleString()} reported scores missing setup details.`}
+        >
+          {reproPercent}% setup gaps
+        </span>
+      )}
+      {firstPartyPercent !== null && firstPartyPercent >= 50 && (
+        <span
+          className="inline-flex items-center gap-1 rounded-full border border-amber-200 bg-amber-50 px-2.5 py-0.5 text-[10px] font-semibold text-amber-800 dark:border-amber-900/50 dark:bg-amber-950/30 dark:text-amber-200"
+          title={`${firstPartyPercent}% of (model, metric) groups have only first-party reports — no independent replication.`}
+        >
+          {firstPartyPercent}% 1st-party only
+        </span>
+      )}
+      {variantDivergent > 0 && (
+        <span
+          className="inline-flex items-center gap-1 rounded-full border border-rose-200 bg-rose-50 px-2.5 py-0.5 text-[10px] font-semibold text-rose-800 dark:border-rose-900/50 dark:bg-rose-950/30 dark:text-rose-200"
+          title={`${variantDivergent} group${variantDivergent === 1 ? "" : "s"} where setup variations produced diverging scores.`}
+        >
+          {variantDivergent} setup divergence{variantDivergent === 1 ? "" : "s"}
+        </span>
+      )}
+      {crossPartyDivergent > 0 && (
+        <span
+          className="inline-flex items-center gap-1 rounded-full border border-violet-200 bg-violet-50 px-2.5 py-0.5 text-[10px] font-semibold text-violet-800 dark:border-violet-900/50 dark:bg-violet-950/30 dark:text-violet-200"
+          title={`${crossPartyDivergent} group${crossPartyDivergent === 1 ? "" : "s"} where different organizations reported diverging scores.`}
+        >
+          {crossPartyDivergent} source disagreement{crossPartyDivergent === 1 ? "" : "s"}
+        </span>
+      )}
+    </div>
+  )
+}
 function looksLikeLanguageSplit(value: string) {
   const normalized = normalizeBenchmarkKey(value)
   const languageLike = new Set([
   const [totalModels, setTotalModels] = useState(0)
   const [searchQuery, setSearchQuery] = useState("")
   const [selectedDomain, setSelectedDomain] = useState<string | null>(null)
+  const [selectedTask, setSelectedTask] = useState<string | null>(null)
   const [selectedCategory, setSelectedCategory] = useState<string | null>(null)
   const [selectedNodeKind, setSelectedNodeKind] = useState<EvalBrowserNodeKind | null>(null)
   const [currentNodeId, setCurrentNodeId] = useState<string | null>(null)
       const params = new URLSearchParams(window.location.search)
       const incomingSearch = params.get("search") ?? ""
       const incomingDomain = params.get("domain")
+      const incomingTask = params.get("task")
       const incomingCategory = params.get("category")
       const incomingNode = params.get("node")
       setSearchQuery(incomingSearch)
       setSelectedDomain(incomingDomain)
+      setSelectedTask(incomingTask)
       setSelectedCategory(incomingCategory)
       setCurrentNodeId(incomingNode)
     }
     if (selectedDomain) {
       params.set("domain", selectedDomain)
     }
+    if (selectedTask) {
+      params.set("task", selectedTask)
+    }
     if (selectedCategory) {
       params.set("category", selectedCategory)
     }
     }
     pendingHistoryActionRef.current = "replace"
+  }, [currentNodeId, searchQuery, selectedCategory, selectedDomain, selectedTask])
   const summariesWithCards = useMemo(() => {
     return summaries.map((summary) => {
       suiteLabel,
       category,
       domains,
+      tasks,
       summaries,
       card,
       sourceLabel,
       suiteLabel?: string
       category: CategoryType
       domains: string[]
+      tasks?: string[]
       summaries: BenchmarkEvalListItem[]
       card?: BenchmarkCard
       sourceLabel?: string
       descriptionFallback: string
     }) => {
       const stats = summarizeNodeStats(summaries, category)
+      const summaryTasks = summaries.flatMap((summary) => summary.tags?.tasks ?? [])
       addNode({
         id,
         parentId,
         description: buildDescription(title, card, descriptionFallback),
         category: stats.category,
         domains: Array.from(new Set(domains.flatMap((domain) => normalizeDomainList(domain)))),
+        tasks: Array.from(new Set([...(tasks ?? []), ...summaryTasks].map((task) => task.trim()).filter(Boolean))),
         dataType: card?.benchmark_details?.data_type,
         license: card?.ethical_and_legal_considerations?.data_licensing,
         card,
         href,
         scopeKeys,
         matrixPreview,
+        reproducibility_summary: stats.reproducibility_summary,
+        provenance_summary: stats.provenance_summary,
+        comparability_summary: stats.comparability_summary,
+        completenessScore: stats.completenessScore,
       })
     }
       slices = [],
       metrics = [],
       scopeKeys,
+      fallbackEvalId,
     }: {
       parentId: string | null
       familyLabel?: string
       slices?: Array<{ key: string; display_name: string; metrics: Array<{ key: string; display_name: string }> }>
       metrics?: Array<{ key: string; display_name: string }>
       scopeKeys: string[]
+      /** Final-resort eval id when no summary or fallback summary matches; comes from leaf.eval_summary_ids */
+      fallbackEvalId?: string
     }) => {
       const benchmarkId = `${parentId ?? "root"}::benchmark:${normalizeBenchmarkKey(benchmarkKey)}`
       const card = summary?.benchmark_card ?? getNodeCard(benchmarkCards, ...cardCandidates)
         !summary && metrics.length > 0
           ? scopeKeys.map((scopeKey) => pickSummaryForKey(summariesWithCards, scopeKey, scopeKeys)).find(Boolean)
           : undefined
+      const resolvedHref = summary
+        ? `/evals/${summary.evaluation_id}`
+        : fallbackSummary
+          ? `/evals/${fallbackSummary.evaluation_id}`
+          : fallbackEvalId
+            ? `/evals/${fallbackEvalId}`
+            : undefined
       const isParentRollupBenchmark =
         Boolean(parentId) && scopeKeys.some((scopeKey) => isSameHierarchyKey(scopeKey, benchmarkKey))
         if (drilldownSlices.length > 0) {
           createSliceNodes(parentId, parentLabel, summary, drilldownSlices, category, scopeKeys)
+        } else if (resolvedHref) {
           const parent = nodes.get(parentId)
           if (parent && !parent.href) {
+            parent.href = resolvedHref
           }
         }
         return
         domains,
         summaries: summary ? [summary] : [],
         card,
+        href: drilldownSlices.length === 0 ? resolvedHref : undefined,
         scopeKeys,
         descriptionFallback: `Browse the {label} benchmark and its lower-level breakdowns.`,
       })
             slices: standalone.slices ?? [],
             metrics: standalone.metrics ?? [],
             scopeKeys: familyScopeKeys,
+            fallbackEvalId: standalone.summary_eval_ids?.[0],
           })
         }
               })),
         metrics: benchmarkSource?.metrics ?? family.metrics ?? [],
         scopeKeys: familyScopeKeys,
+        fallbackEvalId:
+          benchmarkSource?.summary_eval_ids?.[0] ?? family.eval_summary_ids?.[0],
       })
     }
         node.description,
         node.sourceLabel,
         ...node.domains,
+        ...node.tasks,
       ]
       return haystacks.some((value) => value?.toLowerCase().includes(query))
       domainCandidates = domainCandidates.filter((node) => node.category === selectedCategory)
     }
+    if (selectedTask) {
+      domainCandidates = domainCandidates.filter((node) =>
+        node.tasks.some((task) => task.toLowerCase() === selectedTask.toLowerCase())
+      )
+    }
     for (const node of domainCandidates) {
       for (const domain of node.domains) {
         domainSet.add(domain)
     }
     return Array.from(domainSet).sort((a, b) => a.localeCompare(b))
+  }, [nodesMatchingSearch, selectedCategory, selectedNodeKind, selectedTask])
+  const allTasks = useMemo(() => {
+    const taskSet = new Set<string>()
+    let taskCandidates = nodesMatchingSearch
+    if (selectedNodeKind) {
+      taskCandidates = taskCandidates.filter((node) => node.kind === selectedNodeKind)
+    }
+    if (selectedCategory) {
+      taskCandidates = taskCandidates.filter((node) => node.category === selectedCategory)
+    }
+    if (selectedDomain) {
+      taskCandidates = taskCandidates.filter((node) =>
+        node.domains.some((domain) => domain.toLowerCase() === selectedDomain.toLowerCase())
+      )
+    }
+    for (const node of taskCandidates) {
+      for (const task of node.tasks) {
+        taskSet.add(task)
+      }
+    }
+    return Array.from(taskSet).sort((a, b) => a.localeCompare(b)).slice(0, 40)
+  }, [nodesMatchingSearch, selectedCategory, selectedDomain, selectedNodeKind])
   const allCategories = useMemo(() => {
     const categorySet = new Set<string>()
       )
     }
+    if (selectedTask) {
+      categoryCandidates = categoryCandidates.filter((node) =>
+        node.tasks.some((task) => task.toLowerCase() === selectedTask.toLowerCase())
+      )
+    }
     for (const node of categoryCandidates) {
       categorySet.add(node.category)
     }
     return Array.from(categorySet).sort((a, b) => a.localeCompare(b))
+  }, [nodesMatchingSearch, selectedDomain, selectedNodeKind, selectedTask])
   const filtered = useMemo(() => {
     let list = [...nodesMatchingSearch]
       )
     }
+    if (selectedTask) {
+      list = list.filter((node) =>
+        node.tasks.some(
+          (task) => task.toLowerCase() === selectedTask.toLowerCase()
+        )
+      )
+    }
     if (selectedCategory) {
       list = list.filter((node) => node.category === selectedCategory)
     }
     list.sort((a, b) => a.title.localeCompare(b.title, undefined, { sensitivity: "base" }))
     return list
+  }, [nodesMatchingSearch, selectedCategory, selectedDomain, selectedNodeKind, selectedTask])
   useEffect(() => {
     if (selectedDomain && !allDomains.includes(selectedDomain)) {
     }
   }, [allDomains, selectedDomain])
+  useEffect(() => {
+    if (selectedTask && !allTasks.includes(selectedTask)) {
+      setSelectedTask(null)
+    }
+  }, [allTasks, selectedTask])
   useEffect(() => {
     if (selectedCategory && !allCategories.includes(selectedCategory)) {
       setSelectedCategory(null)
   useEffect(() => {
     setPage(1)
+  }, [currentNodeId, searchQuery, selectedCategory, selectedDomain, selectedNodeKind, selectedTask])
   const pagedNodes = useMemo(
     () => filtered.slice((page - 1) * PAGE_SIZE, page * PAGE_SIZE),
   )
   const currentLevelKinds = Array.from(new Set(currentLevelNodes.map((node) => node.kind)))
+  const activeFilterCount = [searchQuery.trim(), selectedDomain, selectedTask, selectedCategory, selectedNodeKind].filter(Boolean).length
   const currentLevelLabel =
     currentNodeId === null
       ? "Rollout entry level"
                   onClick={() => {
                     setSearchQuery("")
                     setSelectedDomain(null)
+                    setSelectedTask(null)
                     setSelectedCategory(null)
                     setSelectedNodeKind(null)
                   }}
                 </div>
               </div>
+              {hierarchy?.stats && (
                 <div className="flex flex-wrap gap-2 text-sm">
                   <span className="rounded-full border border-stone-200/80 bg-stone-50/80 px-3 py-1.5 font-medium text-stone-700 dark:border-stone-700/80 dark:bg-stone-900/70 dark:text-stone-200">
                     {hierarchy.stats.family_count} families
                   </div>
                 )}
+                {allTasks.length > 0 && (
+                  <div className="mt-4 space-y-1.5">
+                    <div className="text-[11px] font-semibold uppercase tracking-[0.2em] text-stone-500 dark:text-stone-400">
+                      Task type
+                    </div>
+                    <div className="flex max-h-40 flex-wrap items-center gap-1.5 overflow-y-auto pr-1">
+                      <button
+                        type="button"
+                        onClick={() => setSelectedTask(null)}
+                        className={cn(
+                          "shrink-0 rounded-full border px-3 py-1.5 text-xs font-medium transition-colors",
+                          selectedTask === null
+                            ? "border-stone-950 bg-stone-950 text-stone-50 dark:border-stone-100 dark:bg-stone-100 dark:text-stone-950"
+                            : "border-stone-200/80 bg-stone-50/80 text-stone-600 hover:bg-stone-100 dark:border-stone-700/80 dark:bg-stone-900/70 dark:text-stone-300 dark:hover:bg-stone-800"
+                        )}
+                      >
+                        All
+                      </button>
+                      {allTasks.map((task) => (
+                        <button
+                          key={task}
+                          type="button"
+                          onClick={() => setSelectedTask(selectedTask === task ? null : task)}
+                          className={cn(
+                            "shrink-0 rounded-full border px-3 py-1.5 text-xs font-medium transition-colors capitalize",
+                            selectedTask === task
+                              ? "border-emerald-300 bg-emerald-50 text-emerald-800 dark:border-emerald-800 dark:bg-emerald-950/50 dark:text-emerald-200"
+                              : "border-stone-200/80 bg-white text-stone-600 hover:bg-stone-50 dark:border-stone-700/80 dark:bg-stone-900 dark:text-stone-300 dark:hover:bg-stone-800"
+                          )}
+                        >
+                          {task}
+                        </button>
+                      ))}
+                    </div>
+                  </div>
+                )}
                 {allCategories.length > 0 && (
                   <div className="mt-4 space-y-1.5">
                     <div className="text-[11px] font-semibold uppercase tracking-[0.2em] text-stone-500 dark:text-stone-400">
                     {node.title}
                   </h3>
+                  <NodeSignalChips node={node} />
                   {node.description && (
                     <p className="mb-4 flex-1 text-sm leading-6 text-stone-600 line-clamp-3 dark:text-stone-300">
                       {node.description}

components/benchmark-detail.tsx CHANGED Viewed

@@ -15,6 +15,12 @@ import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/component
 import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog"
 import { Input } from "@/components/ui/input"
 import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table"
 import {
   DropdownMenu,
   DropdownMenuContent,
@@ -259,48 +265,6 @@ function getOrganizationDisplayName(value: string | null | undefined) {
   return normalizeDisplayLabel(value) || "Unknown Organization"
 }
-function getRelationshipDisplayName(value: string | null | undefined) {
-  return normalizeDisplayLabel(value?.replace(/_/g, " ")) || "Unknown"
-}
-/**
- * Short, badge-friendly label for evaluator relationships.
- * Unknown / "other" values fall back to the normalized full name.
- */
-function getRelationshipShortLabel(value: string | null | undefined) {
-  switch ((value ?? "").toLowerCase()) {
-    case "first_party":
-      return "1st party"
-    case "third_party":
-      return "3rd party"
-    case "collaborative":
-      return "Collaborative"
-    case "other":
-      return "Other"
-    default:
-      return getRelationshipDisplayName(value)
-  }
-}
-/**
- * Tone classes for the relationship badge so readers can scan first-party
- * vs third-party reports at a glance without reading the text.
- */
-function getRelationshipBadgeTone(value: string | null | undefined): string {
-  switch ((value ?? "").toLowerCase()) {
-    case "first_party":
-      // Self-reported by the model's developer — caution tone.
-      return "border-amber-300 bg-amber-50 text-amber-900 dark:border-amber-900/60 dark:bg-amber-950/40 dark:text-amber-100"
-    case "third_party":
-      // Independently evaluated — confidence tone.
-      return "border-emerald-300 bg-emerald-50 text-emerald-900 dark:border-emerald-900/60 dark:bg-emerald-950/40 dark:text-emerald-100"
-    case "collaborative":
-      return "border-sky-300 bg-sky-50 text-sky-900 dark:border-sky-900/60 dark:bg-sky-950/40 dark:text-sky-100"
-    default:
-      return "border-border/70 bg-muted/40 text-muted-foreground"
-  }
-}
 function getSourceTypeDisplayName(value: string | null | undefined) {
   return normalizeDisplayLabel(value?.replace(/_/g, " ")) || "Unknown"
 }
@@ -1798,6 +1762,10 @@ export function BenchmarkDetail({
       thirdPartyEvaluations,
     }
   }, [allEvaluations])
   const allCategoryResults = useMemo(
     () =>
@@ -1868,14 +1836,14 @@ export function BenchmarkDetail({
     }
     const reproducibilityCopy =
-      reportingStats.missingGenerationConfigs === 0
         ? null
-        : reportingStats.missingGenerationConfigs === summary.total_evaluations
           ? "How this model was prompted during testing is not documented. Scores cannot be independently confirmed."
-          : "How this model was prompted during testing is missing for some reported results. Score differences may not be fully attributable to model capability alone."
     const comparabilityCopy =
-      reportingStats.missingGenerationConfigs > 0
         ? `${benchmarkCount > 0 ? `These results cover ${benchmarkCount} benchmark${benchmarkCount === 1 ? "" : "s"},` : "These results"} but missing prompting details mean apparent score gaps may partly reflect setup differences as well as capability.`
         : "Shared benchmark coverage helps, but evaluator choices, benchmark mix, and model size can still limit direct apples-to-apples comparison."
@@ -1898,9 +1866,10 @@ export function BenchmarkDetail({
     allCategoryResults,
     allEvaluations.length,
     reportingStats,
     summary.model_info.additional_details?.params_billions,
     summary.model_info.name,
-    summary.total_evaluations,
   ])
   const benchmarkGroups = useMemo(
@@ -3283,6 +3252,14 @@ export function BenchmarkDetail({
                 Mixed scale · renormalized
               </span>
             )}
           </div>
           {/* Hero: title + developer + stat strip */}
@@ -4663,6 +4640,10 @@ function AggregatedBenchmarkCard({
                                   Score
                                 </div>
                                 <div className="mt-1 text-lg font-semibold tracking-tight">{variant.displayScore}</div>
                               </div>
                               <div className="min-w-0">
@@ -5200,7 +5181,10 @@ function BenchmarkDeepDiveDialogPanel({
                             )}
                           </div>
                         </TableCell>
-                        <TableCell className="px-4 py-3 text-right align-top font-semibold tabular-nums">{variant.displayScore}</TableCell>
                         <TableCell className="px-4 py-3 text-right align-top tabular-nums text-muted-foreground">
                           {(variant.rankPosition != null || resolvedRank)
                             ? `#${resolvedRank?.position ?? variant.rankPosition}${(resolvedRank?.total ?? variant.rankTotal) ? `/${resolvedRank?.total ?? variant.rankTotal}` : ""}`
@@ -5265,7 +5249,10 @@ function BenchmarkDeepDiveDialogPanel({
                         )}
                       </div>
                     </TableCell>
-                    <TableCell className="px-4 py-3 text-right align-top font-semibold tabular-nums">{variant.displayScore}</TableCell>
                     <TableCell className="px-4 py-3 text-right align-top tabular-nums text-muted-foreground">
                       {(variant.rankPosition != null || resolvedRank)
                         ? `#${resolvedRank?.position ?? variant.rankPosition}${(resolvedRank?.total ?? variant.rankTotal) ? `/${resolvedRank?.total ?? variant.rankTotal}` : ""}`
@@ -5379,10 +5366,15 @@ function VariantExpandedDetail({
             <Badge variant="outline" className="font-normal">
               {group.title}
             </Badge>
-            <Badge variant="secondary" className="font-normal">
-              {variant.displayScore}
-            </Badge>
-          </div>
           <div className="text-sm text-muted-foreground">{variant.result.metric_config.evaluation_description}</div>
         </div>

 import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog"
 import { Input } from "@/components/ui/input"
 import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table"
+import {
+  getRelationshipBadgeTone,
+  getRelationshipDisplayName,
+  getRelationshipShortLabel,
+} from "@/components/signals/provenance-badge"
+import { SignalsRowBadges } from "@/components/signals/signals-row-badges"
 import {
   DropdownMenu,
   DropdownMenuContent,
   return normalizeDisplayLabel(value) || "Unknown Organization"
 }
 function getSourceTypeDisplayName(value: string | null | undefined) {
   return normalizeDisplayLabel(value?.replace(/_/g, " ")) || "Unknown"
 }
       thirdPartyEvaluations,
     }
   }, [allEvaluations])
+  const reproducibilityGapCount =
+    summary.reproducibility_summary?.has_reproducibility_gap_count ?? reportingStats.missingGenerationConfigs
+  const reproducibilityResultsTotal =
+    summary.reproducibility_summary?.results_total ?? summary.total_evaluations
   const allCategoryResults = useMemo(
     () =>
     }
     const reproducibilityCopy =
+      reproducibilityGapCount === 0
         ? null
+        : reproducibilityGapCount === reproducibilityResultsTotal
           ? "How this model was prompted during testing is not documented. Scores cannot be independently confirmed."
+          : `${reproducibilityGapCount} of ${reproducibilityResultsTotal} reported scores are missing enough setup detail to be re-run as-is.`
     const comparabilityCopy =
+      reproducibilityGapCount > 0
         ? `${benchmarkCount > 0 ? `These results cover ${benchmarkCount} benchmark${benchmarkCount === 1 ? "" : "s"},` : "These results"} but missing prompting details mean apparent score gaps may partly reflect setup differences as well as capability.`
         : "Shared benchmark coverage helps, but evaluator choices, benchmark mix, and model size can still limit direct apples-to-apples comparison."
     allCategoryResults,
     allEvaluations.length,
     reportingStats,
+    reproducibilityGapCount,
+    reproducibilityResultsTotal,
     summary.model_info.additional_details?.params_billions,
     summary.model_info.name,
   ])
   const benchmarkGroups = useMemo(
                 Mixed scale · renormalized
               </span>
             )}
+            {reproducibilityGapCount > 0 && (
+              <span
+                className="ml-1 inline-flex items-center rounded-full border border-amber-300 bg-amber-50 px-2 py-0.5 text-[10px] tracking-[0.12em] text-amber-900 dark:border-amber-900/60 dark:bg-amber-950/40 dark:text-amber-100"
+                title={`${reproducibilityGapCount} of ${reproducibilityResultsTotal} reported scores are not fully documented.`}
+              >
+                Setup gaps
+              </span>
+            )}
           </div>
           {/* Hero: title + developer + stat strip */}
                                   Score
                                 </div>
                                 <div className="mt-1 text-lg font-semibold tracking-tight">{variant.displayScore}</div>
+                                <SignalsRowBadges
+                                  annotations={variant.result.evalcards?.annotations}
+                                  className="justify-start"
+                                />
                               </div>
                               <div className="min-w-0">
                             )}
                           </div>
                         </TableCell>
+                        <TableCell className="px-4 py-3 text-right align-top font-semibold tabular-nums">
+                          <div>{variant.displayScore}</div>
+                          <SignalsRowBadges annotations={variant.result.evalcards?.annotations} />
+                        </TableCell>
                         <TableCell className="px-4 py-3 text-right align-top tabular-nums text-muted-foreground">
                           {(variant.rankPosition != null || resolvedRank)
                             ? `#${resolvedRank?.position ?? variant.rankPosition}${(resolvedRank?.total ?? variant.rankTotal) ? `/${resolvedRank?.total ?? variant.rankTotal}` : ""}`
                         )}
                       </div>
                     </TableCell>
+                    <TableCell className="px-4 py-3 text-right align-top font-semibold tabular-nums">
+                      <div>{variant.displayScore}</div>
+                      <SignalsRowBadges annotations={variant.result.evalcards?.annotations} />
+                    </TableCell>
                     <TableCell className="px-4 py-3 text-right align-top tabular-nums text-muted-foreground">
                       {(variant.rankPosition != null || resolvedRank)
                         ? `#${resolvedRank?.position ?? variant.rankPosition}${(resolvedRank?.total ?? variant.rankTotal) ? `/${resolvedRank?.total ?? variant.rankTotal}` : ""}`
             <Badge variant="outline" className="font-normal">
               {group.title}
             </Badge>
+          <Badge variant="secondary" className="font-normal">
+            {variant.displayScore}
+          </Badge>
+          <SignalsRowBadges
+            annotations={variant.result.evalcards?.annotations}
+            className="mt-0 justify-start"
+            hideOnMobile={false}
+          />
+        </div>
           <div className="text-sm text-muted-foreground">{variant.result.metric_config.evaluation_description}</div>
         </div>

components/benchmark-evaluation-card.tsx CHANGED Viewed

@@ -5,6 +5,7 @@ import { useMemo } from "react"
 import { useAudienceMode } from "@/components/audience-mode-provider"
 import { useRouter } from "next/navigation"
 import {
   Award,
   ChevronDown,
   ChevronRight,
@@ -14,6 +15,7 @@ import {
 } from "lucide-react"
 import type { CategoryType } from "@/lib/benchmark-schema"
 import { getCategoryColor } from "@/lib/benchmark-schema"
 import type { BenchmarkCard } from "@/lib/benchmark-schema"
 import { lookupBenchmarkCard } from "@/lib/benchmark-metadata-utils"
@@ -59,6 +61,9 @@ export type BenchmarkEvaluationCardData = {
     max: number
     average: number | null
   }
   top_scores: Array<{
     benchmark: string
@@ -262,6 +267,8 @@ export function BenchmarkEvaluationCard({
   const scoreRange = [formatScoreValue(data.score_summary?.min), formatScoreValue(data.score_summary?.max)]
     .filter((value): value is string => Boolean(value))
     .join(" to ")
   return (
     <Card
@@ -297,6 +304,12 @@ export function BenchmarkEvaluationCard({
               {paramsBillions && <Badge variant="secondary">{paramsBillions} parameters</Badge>}
               <Badge variant="outline">{data.benchmarks_count} benchmark suites</Badge>
               <Badge variant="outline">{data.evaluations_count} reported results</Badge>
             </div>
           </div>
@@ -447,6 +460,12 @@ export function BenchmarkEvaluationCard({
               {data.source_types.length > 0 && (
                 <KeyValueRow label="Artifact type" value={data.source_types.map((s) => s.replace(/_/g, " ")).join(", ")} />
               )}
             </div>
           </CollapsibleContent>
         </Collapsible>

 import { useAudienceMode } from "@/components/audience-mode-provider"
 import { useRouter } from "next/navigation"
 import {
+  AlertTriangle,
   Award,
   ChevronDown,
   ChevronRight,
 } from "lucide-react"
 import type { CategoryType } from "@/lib/benchmark-schema"
+import type { SignalSummaries } from "@/lib/backend-artifacts"
 import { getCategoryColor } from "@/lib/benchmark-schema"
 import type { BenchmarkCard } from "@/lib/benchmark-schema"
 import { lookupBenchmarkCard } from "@/lib/benchmark-metadata-utils"
     max: number
     average: number | null
   }
+  reproducibility_summary?: SignalSummaries["reproducibility_summary"]
+  provenance_summary?: SignalSummaries["provenance_summary"]
+  comparability_summary?: SignalSummaries["comparability_summary"]
   top_scores: Array<{
     benchmark: string
   const scoreRange = [formatScoreValue(data.score_summary?.min), formatScoreValue(data.score_summary?.max)]
     .filter((value): value is string => Boolean(value))
     .join(" to ")
+  const reproducibilityGapCount = data.reproducibility_summary?.has_reproducibility_gap_count ?? 0
+  const reproducibilityTotal = data.reproducibility_summary?.results_total ?? data.evaluations_count
   return (
     <Card
               {paramsBillions && <Badge variant="secondary">{paramsBillions} parameters</Badge>}
               <Badge variant="outline">{data.benchmarks_count} benchmark suites</Badge>
               <Badge variant="outline">{data.evaluations_count} reported results</Badge>
+              {reproducibilityGapCount > 0 && (
+                <Badge className="border-amber-300 bg-amber-50 text-amber-900 hover:bg-amber-50 dark:border-amber-900/60 dark:bg-amber-950/40 dark:text-amber-100">
+                  <AlertTriangle className="h-3 w-3" />
+                  {reproducibilityGapCount} setup gaps
+                </Badge>
+              )}
             </div>
           </div>
               {data.source_types.length > 0 && (
                 <KeyValueRow label="Artifact type" value={data.source_types.map((s) => s.replace(/_/g, " ")).join(", ")} />
               )}
+              {reproducibilityGapCount > 0 && (
+                <KeyValueRow
+                  label="Re-runnability"
+                  value={`${reproducibilityGapCount} of ${reproducibilityTotal} reported scores are not fully documented`}
+                />
+              )}
             </div>
           </CollapsibleContent>
         </Collapsible>

components/eval-card.tsx CHANGED Viewed

@@ -79,6 +79,11 @@ export function EvalCard({ summary, delayMs = 0 }: EvalCardProps) {
   const domainPreview = domains.slice(0, 2)
   // Source provenance pulled from the pipeline's source_data
   const sourceData = summary.source_data
   const datasetName = sourceData?.dataset_name
   const datasetUrl =
     sourceData?.dataset_url ??
@@ -129,10 +134,10 @@ export function EvalCard({ summary, delayMs = 0 }: EvalCardProps) {
               Independently evaluated
             </Badge>
           )}
-          {summary.missing_generation_config_count > 0 && (
             <Badge className="bg-amber-500 text-amber-950 hover:bg-amber-500">
               <AlertTriangle className="mr-1 h-3 w-3" />
-              Partial config
             </Badge>
           )}
         </div>
@@ -182,8 +187,8 @@ export function EvalCard({ summary, delayMs = 0 }: EvalCardProps) {
                 <DataRow
                   label="Config"
                   value={
-                    summary.missing_generation_config_count > 0
-                      ? `${summary.missing_generation_config_count} result${summary.missing_generation_config_count !== 1 ? "s" : ""} without config`
                       : "Fully documented"
                   }
                 />
@@ -245,9 +250,9 @@ export function EvalCard({ summary, delayMs = 0 }: EvalCardProps) {
               <div className="space-y-1.5 text-sm">
                 <DataRow label="Avg score" value={scorePercent} />
                 <DataRow label="Reported by" value={summary.evaluator_names.join(", ") || "Unknown"} />
-                {summary.missing_generation_config_count > 0 && (
                   <p className="pt-1 text-xs text-muted-foreground">
-                    Some results lack generation settings; compare scores with care.
                   </p>
                 )}
               </div>

   const domainPreview = domains.slice(0, 2)
   // Source provenance pulled from the pipeline's source_data
   const sourceData = summary.source_data
+  const reproducibilitySummary = summary.reproducibility_summary
+  const reproducibilityGapCount =
+    reproducibilitySummary?.has_reproducibility_gap_count ?? summary.missing_generation_config_count
+  const reproducibilityResultsTotal =
+    reproducibilitySummary?.results_total ?? summary.models_count
   const datasetName = sourceData?.dataset_name
   const datasetUrl =
     sourceData?.dataset_url ??
               Independently evaluated
             </Badge>
           )}
+          {reproducibilityGapCount > 0 && (
             <Badge className="bg-amber-500 text-amber-950 hover:bg-amber-500">
               <AlertTriangle className="mr-1 h-3 w-3" />
+              Documentation gaps
             </Badge>
           )}
         </div>
                 <DataRow
                   label="Config"
                   value={
+                    reproducibilityGapCount > 0
+                      ? `${reproducibilityGapCount} of ${reproducibilityResultsTotal} scores have setup gaps`
                       : "Fully documented"
                   }
                 />
               <div className="space-y-1.5 text-sm">
                 <DataRow label="Avg score" value={scorePercent} />
                 <DataRow label="Reported by" value={summary.evaluator_names.join(", ") || "Unknown"} />
+                {reproducibilityGapCount > 0 && (
                   <p className="pt-1 text-xs text-muted-foreground">
+                    {reproducibilityGapCount} of {reproducibilityResultsTotal} reported scores are not fully documented.
                   </p>
                 )}
               </div>

components/eval-detail.tsx CHANGED Viewed

@@ -5,8 +5,22 @@ import { Fragment, useEffect, useMemo, useState } from "react"
 import Link from "next/link"
 import { Badge } from "@/components/ui/badge"
 import { Button } from "@/components/ui/button"
 import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"
 import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
 import {
   DropdownMenu,
   DropdownMenuCheckboxItem,
@@ -32,9 +46,11 @@ import {
   Globe,
   Medal,
   Scale,
   Shield,
   SlidersHorizontal,
   Tag,
 } from "lucide-react"
 import type { BenchmarkCard } from "@/lib/benchmark-schema"
 import type { BenchmarkEvalSummary, ModelResultForBenchmark } from "@/lib/eval-processing"
@@ -53,6 +69,212 @@ interface LeaderboardRow {
 type LeaderboardMetric = NonNullable<BenchmarkEvalSummary["leaderboard_metrics"]>[number]
 type LeaderboardMatrixRow = NonNullable<BenchmarkEvalSummary["leaderboard_rows"]>[number]
 const PARAM_RANGE_VALUES = [1, 2, 3, 4, 6, 8, 10, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 500] as const
 const PARAM_RANGE_MARKERS = [
   { label: "< 1B", step: 0 },
@@ -400,6 +622,11 @@ export function EvalDetail({ summary }: EvalDetailProps) {
     : summary.is_aggregated
       ? "Averaged model results across the contributing composite benchmarks, with drill-down to each component score."
       : "Model results with benchmark context, source dataset detail, and optional instance-data links."
   const toggleRow = (key: string) =>
     setExpandedRows((current) => ({
@@ -430,6 +657,15 @@ export function EvalDetail({ summary }: EvalDetailProps) {
                       ? `${summary.metrics_count ?? summary.leaderboard_metrics?.length ?? 1} measures`
                       : `${summary.metrics_count ?? 1} ${(summary.metrics_count ?? 1) === 1 ? "measure" : "measures"}`}
                   </Badge>
                 </div>
               </div>
               {overviewOpen ? (
@@ -580,6 +816,12 @@ export function EvalDetail({ summary }: EvalDetailProps) {
                 </dl>
               </div>
               {!hasMultiMetricLeaderboard && (summary.root_metrics?.length || summary.subtasks?.length) ? (
                 <section className="rounded-2xl border bg-muted/5 p-3.5">
                   <div className="space-y-1">
@@ -812,10 +1054,14 @@ export function EvalDetail({ summary }: EvalDetailProps) {
                   const samples = Array.isArray(modelResult.source_data)
                     ? undefined
                     : modelResult.source_data.samples_number
                   return (
                     <Fragment key={key}>
-                      <TableRow className={cn("group", isExpanded && "bg-muted/15")}>
                         <TableCell className="px-4">
                           <div
                             className={cn(
@@ -868,6 +1114,7 @@ export function EvalDetail({ summary }: EvalDetailProps) {
                         <TableCell className="text-right">
                           <div className="text-xl font-semibold tabular-nums">{formatRawScore(modelResult.score, summary.metric_config.unit)}</div>
                         </TableCell>
                         {isResearchView ? (
@@ -997,6 +1244,8 @@ export function EvalDetail({ summary }: EvalDetailProps) {
                                   )}
                                 </DetailPanel>
                                 <DetailPanel
                                   title={isResearchView ? "Score Breakdown" : "Metric Summary"}
                                   subtitle={
@@ -1183,7 +1432,14 @@ function MultiMetricLeaderboard({
   const leaderboardMetrics = summary.leaderboard_metrics ?? []
   const leaderboardRows = summary.leaderboard_rows ?? []
   const allMetricKeys = useMemo(() => leaderboardMetrics.map((metric) => metric.column_key), [leaderboardMetrics])
-  const [visibleMetricKeys, setVisibleMetricKeys] = useState<string[]>(() => leaderboardMetrics.map((metric) => metric.column_key))
   const maxParamStepIndex = PARAM_RANGE_VALUES.length - 1
   const leaderboardMetricMap = useMemo(
     () => new Map(leaderboardMetrics.map((metric) => [metric.column_key, metric])),
@@ -1333,8 +1589,8 @@ function MultiMetricLeaderboard({
   }, [maxParamStep, minParamStep, sortDirection, sortKey])
   useEffect(() => {
-    setVisibleMetricKeys(allMetricKeys)
-  }, [allMetricKeys, summary.evaluation_id])
   useEffect(() => {
     setActiveSubtaskTab("all")
@@ -1521,30 +1777,11 @@ function MultiMetricLeaderboard({
       <CardContent className="p-0">
         {hasSubtaskTabs && (
           <div className="border-b bg-background px-5 py-3 sm:px-6">
-            <div className="mb-2 text-[11px] font-semibold uppercase tracking-[0.16em] text-muted-foreground">
-              Benchmark slices
-            </div>
-            <div className="flex flex-wrap gap-2">
-              <Button
-                type="button"
-                size="sm"
-                variant={activeSubtaskTab === "all" ? "default" : "outline"}
-                onClick={() => setActiveSubtaskTab("all")}
-              >
-                All slices
-              </Button>
-              {singleMetricSubtaskTabs.map((tab) => (
-                <Button
-                  key={tab.key}
-                  type="button"
-                  size="sm"
-                  variant={activeSubtaskTab === tab.key ? "default" : "outline"}
-                  onClick={() => setActiveSubtaskTab(tab.key)}
-                >
-                  {tab.label}
-                </Button>
-              ))}
-            </div>
           </div>
         )}
@@ -1739,6 +1976,12 @@ function MultiMetricLeaderboard({
                         )}
                         <span className="lg:hidden">{row.model_info.developer ?? "Unknown developer"}</span>
                       </div>
                     </div>
                   </TableCell>
@@ -1754,6 +1997,7 @@ function MultiMetricLeaderboard({
                   {visibleMetrics.map((metric) => {
                     const score = row.values[metric.column_key]
                     return (
                       <TableCell
                         key={metric.column_key}
@@ -1762,7 +2006,8 @@ function MultiMetricLeaderboard({
                           !isNumericScore(score) && "text-muted-foreground"
                         )}
                       >
-                        {isNumericScore(score) ? formatRawScore(score, metric.unit) : "—"}
                       </TableCell>
                     )
                   })}

 import Link from "next/link"
 import { Badge } from "@/components/ui/badge"
 import { Button } from "@/components/ui/button"
+import { CompletenessPanel } from "@/components/signals/completeness-panel"
+import { ComparabilityPanel } from "@/components/signals/comparability-panel"
+import { ReproducibilityPanel } from "@/components/signals/reproducibility-panel"
+import { SignalsRowBadges } from "@/components/signals/signals-row-badges"
+import { SignalTooltip } from "@/components/signals/signal-tooltip"
+import { getCompletenessPopulatedCount } from "@/components/signals/signal-utils"
 import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"
 import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
+import {
+  Dialog,
+  DialogContent,
+  DialogDescription,
+  DialogHeader,
+  DialogTitle,
+} from "@/components/ui/dialog"
+import { Input } from "@/components/ui/input"
 import {
   DropdownMenu,
   DropdownMenuCheckboxItem,
   Globe,
   Medal,
   Scale,
+  Search,
   Shield,
   SlidersHorizontal,
   Tag,
+  X,
 } from "lucide-react"
 import type { BenchmarkCard } from "@/lib/benchmark-schema"
 import type { BenchmarkEvalSummary, ModelResultForBenchmark } from "@/lib/eval-processing"
 type LeaderboardMetric = NonNullable<BenchmarkEvalSummary["leaderboard_metrics"]>[number]
 type LeaderboardMatrixRow = NonNullable<BenchmarkEvalSummary["leaderboard_rows"]>[number]
+/**
+ * Pick a representative row-level annotation for the matrix view.
+ *
+ * Reproducibility and provenance are typically constant across all metrics for
+ * a given (model, benchmark) pair, so rendering them in every cell is just
+ * noise. This helper grabs the first non-null annotation across visible metrics
+ * and returns it for the row-level badge strip.
+ */
+function getRowLevelAnnotations(
+  row: LeaderboardMatrixRow,
+  visibleMetrics: LeaderboardMetric[]
+) {
+  const annotationsByMetric = row.annotations_by_metric
+  if (!annotationsByMetric) {
+    return null
+  }
+  for (const metric of visibleMetrics) {
+    const annotations = annotationsByMetric[metric.column_key]
+    if (annotations) {
+      return annotations
+    }
+  }
+  return null
+}
+const SLICE_PILL_THRESHOLD = 5
+interface SliceTab {
+  key: string
+  label: string
+}
+/**
+ * Slice picker that adapts to slice count.
+ *
+ * - <= SLICE_PILL_THRESHOLD: render every slice as a pill (current familiar UX).
+ * - > SLICE_PILL_THRESHOLD: render "All slices" + currently-selected pill +
+ *   a "Browse N slices" button that opens a searchable dialog. Hundreds of
+ *   subtasks (e.g. AIRBench's 374) fit cleanly.
+ */
+function SliceSelector({
+  activeSubtaskTab,
+  onChange,
+  tabs,
+}: {
+  activeSubtaskTab: string
+  onChange: (key: string) => void
+  tabs: SliceTab[]
+}) {
+  const [browserOpen, setBrowserOpen] = useState(false)
+  const [search, setSearch] = useState("")
+  const useBrowser = tabs.length > SLICE_PILL_THRESHOLD
+  const activeTab = tabs.find((tab) => tab.key === activeSubtaskTab)
+  const filteredTabs = useMemo(() => {
+    const query = search.trim().toLowerCase()
+    if (!query) return tabs
+    return tabs.filter((tab) => tab.label.toLowerCase().includes(query))
+  }, [search, tabs])
+  if (!useBrowser) {
+    return (
+      <div>
+        <div className="mb-2 text-[11px] font-semibold uppercase tracking-[0.16em] text-muted-foreground">
+          Benchmark slices
+        </div>
+        <div className="flex flex-wrap gap-2">
+          <Button
+            type="button"
+            size="sm"
+            variant={activeSubtaskTab === "all" ? "default" : "outline"}
+            onClick={() => onChange("all")}
+          >
+            All slices
+          </Button>
+          {tabs.map((tab) => (
+            <Button
+              key={tab.key}
+              type="button"
+              size="sm"
+              variant={activeSubtaskTab === tab.key ? "default" : "outline"}
+              onClick={() => onChange(tab.key)}
+            >
+              {tab.label}
+            </Button>
+          ))}
+        </div>
+      </div>
+    )
+  }
+  return (
+    <div>
+      <div className="mb-2 flex items-center justify-between gap-2">
+        <div className="text-[11px] font-semibold uppercase tracking-[0.16em] text-muted-foreground">
+          Benchmark slices
+        </div>
+        <span className="text-xs text-muted-foreground">{tabs.length} total</span>
+      </div>
+      <div className="flex flex-wrap items-center gap-2">
+        <Button
+          type="button"
+          size="sm"
+          variant={activeSubtaskTab === "all" ? "default" : "outline"}
+          onClick={() => onChange("all")}
+        >
+          All slices
+        </Button>
+        {activeTab && (
+          <Button
+            type="button"
+            size="sm"
+            variant="default"
+            onClick={() => onChange("all")}
+            className="max-w-[18rem] truncate"
+            title={`Active: ${activeTab.label}. Click to clear.`}
+          >
+            {activeTab.label}
+            <X className="ml-1.5 h-3 w-3 shrink-0" />
+          </Button>
+        )}
+        <Button
+          type="button"
+          size="sm"
+          variant="outline"
+          onClick={() => setBrowserOpen(true)}
+          className="gap-1.5"
+        >
+          <Search className="h-3.5 w-3.5" />
+          {activeTab ? "Change slice" : `Browse ${tabs.length} slices`}
+        </Button>
+      </div>
+      <Dialog
+        open={browserOpen}
+        onOpenChange={(open) => {
+          setBrowserOpen(open)
+          if (!open) setSearch("")
+        }}
+      >
+        <DialogContent className="max-w-2xl">
+          <DialogHeader>
+            <DialogTitle>Browse benchmark slices</DialogTitle>
+            <DialogDescription>
+              {tabs.length} slices in this benchmark. Pick one to filter the leaderboard,
+              or close to keep showing all slices.
+            </DialogDescription>
+          </DialogHeader>
+          <Input
+            value={search}
+            onChange={(event) => setSearch(event.target.value)}
+            placeholder="Search slices..."
+            autoFocus
+          />
+          <div className="max-h-[60vh] overflow-y-auto rounded-md border">
+            <button
+              type="button"
+              onClick={() => {
+                onChange("all")
+                setBrowserOpen(false)
+              }}
+              className={cn(
+                "flex w-full items-center justify-between border-b px-4 py-2.5 text-left text-sm transition-colors hover:bg-muted/40",
+                activeSubtaskTab === "all" && "bg-muted/40 font-semibold"
+              )}
+            >
+              <span>All slices (no filter)</span>
+              {activeSubtaskTab === "all" && <span className="text-xs text-muted-foreground">selected</span>}
+            </button>
+            {filteredTabs.length === 0 ? (
+              <div className="px-4 py-6 text-center text-sm text-muted-foreground">
+                No slices match "{search}".
+              </div>
+            ) : (
+              filteredTabs.map((tab) => (
+                <button
+                  key={tab.key}
+                  type="button"
+                  onClick={() => {
+                    onChange(tab.key)
+                    setBrowserOpen(false)
+                  }}
+                  className={cn(
+                    "flex w-full items-center justify-between border-b px-4 py-2 text-left text-sm transition-colors hover:bg-muted/40 last:border-b-0",
+                    activeSubtaskTab === tab.key && "bg-muted/40 font-semibold"
+                  )}
+                >
+                  <span className="min-w-0 truncate pr-2">{tab.label}</span>
+                  {activeSubtaskTab === tab.key && (
+                    <span className="shrink-0 text-xs text-muted-foreground">selected</span>
+                  )}
+                </button>
+              ))
+            )}
+          </div>
+        </DialogContent>
+      </Dialog>
+    </div>
+  )
+}
 const PARAM_RANGE_VALUES = [1, 2, 3, 4, 6, 8, 10, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 500] as const
 const PARAM_RANGE_MARKERS = [
   { label: "< 1B", step: 0 },
     : summary.is_aggregated
       ? "Averaged model results across the contributing composite benchmarks, with drill-down to each component score."
       : "Model results with benchmark context, source dataset detail, and optional instance-data links."
+  const reportingCompleteness = summary.evalcards?.annotations?.reporting_completeness
+  const benchmarkComparability = summary.evalcards?.annotations?.benchmark_comparability
+  const documentationPopulatedCount = reportingCompleteness
+    ? getCompletenessPopulatedCount(reportingCompleteness)
+    : null
   const toggleRow = (key: string) =>
     setExpandedRows((current) => ({
                       ? `${summary.metrics_count ?? summary.leaderboard_metrics?.length ?? 1} measures`
                       : `${summary.metrics_count ?? 1} ${(summary.metrics_count ?? 1) === 1 ? "measure" : "measures"}`}
                   </Badge>
+                  {reportingCompleteness && (
+                    <SignalTooltip
+                      content={`${documentationPopulatedCount} of ${reportingCompleteness.total_fields_evaluated} EvalCards documentation fields populated for this benchmark.`}
+                    >
+                      <Badge variant="outline" className="border-emerald-200 bg-emerald-50 text-emerald-800 dark:border-emerald-900/50 dark:bg-emerald-950/30 dark:text-emerald-200">
+                        Documentation {Math.round(reportingCompleteness.completeness_score * 100)}%
+                      </Badge>
+                    </SignalTooltip>
+                  )}
                 </div>
               </div>
               {overviewOpen ? (
                 </dl>
               </div>
+              <CompletenessPanel completeness={reportingCompleteness} />
+              <ComparabilityPanel
+                comparability={benchmarkComparability}
+                summary={summary.comparability_summary}
+              />
               {!hasMultiMetricLeaderboard && (summary.root_metrics?.length || summary.subtasks?.length) ? (
                 <section className="rounded-2xl border bg-muted/5 p-3.5">
                   <div className="space-y-1">
                   const samples = Array.isArray(modelResult.source_data)
                     ? undefined
                     : modelResult.source_data.samples_number
+                  const rowAnnotations = modelResult.result.evalcards?.annotations
                   return (
                     <Fragment key={key}>
+                      <TableRow
+                        id={modelResult.model_route_id ? `row-${modelResult.model_route_id}` : undefined}
+                        className={cn("group", isExpanded && "bg-muted/15")}
+                      >
                         <TableCell className="px-4">
                           <div
                             className={cn(
                         <TableCell className="text-right">
                           <div className="text-xl font-semibold tabular-nums">{formatRawScore(modelResult.score, summary.metric_config.unit)}</div>
+                          <SignalsRowBadges annotations={rowAnnotations} />
                         </TableCell>
                         {isResearchView ? (
                                   )}
                                 </DetailPanel>
+                                <ReproducibilityPanel gap={rowAnnotations?.reproducibility_gap} />
                                 <DetailPanel
                                   title={isResearchView ? "Score Breakdown" : "Metric Summary"}
                                   subtitle={
   const leaderboardMetrics = summary.leaderboard_metrics ?? []
   const leaderboardRows = summary.leaderboard_rows ?? []
   const allMetricKeys = useMemo(() => leaderboardMetrics.map((metric) => metric.column_key), [leaderboardMetrics])
+  // Cap default visible columns to avoid hangs on benchmarks with hundreds of metrics
+  // (e.g. helm_air_bench has 374 subtask×metric pairs). Users can opt in to more.
+  const DEFAULT_VISIBLE_METRIC_CAP = 24
+  const defaultVisibleMetricKeys = useMemo(
+    () => allMetricKeys.slice(0, DEFAULT_VISIBLE_METRIC_CAP),
+    [allMetricKeys]
+  )
+  const [visibleMetricKeys, setVisibleMetricKeys] = useState<string[]>(() => defaultVisibleMetricKeys)
   const maxParamStepIndex = PARAM_RANGE_VALUES.length - 1
   const leaderboardMetricMap = useMemo(
     () => new Map(leaderboardMetrics.map((metric) => [metric.column_key, metric])),
   }, [maxParamStep, minParamStep, sortDirection, sortKey])
   useEffect(() => {
+    setVisibleMetricKeys(defaultVisibleMetricKeys)
+  }, [defaultVisibleMetricKeys, summary.evaluation_id])
   useEffect(() => {
     setActiveSubtaskTab("all")
       <CardContent className="p-0">
         {hasSubtaskTabs && (
           <div className="border-b bg-background px-5 py-3 sm:px-6">
+            <SliceSelector
+              activeSubtaskTab={activeSubtaskTab}
+              onChange={setActiveSubtaskTab}
+              tabs={singleMetricSubtaskTabs}
+            />
           </div>
         )}
                         )}
                         <span className="lg:hidden">{row.model_info.developer ?? "Unknown developer"}</span>
                       </div>
+                      <SignalsRowBadges
+                        annotations={getRowLevelAnnotations(row, visibleMetrics)}
+                        variant="row"
+                        className="mt-1 justify-start"
+                        hideOnMobile={false}
+                      />
                     </div>
                   </TableCell>
                   {visibleMetrics.map((metric) => {
                     const score = row.values[metric.column_key]
+                    const annotations = row.annotations_by_metric?.[metric.column_key]
                     return (
                       <TableCell
                         key={metric.column_key}
                           !isNumericScore(score) && "text-muted-foreground"
                         )}
                       >
+                        <div>{isNumericScore(score) ? formatRawScore(score, metric.unit) : "—"}</div>
+                        <SignalsRowBadges annotations={annotations} variant="cell" />
                       </TableCell>
                     )
                   })}

components/model-compare-dialog.tsx CHANGED Viewed

@@ -133,6 +133,7 @@ const CONTEXT_ROWS = [
   { key: "benchmarks", label: "Benchmark coverage" },
   { key: "variants", label: "Versions" },
   { key: "score_summary", label: "Score range" },
   { key: "latest", label: "Latest summary" },
   { key: "updated", label: "Updated" },
 ] as const
@@ -409,6 +410,20 @@ export function ModelCompareDialog({
                                     </div>
                                   </div>
                                 ) : null}
                                 {row.key === "latest" ? (
                                   <div className="flex items-center gap-2">
                                     <span>{model.latest_source_name || `${model.benchmarks_count} benchmark suites summarized`}</span>

   { key: "benchmarks", label: "Benchmark coverage" },
   { key: "variants", label: "Versions" },
   { key: "score_summary", label: "Score range" },
+  { key: "reproducibility", label: "Re-runnability" },
   { key: "latest", label: "Latest summary" },
   { key: "updated", label: "Updated" },
 ] as const
                                     </div>
                                   </div>
                                 ) : null}
+                                {row.key === "reproducibility" ? (
+                                  model.reproducibility_summary && model.reproducibility_summary.has_reproducibility_gap_count > 0 ? (
+                                    <div className="space-y-1">
+                                      <div className="font-medium">
+                                        {model.reproducibility_summary.has_reproducibility_gap_count} setup gaps
+                                      </div>
+                                      <div className="text-sm text-muted-foreground">
+                                        Out of {model.reproducibility_summary.results_total} reported scores
+                                      </div>
+                                    </div>
+                                  ) : (
+                                    <span className="text-muted-foreground">No setup gaps reported</span>
+                                  )
+                                ) : null}
                                 {row.key === "latest" ? (
                                   <div className="flex items-center gap-2">
                                     <span>{model.latest_source_name || `${model.benchmarks_count} benchmark suites summarized`}</span>

components/navigation.tsx CHANGED Viewed

@@ -38,6 +38,12 @@ export function Navigation() {
       icon: BarChart3,
       isActive: pathname === "/evals" || pathname?.startsWith("/evals/")
     },
     {
       href: "/survey",
       label: "Survey",

       icon: BarChart3,
       isActive: pathname === "/evals" || pathname?.startsWith("/evals/")
     },
+    {
+      href: "/corpus",
+      label: "Corpus",
+      icon: FlaskConical,
+      isActive: pathname === "/corpus" || pathname?.startsWith("/corpus/")
+    },
     {
       href: "/survey",
       label: "Survey",

components/signals/comparability-panel.tsx ADDED Viewed

	@@ -0,0 +1,193 @@

+"use client"
+import type { ReactNode } from "react"
+import { ChevronDown, GitCompareArrows, UsersRound } from "lucide-react"
+import { useAudienceMode } from "@/components/audience-mode-provider"
+import { Badge } from "@/components/ui/badge"
+import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
+import type { BenchmarkComparability, ComparabilitySummary, DifferingSetupField } from "@/lib/backend-artifacts"
+import {
+  formatFieldLabel,
+  formatSignalNumber,
+  formatSignalValue,
+} from "./signal-utils"
+export function ComparabilityPanel({
+  comparability,
+  summary,
+}: {
+  comparability?: BenchmarkComparability | null
+  summary?: ComparabilitySummary
+}) {
+  const { mode } = useAudienceMode()
+  const isResearchView = mode === "research"
+  const variantGroups = comparability?.variant_divergence_groups ?? []
+  const crossPartyGroups = comparability?.cross_party_divergence_groups ?? []
+  const showNoCrossPartyNote = summary?.groups_with_cross_party_check === 0
+  if (variantGroups.length === 0 && crossPartyGroups.length === 0 && !showNoCrossPartyNote) {
+    return null
+  }
+  return (
+    <section className="rounded-2xl border border-border/70 bg-background/70 p-4 sm:p-5">
+      <div className="flex flex-col gap-2 sm:flex-row sm:items-start sm:justify-between">
+        <div className="space-y-1">
+          <div className="flex items-center gap-2">
+            <GitCompareArrows className="h-4 w-4 text-primary" />
+            <h3 className="font-semibold">
+              {isResearchView ? "Comparability" : "Can these scores be compared directly?"}
+            </h3>
+          </div>
+          <p className="max-w-2xl text-sm text-muted-foreground">
+            {isResearchView
+              ? "Groups where reported scores diverge across setups or reporting organizations."
+              : "Flags cases where score differences may come from setup choices or different reporting sources."}
+          </p>
+        </div>
+        {summary && (
+          <div className="flex flex-wrap gap-2 text-xs">
+            <Badge variant="outline">{summary.groups_with_variant_check} setup checks</Badge>
+            <Badge variant="outline">{summary.groups_with_cross_party_check} source checks</Badge>
+          </div>
+        )}
+      </div>
+      {showNoCrossPartyNote && (
+        <div className="mt-4 rounded-xl border border-dashed border-border/70 bg-muted/10 px-3 py-2 text-sm text-muted-foreground">
+          No third-party reports are available for cross-party comparison.
+        </div>
+      )}
+      <div className="mt-4 grid gap-3 lg:grid-cols-2">
+        {variantGroups.length > 0 && (
+          <GroupList
+            icon="variant"
+            title="Variant divergence"
+            count={variantGroups.length}
+          >
+            {variantGroups.slice(0, 8).map((group) => (
+              <DivergenceGroupItem
+                key={group.group_id}
+                modelRouteId={group.model_route_id}
+                magnitude={group.divergence_magnitude}
+                threshold={group.threshold_used}
+                fields={group.differing_setup_fields}
+              />
+            ))}
+          </GroupList>
+        )}
+        {crossPartyGroups.length > 0 && (
+          <GroupList
+            icon="cross-party"
+            title="Cross-party divergence"
+            count={crossPartyGroups.length}
+          >
+            {crossPartyGroups.slice(0, 8).map((group) => (
+              <DivergenceGroupItem
+                key={group.group_id}
+                modelRouteId={group.model_route_id}
+                magnitude={group.divergence_magnitude}
+                threshold={group.threshold_used}
+                fields={group.differing_setup_fields}
+                scoresByOrganization={group.scores_by_organization}
+              />
+            ))}
+          </GroupList>
+        )}
+      </div>
+    </section>
+  )
+}
+function GroupList({
+  icon,
+  title,
+  count,
+  children,
+}: {
+  icon: "variant" | "cross-party"
+  title: string
+  count: number
+  children: ReactNode
+}) {
+  const Icon = icon === "variant" ? GitCompareArrows : UsersRound
+  return (
+    <Collapsible defaultOpen>
+      <CollapsibleTrigger asChild>
+        <button
+          type="button"
+          className="flex w-full items-center justify-between rounded-xl border border-border/70 bg-muted/10 px-3 py-2 text-left transition-colors hover:bg-muted/20"
+        >
+          <span className="flex items-center gap-2 text-sm font-semibold">
+            <Icon className="h-4 w-4 text-muted-foreground" />
+            {title}
+            <Badge variant="secondary">{count}</Badge>
+          </span>
+          <ChevronDown className="h-4 w-4 text-muted-foreground" />
+        </button>
+      </CollapsibleTrigger>
+      <CollapsibleContent className="mt-2 space-y-2">
+        {children}
+      </CollapsibleContent>
+    </Collapsible>
+  )
+}
+function DivergenceGroupItem({
+  modelRouteId,
+  magnitude,
+  threshold,
+  fields,
+  scoresByOrganization,
+}: {
+  modelRouteId: string
+  magnitude: number
+  threshold: number
+  fields: DifferingSetupField[]
+  scoresByOrganization?: Record<string, number>
+}) {
+  return (
+    <a
+      href={`#row-${modelRouteId}`}
+      className="block rounded-xl border border-border/60 bg-background px-3 py-2 text-sm transition-colors hover:bg-muted/20"
+    >
+      <div className="flex items-start justify-between gap-3">
+        <div className="min-w-0">
+          <div className="font-medium">{modelRouteId}</div>
+          <div className="mt-1 text-xs text-muted-foreground">
+            Divergence {formatSignalNumber(magnitude)}; threshold {formatSignalNumber(threshold)}
+          </div>
+        </div>
+        <span className="shrink-0 text-xs font-medium text-primary">Jump to row</span>
+      </div>
+      {fields.length > 0 && (
+        <div className="mt-2 space-y-1 text-xs text-muted-foreground">
+          {fields.slice(0, 3).map((field) => (
+            <div key={field.field}>
+              <span className="font-medium text-foreground">{formatFieldLabel(field.field)}:</span>{" "}
+              {field.values.map(formatSignalValue).join(", ")}
+            </div>
+          ))}
+        </div>
+      )}
+      {scoresByOrganization && Object.keys(scoresByOrganization).length > 0 && (
+        <div className="mt-2 flex flex-wrap gap-1.5">
+          {Object.entries(scoresByOrganization).slice(0, 4).map(([org, score]) => (
+            <span
+              key={org}
+              className="rounded-full border border-border/60 bg-muted/20 px-2 py-0.5 text-[11px] text-muted-foreground"
+            >
+              {org}: {formatSignalNumber(score)}
+            </span>
+          ))}
+        </div>
+      )}
+    </a>
+  )
+}

components/signals/completeness-panel.tsx ADDED Viewed

	@@ -0,0 +1,147 @@

+"use client"
+import type { ReactNode } from "react"
+import { ChevronDown, ClipboardCheck } from "lucide-react"
+import { useAudienceMode } from "@/components/audience-mode-provider"
+import { Badge } from "@/components/ui/badge"
+import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
+import { Progress } from "@/components/ui/progress"
+import type { ReportingCompleteness } from "@/lib/backend-artifacts"
+import {
+  formatFieldLabel,
+  formatPercent,
+  getCompletenessPopulatedCount,
+} from "./signal-utils"
+export function CompletenessPanel({
+  completeness,
+}: {
+  completeness?: ReportingCompleteness | null
+}) {
+  const { mode } = useAudienceMode()
+  const isResearchView = mode === "research"
+  if (!completeness) {
+    return null
+  }
+  const populatedCount = getCompletenessPopulatedCount(completeness)
+  const total = completeness.total_fields_evaluated
+  const missingFields = completeness.missing_required_fields ?? []
+  const partialFields = completeness.partial_fields ?? []
+  return (
+    <section className="rounded-2xl border border-border/70 bg-background/70 p-4 sm:p-5">
+      <div className="flex flex-col gap-4 lg:flex-row lg:items-start lg:justify-between">
+        <div className="space-y-1">
+          <div className="flex items-center gap-2">
+            <ClipboardCheck className="h-4 w-4 text-primary" />
+            <h3 className="font-semibold">
+              {isResearchView ? "Reporting completeness" : "How well is this benchmark documented?"}
+            </h3>
+          </div>
+          <p className="max-w-2xl text-sm text-muted-foreground">
+            {isResearchView
+              ? "Coverage of EvalCards-required documentation fields for this benchmark."
+              : "A quick read on how much supporting documentation is available before leaning on the scores."}
+          </p>
+        </div>
+        <div className="min-w-[14rem] rounded-xl border border-border/70 bg-muted/10 px-3 py-2">
+          <div className="flex items-baseline justify-between gap-3">
+            <span className="text-[11px] font-semibold uppercase tracking-[0.18em] text-muted-foreground">
+              Documentation
+            </span>
+            <span className="text-lg font-semibold tabular-nums">
+              {formatPercent(completeness.completeness_score)}
+            </span>
+          </div>
+          <Progress value={completeness.completeness_score * 100} className="mt-2 h-2" />
+          <div className="mt-2 text-xs text-muted-foreground">
+            {populatedCount} of {total} fields populated
+          </div>
+        </div>
+      </div>
+      {(missingFields.length > 0 || partialFields.length > 0) && (
+        <div className="mt-4 grid gap-3 lg:grid-cols-2">
+          <SignalListCollapsible
+            title="Missing required fields"
+            count={missingFields.length}
+          >
+            {missingFields.length === 0 ? (
+              <p className="text-sm text-muted-foreground">No missing required fields recorded.</p>
+            ) : (
+              <ul className="space-y-1.5 text-sm">
+                {missingFields.slice(0, 12).map((field) => (
+                  <li key={field} className="rounded-lg border border-border/50 bg-background px-3 py-2">
+                    <span className="font-medium">{formatFieldLabel(field)}</span>
+                    {isResearchView && (
+                      <span className="ml-2 text-xs text-muted-foreground">{field}</span>
+                    )}
+                  </li>
+                ))}
+              </ul>
+            )}
+          </SignalListCollapsible>
+          <SignalListCollapsible
+            title="Partially populated"
+            count={partialFields.length}
+          >
+            {partialFields.length === 0 ? (
+              <p className="text-sm text-muted-foreground">No partially populated fields recorded.</p>
+            ) : (
+              <ul className="space-y-1.5 text-sm">
+                {partialFields.slice(0, 12).map((field) => (
+                  <li key={field.field_path} className="rounded-lg border border-border/50 bg-background px-3 py-2">
+                    <div className="flex items-start justify-between gap-3">
+                      <span className="font-medium">{formatFieldLabel(field.field_path)}</span>
+                      <span className="shrink-0 text-muted-foreground">
+                        {field.populated_subitems}/{field.total_subitems}
+                      </span>
+                    </div>
+                    {isResearchView && (
+                      <div className="mt-1 text-xs text-muted-foreground">{field.field_path}</div>
+                    )}
+                  </li>
+                ))}
+              </ul>
+            )}
+          </SignalListCollapsible>
+        </div>
+      )}
+    </section>
+  )
+}
+function SignalListCollapsible({
+  title,
+  count,
+  children,
+}: {
+  title: string
+  count: number
+  children: ReactNode
+}) {
+  return (
+    <Collapsible>
+      <CollapsibleTrigger asChild>
+        <button
+          type="button"
+          className="flex w-full items-center justify-between rounded-xl border border-border/70 bg-muted/10 px-3 py-2 text-left transition-colors hover:bg-muted/20"
+        >
+          <span className="flex items-center gap-2 text-sm font-semibold">
+            {title}
+            <Badge variant="secondary">{count}</Badge>
+          </span>
+          <ChevronDown className="h-4 w-4 text-muted-foreground" />
+        </button>
+      </CollapsibleTrigger>
+      <CollapsibleContent className="mt-2">
+        {children}
+      </CollapsibleContent>
+    </Collapsible>
+  )
+}

components/signals/corpus-dashboard.tsx ADDED Viewed

	@@ -0,0 +1,442 @@

+"use client"
+import type { ReactNode } from "react"
+import { useEffect, useMemo, useState } from "react"
+import { BarChart3, ClipboardCheck, GitCompareArrows, ShieldCheck } from "lucide-react"
+import { useAudienceMode } from "@/components/audience-mode-provider"
+import { Badge } from "@/components/ui/badge"
+import { Button } from "@/components/ui/button"
+import type {
+  ComparabilityCorpusBlock,
+  CompletenessCorpusBlock,
+  CorpusAggregates,
+  ProvenanceCorpusBlock,
+  ReproducibilityCorpusBlock,
+} from "@/lib/backend-artifacts"
+import { getCategoryColor } from "@/lib/benchmark-schema"
+import {
+  formatFieldLabel,
+  formatPercent,
+} from "./signal-utils"
+const CATEGORY_ORDER = ["agentic", "general", "knowledge", "reasoning", "safety", "other"]
+const SOURCE_COLORS: Record<string, string> = {
+  first_party: "bg-amber-500",
+  third_party: "bg-emerald-500",
+  collaborative: "bg-sky-500",
+  unspecified: "bg-stone-400",
+}
+export function CorpusDashboard({
+  aggregates,
+  completenessScores,
+}: {
+  aggregates: CorpusAggregates
+  completenessScores: number[]
+}) {
+  const { mode } = useAudienceMode()
+  const [view, setView] = useState<"overall" | "category">("overall")
+  useEffect(() => {
+    setView(mode === "research" ? "category" : "overall")
+  }, [mode])
+  const categoryKeys = useMemo(
+    () =>
+      CATEGORY_ORDER.filter((category) =>
+        aggregates.reproducibility.by_category[category] ||
+        aggregates.completeness.by_category[category] ||
+        aggregates.provenance.by_category[category] ||
+        aggregates.comparability.by_category[category]
+      ),
+    [aggregates]
+  )
+  return (
+    <div className="space-y-6">
+      <section className="rounded-2xl border border-border/70 bg-card p-5 shadow-sm">
+        <div className="flex flex-col gap-4 lg:flex-row lg:items-start lg:justify-between">
+          <div>
+            <div className="text-[11px] font-semibold uppercase tracking-[0.22em] text-muted-foreground">
+              Interpretive signals
+            </div>
+            <h1 className="mt-2 text-3xl font-semibold tracking-tight">Corpus Dashboard</h1>
+            <p className="mt-2 max-w-3xl text-sm leading-6 text-muted-foreground">
+              Corpus-level rollups for reproducibility, documentation completeness, source provenance, and comparability.
+            </p>
+          </div>
+          <div className="flex flex-wrap items-center gap-2">
+            <Badge variant="outline">Signals v{aggregates.signal_version}</Badge>
+            <Badge variant="outline">Generated {formatGeneratedDate(aggregates.generated_at)}</Badge>
+            <div className="inline-flex rounded-full border bg-muted/20 p-1">
+              <Button
+                type="button"
+                size="sm"
+                variant={view === "overall" ? "default" : "ghost"}
+                className="h-8 rounded-full"
+                onClick={() => setView("overall")}
+              >
+                Overall
+              </Button>
+              <Button
+                type="button"
+                size="sm"
+                variant={view === "category" ? "default" : "ghost"}
+                className="h-8 rounded-full"
+                onClick={() => setView("category")}
+              >
+                By category
+              </Button>
+            </div>
+          </div>
+        </div>
+      </section>
+      {view === "overall" ? (
+        <div className="grid gap-6">
+          <ReproducibilitySection block={aggregates.reproducibility.overall} />
+          <CompletenessSection block={aggregates.completeness.overall} scores={completenessScores} />
+          <ProvenanceSection block={aggregates.provenance.overall} />
+          <ComparabilitySection block={aggregates.comparability.overall} />
+        </div>
+      ) : (
+        <div className="grid gap-4 xl:grid-cols-2">
+          {categoryKeys.map((category) => (
+            <CategoryPanel
+              key={category}
+              category={category}
+              reproducibility={aggregates.reproducibility.by_category[category]}
+              completeness={aggregates.completeness.by_category[category]}
+              provenance={aggregates.provenance.by_category[category]}
+              comparability={aggregates.comparability.by_category[category]}
+            />
+          ))}
+        </div>
+      )}
+    </div>
+  )
+}
+function ReproducibilitySection({ block }: { block: ReproducibilityCorpusBlock }) {
+  return (
+    <DashboardSection
+      icon={<ShieldCheck className="h-5 w-5" />}
+      title="Reproducibility"
+      subtitle="Reported scores with enough setup documentation to re-run."
+      headline={formatPercent(block.reproducibility_gap_rate)}
+      headlineLabel={`${block.triples_with_reproducibility_gap.toLocaleString()} of ${block.total_triples.toLocaleString()} reported scores have gaps`}
+    >
+      <div className="grid gap-2">
+        {Object.entries(block.per_field_missingness).slice(0, 10).map(([field, value]) => (
+          <MetricBar
+            key={field}
+            label={formatFieldLabel(field)}
+            value={value.missing_rate}
+            detail={`${value.missing_count.toLocaleString()} missing / ${value.denominator === "agentic_only" ? "agentic only" : "all scores"}`}
+          />
+        ))}
+      </div>
+    </DashboardSection>
+  )
+}
+function CompletenessSection({
+  block,
+  scores,
+}: {
+  block: CompletenessCorpusBlock
+  scores: number[]
+}) {
+  return (
+    <DashboardSection
+      icon={<ClipboardCheck className="h-5 w-5" />}
+      title="Reporting Completeness"
+      subtitle="How much benchmark documentation is populated."
+      headline={formatPercent(block.completeness_score_mean)}
+      headlineLabel={`Median ${formatPercent(block.completeness_score_median)} across ${block.total_benchmarks.toLocaleString()} benchmarks`}
+    >
+      {scores.length > 0 && <Histogram scores={scores} />}
+      <div className="mt-4 grid gap-2">
+        {Object.entries(block.per_field_population).slice(0, 10).map(([field, value]) => (
+          <div key={field} className="rounded-xl border border-border/60 bg-background px-3 py-2">
+            <div className="flex items-start justify-between gap-3 text-sm">
+              <span className="font-medium">{formatFieldLabel(field)}</span>
+              <span className="shrink-0 tabular-nums text-muted-foreground">
+                {formatPercent(value.mean_score)}
+              </span>
+            </div>
+            <div className="mt-2 grid gap-1.5">
+              <MetricBar label="Any data" value={value.populated_rate} compact />
+              <MetricBar label="Fully populated" value={value.fully_populated_rate} compact />
+            </div>
+          </div>
+        ))}
+      </div>
+    </DashboardSection>
+  )
+}
+function ProvenanceSection({ block }: { block: ProvenanceCorpusBlock }) {
+  const distribution = block.source_type_distribution
+  const total = Object.values(distribution).reduce((sum, value) => sum + value, 0)
+  return (
+    <DashboardSection
+      icon={<BarChart3 className="h-5 w-5" />}
+      title="Provenance"
+      subtitle="Who reported the scores, and whether groups have multiple sources."
+      headline={formatPercent(block.multi_source_rate)}
+      headlineLabel="of (model, benchmark, metric) groups have multiple reporting sources"
+    >
+      <div className="overflow-hidden rounded-full border border-border/70 bg-muted/30">
+        <div className="flex h-4 w-full">
+          {Object.entries(distribution).map(([sourceType, count]) => (
+            <div
+              key={sourceType}
+              className={SOURCE_COLORS[sourceType] ?? "bg-muted-foreground"}
+              style={{ width: total > 0 ? `${(count / total) * 100}%` : "0%" }}
+              title={`${sourceType.replace(/_/g, " ")}: ${count}`}
+            />
+          ))}
+        </div>
+      </div>
+      <div className="mt-3 grid gap-2 sm:grid-cols-2">
+        <RatioTile label="Multi-source groups" value={block.multi_source_rate} count={block.multi_source_groups} />
+        <RatioTile label="First-party only groups" value={block.first_party_only_rate} count={block.first_party_only_groups} />
+      </div>
+    </DashboardSection>
+  )
+}
+function ComparabilitySection({ block }: { block: ComparabilityCorpusBlock }) {
+  return (
+    <DashboardSection
+      icon={<GitCompareArrows className="h-5 w-5" />}
+      title="Comparability"
+      subtitle="Eligible groups where scores diverge across setups or reporting organizations."
+      headline={formatNullableRate(block.variant_divergence_rate)}
+      headlineLabel={`${block.variant_divergent_groups.toLocaleString()} of ${block.variant_eligible_groups.toLocaleString()} setup-eligible groups diverge`}
+    >
+      <div className="grid gap-3 md:grid-cols-2">
+        <ComparabilityRateCard
+          title="Variant divergence"
+          rate={block.variant_divergence_rate}
+          eligible={block.variant_eligible_groups}
+          divergent={block.variant_divergent_groups}
+        />
+        <ComparabilityRateCard
+          title="Cross-party divergence"
+          rate={block.cross_party_divergence_rate}
+          eligible={block.cross_party_eligible_groups}
+          divergent={block.cross_party_divergent_groups}
+        />
+      </div>
+    </DashboardSection>
+  )
+}
+function CategoryPanel({
+  category,
+  reproducibility,
+  completeness,
+  provenance,
+  comparability,
+}: {
+  category: string
+  reproducibility?: ReproducibilityCorpusBlock
+  completeness?: CompletenessCorpusBlock
+  provenance?: ProvenanceCorpusBlock
+  comparability?: ComparabilityCorpusBlock
+}) {
+  const categoryLabel = `${category.charAt(0).toUpperCase()}${category.slice(1)}`
+  return (
+    <section className="rounded-2xl border border-border/70 bg-card p-4 shadow-sm">
+      <div className="mb-4 flex items-center justify-between gap-3">
+        <h2 className="font-semibold">{categoryLabel}</h2>
+        <Badge className={getCategoryColor(categoryLabel)}>{categoryLabel}</Badge>
+      </div>
+      <div className="grid gap-3 sm:grid-cols-2">
+        <MiniMetric label="Reproducibility gaps" value={formatPercent(reproducibility?.reproducibility_gap_rate)} />
+        <MiniMetric label="Documentation mean" value={formatPercent(completeness?.completeness_score_mean)} />
+        <MiniMetric label="Multi-source groups" value={formatPercent(provenance?.multi_source_rate)} />
+        <MiniMetric label="Variant divergence" value={formatNullableRate(comparability?.variant_divergence_rate)} />
+      </div>
+      {comparability?.cross_party_divergence_rate == null && (
+        <div className="mt-3 rounded-xl border border-dashed border-border/70 bg-muted/10 px-3 py-2 text-sm text-muted-foreground">
+          Cross-party divergence: N/A - not enough multi-org coverage.
+        </div>
+      )}
+    </section>
+  )
+}
+function DashboardSection({
+  icon,
+  title,
+  subtitle,
+  headline,
+  headlineLabel,
+  children,
+}: {
+  icon: ReactNode
+  title: string
+  subtitle: string
+  headline: string
+  headlineLabel: string
+  children: ReactNode
+}) {
+  return (
+    <section className="rounded-2xl border border-border/70 bg-card p-5 shadow-sm">
+      <div className="grid gap-5 lg:grid-cols-[minmax(0,18rem)_1fr]">
+        <div>
+          <div className="flex items-center gap-2 text-primary">
+            {icon}
+            <h2 className="font-semibold">{title}</h2>
+          </div>
+          <p className="mt-2 text-sm leading-6 text-muted-foreground">{subtitle}</p>
+          <div className="mt-5 rounded-xl border border-border/70 bg-muted/10 px-3 py-3">
+            <div className="text-3xl font-semibold tabular-nums">{headline}</div>
+            <div className="mt-1 text-xs leading-5 text-muted-foreground">{headlineLabel}</div>
+          </div>
+        </div>
+        <div>{children}</div>
+      </div>
+    </section>
+  )
+}
+function MetricBar({
+  label,
+  value,
+  detail,
+  compact = false,
+}: {
+  label: string
+  value: number | null
+  detail?: string
+  compact?: boolean
+}) {
+  const percent = value == null ? 0 : Math.max(0, Math.min(100, value * 100))
+  return (
+    <div className={compact ? "space-y-1" : "rounded-xl border border-border/60 bg-background px-3 py-2"}>
+      <div className="flex items-center justify-between gap-3 text-sm">
+        <span className="min-w-0 truncate font-medium">{label}</span>
+        <span className="shrink-0 tabular-nums text-muted-foreground">{formatPercent(value)}</span>
+      </div>
+      <div className="mt-1.5 h-2 overflow-hidden rounded-full bg-muted">
+        <div className="h-full rounded-full bg-primary/75" style={{ width: `${percent}%` }} />
+      </div>
+      {detail && <div className="mt-1 text-xs text-muted-foreground">{detail}</div>}
+    </div>
+  )
+}
+function Histogram({ scores }: { scores: number[] }) {
+  const buckets = Array.from({ length: 10 }, (_, index) => ({
+    label: `${index * 10}-${(index + 1) * 10}%`,
+    count: 0,
+  }))
+  for (const score of scores) {
+    if (!Number.isFinite(score)) continue
+    const bucket = Math.min(9, Math.max(0, Math.floor(score * 10)))
+    buckets[bucket].count += 1
+  }
+  const maxCount = Math.max(...buckets.map((bucket) => bucket.count), 1)
+  return (
+    <div className="rounded-xl border border-border/60 bg-background px-3 py-3">
+      <div className="mb-3 text-sm font-semibold">Benchmark completeness distribution</div>
+      <div className="flex h-28 items-end gap-1.5">
+        {buckets.map((bucket) => (
+          <div key={bucket.label} className="flex min-w-0 flex-1 flex-col items-center gap-1">
+            <div
+              className="w-full rounded-t bg-primary/70"
+              style={{ height: `${Math.max(4, (bucket.count / maxCount) * 100)}%` }}
+              title={`${bucket.label}: ${bucket.count}`}
+            />
+            <span className="text-[9px] text-muted-foreground">{bucket.label.split("-")[0]}</span>
+          </div>
+        ))}
+      </div>
+    </div>
+  )
+}
+function RatioTile({ label, value, count }: { label: string; value: number | null; count: number }) {
+  return (
+    <div className="rounded-xl border border-border/60 bg-background px-3 py-2">
+      <div className="text-sm font-medium">{label}</div>
+      <div className="mt-1 flex items-baseline justify-between gap-2">
+        <span className="text-xl font-semibold tabular-nums">{formatPercent(value)}</span>
+        <span className="text-xs text-muted-foreground">{count.toLocaleString()} groups</span>
+      </div>
+    </div>
+  )
+}
+function ComparabilityRateCard({
+  title,
+  rate,
+  eligible,
+  divergent,
+}: {
+  title: string
+  rate: number | null
+  eligible: number
+  divergent: number
+}) {
+  if (rate == null) {
+    return (
+      <div className="rounded-xl border border-dashed border-border/70 bg-muted/10 px-4 py-5">
+        <div className="font-semibold">{title}</div>
+        <div className="mt-2 text-sm text-muted-foreground">
+          N/A - not enough data to compute this rate.
+        </div>
+      </div>
+    )
+  }
+  return (
+    <div className="rounded-xl border border-border/70 bg-background px-4 py-4">
+      <div className="font-semibold">{title}</div>
+      <div className="mt-3 text-2xl font-semibold tabular-nums">{formatPercent(rate)}</div>
+      <div className="mt-1 text-sm text-muted-foreground">
+        {divergent.toLocaleString()} of {eligible.toLocaleString()} eligible groups
+      </div>
+    </div>
+  )
+}
+function MiniMetric({ label, value }: { label: string; value: string }) {
+  return (
+    <div className="rounded-xl border border-border/60 bg-muted/10 px-3 py-2">
+      <div className="text-xs text-muted-foreground">{label}</div>
+      <div className="mt-1 text-xl font-semibold tabular-nums">{value}</div>
+    </div>
+  )
+}
+function formatNullableRate(value: number | null | undefined) {
+  return value == null ? "N/A" : formatPercent(value)
+}
+function formatGeneratedDate(value: string) {
+  const date = new Date(value)
+  if (Number.isNaN(date.getTime())) {
+    return value
+  }
+  return date.toLocaleDateString("en-US", {
+    year: "numeric",
+    month: "short",
+    day: "numeric",
+  })
+}

components/signals/cross-party-divergence-badge.tsx ADDED Viewed

	@@ -0,0 +1,46 @@

+"use client"
+import { UsersRound } from "lucide-react"
+import { useAudienceMode } from "@/components/audience-mode-provider"
+import { Badge } from "@/components/ui/badge"
+import type { CrossPartyDivergence } from "@/lib/backend-artifacts"
+import { cn } from "@/lib/utils"
+import { formatSignalNumber } from "./signal-utils"
+import { SignalTooltip } from "./signal-tooltip"
+export function CrossPartyDivergenceBadge({
+  divergence,
+  className,
+}: {
+  divergence?: CrossPartyDivergence | null
+  className?: string
+}) {
+  const { mode } = useAudienceMode()
+  const isResearchView = mode === "research"
+  if (!divergence?.has_cross_party_divergence) {
+    return null
+  }
+  const magnitude = formatSignalNumber(divergence.divergence_magnitude)
+  const orgCount = divergence.organization_count
+  const tooltip = isResearchView
+    ? `Reports diverge by ${magnitude} across ${orgCount} organization${orgCount === 1 ? "" : "s"}.`
+    : "Different organizations reported different scores for this same model on this same benchmark."
+  return (
+    <SignalTooltip content={tooltip}>
+      <Badge
+        variant="outline"
+        className={cn(
+          "border-violet-300 bg-violet-50 text-violet-900 dark:border-violet-900/60 dark:bg-violet-950/40 dark:text-violet-100",
+          className
+        )}
+      >
+        <UsersRound className="h-3 w-3" />
+        {isResearchView ? "Cross-party divergence" : "Sources disagree"}
+      </Badge>
+    </SignalTooltip>
+  )
+}

components/signals/provenance-badge.tsx ADDED Viewed

	@@ -0,0 +1,124 @@

+"use client"
+import { AlertTriangle, BadgeCheck, Handshake, UserRoundCheck } from "lucide-react"
+import { useAudienceMode } from "@/components/audience-mode-provider"
+import { Badge } from "@/components/ui/badge"
+import type { Provenance, ProvenanceSourceType } from "@/lib/backend-artifacts"
+import { cn } from "@/lib/utils"
+import { SignalTooltip } from "./signal-tooltip"
+export function getRelationshipDisplayName(value: string | null | undefined) {
+  const normalized = value?.replace(/_/g, " ").trim()
+  if (!normalized) {
+    return "Unknown"
+  }
+  return normalized
+    .split(/\s+/)
+    .map((token) => `${token.charAt(0).toUpperCase()}${token.slice(1).toLowerCase()}`)
+    .join(" ")
+}
+export function getRelationshipShortLabel(value: string | null | undefined, mode: "research" | "policy" = "research") {
+  switch ((value ?? "").toLowerCase()) {
+    case "first_party":
+      return mode === "policy" ? "Reported by model developer" : "1st party"
+    case "third_party":
+      return mode === "policy" ? "Independently reported" : "3rd party"
+    case "collaborative":
+      return mode === "policy" ? "Joint report" : "Collaborative"
+    case "other":
+      return "Other"
+    default:
+      return getRelationshipDisplayName(value)
+  }
+}
+export function getRelationshipBadgeTone(value: string | null | undefined): string {
+  switch ((value ?? "").toLowerCase()) {
+    case "first_party":
+      return "border-amber-300 bg-amber-50 text-amber-900 dark:border-amber-900/60 dark:bg-amber-950/40 dark:text-amber-100"
+    case "third_party":
+      return "border-emerald-300 bg-emerald-50 text-emerald-900 dark:border-emerald-900/60 dark:bg-emerald-950/40 dark:text-emerald-100"
+    case "collaborative":
+      return "border-sky-300 bg-sky-50 text-sky-900 dark:border-sky-900/60 dark:bg-sky-950/40 dark:text-sky-100"
+    default:
+      return "border-border/70 bg-muted/40 text-muted-foreground"
+  }
+}
+function normalizeSourceType(value: string | null | undefined): ProvenanceSourceType | "other" | null {
+  switch ((value ?? "").toLowerCase()) {
+    case "first_party":
+    case "third_party":
+    case "collaborative":
+    case "unspecified":
+      return value?.toLowerCase() as ProvenanceSourceType
+    case "other":
+      return "other"
+    default:
+      return null
+  }
+}
+function ProvenanceIcon({ sourceType }: { sourceType: ProvenanceSourceType | "other" }) {
+  if (sourceType === "third_party") {
+    return <BadgeCheck className="h-3 w-3" />
+  }
+  if (sourceType === "collaborative") {
+    return <Handshake className="h-3 w-3" />
+  }
+  return <UserRoundCheck className="h-3 w-3" />
+}
+export function ProvenanceBadge({
+  provenance,
+  relationship,
+  sourceOrganizationName,
+  showOther = false,
+  className,
+}: {
+  provenance?: Provenance | null
+  relationship?: string | null
+  sourceOrganizationName?: string | null
+  showOther?: boolean
+  className?: string
+}) {
+  const { mode } = useAudienceMode()
+  const sourceType = provenance?.source_type ?? normalizeSourceType(relationship)
+  if (!sourceType || sourceType === "unspecified" || (!showOther && sourceType === "other")) {
+    return null
+  }
+  const firstPartyOnly = provenance?.first_party_only === true
+  const label = firstPartyOnly
+    ? mode === "policy"
+      ? "Only model developer reported"
+      : "1st party only"
+    : getRelationshipShortLabel(sourceType, mode)
+  const tooltip = firstPartyOnly
+    ? mode === "policy"
+      ? "Only the model developer reported this score; no independent replication is recorded."
+      : "First-party only - no independent replication is recorded for this group."
+    : sourceOrganizationName
+      ? `Reported by ${sourceOrganizationName}.`
+      : getRelationshipDisplayName(sourceType)
+  return (
+    <SignalTooltip content={tooltip}>
+      <Badge
+        variant="outline"
+        className={cn(getRelationshipBadgeTone(sourceType), className)}
+      >
+        <ProvenanceIcon sourceType={sourceType} />
+        {label}
+        {firstPartyOnly && <AlertTriangle className="h-3 w-3" />}
+      </Badge>
+    </SignalTooltip>
+  )
+}

components/signals/reproducibility-badge.tsx ADDED Viewed

	@@ -0,0 +1,46 @@

+"use client"
+import { AlertTriangle } from "lucide-react"
+import { useAudienceMode } from "@/components/audience-mode-provider"
+import { Badge } from "@/components/ui/badge"
+import type { ReproducibilityGap } from "@/lib/backend-artifacts"
+import { cn } from "@/lib/utils"
+import { formatMissingField } from "./signal-utils"
+import { SignalTooltip } from "./signal-tooltip"
+export function ReproducibilityBadge({
+  gap,
+  className,
+}: {
+  gap?: ReproducibilityGap | null
+  className?: string
+}) {
+  const { mode } = useAudienceMode()
+  const isResearchView = mode === "research"
+  if (!gap?.has_reproducibility_gap) {
+    return null
+  }
+  const missing = gap.missing_fields.map(formatMissingField)
+  const countLine = `${gap.populated_field_count} of ${gap.required_field_count} setup fields recorded.`
+  const tooltip = isResearchView
+    ? `Setup not fully documented. Missing: ${missing.join(", ") || "none listed"}. ${countLine}`
+    : `This score's setup is not fully documented, so it cannot be re-run as-is. ${countLine}`
+  return (
+    <SignalTooltip content={tooltip}>
+      <Badge
+        variant="outline"
+        className={cn(
+          "border-amber-300 bg-amber-50 text-amber-900 dark:border-amber-900/60 dark:bg-amber-950/40 dark:text-amber-100",
+          className
+        )}
+      >
+        <AlertTriangle className="h-3 w-3" />
+        {isResearchView ? "Reproducibility gap" : "Setup not documented"}
+      </Badge>
+    </SignalTooltip>
+  )
+}

components/signals/reproducibility-panel.tsx ADDED Viewed

	@@ -0,0 +1,60 @@

+"use client"
+import { AlertTriangle } from "lucide-react"
+import { useAudienceMode } from "@/components/audience-mode-provider"
+import type { ReproducibilityGap } from "@/lib/backend-artifacts"
+import { formatMissingField } from "./signal-utils"
+export function ReproducibilityPanel({
+  gap,
+}: {
+  gap?: ReproducibilityGap | null
+}) {
+  const { mode } = useAudienceMode()
+  const isResearchView = mode === "research"
+  if (!gap) {
+    return null
+  }
+  return (
+    <div className="rounded-2xl border bg-background/70 p-4">
+      <div className="mb-4 flex items-start gap-2">
+        <AlertTriangle className="mt-0.5 h-4 w-4 shrink-0 text-amber-600 dark:text-amber-300" />
+        <div>
+          <div className="font-semibold">
+            {isResearchView ? "Reproducibility" : "Re-runnability"}
+          </div>
+          <div className="text-sm text-muted-foreground">
+            {isResearchView
+              ? "Whether the setup is documented well enough for someone else to re-run."
+              : "Whether someone could re-run this evaluation with the information available."}
+          </div>
+        </div>
+      </div>
+      <div className="space-y-2.5 text-sm">
+        <PanelRow
+          label="Setup fields recorded"
+          value={`${gap.populated_field_count} of ${gap.required_field_count}`}
+        />
+        {gap.missing_fields.length > 0 && (
+          <PanelRow
+            label="Missing"
+            value={gap.missing_fields.map(formatMissingField).join(", ")}
+          />
+        )}
+      </div>
+    </div>
+  )
+}
+function PanelRow({ label, value }: { label: string; value: string }) {
+  return (
+    <div className="flex gap-3">
+      <span className="w-32 shrink-0 text-muted-foreground">{label}</span>
+      <span className="min-w-0 flex-1 break-words font-medium">{value}</span>
+    </div>
+  )
+}

components/signals/signal-tooltip.tsx ADDED Viewed

	@@ -0,0 +1,31 @@

+"use client"
+import type { ReactNode } from "react"
+import * as TooltipPrimitive from "@radix-ui/react-tooltip"
+export function SignalTooltip({
+  children,
+  content,
+}: {
+  children: ReactNode
+  content: ReactNode
+}) {
+  return (
+    <TooltipPrimitive.Provider delayDuration={150}>
+      <TooltipPrimitive.Root>
+        <TooltipPrimitive.Trigger asChild>{children}</TooltipPrimitive.Trigger>
+        <TooltipPrimitive.Portal>
+          <TooltipPrimitive.Content
+            side="top"
+            align="center"
+            sideOffset={8}
+            className="z-50 max-w-80 rounded-md border border-border/70 bg-popover px-3 py-2 text-xs leading-5 text-popover-foreground shadow-lg"
+          >
+            {content}
+            <TooltipPrimitive.Arrow className="fill-popover" />
+          </TooltipPrimitive.Content>
+        </TooltipPrimitive.Portal>
+      </TooltipPrimitive.Root>
+    </TooltipPrimitive.Provider>
+  )
+}

components/signals/signal-utils.ts ADDED Viewed

	@@ -0,0 +1,105 @@

+import type { DifferingSetupField, ReportingCompleteness } from "@/lib/backend-artifacts"
+const FIELD_PREFIXES = [
+  "autobenchmarkcard.",
+  "eee_eval.",
+  "evalcards.",
+]
+const TOKEN_OVERRIDES: Record<string, string> = {
+  api: "API",
+  ai: "AI",
+  eee: "EEE",
+  hf: "HF",
+  id: "ID",
+  llm: "LLM",
+  url: "URL",
+}
+function titleCaseSegment(segment: string) {
+  return segment
+    .split(/[\s_-]+/)
+    .filter(Boolean)
+    .map((token) => TOKEN_OVERRIDES[token.toLowerCase()] ?? `${token.charAt(0).toUpperCase()}${token.slice(1).toLowerCase()}`)
+    .join(" ")
+}
+export function formatPercent(value: number | null | undefined, digits = 0) {
+  if (value == null || !Number.isFinite(value)) {
+    return "N/A"
+  }
+  return `${(value * 100).toFixed(digits)}%`
+}
+export function formatSignalNumber(value: number | null | undefined, digits = 3) {
+  if (value == null || !Number.isFinite(value)) {
+    return "N/A"
+  }
+  if (Math.abs(value) >= 100) {
+    return value.toFixed(1).replace(/\.0$/, "")
+  }
+  return value.toFixed(digits).replace(/0+$/, "").replace(/\.$/, "")
+}
+export function formatFieldLabel(path: string) {
+  let next = path
+  for (const prefix of FIELD_PREFIXES) {
+    if (next.startsWith(prefix)) {
+      next = next.slice(prefix.length)
+      break
+    }
+  }
+  return next
+    .split(".")
+    .filter(Boolean)
+    .map(titleCaseSegment)
+    .join(" / ")
+}
+export function formatMissingField(field: string) {
+  return titleCaseSegment(field)
+}
+export function formatSignalValue(value: unknown) {
+  if (value == null) {
+    return "(unspecified)"
+  }
+  if (typeof value === "string") {
+    return value
+  }
+  if (typeof value === "number" || typeof value === "boolean" || typeof value === "bigint") {
+    return String(value)
+  }
+  try {
+    return JSON.stringify(value)
+  } catch {
+    return String(value)
+  }
+}
+export function formatDifferingFields(fields: DifferingSetupField[], limit = 2) {
+  if (fields.length === 0) {
+    return "setup fields"
+  }
+  const labels = fields.slice(0, limit).map((item) => formatMissingField(item.field))
+  const remainder = fields.length - labels.length
+  return remainder > 0 ? `${labels.join(", ")} +${remainder}` : labels.join(", ")
+}
+export function getCompletenessPopulatedCount(completeness: ReportingCompleteness) {
+  if (completeness.field_scores.length === 0) {
+    return Math.round(completeness.completeness_score * completeness.total_fields_evaluated)
+  }
+  return Math.round(
+    completeness.field_scores.reduce((sum, field) => sum + field.score, 0)
+  )
+}

components/signals/signals-row-badges.tsx ADDED Viewed

	@@ -0,0 +1,68 @@

+"use client"
+import type { RowAnnotations } from "@/lib/backend-artifacts"
+import { cn } from "@/lib/utils"
+import { CrossPartyDivergenceBadge } from "./cross-party-divergence-badge"
+import { ProvenanceBadge } from "./provenance-badge"
+import { ReproducibilityBadge } from "./reproducibility-badge"
+import { VariantDivergenceBadge } from "./variant-divergence-badge"
+/**
+ * Renders the four signal badges for a single row.
+ *
+ * - `variant`="full" (default): shows all four signals. Use for single-metric
+ *   leaderboards, expanded row panels, and one-off contexts.
+ * - `variant`="cell": only shows divergence signals (variant + cross-party).
+ *   Use inside multi-metric matrix cells, where reproducibility and provenance
+ *   are constant across columns and would just be visual noise.
+ * - `variant`="row": only shows reproducibility + provenance — the constant
+ *   per-(model, benchmark) signals. Pair with `variant="cell"` columns so each
+ *   row carries its constant signals once at the row header.
+ */
+export function SignalsRowBadges({
+  annotations,
+  className,
+  hideOnMobile = true,
+  variant = "full",
+}: {
+  annotations?: RowAnnotations | null
+  className?: string
+  hideOnMobile?: boolean
+  variant?: "full" | "cell" | "row"
+}) {
+  if (!annotations) {
+    return null
+  }
+  const showRowLevel = variant === "full" || variant === "row"
+  const showCellLevel = variant === "full" || variant === "cell"
+  const hasReproducibility = showRowLevel && annotations.reproducibility_gap?.has_reproducibility_gap
+  const hasProvenance =
+    showRowLevel &&
+    Boolean(
+      annotations.provenance && annotations.provenance.source_type !== "unspecified"
+    )
+  const hasVariant = showCellLevel && annotations.variant_divergence?.has_variant_divergence
+  const hasCrossParty =
+    showCellLevel && annotations.cross_party_divergence?.has_cross_party_divergence
+  if (!hasReproducibility && !hasProvenance && !hasVariant && !hasCrossParty) {
+    return null
+  }
+  return (
+    <div
+      className={cn(
+        "mt-1.5 flex flex-wrap justify-end gap-1.5",
+        hideOnMobile && "hidden md:flex",
+        className
+      )}
+    >
+      {showRowLevel && <ReproducibilityBadge gap={annotations.reproducibility_gap} />}
+      {showRowLevel && <ProvenanceBadge provenance={annotations.provenance} />}
+      {showCellLevel && <VariantDivergenceBadge divergence={annotations.variant_divergence} />}
+      {showCellLevel && <CrossPartyDivergenceBadge divergence={annotations.cross_party_divergence} />}
+    </div>
+  )
+}

components/signals/variant-divergence-badge.tsx ADDED Viewed

	@@ -0,0 +1,46 @@

+"use client"
+import { GitCompareArrows } from "lucide-react"
+import { useAudienceMode } from "@/components/audience-mode-provider"
+import { Badge } from "@/components/ui/badge"
+import type { VariantDivergence } from "@/lib/backend-artifacts"
+import { cn } from "@/lib/utils"
+import { formatDifferingFields, formatSignalNumber } from "./signal-utils"
+import { SignalTooltip } from "./signal-tooltip"
+export function VariantDivergenceBadge({
+  divergence,
+  className,
+}: {
+  divergence?: VariantDivergence | null
+  className?: string
+}) {
+  const { mode } = useAudienceMode()
+  const isResearchView = mode === "research"
+  if (!divergence?.has_variant_divergence) {
+    return null
+  }
+  const magnitude = formatSignalNumber(divergence.divergence_magnitude)
+  const fields = formatDifferingFields(divergence.differing_setup_fields)
+  const tooltip = isResearchView
+    ? `Scores diverge by ${magnitude} across different setups: ${fields}.`
+    : "Different runs of this evaluation produced different scores, so the setup matters."
+  return (
+    <SignalTooltip content={tooltip}>
+      <Badge
+        variant="outline"
+        className={cn(
+          "border-rose-300 bg-rose-50 text-rose-900 dark:border-rose-900/60 dark:bg-rose-950/35 dark:text-rose-100",
+          className
+        )}
+      >
+        <GitCompareArrows className="h-3 w-3" />
+        {isResearchView ? "Variant divergence" : "Score depends on setup"}
+      </Badge>
+    </SignalTooltip>
+  )
+}

docs/INTERPRETIVE_SIGNALS.md ADDED Viewed

	@@ -0,0 +1,622 @@

+# EvalCards interpretive signals — frontend implementation spec
+**Status:** ready to implement. Backend ships in `evaleval/eval_cards_backend_pipeline` PR #1 (merged `b05323c`). All field shapes below are stable and covered by the backend's test suite.
+**Companion docs:**
+- Spec source of truth: *EvalCards Interpretive Signals v1.0* (Anka Reuel, Stanford). Section refs (§3, §4, …) below point at that doc.
+- Open backend questions: [evaleval/eval_cards_backend_pipeline#2](https://github.com/evaleval/eval_cards_backend_pipeline/issues/2). None block frontend work — they may shift wording, not shape.
+---
+## 0. What this PR does at a glance
+The backend now annotates evaluation records with four interpretive signals:
+1. **Reproducibility gap** — *per row.* Was the evaluation documented well enough to be re-run? Surfaced as a missing-fields list (e.g. "missing `max_tokens`").
+2. **Reporting completeness** — *per benchmark.* What fraction of EvalCards-required documentation fields are populated? Surfaced as a `[0, 1]` score with a missing-field breakdown.
+3. **Provenance** — *per row.* Who reported this score (first-party / third-party / collaborative / unspecified), and is it the only source for this `(model, benchmark, metric)` group?
+4. **Comparability** — *per `(model, benchmark, metric)` group.* Two flavors: **variant divergence** (same model, same benchmark, different setups → diverging scores) and **cross-party divergence** (different orgs reporting → diverging scores).
+Plus a corpus-level rollup file (`corpus-aggregates.json`) for a stratified analytics page.
+The frontend's job: surface these signals **in three places** — row-level badges, per-eval / per-model summary panels, and a corpus dashboard view.
+---
+## 1. Where the new data lives
+All fields are new additions to existing artifacts. No artifact is removed or reshaped.
+| Artifact | New fields |
+|---|---|
+| `evals/{id}.json` (`HFEvalDetail`) | Per-row `evalcards.annotations` block on every `metrics[].model_results[]` and `subtasks[…].metrics[].model_results[]`. Plus eval-root `evalcards.annotations.reporting_completeness`, `evalcards.annotations.benchmark_comparability`, and three top-level summaries: `reproducibility_summary`, `provenance_summary`, `comparability_summary`. |
+| `models/{id}.json` (`HFModelDetail`) | Per-row `evalcards.annotations` block on every `hierarchy_by_category[*][*].metrics[].model_results[]`. Plus three top-level summaries scoped to that model. |
+| `eval-list.json` / `eval-list-lite.json` (`HFEvalListEntry`) | Three summaries per entry. |
+| `model-cards.json` / `model-cards-lite.json` (`HFModelCardEntry`) | Three summaries per entry. |
+| `eval-hierarchy.json` (`EvalHierarchy`) | Each family node and leaf node carries the three summaries (aggregated over evals under it). |
+| **`corpus-aggregates.json` (NEW FILE)** | Stratified rollups for paper / dashboard use. |
+| `manifest.json` | New entry in `summary_artifacts`: `corpus_aggregates: "corpus-aggregates.json"`. |
+`signal_version` (currently `"1.0"`) is present on every annotation. Treat it as opaque; surface only in admin/debug.
+---
+## 2. TypeScript types to add
+Add to `lib/backend-artifacts.ts` (preferred — these are pipeline contract types):
+```ts
+// Spec §3
+export interface ReproducibilityGap {
+  has_reproducibility_gap: boolean
+  missing_fields: string[]              // e.g. ["max_tokens"]
+  required_field_count: number          // 2 base + 2 if agentic on current runtime
+  populated_field_count: number
+  signal_version: string
+}
+// Spec §5
+export type ProvenanceSourceType =
+  | "first_party"
+  | "third_party"
+  | "collaborative"
+  | "unspecified"
+export interface Provenance {
+  source_type: ProvenanceSourceType
+  is_multi_source: boolean
+  first_party_only: boolean             // see §6.1 below for caveat
+  distinct_reporting_organizations: number
+  signal_version: string
+}
+// Spec §6.1
+export interface VariantDivergence {
+  has_variant_divergence: boolean
+  group_id: string                      // "{model_route_id}__{metric_summary_id}"
+  divergence_magnitude: number
+  threshold_used: number
+  threshold_basis:
+    | "proportion_or_continuous_normalized"
+    | "percent"
+    | "range_5pct"
+    | "fallback_default"
+  differing_setup_fields: Array<{ field: string; values: unknown[] }>
+  scores_in_group: number[]
+  this_triple_score: number | null      // this row's score within the group
+  triple_count_in_group: number
+  score_scale_anomaly: boolean
+  group_variant_breakdown: Array<{ variant_key: string; row_count: number }>
+  signal_version: string
+}
+// Spec §6.2
+export interface CrossPartyDivergence {
+  has_cross_party_divergence: boolean
+  group_id: string
+  divergence_magnitude: number
+  threshold_used: number
+  threshold_basis: VariantDivergence["threshold_basis"]
+  scores_by_organization: Record<string, number>   // display org name → score
+  differing_setup_fields: Array<{ field: string; values: unknown[] }>
+  organization_count: number
+  group_variant_breakdown: Array<{ variant_key: string; row_count: number }>
+  signal_version: string
+}
+// Per-row annotation block (carried on every model_result row)
+export interface RowAnnotations {
+  reproducibility_gap: ReproducibilityGap | null
+  provenance: Provenance | null
+  variant_divergence: VariantDivergence | null
+  cross_party_divergence: CrossPartyDivergence | null
+}
+// Spec §4
+export interface ReportingCompleteness {
+  completeness_score: number            // [0, 1]
+  total_fields_evaluated: number
+  missing_required_fields: string[]     // dotted paths
+  partial_fields: Array<{
+    field_path: string
+    score: number                       // (0, 1) — strictly between
+    populated_subitems: number
+    total_subitems: number
+  }>
+  field_scores: Array<{
+    field_path: string
+    coverage_type: "full" | "partial" | "reserved"
+    score: number                       // [0, 1]
+  }>
+  signal_version: string
+}
+export interface BenchmarkComparability {
+  variant_divergence_groups: Array<{
+    group_id: string
+    model_route_id: string
+    divergence_magnitude: number
+    threshold_used: number
+    threshold_basis: VariantDivergence["threshold_basis"]
+    differing_setup_fields: VariantDivergence["differing_setup_fields"]
+  }>
+  cross_party_divergence_groups: Array<{
+    group_id: string
+    model_route_id: string
+    divergence_magnitude: number
+    threshold_used: number
+    threshold_basis: VariantDivergence["threshold_basis"]
+    scores_by_organization: Record<string, number>
+    differing_setup_fields: VariantDivergence["differing_setup_fields"]
+  }>
+}
+// Eval-root or model-root annotation block
+export interface EvalcardsAnnotations {
+  reporting_completeness?: ReportingCompleteness
+  benchmark_comparability?: BenchmarkComparability
+}
+// Top-level summary blocks (present on eval-list / model-cards / eval / model / hierarchy nodes)
+export interface ReproducibilitySummary {
+  results_total: number
+  has_reproducibility_gap_count: number
+  populated_ratio_avg: number | null    // null when results_total == 0
+}
+export interface ProvenanceSummary {
+  total_results: number
+  total_groups: number
+  multi_source_groups: number
+  first_party_only_groups: number
+  source_type_distribution: Record<ProvenanceSourceType, number>
+}
+export interface ComparabilitySummary {
+  total_groups: number
+  groups_with_variant_check: number     // eligible groups (>=2 rows, differing setups, >=2 scored)
+  groups_with_cross_party_check: number // eligible groups (>=2 named orgs)
+  variant_divergent_count: number
+  cross_party_divergent_count: number
+}
+export interface SignalSummaries {
+  reproducibility_summary?: ReproducibilitySummary
+  provenance_summary?: ProvenanceSummary
+  comparability_summary?: ComparabilitySummary
+}
+// corpus-aggregates.json
+export interface CorpusAggregates {
+  generated_at: string
+  signal_version: string
+  stratification_dimensions: ["category"]
+  reproducibility: Stratified<ReproducibilityCorpusBlock>
+  completeness:   Stratified<CompletenessCorpusBlock>
+  provenance:     Stratified<ProvenanceCorpusBlock>
+  comparability:  Stratified<ComparabilityCorpusBlock>
+}
+export interface Stratified<T> {
+  overall: T
+  by_category: Record<string, T>        // categories: agentic | general | knowledge | reasoning | safety | other
+}
+export interface ReproducibilityCorpusBlock {
+  total_triples: number
+  triples_with_reproducibility_gap: number
+  reproducibility_gap_rate: number | null
+  agentic_triples: number
+  per_field_missingness: Record<string, {
+    missing_count: number
+    missing_rate: number | null
+    denominator: "all_triples" | "agentic_only"
+    denominator_count: number
+  }>
+}
+export interface CompletenessCorpusBlock {
+  total_benchmarks: number
+  completeness_score_mean: number | null
+  completeness_score_median: number | null
+  per_field_population: Record<string, {
+    mean_score: number
+    populated_rate: number
+    fully_populated_rate: number
+    benchmark_count: number
+  }>
+}
+export interface ProvenanceCorpusBlock {
+  total_triples: number
+  total_groups: number
+  multi_source_groups: number
+  multi_source_rate: number | null
+  first_party_only_groups: number
+  first_party_only_rate: number | null
+  source_type_distribution: Record<ProvenanceSourceType, number>
+}
+export interface ComparabilityCorpusBlock {
+  total_groups: number
+  variant_eligible_groups: number
+  variant_divergent_groups: number
+  variant_divergence_rate: number | null
+  cross_party_eligible_groups: number
+  cross_party_divergent_groups: number
+  cross_party_divergence_rate: number | null   // commonly null on current corpus
+}
+```
+Then in `lib/hf-data.ts`:
+- Extend `HFEvalModelResult` (line ~522) with `evalcards?: { annotations?: RowAnnotations }`.
+- Extend `HFEvalDetail` (line ~556) with `evalcards?: { annotations?: EvalcardsAnnotations }` plus the three summary fields from `SignalSummaries`.
+- Extend `HFEvalListEntry` (line ~475) with `SignalSummaries` fields.
+- Extend `HFModelCardEntry` (line ~439) with `SignalSummaries` fields.
+- Extend `HFModelDetail` (line ~571) with `SignalSummaries` fields.
+- Extend `HFModelHierarchyMetric` (line ~616) — `model_results` already typed as `HFEvalModelResult`, so the per-row annotations propagate automatically.
+In `EvalHierarchy` types (`lib/backend-artifacts.ts` line ~54), add `SignalSummaries` to both `HierarchyFamily` and `HierarchyBenchmark`.
+All fields are **optional** at the type level — older cached snapshots won't have them, and the frontend should render gracefully when they're absent.
+---
+## 3. Data plumbing
+### 3.1 New fetcher + API route for corpus aggregates
+In `lib/hf-data.ts`, add after the existing fetchers (~line 866):
+```ts
+export async function fetchCorpusAggregates(): Promise<CorpusAggregates | null> {
+  return fetchHFJsonSafe<CorpusAggregates>("corpus-aggregates.json")
+}
+```
+Add to `scripts/cache-hf-data.mjs` `CACHE_ROOT_FILES` array: `"corpus-aggregates.json"`. (Mark it optional in `OPTIONAL_CACHE_ROOT_FILES` if shipping while the HF dataset upload is still rolling — once the backend pipeline next runs against the dataset, the file will appear.)
+Create `app/api/corpus-aggregates/route.ts`:
+```ts
+import { NextResponse } from "next/server"
+import { fetchCorpusAggregates } from "@/lib/hf-data"
+export async function GET() {
+  const aggregates = await fetchCorpusAggregates()
+  if (!aggregates) {
+    return NextResponse.json({ error: "Corpus aggregates not available" }, { status: 404 })
+  }
+  return NextResponse.json(aggregates)
+}
+```
+### 3.2 Rest of plumbing is automatic
+Existing fetchers (`fetchEvalDetail`, `fetchModelDetail`, `fetchEvalList`, `fetchModelCardsList`, `fetchEvalHierarchy`) just pull the raw JSON, so the new fields propagate without code changes once the types above are widened.
+---
+## 4. UX components to build
+Build a small set of reusable signal components in `components/signals/`. Each takes one of the typed shapes above and renders a badge / panel. This keeps signal rendering consistent across `eval-detail.tsx`, `benchmark-detail.tsx`, `model-compare-dialog.tsx`, and the new corpus dashboard.
+```
+components/signals/
+├── reproducibility-badge.tsx
+├── provenance-badge.tsx          // already partially exists in benchmark-detail.tsx — see §4.2
+├── variant-divergence-badge.tsx
+├── cross-party-divergence-badge.tsx
+├── reproducibility-panel.tsx      // detail view — full missing-fields list
+├── completeness-panel.tsx         // detail view — score bar + missing-field list
+├── comparability-panel.tsx        // detail view — divergent groups list
+├── signals-row-badges.tsx         // composite: renders all four row-level badges with proper spacing
+└── signal-tooltip.tsx             // shared tooltip primitive
+```
+All badges should follow the existing tone conventions used by `getRelationshipBadgeTone` ([components/benchmark-detail.tsx:289](../components/benchmark-detail.tsx#L289)) and the `Badge` primitive in [components/ui/badge.tsx](../components/ui/badge.tsx).
+### 4.1 Row-level badges — placement
+Insert `<SignalsRowBadges annotations={modelResult.evalcards?.annotations} />` next to the score cell in:
+- **Eval detail leaderboard table** — [components/eval-detail.tsx:869-871](../components/eval-detail.tsx#L869-L871) (the `<TableCell className="text-right">` containing the score). Render badges below the score on a new line for desktop, hidden on mobile.
+- **Benchmark detail rows** — `components/benchmark-detail.tsx` renders score rows in several places (search for `formatRawScoreValue`); insert the same component.
+- **Model compare dialog** — [components/model-compare-dialog.tsx](../components/model-compare-dialog.tsx) score columns.
+**Display rules — only badge for actionable states.** Silence is meaningful here.
+| Signal | Show badge when | Hide when |
+|---|---|---|
+| Reproducibility | `has_reproducibility_gap === true` | gap=false, or annotation absent |
+| Provenance | `source_type` ∈ {`first_party`, `third_party`, `collaborative`} | `source_type === "unspecified"` |
+| Variant divergence | `variant_divergence !== null && has_variant_divergence === true` | null (not applicable) or false (checked, fine) |
+| Cross-party divergence | `cross_party_divergence !== null && has_cross_party_divergence === true` | null (almost always on current corpus) or false |
+`has_*: false` means "we checked and it's fine" — silent success. `null` means "not applicable / not enough data" — also silent. **Only divergent / gap-positive states warrant pixels.**
+**Dedup rule.** `variant_divergence` and `cross_party_divergence` are duplicated onto every row in the same group. If you render three rows from the same `group_id`, render the divergence badge on each row but the *expanded panel* (§4.4) only once at the group header.
+### 4.2 Provenance badge — reuse what's there
+[components/benchmark-detail.tsx:262-302](../components/benchmark-detail.tsx#L262-L302) already has `getRelationshipShortLabel` and `getRelationshipBadgeTone`. Extract these into `components/signals/provenance-badge.tsx` and import back into `benchmark-detail.tsx`. The new badge should **also** consume the new `Provenance` annotation when present (it carries `is_multi_source` and `first_party_only`, which the current implementation derives row-by-row from `source_metadata` alone).
+When `provenance.first_party_only === true`, show a small ⚠ subtle indicator on the first-party badge ("first-party only — no independent replication"). This is the headline use of the signal for policy-mode readers.
+### 4.3 Reproducibility badge — content rules
+Tooltip content depends on audience mode (`useAudienceMode()` from [components/audience-mode-provider.tsx:40](../components/audience-mode-provider.tsx#L40)):
+- Research mode: "Setup not fully documented. Missing: `max_tokens`, `eval_plan`."
+- Policy mode: "This score's setup isn't fully documented, so it can't be re-run as-is."
+Always include the count "{populated_field_count} of {required_field_count} setup fields recorded." Don't hardcode "4 fields" — the active runtime checks 2 base fields (`temperature`, `max_tokens`) plus 2 agentic fields (`eval_plan`, `eval_limits`) when the benchmark is agentic. Read counts off the annotation.
+### 4.4 Detail panels — placement
+#### Reproducibility panel
+The existing "Evaluation Provenance" panel in [components/eval-detail.tsx:952-998](../components/eval-detail.tsx#L952-L998) (rendered when a row is expanded) is the right place for the **per-row** reproducibility breakdown. Add a new `DetailPanel` adjacent to it:
+```tsx
+{rowAnnotations?.reproducibility_gap && (
+  <DetailPanel
+    title={isResearchView ? "Reproducibility" : "Re-runnability"}
+    subtitle={
+      isResearchView
+        ? "Whether the setup is documented well enough for someone else to re-run."
+        : "Whether someone could re-run this evaluation with the information available."
+    }
+  >
+    <MetaRow
+      label="Setup fields recorded"
+      value={`${rowAnnotations.reproducibility_gap.populated_field_count} of ${rowAnnotations.reproducibility_gap.required_field_count}`}
+    />
+    {rowAnnotations.reproducibility_gap.missing_fields.length > 0 && (
+      <MetaRow
+        label="Missing"
+        value={rowAnnotations.reproducibility_gap.missing_fields.join(", ")}
+      />
+    )}
+  </DetailPanel>
+)}
+```
+#### Completeness panel
+Render at the **eval-detail header level** (above the leaderboard, below the metric specification card). New `<CompletenessPanel completeness={detail.evalcards?.annotations?.reporting_completeness} />`. UI: progress bar showing `completeness_score`, label "{N} of {M} fields populated" where N = sum of `field_scores[].score` rounded, M = `total_fields_evaluated`. Below: collapsible accordions:
+- **Missing required fields** (count badge) — list of `missing_required_fields` with friendly labels (see §6.4 for label mapping).
+- **Partially populated** (count badge) — `partial_fields` rendered as "{field}: {populated_subitems}/{total_subitems}".
+In policy mode, don't show the dotted-path field names — show friendly labels only. In research mode, show both.
+#### Comparability panel
+Also at eval-detail header level. Sourced from `detail.evalcards?.annotations?.benchmark_comparability`. Render as two collapsibles — "Variant divergence ({count})" and "Cross-party divergence ({count})". Each item should link to the relevant model row (use `model_route_id` from each group entry as anchor — add `id={"row-" + model_route_id}` on the leaderboard row).
+When both arrays are empty, hide the panel entirely. When `comparability_summary.groups_with_cross_party_check === 0` (the common state), surface a small note: "No third-party reports available for cross-party comparison."
+### 4.5 Per-eval header chips
+On the eval-detail page header (next to existing "Measures" / "Source dataset" chips around [components/eval-detail.tsx:486-525](../components/eval-detail.tsx#L486-L525)), add a fourth chip when `evalcards.annotations.reporting_completeness` is present:
+> **Documentation**
+> {round(completeness_score * 100)}%
+Tooltip: "{N} of {M} EvalCards documentation fields populated for this benchmark."
+### 4.6 Per-model card chips
+On `components/eval-card.tsx` and the model card pages, add three chips driven by the model-level summaries. Replace the hand-written hint at [components/eval-card.tsx:250](../components/eval-card.tsx#L250) ("Some results lack generation settings; compare scores with care.") with a data-driven version:
+> {has_reproducibility_gap_count} of {results_total} reported scores aren't fully documented.
+Show only when `has_reproducibility_gap_count > 0`. The hand-written hint was a placeholder for exactly this signal — wire it up.
+---
+## 5. New page: corpus dashboard
+Add `app/corpus/page.tsx` (linked from main navigation [components/navigation.tsx](../components/navigation.tsx)). Server component that calls `fetchCorpusAggregates()` and renders four sections:
+### 5.1 Reproducibility section
+- Headline number: `reproducibility_gap_rate` rendered as percentage. Sub-label: "{triples_with_reproducibility_gap} of {total_triples} reported scores."
+- Per-field horizontal bar chart from `per_field_missingness`. **Bar denominator depends on `denominator` field**: agentic-only fields use `agentic_triples`, others use `total_triples`. Label each bar with the denominator type so users understand.
+- Toggle: `overall` ↔ `by_category` (rendered as a small-multiple grid, one panel per category).
+### 5.2 Completeness section
+- Headline: `completeness_score_mean` (and median) across `total_benchmarks`.
+- Histogram of per-benchmark scores (pull individual benchmark scores from `eval-list.json` `reporting_completeness.completeness_score`, since corpus-aggregates only carries mean/median).
+- Per-field bar chart from `per_field_population` — three bars per field: `mean_score`, `populated_rate`, `fully_populated_rate`. (See §6.7 for which one to highlight per coverage type.)
+### 5.3 Provenance section
+- Stacked bar of `source_type_distribution` (across all triples).
+- Two ratios: `multi_source_rate`, `first_party_only_rate`. Label both: "% of (model, benchmark, metric) groups."
+### 5.4 Comparability section
+- Two side-by-side panels: Variant divergence (eligible-aware rate) and Cross-party divergence (often null).
+- **When `cross_party_divergence_rate === null`:** show a "Not enough multi-org coverage to compute" empty state, not "0%". Same for `variant_divergence_rate === null`. This is critical — see §6.5.
+All sections support a category toggle (research mode shows category breakdowns by default; policy mode shows overall by default).
+---
+## 6. Caveats and edge cases (read these before implementing)
+### 6.1 `first_party_only` semantics
+A row can be `first_party_only: true` even when `is_multi_source: false`. The spec literal: a group with one *named* org reporting first-party gets the badge. **Don't read it as "exclusive coverage"** — read it as "no independent replication." The label suggestion is "First-party only" rather than "Sole source."
+If `distinct_reporting_organizations === 0` (all rows have null org), `first_party_only` is `false` even when `source_type === "first_party"`. Render the row's source as "First-party (org unspecified)" in research mode; suppress the first-party-only badge.
+### 6.2 Active reproducibility field set is reduced
+The spec describes four base fields (`temperature`, `top_p`, `max_tokens`, `prompt_template`); the active backend currently checks **only `temperature` and `max_tokens`** plus `eval_plan` / `eval_limits` for agentic benchmarks. **Don't hardcode "4 fields" anywhere.** Always read `required_field_count` off the annotation. This is a deliberate spec-author choice and may revert; the field count is the only stable interface.
+### 6.3 Missing-field path strings
+`missing_fields` for reproducibility uses bare names (e.g. `"max_tokens"`). `missing_required_fields` for completeness uses dotted paths (e.g. `"autobenchmarkcard.methodology.baseline_results"`). Different conventions, intentional. Build a small label map for completeness paths — paths come from [registry/completeness_fields.json](https://github.com/evaleval/eval_cards_backend_pipeline/blob/main/registry/completeness_fields.json) on the backend repo. Suggested label rules:
+- Drop the `autobenchmarkcard.` / `eee_eval.` / `evalcards.` prefix.
+- Replace dots with " / ", underscore with space, title-case.
+- Example: `autobenchmarkcard.methodology.baseline_results` → "Methodology / Baseline results".
+### 6.4 `differing_setup_fields[].values` may contain null and mixed types
+Per spec §6.1.4, `null` is a *distinct* value from any explicit setting (comparing "explicit 2048" to "unspecified" is meaningful). Render `null` as "(unspecified)" rather than the string "null". Numeric, string, boolean, and object values can all appear in the same array; render with `JSON.stringify` for objects, plain text otherwise.
+### 6.5 `null` rates in comparability are *not* zero
+Eligibility-aware denominators mean `variant_divergence_rate` and `cross_party_divergence_rate` are `null` when no groups were eligible. **Render as "N/A — not enough data" or an empty-state card, never as "0%".** On the current corpus, `cross_party_divergence_rate` will commonly be null (third-party reports are sparse). Treat this as a normal state, not a data-loading error.
+### 6.6 Score-scale anomaly flag
+`variant_divergence.score_scale_anomaly === true` indicates the metric was declared `proportion` but scores fell outside [0, 1] — usually a metric-normalization bug upstream. Surface as a small "data quality warning" annotation alongside the divergence number; the divergence is still computed but the threshold may not be apples-to-apples.
+### 6.7 `mean_score` vs `populated_rate` for completeness
+Per-field aggregates expose three numbers. Pick which to display based on `coverage_type`:
+- **`full` and `reserved` fields** — `mean_score` and `populated_rate` are equal. Show one number labeled "% of benchmarks populating this field."
+- **`partial` fields** — they diverge. `populated_rate` = % of benchmarks with *any* sub-item; `mean_score` = average sub-item population fraction. Show both: "{populated_rate}% have any data, {mean_score}% on average across sub-items."
+### 6.8 No `computed_at` on per-record annotations
+Only `signal_version` is on each annotation. For "last computed" UI text, use `manifest.json → generated_at` from the existing `BackendManifest`.
+### 6.9 Stratification categories
+`by_category` keys are: `agentic`, `general`, `knowledge`, `reasoning`, `safety`, `other`. Same set as the existing `category` field on evals — reuse whatever color scheme is currently keyed off `inferCategoryFromBenchmark` ([lib/benchmark-schema.ts](../lib/benchmark-schema.ts)).
+### 6.10 Annotation block can be `null` or absent
+`evalcards.annotations.{reproducibility_gap,provenance,variant_divergence,cross_party_divergence}` can each be `null` independently, and the entire `evalcards` block may be absent on older cached snapshots. Use optional chaining everywhere; never assume presence. The `RowAnnotations` type intentionally types each subfield as `T | null` (not `T | undefined`) because the backend writes explicit `null`.
+---
+## 7. Suggested implementation order
+1. **Types + plumbing** (1–2 hours): types in `backend-artifacts.ts` + `hf-data.ts`, the `fetchCorpusAggregates` fetcher, the API route, and adding `corpus-aggregates.json` to the cache script. No UI yet.
+2. **Row-level badges** (½ day): build `signals/` directory with the four badge components, the dedup-aware `signals-row-badges.tsx`, and wire into eval-detail and benchmark-detail. This is the most visible win.
+3. **Per-eval completeness panel + comparability panel** (½ day): single benchmark, easy to design around. New `CompletenessPanel` is the headline new UX in this set.
+4. **Per-row reproducibility detail panel** (1–2 hours): drops into the existing expanded row layout.
+5. **Per-eval / per-model header chips + replace the hand-written gap hint** (1–2 hours): wires the summary fields into existing card surfaces.
+6. **Corpus dashboard page** (1–2 days): new route, new components, biggest scope. Defer until 1–5 are live and reviewed.
+Each step is independently shippable. Steps 1–5 can land before the corpus dashboard is designed.
+---
+## 8. Out of scope (don't do these yet)
+- **Filter / sort the eval list by signal state** ("show only benchmarks with completeness > 0.5"). Wait for the dashboard view to land first; users will tell us which filters they actually want.
+- **Side-by-side score comparison with divergence overlay.** The data supports it (`scores_in_group`, `scores_by_organization`) but the design space is large. Hold off until we see the row-level badges in use.
+- **Recompute / verification UI for missing reproducibility fields.** Backend-side; out of scope here.
+- **Per-instance sample-level badges.** Signals operate at row / benchmark level; sample-level instance data is unaffected.
+---
+## 9. Reference: minimal real-shape examples
+Per-row `evalcards.annotations` with all four signals populated:
+```jsonc
+{
+  "reproducibility_gap": {
+    "has_reproducibility_gap": true,
+    "missing_fields": ["max_tokens"],
+    "required_field_count": 2,
+    "populated_field_count": 1,
+    "signal_version": "1.0"
+  },
+  "provenance": {
+    "source_type": "first_party",
+    "is_multi_source": false,
+    "first_party_only": true,
+    "distinct_reporting_organizations": 1,
+    "signal_version": "1.0"
+  },
+  "variant_divergence": null,
+  "cross_party_divergence": null
+}
+```
+Per-eval `evalcards.annotations` with completeness + comparability:
+```jsonc
+{
+  "reporting_completeness": {
+    "completeness_score": 0.62,
+    "total_fields_evaluated": 28,
+    "missing_required_fields": [
+      "autobenchmarkcard.methodology.baseline_results",
+      "autobenchmarkcard.methodology.validation",
+      "evalcards.preregistration_url"
+    ],
+    "partial_fields": [
+      { "field_path": "autobenchmarkcard.data", "score": 0.5, "populated_subitems": 2, "total_subitems": 4 }
+    ],
+    "field_scores": [/* 28 entries */],
+    "signal_version": "1.0"
+  },
+  "benchmark_comparability": {
+    "variant_divergence_groups": [
+      {
+        "group_id": "openai__gpt-5__hfopenllm_v2_bbh_accuracy",
+        "model_route_id": "openai__gpt-5",
+        "divergence_magnitude": 0.12,
+        "threshold_used": 0.05,
+        "threshold_basis": "proportion_or_continuous_normalized",
+        "differing_setup_fields": [
+          { "field": "max_tokens", "values": [2048, 4096, 8192] }
+        ]
+      }
+    ],
+    "cross_party_divergence_groups": []
+  }
+}
+```
+Top-level `provenance_summary` example:
+```jsonc
+{
+  "total_results": 142,
+  "total_groups": 47,
+  "multi_source_groups": 3,
+  "first_party_only_groups": 30,
+  "source_type_distribution": {
+    "first_party": 120,
+    "third_party": 18,
+    "collaborative": 0,
+    "unspecified": 4
+  }
+}
+```
+`corpus-aggregates.json` structure (top of file):
+```jsonc
+{
+  "generated_at": "2026-04-27T...",
+  "signal_version": "1.0",
+  "stratification_dimensions": ["category"],
+  "reproducibility": { "overall": {/* ReproducibilityCorpusBlock */}, "by_category": { "agentic": {...}, "general": {...}, ... } },
+  "completeness":   { "overall": {/* CompletenessCorpusBlock */},   "by_category": {...} },
+  "provenance":     { "overall": {/* ProvenanceCorpusBlock */},     "by_category": {...} },
+  "comparability":  { "overall": {/* ComparabilityCorpusBlock */},  "by_category": {...} }
+}
+```
+---
+## 10. Audience-mode wording cheatsheet
+| Element | Research mode | Policy mode |
+|---|---|---|
+| Reproducibility gap badge | "Reproducibility gap" | "Setup not documented" |
+| Reproducibility tooltip | "Setup not fully documented. Missing: {fields}." | "This score's setup isn't documented, so it can't be re-run as-is." |
+| Reproducibility panel title | "Reproducibility" | "Re-runnability" |
+| Completeness chip label | "Documentation" | "Documentation" |
+| Completeness panel title | "Reporting completeness" | "How well is this benchmark documented?" |
+| Provenance: first-party | "1st party" | "Reported by model developer" |
+| Provenance: first-party only | "1st party only — no replication" | "Only the model developer reported this score" |
+| Provenance: third-party | "3rd party" | "Independently reported" |
+| Provenance: collaborative | "Collaborative" | "Joint report" |
+| Variant divergence badge | "Variant divergence" | "Score depends on setup" |
+| Variant divergence tooltip | "Scores diverge by {magnitude} across different setups: {fields}." | "Different runs of this evaluation produced different scores — the setup matters." |
+| Cross-party divergence badge | "Cross-party divergence" | "Sources disagree" |
+| Cross-party divergence tooltip | "Reports diverge by {magnitude} across organizations." | "Different organizations reported different scores for this same model on this same benchmark." |
+Adjust tone but keep the underlying numbers identical across modes — the data is the same, only the framing changes.
+---
+*Last updated 2026-04-27. Maintainer: backend pipeline (eval_cards_backend_pipeline), frontend (general-eval-card). Questions on backend semantics → [eval_cards_backend_pipeline#2](https://github.com/evaleval/eval_cards_backend_pipeline/issues/2). Questions on UX → discuss with @anka-evals + frontend team.*

lib/backend-artifacts.ts CHANGED Viewed

@@ -2,6 +2,10 @@ export interface BackendManifest {
   generated_at: string
   config_version: number
   skipped_configs: string[]
 }
 export interface BackendManifestStatus {
@@ -14,6 +18,209 @@ export interface BackendManifestStatus {
   pendingRefreshCount: number
 }
 export interface HierarchyTags {
   domains: string[]
   languages: string[]
@@ -32,16 +239,17 @@ export interface HierarchySlice {
   metrics: HierarchyMetric[]
 }
-export interface HierarchyBenchmark {
   key: string
   display_name: string
   has_card: boolean
   tags: HierarchyTags
   slices: HierarchySlice[]
   metrics: HierarchyMetric[]
 }
-export interface HierarchyComposite {
   key: string
   display_name: string
   has_card: boolean
@@ -51,17 +259,32 @@ export interface HierarchyComposite {
   summary_eval_ids?: string[]
 }
-export interface HierarchyFamily {
   key: string
   display_name: string
-  has_card: boolean
   category: string
-  tags: HierarchyTags
   standalone_benchmarks?: HierarchyBenchmark[]
   composites?: HierarchyComposite[]
   benchmarks?: HierarchyBenchmark[]
   slices?: HierarchySlice[]
   metrics?: HierarchyMetric[]
 }
 export interface EvalHierarchyStats {
@@ -75,7 +298,7 @@ export interface EvalHierarchyStats {
 }
 export interface EvalHierarchy {
-  stats: EvalHierarchyStats
   families: HierarchyFamily[]
 }
@@ -159,4 +382,4 @@ export interface ComparisonIndex {
   metric_group_order: MetricGroup[]
   evals: Record<string, ComparisonEvalEntry>
   by_model: Record<string, Record<string, Record<string, ComparisonByModelEntry>>>
-}

   generated_at: string
   config_version: number
   skipped_configs: string[]
+  summary_artifacts?: {
+    corpus_aggregates?: string
+    [key: string]: string | undefined
+  }
 }
 export interface BackendManifestStatus {
   pendingRefreshCount: number
 }
+// ---------------------------------------------------------------------------
+// EvalCards interpretive signals v1.0
+// ---------------------------------------------------------------------------
+export interface ReproducibilityGap {
+  has_reproducibility_gap: boolean
+  missing_fields: string[]
+  required_field_count: number
+  populated_field_count: number
+  signal_version: string
+}
+export type ProvenanceSourceType =
+  | "first_party"
+  | "third_party"
+  | "collaborative"
+  | "unspecified"
+export interface Provenance {
+  source_type: ProvenanceSourceType
+  is_multi_source: boolean
+  first_party_only: boolean
+  distinct_reporting_organizations: number
+  signal_version: string
+}
+export type DivergenceThresholdBasis =
+  | "proportion_or_continuous_normalized"
+  | "percent"
+  | "range_5pct"
+  | "fallback_default"
+export interface DifferingSetupField {
+  field: string
+  values: unknown[]
+}
+export interface VariantDivergence {
+  has_variant_divergence: boolean
+  group_id: string
+  divergence_magnitude: number
+  threshold_used: number
+  threshold_basis: DivergenceThresholdBasis
+  differing_setup_fields: DifferingSetupField[]
+  scores_in_group: number[]
+  this_triple_score: number | null
+  triple_count_in_group: number
+  score_scale_anomaly: boolean
+  group_variant_breakdown: Array<{ variant_key: string; row_count: number }>
+  signal_version: string
+}
+export interface CrossPartyDivergence {
+  has_cross_party_divergence: boolean
+  group_id: string
+  divergence_magnitude: number
+  threshold_used: number
+  threshold_basis: DivergenceThresholdBasis
+  scores_by_organization: Record<string, number>
+  differing_setup_fields: DifferingSetupField[]
+  organization_count: number
+  group_variant_breakdown: Array<{ variant_key: string; row_count: number }>
+  signal_version: string
+}
+export interface RowAnnotations {
+  reproducibility_gap: ReproducibilityGap | null
+  provenance: Provenance | null
+  variant_divergence: VariantDivergence | null
+  cross_party_divergence: CrossPartyDivergence | null
+}
+export interface ReportingCompleteness {
+  completeness_score: number
+  total_fields_evaluated: number
+  missing_required_fields: string[]
+  partial_fields: Array<{
+    field_path: string
+    score: number
+    populated_subitems: number
+    total_subitems: number
+  }>
+  field_scores: Array<{
+    field_path: string
+    coverage_type: "full" | "partial" | "reserved"
+    score: number
+  }>
+  signal_version: string
+}
+export interface BenchmarkComparability {
+  variant_divergence_groups: Array<{
+    group_id: string
+    model_route_id: string
+    divergence_magnitude: number
+    threshold_used: number
+    threshold_basis: DivergenceThresholdBasis
+    differing_setup_fields: DifferingSetupField[]
+  }>
+  cross_party_divergence_groups: Array<{
+    group_id: string
+    model_route_id: string
+    divergence_magnitude: number
+    threshold_used: number
+    threshold_basis: DivergenceThresholdBasis
+    scores_by_organization: Record<string, number>
+    differing_setup_fields: DifferingSetupField[]
+  }>
+}
+export interface EvalcardsAnnotations {
+  reporting_completeness?: ReportingCompleteness
+  benchmark_comparability?: BenchmarkComparability
+}
+export interface ReproducibilitySummary {
+  results_total: number
+  has_reproducibility_gap_count: number
+  populated_ratio_avg: number | null
+}
+export interface ProvenanceSummary {
+  total_results: number
+  total_groups: number
+  multi_source_groups: number
+  first_party_only_groups: number
+  source_type_distribution: Record<ProvenanceSourceType, number>
+}
+export interface ComparabilitySummary {
+  total_groups: number
+  groups_with_variant_check: number
+  groups_with_cross_party_check: number
+  variant_divergent_count: number
+  cross_party_divergent_count: number
+}
+export interface SignalSummaries {
+  reproducibility_summary?: ReproducibilitySummary
+  provenance_summary?: ProvenanceSummary
+  comparability_summary?: ComparabilitySummary
+}
+export interface CorpusAggregates {
+  generated_at: string
+  signal_version: string
+  stratification_dimensions: ["category"]
+  reproducibility: Stratified<ReproducibilityCorpusBlock>
+  completeness: Stratified<CompletenessCorpusBlock>
+  provenance: Stratified<ProvenanceCorpusBlock>
+  comparability: Stratified<ComparabilityCorpusBlock>
+}
+export interface Stratified<T> {
+  overall: T
+  by_category: Record<string, T>
+}
+export interface ReproducibilityCorpusBlock {
+  total_triples: number
+  triples_with_reproducibility_gap: number
+  reproducibility_gap_rate: number | null
+  agentic_triples: number
+  per_field_missingness: Record<string, {
+    missing_count: number
+    missing_rate: number | null
+    denominator: "all_triples" | "agentic_only"
+    denominator_count: number
+  }>
+}
+export interface CompletenessCorpusBlock {
+  total_benchmarks: number
+  completeness_score_mean: number | null
+  completeness_score_median: number | null
+  per_field_population: Record<string, {
+    mean_score: number
+    populated_rate: number
+    fully_populated_rate: number
+    benchmark_count: number
+  }>
+}
+export interface ProvenanceCorpusBlock {
+  total_triples: number
+  total_groups: number
+  multi_source_groups: number
+  multi_source_rate: number | null
+  first_party_only_groups: number
+  first_party_only_rate: number | null
+  source_type_distribution: Record<ProvenanceSourceType, number>
+}
+export interface ComparabilityCorpusBlock {
+  total_groups: number
+  variant_eligible_groups: number
+  variant_divergent_groups: number
+  variant_divergence_rate: number | null
+  cross_party_eligible_groups: number
+  cross_party_divergent_groups: number
+  cross_party_divergence_rate: number | null
+}
 export interface HierarchyTags {
   domains: string[]
   languages: string[]
   metrics: HierarchyMetric[]
 }
+export interface HierarchyBenchmark extends SignalSummaries {
   key: string
   display_name: string
   has_card: boolean
   tags: HierarchyTags
   slices: HierarchySlice[]
   metrics: HierarchyMetric[]
+  summary_eval_ids?: string[]
 }
+export interface HierarchyComposite extends SignalSummaries {
   key: string
   display_name: string
   has_card: boolean
   summary_eval_ids?: string[]
 }
+export interface HierarchyLeaf extends SignalSummaries {
   key: string
   display_name: string
   category: string
+  evals_count?: number
+  eval_summary_ids?: string[]
+  tags?: Partial<HierarchyTags>
+  has_card?: boolean
+}
+export interface HierarchyFamily extends SignalSummaries {
+  key: string
+  display_name: string
+  has_card?: boolean
+  category: string
+  tags?: Partial<HierarchyTags>
+  evals_count?: number
+  eval_summary_ids?: string[]
+  // Legacy nested shape (composites + standalone benchmarks)
   standalone_benchmarks?: HierarchyBenchmark[]
   composites?: HierarchyComposite[]
   benchmarks?: HierarchyBenchmark[]
   slices?: HierarchySlice[]
   metrics?: HierarchyMetric[]
+  // Newer 2-level shape (family → leaf)
+  leaves?: HierarchyLeaf[]
 }
 export interface EvalHierarchyStats {
 }
 export interface EvalHierarchy {
+  stats?: EvalHierarchyStats
   families: HierarchyFamily[]
 }
   metric_group_order: MetricGroup[]
   evals: Record<string, ComparisonEvalEntry>
   by_model: Record<string, Record<string, Record<string, ComparisonByModelEntry>>>
+}

lib/benchmark-schema.ts CHANGED Viewed

@@ -3,6 +3,8 @@
  * Based on the evalevalai.com schema structure
  */
 export interface BenchmarkEvaluation {
   schema_version: string
   eval_summary_id?: string
@@ -31,6 +33,7 @@ export interface BenchmarkEvaluation {
   generation_config?: GenerationConfig
   evaluation_results: EvaluationResult[]
   detailed_evaluation_results_per_samples?: SampleResult[]
 }
 export interface EvalLibrary {
@@ -96,6 +99,7 @@ export interface EvaluationResult {
   score_details: ScoreDetails
   detailed_evaluation_results_url?: string
   generation_config?: GenerationConfig
 }
 export interface MetricConfig {
@@ -208,7 +212,7 @@ export function inferCategoryFromBenchmark(benchmarkName: string): CategoryType
 /**
  * Aggregate evaluations by model
  */
-export interface ModelSummaryCore {
   model_info: ModelInfo
   evaluations_by_category: Record<CategoryType, BenchmarkEvaluation[]>
   total_evaluations: number
@@ -275,6 +279,9 @@ export interface EvaluationCardData {
     max: number
     average: number | null
   }
   // Quick stats
   top_scores: Array<{

  * Based on the evalevalai.com schema structure
  */
+import type { EvalcardsAnnotations, RowAnnotations, SignalSummaries } from "@/lib/backend-artifacts"
 export interface BenchmarkEvaluation {
   schema_version: string
   eval_summary_id?: string
   generation_config?: GenerationConfig
   evaluation_results: EvaluationResult[]
   detailed_evaluation_results_per_samples?: SampleResult[]
+  evalcards?: { annotations?: EvalcardsAnnotations }
 }
 export interface EvalLibrary {
   score_details: ScoreDetails
   detailed_evaluation_results_url?: string
   generation_config?: GenerationConfig
+  evalcards?: { annotations?: RowAnnotations }
 }
 export interface MetricConfig {
 /**
  * Aggregate evaluations by model
  */
+export interface ModelSummaryCore extends SignalSummaries {
   model_info: ModelInfo
   evaluations_by_category: Record<CategoryType, BenchmarkEvaluation[]>
   total_evaluations: number
     max: number
     average: number | null
   }
+  reproducibility_summary?: SignalSummaries["reproducibility_summary"]
+  provenance_summary?: SignalSummaries["provenance_summary"]
+  comparability_summary?: SignalSummaries["comparability_summary"]
   // Quick stats
   top_scores: Array<{

lib/dashboard-data-client.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import type { BackendManifestStatus, ComparisonIndex, EvalHierarchy } from "@/lib/backend-artifacts"
 import type { BenchmarkEvaluationCardData } from "@/components/benchmark-evaluation-card"
 import type { HFEvalDetail } from "@/lib/hf-data"
 import type {
@@ -108,3 +108,7 @@ export function fetchEvalHierarchy() {
 export function fetchComparisonIndex() {
   return fetchJson<ComparisonIndex>("/api/comparison-index")
 }

+import type { BackendManifestStatus, ComparisonIndex, CorpusAggregates, EvalHierarchy } from "@/lib/backend-artifacts"
 import type { BenchmarkEvaluationCardData } from "@/components/benchmark-evaluation-card"
 import type { HFEvalDetail } from "@/lib/hf-data"
 import type {
 export function fetchComparisonIndex() {
   return fetchJson<ComparisonIndex>("/api/comparison-index")
 }
+export function fetchCorpusAggregates() {
+  return fetchJson<CorpusAggregates>("/api/corpus-aggregates")
+}

lib/eval-processing.ts CHANGED Viewed

@@ -15,6 +15,7 @@ import type {
   MetricConfig,
   EvaluationResult,
 } from './benchmark-schema'
 import type { ModelEvaluationSummary } from './benchmark-schema'
 import type { ModelSummaryCore } from './benchmark-schema'
 import { inferCategoryFromBenchmark } from './benchmark-schema'
@@ -130,7 +131,7 @@ export interface ModelResultForBenchmark {
   }>
 }
-export interface BenchmarkEvalSummary {
   evaluation_name: string
   /** URL-safe slug derived from evaluation_name */
   evaluation_id: string
@@ -192,6 +193,7 @@ export interface BenchmarkEvalSummary {
   leaderboard_metrics?: BenchmarkLeaderboardMetric[]
   /** Matrix rows for multi-metric benchmark leaderboards */
   leaderboard_rows?: BenchmarkLeaderboardRow[]
 }
 export interface BenchmarkSummaryMetric {
@@ -234,6 +236,7 @@ export interface BenchmarkLeaderboardRow {
   source_metadata: SourceMetadata
   source_data: BenchmarkEvaluation["source_data"]
   values: Record<string, number | null>
   metrics_present: number
 }
@@ -727,6 +730,9 @@ export function createEvaluationCard(
     eval_libraries: Array.from(evalLibraries.values()).sort((a, b) => a.name.localeCompare(b.name)),
     latest_source_name: latestSourceName,
     params_billions: Number.isFinite(paramsBillions ?? NaN) ? paramsBillions : null,
     top_scores: topScores,
     source_urls: Array.from(sourceUrls),
     detail_urls: Array.from(detailUrls),

   MetricConfig,
   EvaluationResult,
 } from './benchmark-schema'
+import type { EvalcardsAnnotations, RowAnnotations, SignalSummaries } from './backend-artifacts'
 import type { ModelEvaluationSummary } from './benchmark-schema'
 import type { ModelSummaryCore } from './benchmark-schema'
 import { inferCategoryFromBenchmark } from './benchmark-schema'
   }>
 }
+export interface BenchmarkEvalSummary extends SignalSummaries {
   evaluation_name: string
   /** URL-safe slug derived from evaluation_name */
   evaluation_id: string
   leaderboard_metrics?: BenchmarkLeaderboardMetric[]
   /** Matrix rows for multi-metric benchmark leaderboards */
   leaderboard_rows?: BenchmarkLeaderboardRow[]
+  evalcards?: { annotations?: EvalcardsAnnotations }
 }
 export interface BenchmarkSummaryMetric {
   source_metadata: SourceMetadata
   source_data: BenchmarkEvaluation["source_data"]
   values: Record<string, number | null>
+  annotations_by_metric?: Record<string, RowAnnotations | null | undefined>
   metrics_present: number
 }
     eval_libraries: Array.from(evalLibraries.values()).sort((a, b) => a.name.localeCompare(b.name)),
     latest_source_name: latestSourceName,
     params_billions: Number.isFinite(paramsBillions ?? NaN) ? paramsBillions : null,
+    reproducibility_summary: summary.reproducibility_summary,
+    provenance_summary: summary.provenance_summary,
+    comparability_summary: summary.comparability_summary,
     top_scores: topScores,
     source_urls: Array.from(sourceUrls),
     detail_urls: Array.from(detailUrls),

lib/hf-data.ts CHANGED Viewed

@@ -3,7 +3,16 @@ import "server-only"
 import { promises as fs } from "fs"
 import path from "path"
-import type { BackendManifest, BackendManifestStatus, ComparisonIndex, EvalHierarchy } from "@/lib/backend-artifacts"
 import type {
   BenchmarkCard,
   BenchmarkEvaluation,
@@ -436,7 +445,7 @@ async function fetchHFJsonSafe<T>(relativePath: string): Promise<T | null> {
 // HF dataset types (shapes of JSON files in the HF repo)
 // ---------------------------------------------------------------------------
-export interface HFModelCardEntry {
   model_family_id: string
   model_route_id: string
   model_family_name: string
@@ -472,7 +481,7 @@ export interface HFModelCardEntry {
   }>
 }
-export interface HFEvalListEntry {
   eval_summary_id: string
   benchmark: string
   canonical_display_name?: string
@@ -517,6 +526,7 @@ export interface HFEvalListEntry {
     models_count: number
     top_score: number
   }>
 }
 export interface HFEvalModelResult {
@@ -538,6 +548,7 @@ export interface HFEvalModelResult {
   detailed_evaluation_results_meta?: unknown
   instance_level_data?: unknown
   passthrough_top_level_fields?: unknown
 }
 export interface HFEvalMetric {
@@ -553,7 +564,7 @@ export interface HFEvalMetric {
   model_results: HFEvalModelResult[]
 }
-export interface HFEvalDetail {
   eval_summary_id: string
   benchmark: string
   canonical_display_name?: string
@@ -566,9 +577,10 @@ export interface HFEvalDetail {
   benchmark_card: BenchmarkCard | null
   metrics: HFEvalMetric[]
   subtasks: unknown[]
 }
-export interface HFModelDetail {
   model_info: ModelInfo & {
     family_id?: string
     family_slug?: string
@@ -846,13 +858,112 @@ export async function fetchBackendManifest(): Promise<BackendManifest> {
 }
 export async function fetchEvalHierarchy(): Promise<EvalHierarchy> {
-  return fetchHFJson<EvalHierarchy>("eval-hierarchy.json")
 }
 export async function fetchComparisonIndex(): Promise<ComparisonIndex> {
   return fetchHFJson<ComparisonIndex>("comparison-index.json")
 }
 export async function fetchModelDetail(slug: string): Promise<HFModelDetail | null> {
   return fetchHFJsonSafe<HFModelDetail>(`models/${slug}.json`)
 }
@@ -1297,6 +1408,7 @@ function flattenHierarchyNode(
         detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
           result.detailed_evaluation_results
         ),
       }
       const existing = resultsByVariant.get(variantKey)

 import { promises as fs } from "fs"
 import path from "path"
+import type {
+  BackendManifest,
+  BackendManifestStatus,
+  ComparisonIndex,
+  CorpusAggregates,
+  EvalHierarchy,
+  EvalcardsAnnotations,
+  RowAnnotations,
+  SignalSummaries,
+} from "@/lib/backend-artifacts"
 import type {
   BenchmarkCard,
   BenchmarkEvaluation,
 // HF dataset types (shapes of JSON files in the HF repo)
 // ---------------------------------------------------------------------------
+export interface HFModelCardEntry extends SignalSummaries {
   model_family_id: string
   model_route_id: string
   model_family_name: string
   }>
 }
+export interface HFEvalListEntry extends SignalSummaries {
   eval_summary_id: string
   benchmark: string
   canonical_display_name?: string
     models_count: number
     top_score: number
   }>
+  evalcards?: { annotations?: EvalcardsAnnotations }
 }
 export interface HFEvalModelResult {
   detailed_evaluation_results_meta?: unknown
   instance_level_data?: unknown
   passthrough_top_level_fields?: unknown
+  evalcards?: { annotations?: RowAnnotations }
 }
 export interface HFEvalMetric {
   model_results: HFEvalModelResult[]
 }
+export interface HFEvalDetail extends SignalSummaries {
   eval_summary_id: string
   benchmark: string
   canonical_display_name?: string
   benchmark_card: BenchmarkCard | null
   metrics: HFEvalMetric[]
   subtasks: unknown[]
+  evalcards?: { annotations?: EvalcardsAnnotations }
 }
+export interface HFModelDetail extends SignalSummaries {
   model_info: ModelInfo & {
     family_id?: string
     family_slug?: string
 }
 export async function fetchEvalHierarchy(): Promise<EvalHierarchy> {
+  const raw = await fetchHFJson<EvalHierarchy>("eval-hierarchy.json")
+  return adaptEvalHierarchy(raw)
+}
+/**
+ * The upstream pipeline migrated to a flat 2-level shape (family → leaf).
+ * The evals page still walks the older composites/standalone_benchmarks tree,
+ * so we synthesize the legacy view from `leaves` when the new shape is present.
+ * Also computes a fallback `stats` block when missing.
+ */
+function adaptEvalHierarchy(raw: EvalHierarchy): EvalHierarchy {
+  const families = (raw.families ?? []).map((family) => {
+    const hasLegacyTree =
+      (family.composites && family.composites.length > 0) ||
+      (family.standalone_benchmarks && family.standalone_benchmarks.length > 0) ||
+      (family.benchmarks && family.benchmarks.length > 0)
+    if (hasLegacyTree) {
+      return family
+    }
+    const leaves = family.leaves ?? []
+    if (leaves.length === 0) {
+      return family
+    }
+    const standalone = leaves.map((leaf) => ({
+      key: leaf.key,
+      display_name: leaf.display_name,
+      has_card: leaf.has_card ?? false,
+      tags: {
+        domains: leaf.tags?.domains ?? [],
+        languages: leaf.tags?.languages ?? [],
+        tasks: leaf.tags?.tasks ?? [],
+      },
+      slices: [],
+      metrics: [],
+      reproducibility_summary: leaf.reproducibility_summary,
+      provenance_summary: leaf.provenance_summary,
+      comparability_summary: leaf.comparability_summary,
+      summary_eval_ids: leaf.eval_summary_ids,
+    }))
+    return {
+      ...family,
+      tags: {
+        domains: family.tags?.domains ?? [],
+        languages: family.tags?.languages ?? [],
+        tasks: family.tags?.tasks ?? [],
+      },
+      standalone_benchmarks: standalone,
+    }
+  })
+  if (raw.stats) {
+    return { ...raw, families }
+  }
+  let composite_count = 0
+  let standalone_benchmark_count = 0
+  let single_benchmark_count = 0
+  let slice_count = 0
+  let metric_count = 0
+  for (const family of families) {
+    composite_count += family.composites?.length ?? 0
+    const standalone = family.standalone_benchmarks ?? []
+    standalone_benchmark_count += standalone.length
+    if ((family.composites?.length ?? 0) === 0 && standalone.length === 1) {
+      single_benchmark_count += 1
+    }
+    for (const composite of family.composites ?? []) {
+      for (const benchmark of composite.benchmarks ?? []) {
+        slice_count += benchmark.slices?.length ?? 0
+        metric_count += benchmark.metrics?.length ?? 0
+      }
+    }
+    for (const benchmark of standalone) {
+      slice_count += benchmark.slices?.length ?? 0
+      metric_count += benchmark.metrics?.length ?? 0
+    }
+  }
+  return {
+    ...raw,
+    families,
+    stats: {
+      family_count: families.length,
+      composite_count,
+      standalone_benchmark_count,
+      single_benchmark_count,
+      slice_count,
+      metric_count,
+      metric_rows_scanned: 0,
+    },
+  }
 }
 export async function fetchComparisonIndex(): Promise<ComparisonIndex> {
   return fetchHFJson<ComparisonIndex>("comparison-index.json")
 }
+export async function fetchCorpusAggregates(): Promise<CorpusAggregates | null> {
+  return fetchHFJsonSafe<CorpusAggregates>("corpus-aggregates.json")
+}
 export async function fetchModelDetail(slug: string): Promise<HFModelDetail | null> {
   return fetchHFJsonSafe<HFModelDetail>(`models/${slug}.json`)
 }
         detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
           result.detailed_evaluation_results
         ),
+        evalcards: result.evalcards,
       }
       const existing = resultsByVariant.get(variantKey)

lib/model-data.ts CHANGED Viewed

@@ -27,6 +27,7 @@ import { getCanonicalModelIdentity, getModelFamilyRouteId } from "@/lib/model-fa
 import { getBenchmarkCard, normalizeBenchmarkKey } from "@/lib/benchmark-metadata"
 import {
   type HFEvalDetail,
   type HFEvalModelResult,
   type HFModelCardEntry,
   type HFModelDetail,
@@ -337,6 +338,24 @@ function parseParamsBillions(value: unknown): number | null {
   return Number.isFinite(numeric) && numeric > 0 ? numeric : null
 }
 // ---------------------------------------------------------------------------
 // HF model-cards.json → EvaluationCardData
 // ---------------------------------------------------------------------------
@@ -391,6 +410,9 @@ function hfModelCardToEvaluationCardData(entry: HFModelCardEntry): EvaluationCar
       ? `${entry.benchmark_names.length} benchmark${entry.benchmark_names.length === 1 ? "" : "s"}`
       : undefined,
     params_billions: parseParamsBillions(entry.params_billions),
     benchmark_names: (entry.benchmark_names ?? []).map((name) => getBenchmarkDisplayName(name)),
     score_summary: {
       count: entry.score_summary.count,
@@ -408,31 +430,7 @@ function hfModelCardToEvaluationCardData(entry: HFModelCardEntry): EvaluationCar
 // HF eval-list.json → BenchmarkEvalListItem
 // ---------------------------------------------------------------------------
-function hfEvalEntryToListItem(entry: {
-  eval_summary_id: string
-  benchmark: string
-  benchmark_family_key: string
-  benchmark_family_name: string
-  benchmark_parent_name?: string
-  benchmark_leaf_key: string
-  benchmark_leaf_name: string
-  evaluation_name?: string
-  display_name: string
-  is_summary_score?: boolean
-  summary_eval_ids?: string[]
-  category: string
-  tags: { domains: string[]; languages: string[]; tasks: string[] }
-  models_count: number
-  metrics_count: number
-  subtasks_count?: number
-  metric_names: string[]
-  primary_metric_name: string
-  benchmark_card: BenchmarkCard | null
-  source_data?: SourceData
-  top_score: number
-  instance_data: { available: boolean; url_count: number; sample_urls: string[]; models_with_loaded_instances: number }
-  metrics: Array<{ metric_summary_id: string; metric_name: string; lower_is_better: boolean; models_count: number; top_score: number }>
-}): BenchmarkEvalListItem {
   // Use the pipeline's category directly, mapped to our CategoryType
   const category = mapHFCategories([entry.category])[0] ?? "General" as CategoryType
@@ -486,6 +484,10 @@ function hfEvalEntryToListItem(entry: {
     subtasks_count: entry.subtasks_count ?? 0,
     is_summary_score: entry.is_summary_score ?? false,
     summary_eval_ids: entry.summary_eval_ids ?? [],
   }
 }
@@ -652,6 +654,7 @@ function buildBenchmarkLeaderboardMatrix(detail: HFEvalDetail) {
           source_metadata: sourceMetadata,
           source_data: sourceData,
           values: { [columnKey]: modelResult.score ?? null },
           metrics_present: 0,
           _timestampValue: nextTimestamp,
         })
@@ -659,6 +662,10 @@ function buildBenchmarkLeaderboardMatrix(detail: HFEvalDetail) {
       }
       existing.values[columnKey] = modelResult.score ?? null
       if (!existing.model_route_id && modelResult.model_route_id) {
         existing.model_route_id = modelResult.model_route_id
       }
@@ -725,6 +732,7 @@ function toModelResultsForMetric(
       detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
         mr.detailed_evaluation_results
       ),
     }
     return {
@@ -797,6 +805,11 @@ function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
       subtasks,
       leaderboard_metrics: leaderboardMatrix.leaderboard_metrics,
       leaderboard_rows: leaderboardMatrix.leaderboard_rows,
     }
   }
@@ -847,6 +860,11 @@ function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
     subtasks,
     leaderboard_metrics: leaderboardMatrix.leaderboard_metrics,
     leaderboard_rows: leaderboardMatrix.leaderboard_rows,
   }
 }
@@ -1140,6 +1158,7 @@ function buildSingleMetricSuiteMatrixSummary(
           source_metadata: sourceMetadata,
           source_data: sourceData,
           values: { [columnKey]: modelResult.score ?? null },
           metrics_present: 0,
           _timestampValue: nextTimestamp,
         })
@@ -1147,6 +1166,10 @@ function buildSingleMetricSuiteMatrixSummary(
       }
       existing.values[columnKey] = modelResult.score ?? null
       if (!existing.model_route_id && modelResult.model_route_id) {
         existing.model_route_id = modelResult.model_route_id
       }
@@ -1469,7 +1492,7 @@ export async function getModelSummaryById(modelId: string) {
     if (detail) {
       const evaluations = flattenModelEvaluations(detail)
       if (evaluations.length > 0) {
-        return createModelFamilySummary(evaluations)
       }
     }
   }
@@ -1489,7 +1512,7 @@ export async function getModelSummaryById(modelId: string) {
     if (detail) {
       const evaluations = flattenModelEvaluations(detail)
       if (evaluations.length > 0) {
-        return createModelFamilySummary(evaluations)
       }
     }
@@ -1501,7 +1524,7 @@ export async function getModelSummaryById(modelId: string) {
           if (variantDetail) {
             const evaluations = flattenModelEvaluations(variantDetail)
             if (evaluations.length > 0) {
-              return createModelFamilySummary(evaluations)
             }
           }
         }

 import { getBenchmarkCard, normalizeBenchmarkKey } from "@/lib/benchmark-metadata"
 import {
   type HFEvalDetail,
+  type HFEvalListEntry,
   type HFEvalModelResult,
   type HFModelCardEntry,
   type HFModelDetail,
   return Number.isFinite(numeric) && numeric > 0 ? numeric : null
 }
+function attachModelSignalSummaries<T extends ReturnType<typeof createModelFamilySummary>>(
+  summary: T,
+  detail: HFModelDetail
+): T {
+  return {
+    ...summary,
+    reproducibility_summary: detail.reproducibility_summary,
+    provenance_summary: detail.provenance_summary,
+    comparability_summary: detail.comparability_summary,
+    variants: summary.variants.map((variant) => ({
+      ...variant,
+      reproducibility_summary: detail.reproducibility_summary,
+      provenance_summary: detail.provenance_summary,
+      comparability_summary: detail.comparability_summary,
+    })),
+  }
+}
 // ---------------------------------------------------------------------------
 // HF model-cards.json → EvaluationCardData
 // ---------------------------------------------------------------------------
       ? `${entry.benchmark_names.length} benchmark${entry.benchmark_names.length === 1 ? "" : "s"}`
       : undefined,
     params_billions: parseParamsBillions(entry.params_billions),
+    reproducibility_summary: entry.reproducibility_summary,
+    provenance_summary: entry.provenance_summary,
+    comparability_summary: entry.comparability_summary,
     benchmark_names: (entry.benchmark_names ?? []).map((name) => getBenchmarkDisplayName(name)),
     score_summary: {
       count: entry.score_summary.count,
 // HF eval-list.json → BenchmarkEvalListItem
 // ---------------------------------------------------------------------------
+function hfEvalEntryToListItem(entry: HFEvalListEntry): BenchmarkEvalListItem {
   // Use the pipeline's category directly, mapped to our CategoryType
   const category = mapHFCategories([entry.category])[0] ?? "General" as CategoryType
     subtasks_count: entry.subtasks_count ?? 0,
     is_summary_score: entry.is_summary_score ?? false,
     summary_eval_ids: entry.summary_eval_ids ?? [],
+    evalcards: entry.evalcards,
+    reproducibility_summary: entry.reproducibility_summary,
+    provenance_summary: entry.provenance_summary,
+    comparability_summary: entry.comparability_summary,
   }
 }
           source_metadata: sourceMetadata,
           source_data: sourceData,
           values: { [columnKey]: modelResult.score ?? null },
+          annotations_by_metric: { [columnKey]: modelResult.evalcards?.annotations ?? null },
           metrics_present: 0,
           _timestampValue: nextTimestamp,
         })
       }
       existing.values[columnKey] = modelResult.score ?? null
+      existing.annotations_by_metric = {
+        ...(existing.annotations_by_metric ?? {}),
+        [columnKey]: modelResult.evalcards?.annotations ?? null,
+      }
       if (!existing.model_route_id && modelResult.model_route_id) {
         existing.model_route_id = modelResult.model_route_id
       }
       detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
         mr.detailed_evaluation_results
       ),
+      evalcards: mr.evalcards,
     }
     return {
       subtasks,
       leaderboard_metrics: leaderboardMatrix.leaderboard_metrics,
       leaderboard_rows: leaderboardMatrix.leaderboard_rows,
+      source_data: detail.source_data,
+      evalcards: detail.evalcards,
+      reproducibility_summary: detail.reproducibility_summary,
+      provenance_summary: detail.provenance_summary,
+      comparability_summary: detail.comparability_summary,
     }
   }
     subtasks,
     leaderboard_metrics: leaderboardMatrix.leaderboard_metrics,
     leaderboard_rows: leaderboardMatrix.leaderboard_rows,
+    source_data: detail.source_data,
+    evalcards: detail.evalcards,
+    reproducibility_summary: detail.reproducibility_summary,
+    provenance_summary: detail.provenance_summary,
+    comparability_summary: detail.comparability_summary,
   }
 }
           source_metadata: sourceMetadata,
           source_data: sourceData,
           values: { [columnKey]: modelResult.score ?? null },
+          annotations_by_metric: { [columnKey]: modelResult.evalcards?.annotations ?? null },
           metrics_present: 0,
           _timestampValue: nextTimestamp,
         })
       }
       existing.values[columnKey] = modelResult.score ?? null
+      existing.annotations_by_metric = {
+        ...(existing.annotations_by_metric ?? {}),
+        [columnKey]: modelResult.evalcards?.annotations ?? null,
+      }
       if (!existing.model_route_id && modelResult.model_route_id) {
         existing.model_route_id = modelResult.model_route_id
       }
     if (detail) {
       const evaluations = flattenModelEvaluations(detail)
       if (evaluations.length > 0) {
+        return attachModelSignalSummaries(createModelFamilySummary(evaluations), detail)
       }
     }
   }
     if (detail) {
       const evaluations = flattenModelEvaluations(detail)
       if (evaluations.length > 0) {
+        return attachModelSignalSummaries(createModelFamilySummary(evaluations), detail)
       }
     }
           if (variantDetail) {
             const evaluations = flattenModelEvaluations(variantDetail)
             if (evaluations.length > 0) {
+              return attachModelSignalSummaries(createModelFamilySummary(evaluations), variantDetail)
             }
           }
         }

public/peer-ranks.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

scripts/cache-hf-data.mjs CHANGED Viewed

@@ -32,11 +32,13 @@ const CACHE_ROOT_FILES = [
   "benchmark-metadata.json",
   "eval-hierarchy.json",
   "comparison-index.json",
 ]
 const OPTIONAL_CACHE_ROOT_FILES = new Set([
   "model-cards-lite.json",
   "eval-list-lite.json",
 ])
 const CACHE_DIRECTORIES = ["developers", "evals", "models"]

   "benchmark-metadata.json",
   "eval-hierarchy.json",
   "comparison-index.json",
+  "corpus-aggregates.json",
 ]
 const OPTIONAL_CACHE_ROOT_FILES = new Set([
   "model-cards-lite.json",
   "eval-list-lite.json",
+  "corpus-aggregates.json",
 ])
 const CACHE_DIRECTORIES = ["developers", "evals", "models"]