evijit HF Staff Claude Opus 4.7 (1M context) commited on
Commit
bca888a
·
1 Parent(s): 431b0cc

Add interpretive signals, corpus dashboard, and slice browser

Browse files

Surfaces reproducibility, reporting completeness, provenance, and
comparability signals from the backend across eval detail, model compare,
eval list cards, and a new /corpus dashboard. Adapts to the upstream
2-level hierarchy (family → leaf), caps the leaderboard at 24 default
columns and replaces the slice tabs with a search dialog when a
benchmark has more than 5 slices.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

app/api/corpus-aggregates/route.ts ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { NextResponse } from "next/server"
2
+
3
+ import { fetchCorpusAggregates } from "@/lib/hf-data"
4
+
5
+ export async function GET() {
6
+ const aggregates = await fetchCorpusAggregates()
7
+
8
+ if (!aggregates) {
9
+ return NextResponse.json(
10
+ { error: "Corpus aggregates not available" },
11
+ { status: 404 }
12
+ )
13
+ }
14
+
15
+ return NextResponse.json(aggregates)
16
+ }
app/corpus/page.tsx ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { CorpusDashboard } from "@/components/signals/corpus-dashboard"
2
+ import { Navigation } from "@/components/navigation"
3
+ import { fetchCorpusAggregates, fetchEvalListLite } from "@/lib/hf-data"
4
+
5
+ export default async function CorpusPage() {
6
+ const [aggregates, evalList] = await Promise.all([
7
+ fetchCorpusAggregates(),
8
+ fetchEvalListLite().catch(() => ({ evals: [] })),
9
+ ])
10
+
11
+ const completenessScores = evalList.evals
12
+ .map((entry) => entry.evalcards?.annotations?.reporting_completeness?.completeness_score)
13
+ .filter((score): score is number => typeof score === "number" && Number.isFinite(score))
14
+
15
+ return (
16
+ <div className="min-h-screen bg-background">
17
+ <Navigation />
18
+ <main className="container mx-auto px-4 py-8">
19
+ {aggregates ? (
20
+ <CorpusDashboard aggregates={aggregates} completenessScores={completenessScores} />
21
+ ) : (
22
+ <section className="rounded-2xl border border-dashed border-border/70 bg-card p-8 text-center">
23
+ <div className="text-[11px] font-semibold uppercase tracking-[0.22em] text-muted-foreground">
24
+ Interpretive signals
25
+ </div>
26
+ <h1 className="mt-2 text-2xl font-semibold tracking-tight">Corpus aggregates are not available yet</h1>
27
+ <p className="mx-auto mt-3 max-w-2xl text-sm leading-6 text-muted-foreground">
28
+ The frontend is ready for `corpus-aggregates.json`, but this cached backend snapshot does not include it yet.
29
+ Once the dataset ships the file, this page will render reproducibility, completeness, provenance, and comparability rollups.
30
+ </p>
31
+ </section>
32
+ )}
33
+ </main>
34
+ </div>
35
+ )
36
+ }
app/evals/page.tsx CHANGED
@@ -11,7 +11,7 @@ import { PageHeader } from "@/components/page-header"
11
  import { Button } from "@/components/ui/button"
12
  import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
13
  import { Input } from "@/components/ui/input"
14
- import type { EvalHierarchy } from "@/lib/backend-artifacts"
15
  import type { BenchmarkCard, CategoryType } from "@/lib/benchmark-schema"
16
  import type { BenchmarkEvalListItem } from "@/lib/eval-processing"
17
  import { fetchBenchmarkMetadata, fetchEvalHierarchy, fetchEvalList } from "@/lib/dashboard-data-client"
@@ -251,7 +251,7 @@ interface EvalBrowserMatrixPreviewRow {
251
  value: string
252
  }
253
 
254
- interface EvalBrowserNode {
255
  id: string
256
  parentId: string | null
257
  kind: EvalBrowserNodeKind
@@ -261,6 +261,7 @@ interface EvalBrowserNode {
261
  description: string
262
  category: CategoryType
263
  domains: string[]
 
264
  dataType?: string
265
  license?: string
266
  card?: BenchmarkCard
@@ -272,6 +273,8 @@ interface EvalBrowserNode {
272
  childIds: string[]
273
  href?: string
274
  scopeKeys: string[]
 
 
275
  matrixPreview?: {
276
  columnLabel: string
277
  rows: EvalBrowserMatrixPreviewRow[]
@@ -336,6 +339,86 @@ function summarizeNodeStats(
336
  0
337
  )
338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  return {
340
  category: getDominantCategory(summaries, fallbackCategory),
341
  modelsCount,
@@ -346,6 +429,10 @@ function summarizeNodeStats(
346
  summaries[0]?.source_data?.hf_repo ??
347
  summaries[0]?.source_data?.dataset_name ??
348
  "Hierarchy summary",
 
 
 
 
349
  }
350
  }
351
 
@@ -435,6 +522,94 @@ function getNodeCard(
435
  return undefined
436
  }
437
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
  function looksLikeLanguageSplit(value: string) {
439
  const normalized = normalizeBenchmarkKey(value)
440
  const languageLike = new Set([
@@ -528,6 +703,7 @@ export default function EvalsPage() {
528
  const [totalModels, setTotalModels] = useState(0)
529
  const [searchQuery, setSearchQuery] = useState("")
530
  const [selectedDomain, setSelectedDomain] = useState<string | null>(null)
 
531
  const [selectedCategory, setSelectedCategory] = useState<string | null>(null)
532
  const [selectedNodeKind, setSelectedNodeKind] = useState<EvalBrowserNodeKind | null>(null)
533
  const [currentNodeId, setCurrentNodeId] = useState<string | null>(null)
@@ -558,10 +734,12 @@ export default function EvalsPage() {
558
  const params = new URLSearchParams(window.location.search)
559
  const incomingSearch = params.get("search") ?? ""
560
  const incomingDomain = params.get("domain")
 
561
  const incomingCategory = params.get("category")
562
  const incomingNode = params.get("node")
563
  setSearchQuery(incomingSearch)
564
  setSelectedDomain(incomingDomain)
 
565
  setSelectedCategory(incomingCategory)
566
  setCurrentNodeId(incomingNode)
567
  }
@@ -586,6 +764,9 @@ export default function EvalsPage() {
586
  if (selectedDomain) {
587
  params.set("domain", selectedDomain)
588
  }
 
 
 
589
  if (selectedCategory) {
590
  params.set("category", selectedCategory)
591
  }
@@ -607,7 +788,7 @@ export default function EvalsPage() {
607
  }
608
 
609
  pendingHistoryActionRef.current = "replace"
610
- }, [currentNodeId, searchQuery, selectedCategory, selectedDomain])
611
 
612
  const summariesWithCards = useMemo(() => {
613
  return summaries.map((summary) => {
@@ -665,6 +846,7 @@ export default function EvalsPage() {
665
  suiteLabel,
666
  category,
667
  domains,
 
668
  summaries,
669
  card,
670
  sourceLabel,
@@ -682,6 +864,7 @@ export default function EvalsPage() {
682
  suiteLabel?: string
683
  category: CategoryType
684
  domains: string[]
 
685
  summaries: BenchmarkEvalListItem[]
686
  card?: BenchmarkCard
687
  sourceLabel?: string
@@ -692,6 +875,7 @@ export default function EvalsPage() {
692
  descriptionFallback: string
693
  }) => {
694
  const stats = summarizeNodeStats(summaries, category)
 
695
  addNode({
696
  id,
697
  parentId,
@@ -702,6 +886,7 @@ export default function EvalsPage() {
702
  description: buildDescription(title, card, descriptionFallback),
703
  category: stats.category,
704
  domains: Array.from(new Set(domains.flatMap((domain) => normalizeDomainList(domain)))),
 
705
  dataType: card?.benchmark_details?.data_type,
706
  license: card?.ethical_and_legal_considerations?.data_licensing,
707
  card,
@@ -714,6 +899,10 @@ export default function EvalsPage() {
714
  href,
715
  scopeKeys,
716
  matrixPreview,
 
 
 
 
717
  })
718
  }
719
 
@@ -799,6 +988,7 @@ export default function EvalsPage() {
799
  slices = [],
800
  metrics = [],
801
  scopeKeys,
 
802
  }: {
803
  parentId: string | null
804
  familyLabel?: string
@@ -812,6 +1002,8 @@ export default function EvalsPage() {
812
  slices?: Array<{ key: string; display_name: string; metrics: Array<{ key: string; display_name: string }> }>
813
  metrics?: Array<{ key: string; display_name: string }>
814
  scopeKeys: string[]
 
 
815
  }) => {
816
  const benchmarkId = `${parentId ?? "root"}::benchmark:${normalizeBenchmarkKey(benchmarkKey)}`
817
  const card = summary?.benchmark_card ?? getNodeCard(benchmarkCards, ...cardCandidates)
@@ -821,6 +1013,13 @@ export default function EvalsPage() {
821
  !summary && metrics.length > 0
822
  ? scopeKeys.map((scopeKey) => pickSummaryForKey(summariesWithCards, scopeKey, scopeKeys)).find(Boolean)
823
  : undefined
 
 
 
 
 
 
 
824
  const isParentRollupBenchmark =
825
  Boolean(parentId) && scopeKeys.some((scopeKey) => isSameHierarchyKey(scopeKey, benchmarkKey))
826
 
@@ -829,10 +1028,10 @@ export default function EvalsPage() {
829
 
830
  if (drilldownSlices.length > 0) {
831
  createSliceNodes(parentId, parentLabel, summary, drilldownSlices, category, scopeKeys)
832
- } else if (summary) {
833
  const parent = nodes.get(parentId)
834
  if (parent && !parent.href) {
835
- parent.href = `/evals/${summary.evaluation_id}`
836
  }
837
  }
838
  return
@@ -849,14 +1048,7 @@ export default function EvalsPage() {
849
  domains,
850
  summaries: summary ? [summary] : [],
851
  card,
852
- href:
853
- drilldownSlices.length === 0
854
- ? summary
855
- ? `/evals/${summary.evaluation_id}`
856
- : fallbackSummary
857
- ? `/evals/${fallbackSummary.evaluation_id}`
858
- : undefined
859
- : undefined,
860
  scopeKeys,
861
  descriptionFallback: `Browse the {label} benchmark and its lower-level breakdowns.`,
862
  })
@@ -1038,6 +1230,7 @@ export default function EvalsPage() {
1038
  slices: standalone.slices ?? [],
1039
  metrics: standalone.metrics ?? [],
1040
  scopeKeys: familyScopeKeys,
 
1041
  })
1042
  }
1043
 
@@ -1199,6 +1392,8 @@ export default function EvalsPage() {
1199
  })),
1200
  metrics: benchmarkSource?.metrics ?? family.metrics ?? [],
1201
  scopeKeys: familyScopeKeys,
 
 
1202
  })
1203
  }
1204
 
@@ -1243,6 +1438,7 @@ export default function EvalsPage() {
1243
  node.description,
1244
  node.sourceLabel,
1245
  ...node.domains,
 
1246
  ]
1247
 
1248
  return haystacks.some((value) => value?.toLowerCase().includes(query))
@@ -1261,6 +1457,12 @@ export default function EvalsPage() {
1261
  domainCandidates = domainCandidates.filter((node) => node.category === selectedCategory)
1262
  }
1263
 
 
 
 
 
 
 
1264
  for (const node of domainCandidates) {
1265
  for (const domain of node.domains) {
1266
  domainSet.add(domain)
@@ -1268,7 +1470,34 @@ export default function EvalsPage() {
1268
  }
1269
 
1270
  return Array.from(domainSet).sort((a, b) => a.localeCompare(b))
1271
- }, [nodesMatchingSearch, selectedCategory])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1272
 
1273
  const allCategories = useMemo(() => {
1274
  const categorySet = new Set<string>()
@@ -1284,12 +1513,18 @@ export default function EvalsPage() {
1284
  )
1285
  }
1286
 
 
 
 
 
 
 
1287
  for (const node of categoryCandidates) {
1288
  categorySet.add(node.category)
1289
  }
1290
 
1291
  return Array.from(categorySet).sort((a, b) => a.localeCompare(b))
1292
- }, [nodesMatchingSearch, selectedDomain])
1293
 
1294
  const filtered = useMemo(() => {
1295
  let list = [...nodesMatchingSearch]
@@ -1306,13 +1541,21 @@ export default function EvalsPage() {
1306
  )
1307
  }
1308
 
 
 
 
 
 
 
 
 
1309
  if (selectedCategory) {
1310
  list = list.filter((node) => node.category === selectedCategory)
1311
  }
1312
 
1313
  list.sort((a, b) => a.title.localeCompare(b.title, undefined, { sensitivity: "base" }))
1314
  return list
1315
- }, [nodesMatchingSearch, selectedCategory, selectedDomain, selectedNodeKind])
1316
 
1317
  useEffect(() => {
1318
  if (selectedDomain && !allDomains.includes(selectedDomain)) {
@@ -1320,6 +1563,12 @@ export default function EvalsPage() {
1320
  }
1321
  }, [allDomains, selectedDomain])
1322
 
 
 
 
 
 
 
1323
  useEffect(() => {
1324
  if (selectedCategory && !allCategories.includes(selectedCategory)) {
1325
  setSelectedCategory(null)
@@ -1328,7 +1577,7 @@ export default function EvalsPage() {
1328
 
1329
  useEffect(() => {
1330
  setPage(1)
1331
- }, [currentNodeId, searchQuery, selectedCategory, selectedDomain, selectedNodeKind])
1332
 
1333
  const pagedNodes = useMemo(
1334
  () => filtered.slice((page - 1) * PAGE_SIZE, page * PAGE_SIZE),
@@ -1336,7 +1585,7 @@ export default function EvalsPage() {
1336
  )
1337
 
1338
  const currentLevelKinds = Array.from(new Set(currentLevelNodes.map((node) => node.kind)))
1339
- const activeFilterCount = [searchQuery.trim(), selectedDomain, selectedCategory, selectedNodeKind].filter(Boolean).length
1340
  const currentLevelLabel =
1341
  currentNodeId === null
1342
  ? "Rollout entry level"
@@ -1480,6 +1729,7 @@ export default function EvalsPage() {
1480
  onClick={() => {
1481
  setSearchQuery("")
1482
  setSelectedDomain(null)
 
1483
  setSelectedCategory(null)
1484
  setSelectedNodeKind(null)
1485
  }}
@@ -1545,7 +1795,7 @@ export default function EvalsPage() {
1545
  </div>
1546
  </div>
1547
 
1548
- {hierarchy && (
1549
  <div className="flex flex-wrap gap-2 text-sm">
1550
  <span className="rounded-full border border-stone-200/80 bg-stone-50/80 px-3 py-1.5 font-medium text-stone-700 dark:border-stone-700/80 dark:bg-stone-900/70 dark:text-stone-200">
1551
  {hierarchy.stats.family_count} families
@@ -1669,6 +1919,43 @@ export default function EvalsPage() {
1669
  </div>
1670
  )}
1671
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1672
  {allCategories.length > 0 && (
1673
  <div className="mt-4 space-y-1.5">
1674
  <div className="text-[11px] font-semibold uppercase tracking-[0.2em] text-stone-500 dark:text-stone-400">
@@ -1794,6 +2081,8 @@ export default function EvalsPage() {
1794
  {node.title}
1795
  </h3>
1796
 
 
 
1797
  {node.description && (
1798
  <p className="mb-4 flex-1 text-sm leading-6 text-stone-600 line-clamp-3 dark:text-stone-300">
1799
  {node.description}
 
11
  import { Button } from "@/components/ui/button"
12
  import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
13
  import { Input } from "@/components/ui/input"
14
+ import type { EvalHierarchy, SignalSummaries } from "@/lib/backend-artifacts"
15
  import type { BenchmarkCard, CategoryType } from "@/lib/benchmark-schema"
16
  import type { BenchmarkEvalListItem } from "@/lib/eval-processing"
17
  import { fetchBenchmarkMetadata, fetchEvalHierarchy, fetchEvalList } from "@/lib/dashboard-data-client"
 
251
  value: string
252
  }
253
 
254
+ interface EvalBrowserNode extends SignalSummaries {
255
  id: string
256
  parentId: string | null
257
  kind: EvalBrowserNodeKind
 
261
  description: string
262
  category: CategoryType
263
  domains: string[]
264
+ tasks: string[]
265
  dataType?: string
266
  license?: string
267
  card?: BenchmarkCard
 
273
  childIds: string[]
274
  href?: string
275
  scopeKeys: string[]
276
+ /** Reporting completeness score in [0, 1] when known, otherwise undefined. */
277
+ completenessScore?: number
278
  matrixPreview?: {
279
  columnLabel: string
280
  rows: EvalBrowserMatrixPreviewRow[]
 
339
  0
340
  )
341
 
342
+ // Aggregate signals across all summaries under this node so a family card
343
+ // can show signals that span its children.
344
+ const reproducibilitySummaries = summaries
345
+ .map((s) => s.reproducibility_summary)
346
+ .filter((value): value is NonNullable<typeof value> => Boolean(value))
347
+ const provenanceSummaries = summaries
348
+ .map((s) => s.provenance_summary)
349
+ .filter((value): value is NonNullable<typeof value> => Boolean(value))
350
+ const comparabilitySummaries = summaries
351
+ .map((s) => s.comparability_summary)
352
+ .filter((value): value is NonNullable<typeof value> => Boolean(value))
353
+
354
+ const reproducibility_summary = reproducibilitySummaries.length
355
+ ? reproducibilitySummaries.reduce(
356
+ (acc, item) => ({
357
+ results_total: acc.results_total + item.results_total,
358
+ has_reproducibility_gap_count:
359
+ acc.has_reproducibility_gap_count + item.has_reproducibility_gap_count,
360
+ populated_ratio_avg: null,
361
+ }),
362
+ { results_total: 0, has_reproducibility_gap_count: 0, populated_ratio_avg: null as number | null }
363
+ )
364
+ : undefined
365
+
366
+ const provenance_summary = provenanceSummaries.length
367
+ ? provenanceSummaries.reduce(
368
+ (acc, item) => {
369
+ for (const key of ["first_party", "third_party", "collaborative", "unspecified"] as const) {
370
+ acc.source_type_distribution[key] += item.source_type_distribution[key] ?? 0
371
+ }
372
+ return {
373
+ total_results: acc.total_results + item.total_results,
374
+ total_groups: acc.total_groups + item.total_groups,
375
+ multi_source_groups: acc.multi_source_groups + item.multi_source_groups,
376
+ first_party_only_groups: acc.first_party_only_groups + item.first_party_only_groups,
377
+ source_type_distribution: acc.source_type_distribution,
378
+ }
379
+ },
380
+ {
381
+ total_results: 0,
382
+ total_groups: 0,
383
+ multi_source_groups: 0,
384
+ first_party_only_groups: 0,
385
+ source_type_distribution: {
386
+ first_party: 0,
387
+ third_party: 0,
388
+ collaborative: 0,
389
+ unspecified: 0,
390
+ },
391
+ }
392
+ )
393
+ : undefined
394
+
395
+ const comparability_summary = comparabilitySummaries.length
396
+ ? comparabilitySummaries.reduce(
397
+ (acc, item) => ({
398
+ total_groups: acc.total_groups + item.total_groups,
399
+ groups_with_variant_check: acc.groups_with_variant_check + item.groups_with_variant_check,
400
+ groups_with_cross_party_check: acc.groups_with_cross_party_check + item.groups_with_cross_party_check,
401
+ variant_divergent_count: acc.variant_divergent_count + item.variant_divergent_count,
402
+ cross_party_divergent_count: acc.cross_party_divergent_count + item.cross_party_divergent_count,
403
+ }),
404
+ {
405
+ total_groups: 0,
406
+ groups_with_variant_check: 0,
407
+ groups_with_cross_party_check: 0,
408
+ variant_divergent_count: 0,
409
+ cross_party_divergent_count: 0,
410
+ }
411
+ )
412
+ : undefined
413
+
414
+ // Average completeness score across summaries that report one.
415
+ const completenessScores = summaries
416
+ .map((s) => s.evalcards?.annotations?.reporting_completeness?.completeness_score)
417
+ .filter((v): v is number => typeof v === "number" && Number.isFinite(v))
418
+ const completenessScore = completenessScores.length
419
+ ? completenessScores.reduce((sum, value) => sum + value, 0) / completenessScores.length
420
+ : undefined
421
+
422
  return {
423
  category: getDominantCategory(summaries, fallbackCategory),
424
  modelsCount,
 
429
  summaries[0]?.source_data?.hf_repo ??
430
  summaries[0]?.source_data?.dataset_name ??
431
  "Hierarchy summary",
432
+ reproducibility_summary,
433
+ provenance_summary,
434
+ comparability_summary,
435
+ completenessScore,
436
  }
437
  }
438
 
 
522
  return undefined
523
  }
524
 
525
+ /**
526
+ * Compact signal indicators for a node card. Shown alongside (or instead of)
527
+ * the benchmark-card-derived metadata so that nodes lacking a benchmark card
528
+ * still surface useful interpretive context.
529
+ */
530
+ function NodeSignalChips({ node }: { node: EvalBrowserNode }) {
531
+ const repro = node.reproducibility_summary
532
+ const prov = node.provenance_summary
533
+ const comparability = node.comparability_summary
534
+ const completeness = node.completenessScore
535
+
536
+ const reproPercent =
537
+ repro && repro.results_total > 0
538
+ ? Math.round((repro.has_reproducibility_gap_count / repro.results_total) * 100)
539
+ : null
540
+
541
+ const firstPartyPercent =
542
+ prov && prov.total_groups > 0
543
+ ? Math.round((prov.first_party_only_groups / prov.total_groups) * 100)
544
+ : null
545
+
546
+ const variantDivergent = comparability?.variant_divergent_count ?? 0
547
+ const crossPartyDivergent = comparability?.cross_party_divergent_count ?? 0
548
+
549
+ const completenessPercent = completeness != null ? Math.round(completeness * 100) : null
550
+
551
+ const hasAny =
552
+ reproPercent !== null ||
553
+ firstPartyPercent !== null ||
554
+ variantDivergent > 0 ||
555
+ crossPartyDivergent > 0 ||
556
+ completenessPercent !== null
557
+
558
+ if (!hasAny) {
559
+ return null
560
+ }
561
+
562
+ return (
563
+ <div className="mb-3 flex flex-wrap gap-1.5">
564
+ {completenessPercent !== null && (
565
+ <span
566
+ className={cn(
567
+ "inline-flex items-center gap-1 rounded-full border px-2.5 py-0.5 text-[10px] font-semibold",
568
+ completenessPercent >= 50
569
+ ? "border-emerald-200 bg-emerald-50 text-emerald-800 dark:border-emerald-900/50 dark:bg-emerald-950/30 dark:text-emerald-200"
570
+ : "border-amber-200 bg-amber-50 text-amber-800 dark:border-amber-900/50 dark:bg-amber-950/30 dark:text-amber-200"
571
+ )}
572
+ title={`Documentation completeness: ${completenessPercent}% of EvalCards fields populated.`}
573
+ >
574
+ {completenessPercent}% documented
575
+ </span>
576
+ )}
577
+ {reproPercent !== null && reproPercent > 0 && (
578
+ <span
579
+ className="inline-flex items-center gap-1 rounded-full border border-amber-200 bg-amber-50 px-2.5 py-0.5 text-[10px] font-semibold text-amber-800 dark:border-amber-900/50 dark:bg-amber-950/30 dark:text-amber-200"
580
+ title={`${repro?.has_reproducibility_gap_count.toLocaleString()} of ${repro?.results_total.toLocaleString()} reported scores missing setup details.`}
581
+ >
582
+ {reproPercent}% setup gaps
583
+ </span>
584
+ )}
585
+ {firstPartyPercent !== null && firstPartyPercent >= 50 && (
586
+ <span
587
+ className="inline-flex items-center gap-1 rounded-full border border-amber-200 bg-amber-50 px-2.5 py-0.5 text-[10px] font-semibold text-amber-800 dark:border-amber-900/50 dark:bg-amber-950/30 dark:text-amber-200"
588
+ title={`${firstPartyPercent}% of (model, metric) groups have only first-party reports — no independent replication.`}
589
+ >
590
+ {firstPartyPercent}% 1st-party only
591
+ </span>
592
+ )}
593
+ {variantDivergent > 0 && (
594
+ <span
595
+ className="inline-flex items-center gap-1 rounded-full border border-rose-200 bg-rose-50 px-2.5 py-0.5 text-[10px] font-semibold text-rose-800 dark:border-rose-900/50 dark:bg-rose-950/30 dark:text-rose-200"
596
+ title={`${variantDivergent} group${variantDivergent === 1 ? "" : "s"} where setup variations produced diverging scores.`}
597
+ >
598
+ {variantDivergent} setup divergence{variantDivergent === 1 ? "" : "s"}
599
+ </span>
600
+ )}
601
+ {crossPartyDivergent > 0 && (
602
+ <span
603
+ className="inline-flex items-center gap-1 rounded-full border border-violet-200 bg-violet-50 px-2.5 py-0.5 text-[10px] font-semibold text-violet-800 dark:border-violet-900/50 dark:bg-violet-950/30 dark:text-violet-200"
604
+ title={`${crossPartyDivergent} group${crossPartyDivergent === 1 ? "" : "s"} where different organizations reported diverging scores.`}
605
+ >
606
+ {crossPartyDivergent} source disagreement{crossPartyDivergent === 1 ? "" : "s"}
607
+ </span>
608
+ )}
609
+ </div>
610
+ )
611
+ }
612
+
613
  function looksLikeLanguageSplit(value: string) {
614
  const normalized = normalizeBenchmarkKey(value)
615
  const languageLike = new Set([
 
703
  const [totalModels, setTotalModels] = useState(0)
704
  const [searchQuery, setSearchQuery] = useState("")
705
  const [selectedDomain, setSelectedDomain] = useState<string | null>(null)
706
+ const [selectedTask, setSelectedTask] = useState<string | null>(null)
707
  const [selectedCategory, setSelectedCategory] = useState<string | null>(null)
708
  const [selectedNodeKind, setSelectedNodeKind] = useState<EvalBrowserNodeKind | null>(null)
709
  const [currentNodeId, setCurrentNodeId] = useState<string | null>(null)
 
734
  const params = new URLSearchParams(window.location.search)
735
  const incomingSearch = params.get("search") ?? ""
736
  const incomingDomain = params.get("domain")
737
+ const incomingTask = params.get("task")
738
  const incomingCategory = params.get("category")
739
  const incomingNode = params.get("node")
740
  setSearchQuery(incomingSearch)
741
  setSelectedDomain(incomingDomain)
742
+ setSelectedTask(incomingTask)
743
  setSelectedCategory(incomingCategory)
744
  setCurrentNodeId(incomingNode)
745
  }
 
764
  if (selectedDomain) {
765
  params.set("domain", selectedDomain)
766
  }
767
+ if (selectedTask) {
768
+ params.set("task", selectedTask)
769
+ }
770
  if (selectedCategory) {
771
  params.set("category", selectedCategory)
772
  }
 
788
  }
789
 
790
  pendingHistoryActionRef.current = "replace"
791
+ }, [currentNodeId, searchQuery, selectedCategory, selectedDomain, selectedTask])
792
 
793
  const summariesWithCards = useMemo(() => {
794
  return summaries.map((summary) => {
 
846
  suiteLabel,
847
  category,
848
  domains,
849
+ tasks,
850
  summaries,
851
  card,
852
  sourceLabel,
 
864
  suiteLabel?: string
865
  category: CategoryType
866
  domains: string[]
867
+ tasks?: string[]
868
  summaries: BenchmarkEvalListItem[]
869
  card?: BenchmarkCard
870
  sourceLabel?: string
 
875
  descriptionFallback: string
876
  }) => {
877
  const stats = summarizeNodeStats(summaries, category)
878
+ const summaryTasks = summaries.flatMap((summary) => summary.tags?.tasks ?? [])
879
  addNode({
880
  id,
881
  parentId,
 
886
  description: buildDescription(title, card, descriptionFallback),
887
  category: stats.category,
888
  domains: Array.from(new Set(domains.flatMap((domain) => normalizeDomainList(domain)))),
889
+ tasks: Array.from(new Set([...(tasks ?? []), ...summaryTasks].map((task) => task.trim()).filter(Boolean))),
890
  dataType: card?.benchmark_details?.data_type,
891
  license: card?.ethical_and_legal_considerations?.data_licensing,
892
  card,
 
899
  href,
900
  scopeKeys,
901
  matrixPreview,
902
+ reproducibility_summary: stats.reproducibility_summary,
903
+ provenance_summary: stats.provenance_summary,
904
+ comparability_summary: stats.comparability_summary,
905
+ completenessScore: stats.completenessScore,
906
  })
907
  }
908
 
 
988
  slices = [],
989
  metrics = [],
990
  scopeKeys,
991
+ fallbackEvalId,
992
  }: {
993
  parentId: string | null
994
  familyLabel?: string
 
1002
  slices?: Array<{ key: string; display_name: string; metrics: Array<{ key: string; display_name: string }> }>
1003
  metrics?: Array<{ key: string; display_name: string }>
1004
  scopeKeys: string[]
1005
+ /** Final-resort eval id when no summary or fallback summary matches; comes from leaf.eval_summary_ids */
1006
+ fallbackEvalId?: string
1007
  }) => {
1008
  const benchmarkId = `${parentId ?? "root"}::benchmark:${normalizeBenchmarkKey(benchmarkKey)}`
1009
  const card = summary?.benchmark_card ?? getNodeCard(benchmarkCards, ...cardCandidates)
 
1013
  !summary && metrics.length > 0
1014
  ? scopeKeys.map((scopeKey) => pickSummaryForKey(summariesWithCards, scopeKey, scopeKeys)).find(Boolean)
1015
  : undefined
1016
+ const resolvedHref = summary
1017
+ ? `/evals/${summary.evaluation_id}`
1018
+ : fallbackSummary
1019
+ ? `/evals/${fallbackSummary.evaluation_id}`
1020
+ : fallbackEvalId
1021
+ ? `/evals/${fallbackEvalId}`
1022
+ : undefined
1023
  const isParentRollupBenchmark =
1024
  Boolean(parentId) && scopeKeys.some((scopeKey) => isSameHierarchyKey(scopeKey, benchmarkKey))
1025
 
 
1028
 
1029
  if (drilldownSlices.length > 0) {
1030
  createSliceNodes(parentId, parentLabel, summary, drilldownSlices, category, scopeKeys)
1031
+ } else if (resolvedHref) {
1032
  const parent = nodes.get(parentId)
1033
  if (parent && !parent.href) {
1034
+ parent.href = resolvedHref
1035
  }
1036
  }
1037
  return
 
1048
  domains,
1049
  summaries: summary ? [summary] : [],
1050
  card,
1051
+ href: drilldownSlices.length === 0 ? resolvedHref : undefined,
 
 
 
 
 
 
 
1052
  scopeKeys,
1053
  descriptionFallback: `Browse the {label} benchmark and its lower-level breakdowns.`,
1054
  })
 
1230
  slices: standalone.slices ?? [],
1231
  metrics: standalone.metrics ?? [],
1232
  scopeKeys: familyScopeKeys,
1233
+ fallbackEvalId: standalone.summary_eval_ids?.[0],
1234
  })
1235
  }
1236
 
 
1392
  })),
1393
  metrics: benchmarkSource?.metrics ?? family.metrics ?? [],
1394
  scopeKeys: familyScopeKeys,
1395
+ fallbackEvalId:
1396
+ benchmarkSource?.summary_eval_ids?.[0] ?? family.eval_summary_ids?.[0],
1397
  })
1398
  }
1399
 
 
1438
  node.description,
1439
  node.sourceLabel,
1440
  ...node.domains,
1441
+ ...node.tasks,
1442
  ]
1443
 
1444
  return haystacks.some((value) => value?.toLowerCase().includes(query))
 
1457
  domainCandidates = domainCandidates.filter((node) => node.category === selectedCategory)
1458
  }
1459
 
1460
+ if (selectedTask) {
1461
+ domainCandidates = domainCandidates.filter((node) =>
1462
+ node.tasks.some((task) => task.toLowerCase() === selectedTask.toLowerCase())
1463
+ )
1464
+ }
1465
+
1466
  for (const node of domainCandidates) {
1467
  for (const domain of node.domains) {
1468
  domainSet.add(domain)
 
1470
  }
1471
 
1472
  return Array.from(domainSet).sort((a, b) => a.localeCompare(b))
1473
+ }, [nodesMatchingSearch, selectedCategory, selectedNodeKind, selectedTask])
1474
+
1475
+ const allTasks = useMemo(() => {
1476
+ const taskSet = new Set<string>()
1477
+ let taskCandidates = nodesMatchingSearch
1478
+
1479
+ if (selectedNodeKind) {
1480
+ taskCandidates = taskCandidates.filter((node) => node.kind === selectedNodeKind)
1481
+ }
1482
+
1483
+ if (selectedCategory) {
1484
+ taskCandidates = taskCandidates.filter((node) => node.category === selectedCategory)
1485
+ }
1486
+
1487
+ if (selectedDomain) {
1488
+ taskCandidates = taskCandidates.filter((node) =>
1489
+ node.domains.some((domain) => domain.toLowerCase() === selectedDomain.toLowerCase())
1490
+ )
1491
+ }
1492
+
1493
+ for (const node of taskCandidates) {
1494
+ for (const task of node.tasks) {
1495
+ taskSet.add(task)
1496
+ }
1497
+ }
1498
+
1499
+ return Array.from(taskSet).sort((a, b) => a.localeCompare(b)).slice(0, 40)
1500
+ }, [nodesMatchingSearch, selectedCategory, selectedDomain, selectedNodeKind])
1501
 
1502
  const allCategories = useMemo(() => {
1503
  const categorySet = new Set<string>()
 
1513
  )
1514
  }
1515
 
1516
+ if (selectedTask) {
1517
+ categoryCandidates = categoryCandidates.filter((node) =>
1518
+ node.tasks.some((task) => task.toLowerCase() === selectedTask.toLowerCase())
1519
+ )
1520
+ }
1521
+
1522
  for (const node of categoryCandidates) {
1523
  categorySet.add(node.category)
1524
  }
1525
 
1526
  return Array.from(categorySet).sort((a, b) => a.localeCompare(b))
1527
+ }, [nodesMatchingSearch, selectedDomain, selectedNodeKind, selectedTask])
1528
 
1529
  const filtered = useMemo(() => {
1530
  let list = [...nodesMatchingSearch]
 
1541
  )
1542
  }
1543
 
1544
+ if (selectedTask) {
1545
+ list = list.filter((node) =>
1546
+ node.tasks.some(
1547
+ (task) => task.toLowerCase() === selectedTask.toLowerCase()
1548
+ )
1549
+ )
1550
+ }
1551
+
1552
  if (selectedCategory) {
1553
  list = list.filter((node) => node.category === selectedCategory)
1554
  }
1555
 
1556
  list.sort((a, b) => a.title.localeCompare(b.title, undefined, { sensitivity: "base" }))
1557
  return list
1558
+ }, [nodesMatchingSearch, selectedCategory, selectedDomain, selectedNodeKind, selectedTask])
1559
 
1560
  useEffect(() => {
1561
  if (selectedDomain && !allDomains.includes(selectedDomain)) {
 
1563
  }
1564
  }, [allDomains, selectedDomain])
1565
 
1566
+ useEffect(() => {
1567
+ if (selectedTask && !allTasks.includes(selectedTask)) {
1568
+ setSelectedTask(null)
1569
+ }
1570
+ }, [allTasks, selectedTask])
1571
+
1572
  useEffect(() => {
1573
  if (selectedCategory && !allCategories.includes(selectedCategory)) {
1574
  setSelectedCategory(null)
 
1577
 
1578
  useEffect(() => {
1579
  setPage(1)
1580
+ }, [currentNodeId, searchQuery, selectedCategory, selectedDomain, selectedNodeKind, selectedTask])
1581
 
1582
  const pagedNodes = useMemo(
1583
  () => filtered.slice((page - 1) * PAGE_SIZE, page * PAGE_SIZE),
 
1585
  )
1586
 
1587
  const currentLevelKinds = Array.from(new Set(currentLevelNodes.map((node) => node.kind)))
1588
+ const activeFilterCount = [searchQuery.trim(), selectedDomain, selectedTask, selectedCategory, selectedNodeKind].filter(Boolean).length
1589
  const currentLevelLabel =
1590
  currentNodeId === null
1591
  ? "Rollout entry level"
 
1729
  onClick={() => {
1730
  setSearchQuery("")
1731
  setSelectedDomain(null)
1732
+ setSelectedTask(null)
1733
  setSelectedCategory(null)
1734
  setSelectedNodeKind(null)
1735
  }}
 
1795
  </div>
1796
  </div>
1797
 
1798
+ {hierarchy?.stats && (
1799
  <div className="flex flex-wrap gap-2 text-sm">
1800
  <span className="rounded-full border border-stone-200/80 bg-stone-50/80 px-3 py-1.5 font-medium text-stone-700 dark:border-stone-700/80 dark:bg-stone-900/70 dark:text-stone-200">
1801
  {hierarchy.stats.family_count} families
 
1919
  </div>
1920
  )}
1921
 
1922
+ {allTasks.length > 0 && (
1923
+ <div className="mt-4 space-y-1.5">
1924
+ <div className="text-[11px] font-semibold uppercase tracking-[0.2em] text-stone-500 dark:text-stone-400">
1925
+ Task type
1926
+ </div>
1927
+ <div className="flex max-h-40 flex-wrap items-center gap-1.5 overflow-y-auto pr-1">
1928
+ <button
1929
+ type="button"
1930
+ onClick={() => setSelectedTask(null)}
1931
+ className={cn(
1932
+ "shrink-0 rounded-full border px-3 py-1.5 text-xs font-medium transition-colors",
1933
+ selectedTask === null
1934
+ ? "border-stone-950 bg-stone-950 text-stone-50 dark:border-stone-100 dark:bg-stone-100 dark:text-stone-950"
1935
+ : "border-stone-200/80 bg-stone-50/80 text-stone-600 hover:bg-stone-100 dark:border-stone-700/80 dark:bg-stone-900/70 dark:text-stone-300 dark:hover:bg-stone-800"
1936
+ )}
1937
+ >
1938
+ All
1939
+ </button>
1940
+ {allTasks.map((task) => (
1941
+ <button
1942
+ key={task}
1943
+ type="button"
1944
+ onClick={() => setSelectedTask(selectedTask === task ? null : task)}
1945
+ className={cn(
1946
+ "shrink-0 rounded-full border px-3 py-1.5 text-xs font-medium transition-colors capitalize",
1947
+ selectedTask === task
1948
+ ? "border-emerald-300 bg-emerald-50 text-emerald-800 dark:border-emerald-800 dark:bg-emerald-950/50 dark:text-emerald-200"
1949
+ : "border-stone-200/80 bg-white text-stone-600 hover:bg-stone-50 dark:border-stone-700/80 dark:bg-stone-900 dark:text-stone-300 dark:hover:bg-stone-800"
1950
+ )}
1951
+ >
1952
+ {task}
1953
+ </button>
1954
+ ))}
1955
+ </div>
1956
+ </div>
1957
+ )}
1958
+
1959
  {allCategories.length > 0 && (
1960
  <div className="mt-4 space-y-1.5">
1961
  <div className="text-[11px] font-semibold uppercase tracking-[0.2em] text-stone-500 dark:text-stone-400">
 
2081
  {node.title}
2082
  </h3>
2083
 
2084
+ <NodeSignalChips node={node} />
2085
+
2086
  {node.description && (
2087
  <p className="mb-4 flex-1 text-sm leading-6 text-stone-600 line-clamp-3 dark:text-stone-300">
2088
  {node.description}
components/benchmark-detail.tsx CHANGED
@@ -15,6 +15,12 @@ import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/component
15
  import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog"
16
  import { Input } from "@/components/ui/input"
17
  import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table"
 
 
 
 
 
 
18
  import {
19
  DropdownMenu,
20
  DropdownMenuContent,
@@ -259,48 +265,6 @@ function getOrganizationDisplayName(value: string | null | undefined) {
259
  return normalizeDisplayLabel(value) || "Unknown Organization"
260
  }
261
 
262
- function getRelationshipDisplayName(value: string | null | undefined) {
263
- return normalizeDisplayLabel(value?.replace(/_/g, " ")) || "Unknown"
264
- }
265
-
266
- /**
267
- * Short, badge-friendly label for evaluator relationships.
268
- * Unknown / "other" values fall back to the normalized full name.
269
- */
270
- function getRelationshipShortLabel(value: string | null | undefined) {
271
- switch ((value ?? "").toLowerCase()) {
272
- case "first_party":
273
- return "1st party"
274
- case "third_party":
275
- return "3rd party"
276
- case "collaborative":
277
- return "Collaborative"
278
- case "other":
279
- return "Other"
280
- default:
281
- return getRelationshipDisplayName(value)
282
- }
283
- }
284
-
285
- /**
286
- * Tone classes for the relationship badge so readers can scan first-party
287
- * vs third-party reports at a glance without reading the text.
288
- */
289
- function getRelationshipBadgeTone(value: string | null | undefined): string {
290
- switch ((value ?? "").toLowerCase()) {
291
- case "first_party":
292
- // Self-reported by the model's developer — caution tone.
293
- return "border-amber-300 bg-amber-50 text-amber-900 dark:border-amber-900/60 dark:bg-amber-950/40 dark:text-amber-100"
294
- case "third_party":
295
- // Independently evaluated — confidence tone.
296
- return "border-emerald-300 bg-emerald-50 text-emerald-900 dark:border-emerald-900/60 dark:bg-emerald-950/40 dark:text-emerald-100"
297
- case "collaborative":
298
- return "border-sky-300 bg-sky-50 text-sky-900 dark:border-sky-900/60 dark:bg-sky-950/40 dark:text-sky-100"
299
- default:
300
- return "border-border/70 bg-muted/40 text-muted-foreground"
301
- }
302
- }
303
-
304
  function getSourceTypeDisplayName(value: string | null | undefined) {
305
  return normalizeDisplayLabel(value?.replace(/_/g, " ")) || "Unknown"
306
  }
@@ -1798,6 +1762,10 @@ export function BenchmarkDetail({
1798
  thirdPartyEvaluations,
1799
  }
1800
  }, [allEvaluations])
 
 
 
 
1801
 
1802
  const allCategoryResults = useMemo(
1803
  () =>
@@ -1868,14 +1836,14 @@ export function BenchmarkDetail({
1868
  }
1869
 
1870
  const reproducibilityCopy =
1871
- reportingStats.missingGenerationConfigs === 0
1872
  ? null
1873
- : reportingStats.missingGenerationConfigs === summary.total_evaluations
1874
  ? "How this model was prompted during testing is not documented. Scores cannot be independently confirmed."
1875
- : "How this model was prompted during testing is missing for some reported results. Score differences may not be fully attributable to model capability alone."
1876
 
1877
  const comparabilityCopy =
1878
- reportingStats.missingGenerationConfigs > 0
1879
  ? `${benchmarkCount > 0 ? `These results cover ${benchmarkCount} benchmark${benchmarkCount === 1 ? "" : "s"},` : "These results"} but missing prompting details mean apparent score gaps may partly reflect setup differences as well as capability.`
1880
  : "Shared benchmark coverage helps, but evaluator choices, benchmark mix, and model size can still limit direct apples-to-apples comparison."
1881
 
@@ -1898,9 +1866,10 @@ export function BenchmarkDetail({
1898
  allCategoryResults,
1899
  allEvaluations.length,
1900
  reportingStats,
 
 
1901
  summary.model_info.additional_details?.params_billions,
1902
  summary.model_info.name,
1903
- summary.total_evaluations,
1904
  ])
1905
 
1906
  const benchmarkGroups = useMemo(
@@ -3283,6 +3252,14 @@ export function BenchmarkDetail({
3283
  Mixed scale · renormalized
3284
  </span>
3285
  )}
 
 
 
 
 
 
 
 
3286
  </div>
3287
 
3288
  {/* Hero: title + developer + stat strip */}
@@ -4663,6 +4640,10 @@ function AggregatedBenchmarkCard({
4663
  Score
4664
  </div>
4665
  <div className="mt-1 text-lg font-semibold tracking-tight">{variant.displayScore}</div>
 
 
 
 
4666
  </div>
4667
 
4668
  <div className="min-w-0">
@@ -5200,7 +5181,10 @@ function BenchmarkDeepDiveDialogPanel({
5200
  )}
5201
  </div>
5202
  </TableCell>
5203
- <TableCell className="px-4 py-3 text-right align-top font-semibold tabular-nums">{variant.displayScore}</TableCell>
 
 
 
5204
  <TableCell className="px-4 py-3 text-right align-top tabular-nums text-muted-foreground">
5205
  {(variant.rankPosition != null || resolvedRank)
5206
  ? `#${resolvedRank?.position ?? variant.rankPosition}${(resolvedRank?.total ?? variant.rankTotal) ? `/${resolvedRank?.total ?? variant.rankTotal}` : ""}`
@@ -5265,7 +5249,10 @@ function BenchmarkDeepDiveDialogPanel({
5265
  )}
5266
  </div>
5267
  </TableCell>
5268
- <TableCell className="px-4 py-3 text-right align-top font-semibold tabular-nums">{variant.displayScore}</TableCell>
 
 
 
5269
  <TableCell className="px-4 py-3 text-right align-top tabular-nums text-muted-foreground">
5270
  {(variant.rankPosition != null || resolvedRank)
5271
  ? `#${resolvedRank?.position ?? variant.rankPosition}${(resolvedRank?.total ?? variant.rankTotal) ? `/${resolvedRank?.total ?? variant.rankTotal}` : ""}`
@@ -5379,10 +5366,15 @@ function VariantExpandedDetail({
5379
  <Badge variant="outline" className="font-normal">
5380
  {group.title}
5381
  </Badge>
5382
- <Badge variant="secondary" className="font-normal">
5383
- {variant.displayScore}
5384
- </Badge>
5385
- </div>
 
 
 
 
 
5386
  <div className="text-sm text-muted-foreground">{variant.result.metric_config.evaluation_description}</div>
5387
  </div>
5388
 
 
15
  import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog"
16
  import { Input } from "@/components/ui/input"
17
  import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table"
18
+ import {
19
+ getRelationshipBadgeTone,
20
+ getRelationshipDisplayName,
21
+ getRelationshipShortLabel,
22
+ } from "@/components/signals/provenance-badge"
23
+ import { SignalsRowBadges } from "@/components/signals/signals-row-badges"
24
  import {
25
  DropdownMenu,
26
  DropdownMenuContent,
 
265
  return normalizeDisplayLabel(value) || "Unknown Organization"
266
  }
267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  function getSourceTypeDisplayName(value: string | null | undefined) {
269
  return normalizeDisplayLabel(value?.replace(/_/g, " ")) || "Unknown"
270
  }
 
1762
  thirdPartyEvaluations,
1763
  }
1764
  }, [allEvaluations])
1765
+ const reproducibilityGapCount =
1766
+ summary.reproducibility_summary?.has_reproducibility_gap_count ?? reportingStats.missingGenerationConfigs
1767
+ const reproducibilityResultsTotal =
1768
+ summary.reproducibility_summary?.results_total ?? summary.total_evaluations
1769
 
1770
  const allCategoryResults = useMemo(
1771
  () =>
 
1836
  }
1837
 
1838
  const reproducibilityCopy =
1839
+ reproducibilityGapCount === 0
1840
  ? null
1841
+ : reproducibilityGapCount === reproducibilityResultsTotal
1842
  ? "How this model was prompted during testing is not documented. Scores cannot be independently confirmed."
1843
+ : `${reproducibilityGapCount} of ${reproducibilityResultsTotal} reported scores are missing enough setup detail to be re-run as-is.`
1844
 
1845
  const comparabilityCopy =
1846
+ reproducibilityGapCount > 0
1847
  ? `${benchmarkCount > 0 ? `These results cover ${benchmarkCount} benchmark${benchmarkCount === 1 ? "" : "s"},` : "These results"} but missing prompting details mean apparent score gaps may partly reflect setup differences as well as capability.`
1848
  : "Shared benchmark coverage helps, but evaluator choices, benchmark mix, and model size can still limit direct apples-to-apples comparison."
1849
 
 
1866
  allCategoryResults,
1867
  allEvaluations.length,
1868
  reportingStats,
1869
+ reproducibilityGapCount,
1870
+ reproducibilityResultsTotal,
1871
  summary.model_info.additional_details?.params_billions,
1872
  summary.model_info.name,
 
1873
  ])
1874
 
1875
  const benchmarkGroups = useMemo(
 
3252
  Mixed scale · renormalized
3253
  </span>
3254
  )}
3255
+ {reproducibilityGapCount > 0 && (
3256
+ <span
3257
+ className="ml-1 inline-flex items-center rounded-full border border-amber-300 bg-amber-50 px-2 py-0.5 text-[10px] tracking-[0.12em] text-amber-900 dark:border-amber-900/60 dark:bg-amber-950/40 dark:text-amber-100"
3258
+ title={`${reproducibilityGapCount} of ${reproducibilityResultsTotal} reported scores are not fully documented.`}
3259
+ >
3260
+ Setup gaps
3261
+ </span>
3262
+ )}
3263
  </div>
3264
 
3265
  {/* Hero: title + developer + stat strip */}
 
4640
  Score
4641
  </div>
4642
  <div className="mt-1 text-lg font-semibold tracking-tight">{variant.displayScore}</div>
4643
+ <SignalsRowBadges
4644
+ annotations={variant.result.evalcards?.annotations}
4645
+ className="justify-start"
4646
+ />
4647
  </div>
4648
 
4649
  <div className="min-w-0">
 
5181
  )}
5182
  </div>
5183
  </TableCell>
5184
+ <TableCell className="px-4 py-3 text-right align-top font-semibold tabular-nums">
5185
+ <div>{variant.displayScore}</div>
5186
+ <SignalsRowBadges annotations={variant.result.evalcards?.annotations} />
5187
+ </TableCell>
5188
  <TableCell className="px-4 py-3 text-right align-top tabular-nums text-muted-foreground">
5189
  {(variant.rankPosition != null || resolvedRank)
5190
  ? `#${resolvedRank?.position ?? variant.rankPosition}${(resolvedRank?.total ?? variant.rankTotal) ? `/${resolvedRank?.total ?? variant.rankTotal}` : ""}`
 
5249
  )}
5250
  </div>
5251
  </TableCell>
5252
+ <TableCell className="px-4 py-3 text-right align-top font-semibold tabular-nums">
5253
+ <div>{variant.displayScore}</div>
5254
+ <SignalsRowBadges annotations={variant.result.evalcards?.annotations} />
5255
+ </TableCell>
5256
  <TableCell className="px-4 py-3 text-right align-top tabular-nums text-muted-foreground">
5257
  {(variant.rankPosition != null || resolvedRank)
5258
  ? `#${resolvedRank?.position ?? variant.rankPosition}${(resolvedRank?.total ?? variant.rankTotal) ? `/${resolvedRank?.total ?? variant.rankTotal}` : ""}`
 
5366
  <Badge variant="outline" className="font-normal">
5367
  {group.title}
5368
  </Badge>
5369
+ <Badge variant="secondary" className="font-normal">
5370
+ {variant.displayScore}
5371
+ </Badge>
5372
+ <SignalsRowBadges
5373
+ annotations={variant.result.evalcards?.annotations}
5374
+ className="mt-0 justify-start"
5375
+ hideOnMobile={false}
5376
+ />
5377
+ </div>
5378
  <div className="text-sm text-muted-foreground">{variant.result.metric_config.evaluation_description}</div>
5379
  </div>
5380
 
components/benchmark-evaluation-card.tsx CHANGED
@@ -5,6 +5,7 @@ import { useMemo } from "react"
5
  import { useAudienceMode } from "@/components/audience-mode-provider"
6
  import { useRouter } from "next/navigation"
7
  import {
 
8
  Award,
9
  ChevronDown,
10
  ChevronRight,
@@ -14,6 +15,7 @@ import {
14
  } from "lucide-react"
15
 
16
  import type { CategoryType } from "@/lib/benchmark-schema"
 
17
  import { getCategoryColor } from "@/lib/benchmark-schema"
18
  import type { BenchmarkCard } from "@/lib/benchmark-schema"
19
  import { lookupBenchmarkCard } from "@/lib/benchmark-metadata-utils"
@@ -59,6 +61,9 @@ export type BenchmarkEvaluationCardData = {
59
  max: number
60
  average: number | null
61
  }
 
 
 
62
 
63
  top_scores: Array<{
64
  benchmark: string
@@ -262,6 +267,8 @@ export function BenchmarkEvaluationCard({
262
  const scoreRange = [formatScoreValue(data.score_summary?.min), formatScoreValue(data.score_summary?.max)]
263
  .filter((value): value is string => Boolean(value))
264
  .join(" to ")
 
 
265
 
266
  return (
267
  <Card
@@ -297,6 +304,12 @@ export function BenchmarkEvaluationCard({
297
  {paramsBillions && <Badge variant="secondary">{paramsBillions} parameters</Badge>}
298
  <Badge variant="outline">{data.benchmarks_count} benchmark suites</Badge>
299
  <Badge variant="outline">{data.evaluations_count} reported results</Badge>
 
 
 
 
 
 
300
  </div>
301
  </div>
302
 
@@ -447,6 +460,12 @@ export function BenchmarkEvaluationCard({
447
  {data.source_types.length > 0 && (
448
  <KeyValueRow label="Artifact type" value={data.source_types.map((s) => s.replace(/_/g, " ")).join(", ")} />
449
  )}
 
 
 
 
 
 
450
  </div>
451
  </CollapsibleContent>
452
  </Collapsible>
 
5
  import { useAudienceMode } from "@/components/audience-mode-provider"
6
  import { useRouter } from "next/navigation"
7
  import {
8
+ AlertTriangle,
9
  Award,
10
  ChevronDown,
11
  ChevronRight,
 
15
  } from "lucide-react"
16
 
17
  import type { CategoryType } from "@/lib/benchmark-schema"
18
+ import type { SignalSummaries } from "@/lib/backend-artifacts"
19
  import { getCategoryColor } from "@/lib/benchmark-schema"
20
  import type { BenchmarkCard } from "@/lib/benchmark-schema"
21
  import { lookupBenchmarkCard } from "@/lib/benchmark-metadata-utils"
 
61
  max: number
62
  average: number | null
63
  }
64
+ reproducibility_summary?: SignalSummaries["reproducibility_summary"]
65
+ provenance_summary?: SignalSummaries["provenance_summary"]
66
+ comparability_summary?: SignalSummaries["comparability_summary"]
67
 
68
  top_scores: Array<{
69
  benchmark: string
 
267
  const scoreRange = [formatScoreValue(data.score_summary?.min), formatScoreValue(data.score_summary?.max)]
268
  .filter((value): value is string => Boolean(value))
269
  .join(" to ")
270
+ const reproducibilityGapCount = data.reproducibility_summary?.has_reproducibility_gap_count ?? 0
271
+ const reproducibilityTotal = data.reproducibility_summary?.results_total ?? data.evaluations_count
272
 
273
  return (
274
  <Card
 
304
  {paramsBillions && <Badge variant="secondary">{paramsBillions} parameters</Badge>}
305
  <Badge variant="outline">{data.benchmarks_count} benchmark suites</Badge>
306
  <Badge variant="outline">{data.evaluations_count} reported results</Badge>
307
+ {reproducibilityGapCount > 0 && (
308
+ <Badge className="border-amber-300 bg-amber-50 text-amber-900 hover:bg-amber-50 dark:border-amber-900/60 dark:bg-amber-950/40 dark:text-amber-100">
309
+ <AlertTriangle className="h-3 w-3" />
310
+ {reproducibilityGapCount} setup gaps
311
+ </Badge>
312
+ )}
313
  </div>
314
  </div>
315
 
 
460
  {data.source_types.length > 0 && (
461
  <KeyValueRow label="Artifact type" value={data.source_types.map((s) => s.replace(/_/g, " ")).join(", ")} />
462
  )}
463
+ {reproducibilityGapCount > 0 && (
464
+ <KeyValueRow
465
+ label="Re-runnability"
466
+ value={`${reproducibilityGapCount} of ${reproducibilityTotal} reported scores are not fully documented`}
467
+ />
468
+ )}
469
  </div>
470
  </CollapsibleContent>
471
  </Collapsible>
components/eval-card.tsx CHANGED
@@ -79,6 +79,11 @@ export function EvalCard({ summary, delayMs = 0 }: EvalCardProps) {
79
  const domainPreview = domains.slice(0, 2)
80
  // Source provenance pulled from the pipeline's source_data
81
  const sourceData = summary.source_data
 
 
 
 
 
82
  const datasetName = sourceData?.dataset_name
83
  const datasetUrl =
84
  sourceData?.dataset_url ??
@@ -129,10 +134,10 @@ export function EvalCard({ summary, delayMs = 0 }: EvalCardProps) {
129
  Independently evaluated
130
  </Badge>
131
  )}
132
- {summary.missing_generation_config_count > 0 && (
133
  <Badge className="bg-amber-500 text-amber-950 hover:bg-amber-500">
134
  <AlertTriangle className="mr-1 h-3 w-3" />
135
- Partial config
136
  </Badge>
137
  )}
138
  </div>
@@ -182,8 +187,8 @@ export function EvalCard({ summary, delayMs = 0 }: EvalCardProps) {
182
  <DataRow
183
  label="Config"
184
  value={
185
- summary.missing_generation_config_count > 0
186
- ? `${summary.missing_generation_config_count} result${summary.missing_generation_config_count !== 1 ? "s" : ""} without config`
187
  : "Fully documented"
188
  }
189
  />
@@ -245,9 +250,9 @@ export function EvalCard({ summary, delayMs = 0 }: EvalCardProps) {
245
  <div className="space-y-1.5 text-sm">
246
  <DataRow label="Avg score" value={scorePercent} />
247
  <DataRow label="Reported by" value={summary.evaluator_names.join(", ") || "Unknown"} />
248
- {summary.missing_generation_config_count > 0 && (
249
  <p className="pt-1 text-xs text-muted-foreground">
250
- Some results lack generation settings; compare scores with care.
251
  </p>
252
  )}
253
  </div>
 
79
  const domainPreview = domains.slice(0, 2)
80
  // Source provenance pulled from the pipeline's source_data
81
  const sourceData = summary.source_data
82
+ const reproducibilitySummary = summary.reproducibility_summary
83
+ const reproducibilityGapCount =
84
+ reproducibilitySummary?.has_reproducibility_gap_count ?? summary.missing_generation_config_count
85
+ const reproducibilityResultsTotal =
86
+ reproducibilitySummary?.results_total ?? summary.models_count
87
  const datasetName = sourceData?.dataset_name
88
  const datasetUrl =
89
  sourceData?.dataset_url ??
 
134
  Independently evaluated
135
  </Badge>
136
  )}
137
+ {reproducibilityGapCount > 0 && (
138
  <Badge className="bg-amber-500 text-amber-950 hover:bg-amber-500">
139
  <AlertTriangle className="mr-1 h-3 w-3" />
140
+ Documentation gaps
141
  </Badge>
142
  )}
143
  </div>
 
187
  <DataRow
188
  label="Config"
189
  value={
190
+ reproducibilityGapCount > 0
191
+ ? `${reproducibilityGapCount} of ${reproducibilityResultsTotal} scores have setup gaps`
192
  : "Fully documented"
193
  }
194
  />
 
250
  <div className="space-y-1.5 text-sm">
251
  <DataRow label="Avg score" value={scorePercent} />
252
  <DataRow label="Reported by" value={summary.evaluator_names.join(", ") || "Unknown"} />
253
+ {reproducibilityGapCount > 0 && (
254
  <p className="pt-1 text-xs text-muted-foreground">
255
+ {reproducibilityGapCount} of {reproducibilityResultsTotal} reported scores are not fully documented.
256
  </p>
257
  )}
258
  </div>
components/eval-detail.tsx CHANGED
@@ -5,8 +5,22 @@ import { Fragment, useEffect, useMemo, useState } from "react"
5
  import Link from "next/link"
6
  import { Badge } from "@/components/ui/badge"
7
  import { Button } from "@/components/ui/button"
 
 
 
 
 
 
8
  import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"
9
  import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
 
 
 
 
 
 
 
 
10
  import {
11
  DropdownMenu,
12
  DropdownMenuCheckboxItem,
@@ -32,9 +46,11 @@ import {
32
  Globe,
33
  Medal,
34
  Scale,
 
35
  Shield,
36
  SlidersHorizontal,
37
  Tag,
 
38
  } from "lucide-react"
39
  import type { BenchmarkCard } from "@/lib/benchmark-schema"
40
  import type { BenchmarkEvalSummary, ModelResultForBenchmark } from "@/lib/eval-processing"
@@ -53,6 +69,212 @@ interface LeaderboardRow {
53
  type LeaderboardMetric = NonNullable<BenchmarkEvalSummary["leaderboard_metrics"]>[number]
54
  type LeaderboardMatrixRow = NonNullable<BenchmarkEvalSummary["leaderboard_rows"]>[number]
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  const PARAM_RANGE_VALUES = [1, 2, 3, 4, 6, 8, 10, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 500] as const
57
  const PARAM_RANGE_MARKERS = [
58
  { label: "< 1B", step: 0 },
@@ -400,6 +622,11 @@ export function EvalDetail({ summary }: EvalDetailProps) {
400
  : summary.is_aggregated
401
  ? "Averaged model results across the contributing composite benchmarks, with drill-down to each component score."
402
  : "Model results with benchmark context, source dataset detail, and optional instance-data links."
 
 
 
 
 
403
 
404
  const toggleRow = (key: string) =>
405
  setExpandedRows((current) => ({
@@ -430,6 +657,15 @@ export function EvalDetail({ summary }: EvalDetailProps) {
430
  ? `${summary.metrics_count ?? summary.leaderboard_metrics?.length ?? 1} measures`
431
  : `${summary.metrics_count ?? 1} ${(summary.metrics_count ?? 1) === 1 ? "measure" : "measures"}`}
432
  </Badge>
 
 
 
 
 
 
 
 
 
433
  </div>
434
  </div>
435
  {overviewOpen ? (
@@ -580,6 +816,12 @@ export function EvalDetail({ summary }: EvalDetailProps) {
580
  </dl>
581
  </div>
582
 
 
 
 
 
 
 
583
  {!hasMultiMetricLeaderboard && (summary.root_metrics?.length || summary.subtasks?.length) ? (
584
  <section className="rounded-2xl border bg-muted/5 p-3.5">
585
  <div className="space-y-1">
@@ -812,10 +1054,14 @@ export function EvalDetail({ summary }: EvalDetailProps) {
812
  const samples = Array.isArray(modelResult.source_data)
813
  ? undefined
814
  : modelResult.source_data.samples_number
 
815
 
816
  return (
817
  <Fragment key={key}>
818
- <TableRow className={cn("group", isExpanded && "bg-muted/15")}>
 
 
 
819
  <TableCell className="px-4">
820
  <div
821
  className={cn(
@@ -868,6 +1114,7 @@ export function EvalDetail({ summary }: EvalDetailProps) {
868
 
869
  <TableCell className="text-right">
870
  <div className="text-xl font-semibold tabular-nums">{formatRawScore(modelResult.score, summary.metric_config.unit)}</div>
 
871
  </TableCell>
872
 
873
  {isResearchView ? (
@@ -997,6 +1244,8 @@ export function EvalDetail({ summary }: EvalDetailProps) {
997
  )}
998
  </DetailPanel>
999
 
 
 
1000
  <DetailPanel
1001
  title={isResearchView ? "Score Breakdown" : "Metric Summary"}
1002
  subtitle={
@@ -1183,7 +1432,14 @@ function MultiMetricLeaderboard({
1183
  const leaderboardMetrics = summary.leaderboard_metrics ?? []
1184
  const leaderboardRows = summary.leaderboard_rows ?? []
1185
  const allMetricKeys = useMemo(() => leaderboardMetrics.map((metric) => metric.column_key), [leaderboardMetrics])
1186
- const [visibleMetricKeys, setVisibleMetricKeys] = useState<string[]>(() => leaderboardMetrics.map((metric) => metric.column_key))
 
 
 
 
 
 
 
1187
  const maxParamStepIndex = PARAM_RANGE_VALUES.length - 1
1188
  const leaderboardMetricMap = useMemo(
1189
  () => new Map(leaderboardMetrics.map((metric) => [metric.column_key, metric])),
@@ -1333,8 +1589,8 @@ function MultiMetricLeaderboard({
1333
  }, [maxParamStep, minParamStep, sortDirection, sortKey])
1334
 
1335
  useEffect(() => {
1336
- setVisibleMetricKeys(allMetricKeys)
1337
- }, [allMetricKeys, summary.evaluation_id])
1338
 
1339
  useEffect(() => {
1340
  setActiveSubtaskTab("all")
@@ -1521,30 +1777,11 @@ function MultiMetricLeaderboard({
1521
  <CardContent className="p-0">
1522
  {hasSubtaskTabs && (
1523
  <div className="border-b bg-background px-5 py-3 sm:px-6">
1524
- <div className="mb-2 text-[11px] font-semibold uppercase tracking-[0.16em] text-muted-foreground">
1525
- Benchmark slices
1526
- </div>
1527
- <div className="flex flex-wrap gap-2">
1528
- <Button
1529
- type="button"
1530
- size="sm"
1531
- variant={activeSubtaskTab === "all" ? "default" : "outline"}
1532
- onClick={() => setActiveSubtaskTab("all")}
1533
- >
1534
- All slices
1535
- </Button>
1536
- {singleMetricSubtaskTabs.map((tab) => (
1537
- <Button
1538
- key={tab.key}
1539
- type="button"
1540
- size="sm"
1541
- variant={activeSubtaskTab === tab.key ? "default" : "outline"}
1542
- onClick={() => setActiveSubtaskTab(tab.key)}
1543
- >
1544
- {tab.label}
1545
- </Button>
1546
- ))}
1547
- </div>
1548
  </div>
1549
  )}
1550
 
@@ -1739,6 +1976,12 @@ function MultiMetricLeaderboard({
1739
  )}
1740
  <span className="lg:hidden">{row.model_info.developer ?? "Unknown developer"}</span>
1741
  </div>
 
 
 
 
 
 
1742
  </div>
1743
  </TableCell>
1744
 
@@ -1754,6 +1997,7 @@ function MultiMetricLeaderboard({
1754
 
1755
  {visibleMetrics.map((metric) => {
1756
  const score = row.values[metric.column_key]
 
1757
  return (
1758
  <TableCell
1759
  key={metric.column_key}
@@ -1762,7 +2006,8 @@ function MultiMetricLeaderboard({
1762
  !isNumericScore(score) && "text-muted-foreground"
1763
  )}
1764
  >
1765
- {isNumericScore(score) ? formatRawScore(score, metric.unit) : "—"}
 
1766
  </TableCell>
1767
  )
1768
  })}
 
5
  import Link from "next/link"
6
  import { Badge } from "@/components/ui/badge"
7
  import { Button } from "@/components/ui/button"
8
+ import { CompletenessPanel } from "@/components/signals/completeness-panel"
9
+ import { ComparabilityPanel } from "@/components/signals/comparability-panel"
10
+ import { ReproducibilityPanel } from "@/components/signals/reproducibility-panel"
11
+ import { SignalsRowBadges } from "@/components/signals/signals-row-badges"
12
+ import { SignalTooltip } from "@/components/signals/signal-tooltip"
13
+ import { getCompletenessPopulatedCount } from "@/components/signals/signal-utils"
14
  import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"
15
  import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
16
+ import {
17
+ Dialog,
18
+ DialogContent,
19
+ DialogDescription,
20
+ DialogHeader,
21
+ DialogTitle,
22
+ } from "@/components/ui/dialog"
23
+ import { Input } from "@/components/ui/input"
24
  import {
25
  DropdownMenu,
26
  DropdownMenuCheckboxItem,
 
46
  Globe,
47
  Medal,
48
  Scale,
49
+ Search,
50
  Shield,
51
  SlidersHorizontal,
52
  Tag,
53
+ X,
54
  } from "lucide-react"
55
  import type { BenchmarkCard } from "@/lib/benchmark-schema"
56
  import type { BenchmarkEvalSummary, ModelResultForBenchmark } from "@/lib/eval-processing"
 
69
  type LeaderboardMetric = NonNullable<BenchmarkEvalSummary["leaderboard_metrics"]>[number]
70
  type LeaderboardMatrixRow = NonNullable<BenchmarkEvalSummary["leaderboard_rows"]>[number]
71
 
72
+ /**
73
+ * Pick a representative row-level annotation for the matrix view.
74
+ *
75
+ * Reproducibility and provenance are typically constant across all metrics for
76
+ * a given (model, benchmark) pair, so rendering them in every cell is just
77
+ * noise. This helper grabs the first non-null annotation across visible metrics
78
+ * and returns it for the row-level badge strip.
79
+ */
80
+ function getRowLevelAnnotations(
81
+ row: LeaderboardMatrixRow,
82
+ visibleMetrics: LeaderboardMetric[]
83
+ ) {
84
+ const annotationsByMetric = row.annotations_by_metric
85
+ if (!annotationsByMetric) {
86
+ return null
87
+ }
88
+
89
+ for (const metric of visibleMetrics) {
90
+ const annotations = annotationsByMetric[metric.column_key]
91
+ if (annotations) {
92
+ return annotations
93
+ }
94
+ }
95
+
96
+ return null
97
+ }
98
+
99
+ const SLICE_PILL_THRESHOLD = 5
100
+
101
+ interface SliceTab {
102
+ key: string
103
+ label: string
104
+ }
105
+
106
+ /**
107
+ * Slice picker that adapts to slice count.
108
+ *
109
+ * - <= SLICE_PILL_THRESHOLD: render every slice as a pill (current familiar UX).
110
+ * - > SLICE_PILL_THRESHOLD: render "All slices" + currently-selected pill +
111
+ * a "Browse N slices" button that opens a searchable dialog. Hundreds of
112
+ * subtasks (e.g. AIRBench's 374) fit cleanly.
113
+ */
114
+ function SliceSelector({
115
+ activeSubtaskTab,
116
+ onChange,
117
+ tabs,
118
+ }: {
119
+ activeSubtaskTab: string
120
+ onChange: (key: string) => void
121
+ tabs: SliceTab[]
122
+ }) {
123
+ const [browserOpen, setBrowserOpen] = useState(false)
124
+ const [search, setSearch] = useState("")
125
+
126
+ const useBrowser = tabs.length > SLICE_PILL_THRESHOLD
127
+ const activeTab = tabs.find((tab) => tab.key === activeSubtaskTab)
128
+
129
+ const filteredTabs = useMemo(() => {
130
+ const query = search.trim().toLowerCase()
131
+ if (!query) return tabs
132
+ return tabs.filter((tab) => tab.label.toLowerCase().includes(query))
133
+ }, [search, tabs])
134
+
135
+ if (!useBrowser) {
136
+ return (
137
+ <div>
138
+ <div className="mb-2 text-[11px] font-semibold uppercase tracking-[0.16em] text-muted-foreground">
139
+ Benchmark slices
140
+ </div>
141
+ <div className="flex flex-wrap gap-2">
142
+ <Button
143
+ type="button"
144
+ size="sm"
145
+ variant={activeSubtaskTab === "all" ? "default" : "outline"}
146
+ onClick={() => onChange("all")}
147
+ >
148
+ All slices
149
+ </Button>
150
+ {tabs.map((tab) => (
151
+ <Button
152
+ key={tab.key}
153
+ type="button"
154
+ size="sm"
155
+ variant={activeSubtaskTab === tab.key ? "default" : "outline"}
156
+ onClick={() => onChange(tab.key)}
157
+ >
158
+ {tab.label}
159
+ </Button>
160
+ ))}
161
+ </div>
162
+ </div>
163
+ )
164
+ }
165
+
166
+ return (
167
+ <div>
168
+ <div className="mb-2 flex items-center justify-between gap-2">
169
+ <div className="text-[11px] font-semibold uppercase tracking-[0.16em] text-muted-foreground">
170
+ Benchmark slices
171
+ </div>
172
+ <span className="text-xs text-muted-foreground">{tabs.length} total</span>
173
+ </div>
174
+ <div className="flex flex-wrap items-center gap-2">
175
+ <Button
176
+ type="button"
177
+ size="sm"
178
+ variant={activeSubtaskTab === "all" ? "default" : "outline"}
179
+ onClick={() => onChange("all")}
180
+ >
181
+ All slices
182
+ </Button>
183
+ {activeTab && (
184
+ <Button
185
+ type="button"
186
+ size="sm"
187
+ variant="default"
188
+ onClick={() => onChange("all")}
189
+ className="max-w-[18rem] truncate"
190
+ title={`Active: ${activeTab.label}. Click to clear.`}
191
+ >
192
+ {activeTab.label}
193
+ <X className="ml-1.5 h-3 w-3 shrink-0" />
194
+ </Button>
195
+ )}
196
+ <Button
197
+ type="button"
198
+ size="sm"
199
+ variant="outline"
200
+ onClick={() => setBrowserOpen(true)}
201
+ className="gap-1.5"
202
+ >
203
+ <Search className="h-3.5 w-3.5" />
204
+ {activeTab ? "Change slice" : `Browse ${tabs.length} slices`}
205
+ </Button>
206
+ </div>
207
+
208
+ <Dialog
209
+ open={browserOpen}
210
+ onOpenChange={(open) => {
211
+ setBrowserOpen(open)
212
+ if (!open) setSearch("")
213
+ }}
214
+ >
215
+ <DialogContent className="max-w-2xl">
216
+ <DialogHeader>
217
+ <DialogTitle>Browse benchmark slices</DialogTitle>
218
+ <DialogDescription>
219
+ {tabs.length} slices in this benchmark. Pick one to filter the leaderboard,
220
+ or close to keep showing all slices.
221
+ </DialogDescription>
222
+ </DialogHeader>
223
+
224
+ <Input
225
+ value={search}
226
+ onChange={(event) => setSearch(event.target.value)}
227
+ placeholder="Search slices..."
228
+ autoFocus
229
+ />
230
+
231
+ <div className="max-h-[60vh] overflow-y-auto rounded-md border">
232
+ <button
233
+ type="button"
234
+ onClick={() => {
235
+ onChange("all")
236
+ setBrowserOpen(false)
237
+ }}
238
+ className={cn(
239
+ "flex w-full items-center justify-between border-b px-4 py-2.5 text-left text-sm transition-colors hover:bg-muted/40",
240
+ activeSubtaskTab === "all" && "bg-muted/40 font-semibold"
241
+ )}
242
+ >
243
+ <span>All slices (no filter)</span>
244
+ {activeSubtaskTab === "all" && <span className="text-xs text-muted-foreground">selected</span>}
245
+ </button>
246
+ {filteredTabs.length === 0 ? (
247
+ <div className="px-4 py-6 text-center text-sm text-muted-foreground">
248
+ No slices match "{search}".
249
+ </div>
250
+ ) : (
251
+ filteredTabs.map((tab) => (
252
+ <button
253
+ key={tab.key}
254
+ type="button"
255
+ onClick={() => {
256
+ onChange(tab.key)
257
+ setBrowserOpen(false)
258
+ }}
259
+ className={cn(
260
+ "flex w-full items-center justify-between border-b px-4 py-2 text-left text-sm transition-colors hover:bg-muted/40 last:border-b-0",
261
+ activeSubtaskTab === tab.key && "bg-muted/40 font-semibold"
262
+ )}
263
+ >
264
+ <span className="min-w-0 truncate pr-2">{tab.label}</span>
265
+ {activeSubtaskTab === tab.key && (
266
+ <span className="shrink-0 text-xs text-muted-foreground">selected</span>
267
+ )}
268
+ </button>
269
+ ))
270
+ )}
271
+ </div>
272
+ </DialogContent>
273
+ </Dialog>
274
+ </div>
275
+ )
276
+ }
277
+
278
  const PARAM_RANGE_VALUES = [1, 2, 3, 4, 6, 8, 10, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 500] as const
279
  const PARAM_RANGE_MARKERS = [
280
  { label: "< 1B", step: 0 },
 
622
  : summary.is_aggregated
623
  ? "Averaged model results across the contributing composite benchmarks, with drill-down to each component score."
624
  : "Model results with benchmark context, source dataset detail, and optional instance-data links."
625
+ const reportingCompleteness = summary.evalcards?.annotations?.reporting_completeness
626
+ const benchmarkComparability = summary.evalcards?.annotations?.benchmark_comparability
627
+ const documentationPopulatedCount = reportingCompleteness
628
+ ? getCompletenessPopulatedCount(reportingCompleteness)
629
+ : null
630
 
631
  const toggleRow = (key: string) =>
632
  setExpandedRows((current) => ({
 
657
  ? `${summary.metrics_count ?? summary.leaderboard_metrics?.length ?? 1} measures`
658
  : `${summary.metrics_count ?? 1} ${(summary.metrics_count ?? 1) === 1 ? "measure" : "measures"}`}
659
  </Badge>
660
+ {reportingCompleteness && (
661
+ <SignalTooltip
662
+ content={`${documentationPopulatedCount} of ${reportingCompleteness.total_fields_evaluated} EvalCards documentation fields populated for this benchmark.`}
663
+ >
664
+ <Badge variant="outline" className="border-emerald-200 bg-emerald-50 text-emerald-800 dark:border-emerald-900/50 dark:bg-emerald-950/30 dark:text-emerald-200">
665
+ Documentation {Math.round(reportingCompleteness.completeness_score * 100)}%
666
+ </Badge>
667
+ </SignalTooltip>
668
+ )}
669
  </div>
670
  </div>
671
  {overviewOpen ? (
 
816
  </dl>
817
  </div>
818
 
819
+ <CompletenessPanel completeness={reportingCompleteness} />
820
+ <ComparabilityPanel
821
+ comparability={benchmarkComparability}
822
+ summary={summary.comparability_summary}
823
+ />
824
+
825
  {!hasMultiMetricLeaderboard && (summary.root_metrics?.length || summary.subtasks?.length) ? (
826
  <section className="rounded-2xl border bg-muted/5 p-3.5">
827
  <div className="space-y-1">
 
1054
  const samples = Array.isArray(modelResult.source_data)
1055
  ? undefined
1056
  : modelResult.source_data.samples_number
1057
+ const rowAnnotations = modelResult.result.evalcards?.annotations
1058
 
1059
  return (
1060
  <Fragment key={key}>
1061
+ <TableRow
1062
+ id={modelResult.model_route_id ? `row-${modelResult.model_route_id}` : undefined}
1063
+ className={cn("group", isExpanded && "bg-muted/15")}
1064
+ >
1065
  <TableCell className="px-4">
1066
  <div
1067
  className={cn(
 
1114
 
1115
  <TableCell className="text-right">
1116
  <div className="text-xl font-semibold tabular-nums">{formatRawScore(modelResult.score, summary.metric_config.unit)}</div>
1117
+ <SignalsRowBadges annotations={rowAnnotations} />
1118
  </TableCell>
1119
 
1120
  {isResearchView ? (
 
1244
  )}
1245
  </DetailPanel>
1246
 
1247
+ <ReproducibilityPanel gap={rowAnnotations?.reproducibility_gap} />
1248
+
1249
  <DetailPanel
1250
  title={isResearchView ? "Score Breakdown" : "Metric Summary"}
1251
  subtitle={
 
1432
  const leaderboardMetrics = summary.leaderboard_metrics ?? []
1433
  const leaderboardRows = summary.leaderboard_rows ?? []
1434
  const allMetricKeys = useMemo(() => leaderboardMetrics.map((metric) => metric.column_key), [leaderboardMetrics])
1435
+ // Cap default visible columns to avoid hangs on benchmarks with hundreds of metrics
1436
+ // (e.g. helm_air_bench has 374 subtask×metric pairs). Users can opt in to more.
1437
+ const DEFAULT_VISIBLE_METRIC_CAP = 24
1438
+ const defaultVisibleMetricKeys = useMemo(
1439
+ () => allMetricKeys.slice(0, DEFAULT_VISIBLE_METRIC_CAP),
1440
+ [allMetricKeys]
1441
+ )
1442
+ const [visibleMetricKeys, setVisibleMetricKeys] = useState<string[]>(() => defaultVisibleMetricKeys)
1443
  const maxParamStepIndex = PARAM_RANGE_VALUES.length - 1
1444
  const leaderboardMetricMap = useMemo(
1445
  () => new Map(leaderboardMetrics.map((metric) => [metric.column_key, metric])),
 
1589
  }, [maxParamStep, minParamStep, sortDirection, sortKey])
1590
 
1591
  useEffect(() => {
1592
+ setVisibleMetricKeys(defaultVisibleMetricKeys)
1593
+ }, [defaultVisibleMetricKeys, summary.evaluation_id])
1594
 
1595
  useEffect(() => {
1596
  setActiveSubtaskTab("all")
 
1777
  <CardContent className="p-0">
1778
  {hasSubtaskTabs && (
1779
  <div className="border-b bg-background px-5 py-3 sm:px-6">
1780
+ <SliceSelector
1781
+ activeSubtaskTab={activeSubtaskTab}
1782
+ onChange={setActiveSubtaskTab}
1783
+ tabs={singleMetricSubtaskTabs}
1784
+ />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1785
  </div>
1786
  )}
1787
 
 
1976
  )}
1977
  <span className="lg:hidden">{row.model_info.developer ?? "Unknown developer"}</span>
1978
  </div>
1979
+ <SignalsRowBadges
1980
+ annotations={getRowLevelAnnotations(row, visibleMetrics)}
1981
+ variant="row"
1982
+ className="mt-1 justify-start"
1983
+ hideOnMobile={false}
1984
+ />
1985
  </div>
1986
  </TableCell>
1987
 
 
1997
 
1998
  {visibleMetrics.map((metric) => {
1999
  const score = row.values[metric.column_key]
2000
+ const annotations = row.annotations_by_metric?.[metric.column_key]
2001
  return (
2002
  <TableCell
2003
  key={metric.column_key}
 
2006
  !isNumericScore(score) && "text-muted-foreground"
2007
  )}
2008
  >
2009
+ <div>{isNumericScore(score) ? formatRawScore(score, metric.unit) : "—"}</div>
2010
+ <SignalsRowBadges annotations={annotations} variant="cell" />
2011
  </TableCell>
2012
  )
2013
  })}
components/model-compare-dialog.tsx CHANGED
@@ -133,6 +133,7 @@ const CONTEXT_ROWS = [
133
  { key: "benchmarks", label: "Benchmark coverage" },
134
  { key: "variants", label: "Versions" },
135
  { key: "score_summary", label: "Score range" },
 
136
  { key: "latest", label: "Latest summary" },
137
  { key: "updated", label: "Updated" },
138
  ] as const
@@ -409,6 +410,20 @@ export function ModelCompareDialog({
409
  </div>
410
  </div>
411
  ) : null}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  {row.key === "latest" ? (
413
  <div className="flex items-center gap-2">
414
  <span>{model.latest_source_name || `${model.benchmarks_count} benchmark suites summarized`}</span>
 
133
  { key: "benchmarks", label: "Benchmark coverage" },
134
  { key: "variants", label: "Versions" },
135
  { key: "score_summary", label: "Score range" },
136
+ { key: "reproducibility", label: "Re-runnability" },
137
  { key: "latest", label: "Latest summary" },
138
  { key: "updated", label: "Updated" },
139
  ] as const
 
410
  </div>
411
  </div>
412
  ) : null}
413
+ {row.key === "reproducibility" ? (
414
+ model.reproducibility_summary && model.reproducibility_summary.has_reproducibility_gap_count > 0 ? (
415
+ <div className="space-y-1">
416
+ <div className="font-medium">
417
+ {model.reproducibility_summary.has_reproducibility_gap_count} setup gaps
418
+ </div>
419
+ <div className="text-sm text-muted-foreground">
420
+ Out of {model.reproducibility_summary.results_total} reported scores
421
+ </div>
422
+ </div>
423
+ ) : (
424
+ <span className="text-muted-foreground">No setup gaps reported</span>
425
+ )
426
+ ) : null}
427
  {row.key === "latest" ? (
428
  <div className="flex items-center gap-2">
429
  <span>{model.latest_source_name || `${model.benchmarks_count} benchmark suites summarized`}</span>
components/navigation.tsx CHANGED
@@ -38,6 +38,12 @@ export function Navigation() {
38
  icon: BarChart3,
39
  isActive: pathname === "/evals" || pathname?.startsWith("/evals/")
40
  },
 
 
 
 
 
 
41
  {
42
  href: "/survey",
43
  label: "Survey",
 
38
  icon: BarChart3,
39
  isActive: pathname === "/evals" || pathname?.startsWith("/evals/")
40
  },
41
+ {
42
+ href: "/corpus",
43
+ label: "Corpus",
44
+ icon: FlaskConical,
45
+ isActive: pathname === "/corpus" || pathname?.startsWith("/corpus/")
46
+ },
47
  {
48
  href: "/survey",
49
  label: "Survey",
components/signals/comparability-panel.tsx ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client"
2
+
3
+ import type { ReactNode } from "react"
4
+ import { ChevronDown, GitCompareArrows, UsersRound } from "lucide-react"
5
+
6
+ import { useAudienceMode } from "@/components/audience-mode-provider"
7
+ import { Badge } from "@/components/ui/badge"
8
+ import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
9
+ import type { BenchmarkComparability, ComparabilitySummary, DifferingSetupField } from "@/lib/backend-artifacts"
10
+ import {
11
+ formatFieldLabel,
12
+ formatSignalNumber,
13
+ formatSignalValue,
14
+ } from "./signal-utils"
15
+
16
+ export function ComparabilityPanel({
17
+ comparability,
18
+ summary,
19
+ }: {
20
+ comparability?: BenchmarkComparability | null
21
+ summary?: ComparabilitySummary
22
+ }) {
23
+ const { mode } = useAudienceMode()
24
+ const isResearchView = mode === "research"
25
+ const variantGroups = comparability?.variant_divergence_groups ?? []
26
+ const crossPartyGroups = comparability?.cross_party_divergence_groups ?? []
27
+ const showNoCrossPartyNote = summary?.groups_with_cross_party_check === 0
28
+
29
+ if (variantGroups.length === 0 && crossPartyGroups.length === 0 && !showNoCrossPartyNote) {
30
+ return null
31
+ }
32
+
33
+ return (
34
+ <section className="rounded-2xl border border-border/70 bg-background/70 p-4 sm:p-5">
35
+ <div className="flex flex-col gap-2 sm:flex-row sm:items-start sm:justify-between">
36
+ <div className="space-y-1">
37
+ <div className="flex items-center gap-2">
38
+ <GitCompareArrows className="h-4 w-4 text-primary" />
39
+ <h3 className="font-semibold">
40
+ {isResearchView ? "Comparability" : "Can these scores be compared directly?"}
41
+ </h3>
42
+ </div>
43
+ <p className="max-w-2xl text-sm text-muted-foreground">
44
+ {isResearchView
45
+ ? "Groups where reported scores diverge across setups or reporting organizations."
46
+ : "Flags cases where score differences may come from setup choices or different reporting sources."}
47
+ </p>
48
+ </div>
49
+ {summary && (
50
+ <div className="flex flex-wrap gap-2 text-xs">
51
+ <Badge variant="outline">{summary.groups_with_variant_check} setup checks</Badge>
52
+ <Badge variant="outline">{summary.groups_with_cross_party_check} source checks</Badge>
53
+ </div>
54
+ )}
55
+ </div>
56
+
57
+ {showNoCrossPartyNote && (
58
+ <div className="mt-4 rounded-xl border border-dashed border-border/70 bg-muted/10 px-3 py-2 text-sm text-muted-foreground">
59
+ No third-party reports are available for cross-party comparison.
60
+ </div>
61
+ )}
62
+
63
+ <div className="mt-4 grid gap-3 lg:grid-cols-2">
64
+ {variantGroups.length > 0 && (
65
+ <GroupList
66
+ icon="variant"
67
+ title="Variant divergence"
68
+ count={variantGroups.length}
69
+ >
70
+ {variantGroups.slice(0, 8).map((group) => (
71
+ <DivergenceGroupItem
72
+ key={group.group_id}
73
+ modelRouteId={group.model_route_id}
74
+ magnitude={group.divergence_magnitude}
75
+ threshold={group.threshold_used}
76
+ fields={group.differing_setup_fields}
77
+ />
78
+ ))}
79
+ </GroupList>
80
+ )}
81
+
82
+ {crossPartyGroups.length > 0 && (
83
+ <GroupList
84
+ icon="cross-party"
85
+ title="Cross-party divergence"
86
+ count={crossPartyGroups.length}
87
+ >
88
+ {crossPartyGroups.slice(0, 8).map((group) => (
89
+ <DivergenceGroupItem
90
+ key={group.group_id}
91
+ modelRouteId={group.model_route_id}
92
+ magnitude={group.divergence_magnitude}
93
+ threshold={group.threshold_used}
94
+ fields={group.differing_setup_fields}
95
+ scoresByOrganization={group.scores_by_organization}
96
+ />
97
+ ))}
98
+ </GroupList>
99
+ )}
100
+ </div>
101
+ </section>
102
+ )
103
+ }
104
+
105
+ function GroupList({
106
+ icon,
107
+ title,
108
+ count,
109
+ children,
110
+ }: {
111
+ icon: "variant" | "cross-party"
112
+ title: string
113
+ count: number
114
+ children: ReactNode
115
+ }) {
116
+ const Icon = icon === "variant" ? GitCompareArrows : UsersRound
117
+
118
+ return (
119
+ <Collapsible defaultOpen>
120
+ <CollapsibleTrigger asChild>
121
+ <button
122
+ type="button"
123
+ className="flex w-full items-center justify-between rounded-xl border border-border/70 bg-muted/10 px-3 py-2 text-left transition-colors hover:bg-muted/20"
124
+ >
125
+ <span className="flex items-center gap-2 text-sm font-semibold">
126
+ <Icon className="h-4 w-4 text-muted-foreground" />
127
+ {title}
128
+ <Badge variant="secondary">{count}</Badge>
129
+ </span>
130
+ <ChevronDown className="h-4 w-4 text-muted-foreground" />
131
+ </button>
132
+ </CollapsibleTrigger>
133
+ <CollapsibleContent className="mt-2 space-y-2">
134
+ {children}
135
+ </CollapsibleContent>
136
+ </Collapsible>
137
+ )
138
+ }
139
+
140
+ function DivergenceGroupItem({
141
+ modelRouteId,
142
+ magnitude,
143
+ threshold,
144
+ fields,
145
+ scoresByOrganization,
146
+ }: {
147
+ modelRouteId: string
148
+ magnitude: number
149
+ threshold: number
150
+ fields: DifferingSetupField[]
151
+ scoresByOrganization?: Record<string, number>
152
+ }) {
153
+ return (
154
+ <a
155
+ href={`#row-${modelRouteId}`}
156
+ className="block rounded-xl border border-border/60 bg-background px-3 py-2 text-sm transition-colors hover:bg-muted/20"
157
+ >
158
+ <div className="flex items-start justify-between gap-3">
159
+ <div className="min-w-0">
160
+ <div className="font-medium">{modelRouteId}</div>
161
+ <div className="mt-1 text-xs text-muted-foreground">
162
+ Divergence {formatSignalNumber(magnitude)}; threshold {formatSignalNumber(threshold)}
163
+ </div>
164
+ </div>
165
+ <span className="shrink-0 text-xs font-medium text-primary">Jump to row</span>
166
+ </div>
167
+
168
+ {fields.length > 0 && (
169
+ <div className="mt-2 space-y-1 text-xs text-muted-foreground">
170
+ {fields.slice(0, 3).map((field) => (
171
+ <div key={field.field}>
172
+ <span className="font-medium text-foreground">{formatFieldLabel(field.field)}:</span>{" "}
173
+ {field.values.map(formatSignalValue).join(", ")}
174
+ </div>
175
+ ))}
176
+ </div>
177
+ )}
178
+
179
+ {scoresByOrganization && Object.keys(scoresByOrganization).length > 0 && (
180
+ <div className="mt-2 flex flex-wrap gap-1.5">
181
+ {Object.entries(scoresByOrganization).slice(0, 4).map(([org, score]) => (
182
+ <span
183
+ key={org}
184
+ className="rounded-full border border-border/60 bg-muted/20 px-2 py-0.5 text-[11px] text-muted-foreground"
185
+ >
186
+ {org}: {formatSignalNumber(score)}
187
+ </span>
188
+ ))}
189
+ </div>
190
+ )}
191
+ </a>
192
+ )
193
+ }
components/signals/completeness-panel.tsx ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client"
2
+
3
+ import type { ReactNode } from "react"
4
+ import { ChevronDown, ClipboardCheck } from "lucide-react"
5
+
6
+ import { useAudienceMode } from "@/components/audience-mode-provider"
7
+ import { Badge } from "@/components/ui/badge"
8
+ import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
9
+ import { Progress } from "@/components/ui/progress"
10
+ import type { ReportingCompleteness } from "@/lib/backend-artifacts"
11
+ import {
12
+ formatFieldLabel,
13
+ formatPercent,
14
+ getCompletenessPopulatedCount,
15
+ } from "./signal-utils"
16
+
17
+ export function CompletenessPanel({
18
+ completeness,
19
+ }: {
20
+ completeness?: ReportingCompleteness | null
21
+ }) {
22
+ const { mode } = useAudienceMode()
23
+ const isResearchView = mode === "research"
24
+
25
+ if (!completeness) {
26
+ return null
27
+ }
28
+
29
+ const populatedCount = getCompletenessPopulatedCount(completeness)
30
+ const total = completeness.total_fields_evaluated
31
+ const missingFields = completeness.missing_required_fields ?? []
32
+ const partialFields = completeness.partial_fields ?? []
33
+
34
+ return (
35
+ <section className="rounded-2xl border border-border/70 bg-background/70 p-4 sm:p-5">
36
+ <div className="flex flex-col gap-4 lg:flex-row lg:items-start lg:justify-between">
37
+ <div className="space-y-1">
38
+ <div className="flex items-center gap-2">
39
+ <ClipboardCheck className="h-4 w-4 text-primary" />
40
+ <h3 className="font-semibold">
41
+ {isResearchView ? "Reporting completeness" : "How well is this benchmark documented?"}
42
+ </h3>
43
+ </div>
44
+ <p className="max-w-2xl text-sm text-muted-foreground">
45
+ {isResearchView
46
+ ? "Coverage of EvalCards-required documentation fields for this benchmark."
47
+ : "A quick read on how much supporting documentation is available before leaning on the scores."}
48
+ </p>
49
+ </div>
50
+
51
+ <div className="min-w-[14rem] rounded-xl border border-border/70 bg-muted/10 px-3 py-2">
52
+ <div className="flex items-baseline justify-between gap-3">
53
+ <span className="text-[11px] font-semibold uppercase tracking-[0.18em] text-muted-foreground">
54
+ Documentation
55
+ </span>
56
+ <span className="text-lg font-semibold tabular-nums">
57
+ {formatPercent(completeness.completeness_score)}
58
+ </span>
59
+ </div>
60
+ <Progress value={completeness.completeness_score * 100} className="mt-2 h-2" />
61
+ <div className="mt-2 text-xs text-muted-foreground">
62
+ {populatedCount} of {total} fields populated
63
+ </div>
64
+ </div>
65
+ </div>
66
+
67
+ {(missingFields.length > 0 || partialFields.length > 0) && (
68
+ <div className="mt-4 grid gap-3 lg:grid-cols-2">
69
+ <SignalListCollapsible
70
+ title="Missing required fields"
71
+ count={missingFields.length}
72
+ >
73
+ {missingFields.length === 0 ? (
74
+ <p className="text-sm text-muted-foreground">No missing required fields recorded.</p>
75
+ ) : (
76
+ <ul className="space-y-1.5 text-sm">
77
+ {missingFields.slice(0, 12).map((field) => (
78
+ <li key={field} className="rounded-lg border border-border/50 bg-background px-3 py-2">
79
+ <span className="font-medium">{formatFieldLabel(field)}</span>
80
+ {isResearchView && (
81
+ <span className="ml-2 text-xs text-muted-foreground">{field}</span>
82
+ )}
83
+ </li>
84
+ ))}
85
+ </ul>
86
+ )}
87
+ </SignalListCollapsible>
88
+
89
+ <SignalListCollapsible
90
+ title="Partially populated"
91
+ count={partialFields.length}
92
+ >
93
+ {partialFields.length === 0 ? (
94
+ <p className="text-sm text-muted-foreground">No partially populated fields recorded.</p>
95
+ ) : (
96
+ <ul className="space-y-1.5 text-sm">
97
+ {partialFields.slice(0, 12).map((field) => (
98
+ <li key={field.field_path} className="rounded-lg border border-border/50 bg-background px-3 py-2">
99
+ <div className="flex items-start justify-between gap-3">
100
+ <span className="font-medium">{formatFieldLabel(field.field_path)}</span>
101
+ <span className="shrink-0 text-muted-foreground">
102
+ {field.populated_subitems}/{field.total_subitems}
103
+ </span>
104
+ </div>
105
+ {isResearchView && (
106
+ <div className="mt-1 text-xs text-muted-foreground">{field.field_path}</div>
107
+ )}
108
+ </li>
109
+ ))}
110
+ </ul>
111
+ )}
112
+ </SignalListCollapsible>
113
+ </div>
114
+ )}
115
+ </section>
116
+ )
117
+ }
118
+
119
+ function SignalListCollapsible({
120
+ title,
121
+ count,
122
+ children,
123
+ }: {
124
+ title: string
125
+ count: number
126
+ children: ReactNode
127
+ }) {
128
+ return (
129
+ <Collapsible>
130
+ <CollapsibleTrigger asChild>
131
+ <button
132
+ type="button"
133
+ className="flex w-full items-center justify-between rounded-xl border border-border/70 bg-muted/10 px-3 py-2 text-left transition-colors hover:bg-muted/20"
134
+ >
135
+ <span className="flex items-center gap-2 text-sm font-semibold">
136
+ {title}
137
+ <Badge variant="secondary">{count}</Badge>
138
+ </span>
139
+ <ChevronDown className="h-4 w-4 text-muted-foreground" />
140
+ </button>
141
+ </CollapsibleTrigger>
142
+ <CollapsibleContent className="mt-2">
143
+ {children}
144
+ </CollapsibleContent>
145
+ </Collapsible>
146
+ )
147
+ }
components/signals/corpus-dashboard.tsx ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client"
2
+
3
+ import type { ReactNode } from "react"
4
+ import { useEffect, useMemo, useState } from "react"
5
+ import { BarChart3, ClipboardCheck, GitCompareArrows, ShieldCheck } from "lucide-react"
6
+
7
+ import { useAudienceMode } from "@/components/audience-mode-provider"
8
+ import { Badge } from "@/components/ui/badge"
9
+ import { Button } from "@/components/ui/button"
10
+ import type {
11
+ ComparabilityCorpusBlock,
12
+ CompletenessCorpusBlock,
13
+ CorpusAggregates,
14
+ ProvenanceCorpusBlock,
15
+ ReproducibilityCorpusBlock,
16
+ } from "@/lib/backend-artifacts"
17
+ import { getCategoryColor } from "@/lib/benchmark-schema"
18
+ import {
19
+ formatFieldLabel,
20
+ formatPercent,
21
+ } from "./signal-utils"
22
+
23
+ const CATEGORY_ORDER = ["agentic", "general", "knowledge", "reasoning", "safety", "other"]
24
+
25
+ const SOURCE_COLORS: Record<string, string> = {
26
+ first_party: "bg-amber-500",
27
+ third_party: "bg-emerald-500",
28
+ collaborative: "bg-sky-500",
29
+ unspecified: "bg-stone-400",
30
+ }
31
+
32
+ export function CorpusDashboard({
33
+ aggregates,
34
+ completenessScores,
35
+ }: {
36
+ aggregates: CorpusAggregates
37
+ completenessScores: number[]
38
+ }) {
39
+ const { mode } = useAudienceMode()
40
+ const [view, setView] = useState<"overall" | "category">("overall")
41
+
42
+ useEffect(() => {
43
+ setView(mode === "research" ? "category" : "overall")
44
+ }, [mode])
45
+
46
+ const categoryKeys = useMemo(
47
+ () =>
48
+ CATEGORY_ORDER.filter((category) =>
49
+ aggregates.reproducibility.by_category[category] ||
50
+ aggregates.completeness.by_category[category] ||
51
+ aggregates.provenance.by_category[category] ||
52
+ aggregates.comparability.by_category[category]
53
+ ),
54
+ [aggregates]
55
+ )
56
+
57
+ return (
58
+ <div className="space-y-6">
59
+ <section className="rounded-2xl border border-border/70 bg-card p-5 shadow-sm">
60
+ <div className="flex flex-col gap-4 lg:flex-row lg:items-start lg:justify-between">
61
+ <div>
62
+ <div className="text-[11px] font-semibold uppercase tracking-[0.22em] text-muted-foreground">
63
+ Interpretive signals
64
+ </div>
65
+ <h1 className="mt-2 text-3xl font-semibold tracking-tight">Corpus Dashboard</h1>
66
+ <p className="mt-2 max-w-3xl text-sm leading-6 text-muted-foreground">
67
+ Corpus-level rollups for reproducibility, documentation completeness, source provenance, and comparability.
68
+ </p>
69
+ </div>
70
+
71
+ <div className="flex flex-wrap items-center gap-2">
72
+ <Badge variant="outline">Signals v{aggregates.signal_version}</Badge>
73
+ <Badge variant="outline">Generated {formatGeneratedDate(aggregates.generated_at)}</Badge>
74
+ <div className="inline-flex rounded-full border bg-muted/20 p-1">
75
+ <Button
76
+ type="button"
77
+ size="sm"
78
+ variant={view === "overall" ? "default" : "ghost"}
79
+ className="h-8 rounded-full"
80
+ onClick={() => setView("overall")}
81
+ >
82
+ Overall
83
+ </Button>
84
+ <Button
85
+ type="button"
86
+ size="sm"
87
+ variant={view === "category" ? "default" : "ghost"}
88
+ className="h-8 rounded-full"
89
+ onClick={() => setView("category")}
90
+ >
91
+ By category
92
+ </Button>
93
+ </div>
94
+ </div>
95
+ </div>
96
+ </section>
97
+
98
+ {view === "overall" ? (
99
+ <div className="grid gap-6">
100
+ <ReproducibilitySection block={aggregates.reproducibility.overall} />
101
+ <CompletenessSection block={aggregates.completeness.overall} scores={completenessScores} />
102
+ <ProvenanceSection block={aggregates.provenance.overall} />
103
+ <ComparabilitySection block={aggregates.comparability.overall} />
104
+ </div>
105
+ ) : (
106
+ <div className="grid gap-4 xl:grid-cols-2">
107
+ {categoryKeys.map((category) => (
108
+ <CategoryPanel
109
+ key={category}
110
+ category={category}
111
+ reproducibility={aggregates.reproducibility.by_category[category]}
112
+ completeness={aggregates.completeness.by_category[category]}
113
+ provenance={aggregates.provenance.by_category[category]}
114
+ comparability={aggregates.comparability.by_category[category]}
115
+ />
116
+ ))}
117
+ </div>
118
+ )}
119
+ </div>
120
+ )
121
+ }
122
+
123
+ function ReproducibilitySection({ block }: { block: ReproducibilityCorpusBlock }) {
124
+ return (
125
+ <DashboardSection
126
+ icon={<ShieldCheck className="h-5 w-5" />}
127
+ title="Reproducibility"
128
+ subtitle="Reported scores with enough setup documentation to re-run."
129
+ headline={formatPercent(block.reproducibility_gap_rate)}
130
+ headlineLabel={`${block.triples_with_reproducibility_gap.toLocaleString()} of ${block.total_triples.toLocaleString()} reported scores have gaps`}
131
+ >
132
+ <div className="grid gap-2">
133
+ {Object.entries(block.per_field_missingness).slice(0, 10).map(([field, value]) => (
134
+ <MetricBar
135
+ key={field}
136
+ label={formatFieldLabel(field)}
137
+ value={value.missing_rate}
138
+ detail={`${value.missing_count.toLocaleString()} missing / ${value.denominator === "agentic_only" ? "agentic only" : "all scores"}`}
139
+ />
140
+ ))}
141
+ </div>
142
+ </DashboardSection>
143
+ )
144
+ }
145
+
146
+ function CompletenessSection({
147
+ block,
148
+ scores,
149
+ }: {
150
+ block: CompletenessCorpusBlock
151
+ scores: number[]
152
+ }) {
153
+ return (
154
+ <DashboardSection
155
+ icon={<ClipboardCheck className="h-5 w-5" />}
156
+ title="Reporting Completeness"
157
+ subtitle="How much benchmark documentation is populated."
158
+ headline={formatPercent(block.completeness_score_mean)}
159
+ headlineLabel={`Median ${formatPercent(block.completeness_score_median)} across ${block.total_benchmarks.toLocaleString()} benchmarks`}
160
+ >
161
+ {scores.length > 0 && <Histogram scores={scores} />}
162
+ <div className="mt-4 grid gap-2">
163
+ {Object.entries(block.per_field_population).slice(0, 10).map(([field, value]) => (
164
+ <div key={field} className="rounded-xl border border-border/60 bg-background px-3 py-2">
165
+ <div className="flex items-start justify-between gap-3 text-sm">
166
+ <span className="font-medium">{formatFieldLabel(field)}</span>
167
+ <span className="shrink-0 tabular-nums text-muted-foreground">
168
+ {formatPercent(value.mean_score)}
169
+ </span>
170
+ </div>
171
+ <div className="mt-2 grid gap-1.5">
172
+ <MetricBar label="Any data" value={value.populated_rate} compact />
173
+ <MetricBar label="Fully populated" value={value.fully_populated_rate} compact />
174
+ </div>
175
+ </div>
176
+ ))}
177
+ </div>
178
+ </DashboardSection>
179
+ )
180
+ }
181
+
182
+ function ProvenanceSection({ block }: { block: ProvenanceCorpusBlock }) {
183
+ const distribution = block.source_type_distribution
184
+ const total = Object.values(distribution).reduce((sum, value) => sum + value, 0)
185
+
186
+ return (
187
+ <DashboardSection
188
+ icon={<BarChart3 className="h-5 w-5" />}
189
+ title="Provenance"
190
+ subtitle="Who reported the scores, and whether groups have multiple sources."
191
+ headline={formatPercent(block.multi_source_rate)}
192
+ headlineLabel="of (model, benchmark, metric) groups have multiple reporting sources"
193
+ >
194
+ <div className="overflow-hidden rounded-full border border-border/70 bg-muted/30">
195
+ <div className="flex h-4 w-full">
196
+ {Object.entries(distribution).map(([sourceType, count]) => (
197
+ <div
198
+ key={sourceType}
199
+ className={SOURCE_COLORS[sourceType] ?? "bg-muted-foreground"}
200
+ style={{ width: total > 0 ? `${(count / total) * 100}%` : "0%" }}
201
+ title={`${sourceType.replace(/_/g, " ")}: ${count}`}
202
+ />
203
+ ))}
204
+ </div>
205
+ </div>
206
+
207
+ <div className="mt-3 grid gap-2 sm:grid-cols-2">
208
+ <RatioTile label="Multi-source groups" value={block.multi_source_rate} count={block.multi_source_groups} />
209
+ <RatioTile label="First-party only groups" value={block.first_party_only_rate} count={block.first_party_only_groups} />
210
+ </div>
211
+ </DashboardSection>
212
+ )
213
+ }
214
+
215
+ function ComparabilitySection({ block }: { block: ComparabilityCorpusBlock }) {
216
+ return (
217
+ <DashboardSection
218
+ icon={<GitCompareArrows className="h-5 w-5" />}
219
+ title="Comparability"
220
+ subtitle="Eligible groups where scores diverge across setups or reporting organizations."
221
+ headline={formatNullableRate(block.variant_divergence_rate)}
222
+ headlineLabel={`${block.variant_divergent_groups.toLocaleString()} of ${block.variant_eligible_groups.toLocaleString()} setup-eligible groups diverge`}
223
+ >
224
+ <div className="grid gap-3 md:grid-cols-2">
225
+ <ComparabilityRateCard
226
+ title="Variant divergence"
227
+ rate={block.variant_divergence_rate}
228
+ eligible={block.variant_eligible_groups}
229
+ divergent={block.variant_divergent_groups}
230
+ />
231
+ <ComparabilityRateCard
232
+ title="Cross-party divergence"
233
+ rate={block.cross_party_divergence_rate}
234
+ eligible={block.cross_party_eligible_groups}
235
+ divergent={block.cross_party_divergent_groups}
236
+ />
237
+ </div>
238
+ </DashboardSection>
239
+ )
240
+ }
241
+
242
+ function CategoryPanel({
243
+ category,
244
+ reproducibility,
245
+ completeness,
246
+ provenance,
247
+ comparability,
248
+ }: {
249
+ category: string
250
+ reproducibility?: ReproducibilityCorpusBlock
251
+ completeness?: CompletenessCorpusBlock
252
+ provenance?: ProvenanceCorpusBlock
253
+ comparability?: ComparabilityCorpusBlock
254
+ }) {
255
+ const categoryLabel = `${category.charAt(0).toUpperCase()}${category.slice(1)}`
256
+
257
+ return (
258
+ <section className="rounded-2xl border border-border/70 bg-card p-4 shadow-sm">
259
+ <div className="mb-4 flex items-center justify-between gap-3">
260
+ <h2 className="font-semibold">{categoryLabel}</h2>
261
+ <Badge className={getCategoryColor(categoryLabel)}>{categoryLabel}</Badge>
262
+ </div>
263
+ <div className="grid gap-3 sm:grid-cols-2">
264
+ <MiniMetric label="Reproducibility gaps" value={formatPercent(reproducibility?.reproducibility_gap_rate)} />
265
+ <MiniMetric label="Documentation mean" value={formatPercent(completeness?.completeness_score_mean)} />
266
+ <MiniMetric label="Multi-source groups" value={formatPercent(provenance?.multi_source_rate)} />
267
+ <MiniMetric label="Variant divergence" value={formatNullableRate(comparability?.variant_divergence_rate)} />
268
+ </div>
269
+ {comparability?.cross_party_divergence_rate == null && (
270
+ <div className="mt-3 rounded-xl border border-dashed border-border/70 bg-muted/10 px-3 py-2 text-sm text-muted-foreground">
271
+ Cross-party divergence: N/A - not enough multi-org coverage.
272
+ </div>
273
+ )}
274
+ </section>
275
+ )
276
+ }
277
+
278
+ function DashboardSection({
279
+ icon,
280
+ title,
281
+ subtitle,
282
+ headline,
283
+ headlineLabel,
284
+ children,
285
+ }: {
286
+ icon: ReactNode
287
+ title: string
288
+ subtitle: string
289
+ headline: string
290
+ headlineLabel: string
291
+ children: ReactNode
292
+ }) {
293
+ return (
294
+ <section className="rounded-2xl border border-border/70 bg-card p-5 shadow-sm">
295
+ <div className="grid gap-5 lg:grid-cols-[minmax(0,18rem)_1fr]">
296
+ <div>
297
+ <div className="flex items-center gap-2 text-primary">
298
+ {icon}
299
+ <h2 className="font-semibold">{title}</h2>
300
+ </div>
301
+ <p className="mt-2 text-sm leading-6 text-muted-foreground">{subtitle}</p>
302
+ <div className="mt-5 rounded-xl border border-border/70 bg-muted/10 px-3 py-3">
303
+ <div className="text-3xl font-semibold tabular-nums">{headline}</div>
304
+ <div className="mt-1 text-xs leading-5 text-muted-foreground">{headlineLabel}</div>
305
+ </div>
306
+ </div>
307
+ <div>{children}</div>
308
+ </div>
309
+ </section>
310
+ )
311
+ }
312
+
313
+ function MetricBar({
314
+ label,
315
+ value,
316
+ detail,
317
+ compact = false,
318
+ }: {
319
+ label: string
320
+ value: number | null
321
+ detail?: string
322
+ compact?: boolean
323
+ }) {
324
+ const percent = value == null ? 0 : Math.max(0, Math.min(100, value * 100))
325
+
326
+ return (
327
+ <div className={compact ? "space-y-1" : "rounded-xl border border-border/60 bg-background px-3 py-2"}>
328
+ <div className="flex items-center justify-between gap-3 text-sm">
329
+ <span className="min-w-0 truncate font-medium">{label}</span>
330
+ <span className="shrink-0 tabular-nums text-muted-foreground">{formatPercent(value)}</span>
331
+ </div>
332
+ <div className="mt-1.5 h-2 overflow-hidden rounded-full bg-muted">
333
+ <div className="h-full rounded-full bg-primary/75" style={{ width: `${percent}%` }} />
334
+ </div>
335
+ {detail && <div className="mt-1 text-xs text-muted-foreground">{detail}</div>}
336
+ </div>
337
+ )
338
+ }
339
+
340
+ function Histogram({ scores }: { scores: number[] }) {
341
+ const buckets = Array.from({ length: 10 }, (_, index) => ({
342
+ label: `${index * 10}-${(index + 1) * 10}%`,
343
+ count: 0,
344
+ }))
345
+
346
+ for (const score of scores) {
347
+ if (!Number.isFinite(score)) continue
348
+ const bucket = Math.min(9, Math.max(0, Math.floor(score * 10)))
349
+ buckets[bucket].count += 1
350
+ }
351
+
352
+ const maxCount = Math.max(...buckets.map((bucket) => bucket.count), 1)
353
+
354
+ return (
355
+ <div className="rounded-xl border border-border/60 bg-background px-3 py-3">
356
+ <div className="mb-3 text-sm font-semibold">Benchmark completeness distribution</div>
357
+ <div className="flex h-28 items-end gap-1.5">
358
+ {buckets.map((bucket) => (
359
+ <div key={bucket.label} className="flex min-w-0 flex-1 flex-col items-center gap-1">
360
+ <div
361
+ className="w-full rounded-t bg-primary/70"
362
+ style={{ height: `${Math.max(4, (bucket.count / maxCount) * 100)}%` }}
363
+ title={`${bucket.label}: ${bucket.count}`}
364
+ />
365
+ <span className="text-[9px] text-muted-foreground">{bucket.label.split("-")[0]}</span>
366
+ </div>
367
+ ))}
368
+ </div>
369
+ </div>
370
+ )
371
+ }
372
+
373
+ function RatioTile({ label, value, count }: { label: string; value: number | null; count: number }) {
374
+ return (
375
+ <div className="rounded-xl border border-border/60 bg-background px-3 py-2">
376
+ <div className="text-sm font-medium">{label}</div>
377
+ <div className="mt-1 flex items-baseline justify-between gap-2">
378
+ <span className="text-xl font-semibold tabular-nums">{formatPercent(value)}</span>
379
+ <span className="text-xs text-muted-foreground">{count.toLocaleString()} groups</span>
380
+ </div>
381
+ </div>
382
+ )
383
+ }
384
+
385
+ function ComparabilityRateCard({
386
+ title,
387
+ rate,
388
+ eligible,
389
+ divergent,
390
+ }: {
391
+ title: string
392
+ rate: number | null
393
+ eligible: number
394
+ divergent: number
395
+ }) {
396
+ if (rate == null) {
397
+ return (
398
+ <div className="rounded-xl border border-dashed border-border/70 bg-muted/10 px-4 py-5">
399
+ <div className="font-semibold">{title}</div>
400
+ <div className="mt-2 text-sm text-muted-foreground">
401
+ N/A - not enough data to compute this rate.
402
+ </div>
403
+ </div>
404
+ )
405
+ }
406
+
407
+ return (
408
+ <div className="rounded-xl border border-border/70 bg-background px-4 py-4">
409
+ <div className="font-semibold">{title}</div>
410
+ <div className="mt-3 text-2xl font-semibold tabular-nums">{formatPercent(rate)}</div>
411
+ <div className="mt-1 text-sm text-muted-foreground">
412
+ {divergent.toLocaleString()} of {eligible.toLocaleString()} eligible groups
413
+ </div>
414
+ </div>
415
+ )
416
+ }
417
+
418
+ function MiniMetric({ label, value }: { label: string; value: string }) {
419
+ return (
420
+ <div className="rounded-xl border border-border/60 bg-muted/10 px-3 py-2">
421
+ <div className="text-xs text-muted-foreground">{label}</div>
422
+ <div className="mt-1 text-xl font-semibold tabular-nums">{value}</div>
423
+ </div>
424
+ )
425
+ }
426
+
427
+ function formatNullableRate(value: number | null | undefined) {
428
+ return value == null ? "N/A" : formatPercent(value)
429
+ }
430
+
431
+ function formatGeneratedDate(value: string) {
432
+ const date = new Date(value)
433
+ if (Number.isNaN(date.getTime())) {
434
+ return value
435
+ }
436
+
437
+ return date.toLocaleDateString("en-US", {
438
+ year: "numeric",
439
+ month: "short",
440
+ day: "numeric",
441
+ })
442
+ }
components/signals/cross-party-divergence-badge.tsx ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client"
2
+
3
+ import { UsersRound } from "lucide-react"
4
+
5
+ import { useAudienceMode } from "@/components/audience-mode-provider"
6
+ import { Badge } from "@/components/ui/badge"
7
+ import type { CrossPartyDivergence } from "@/lib/backend-artifacts"
8
+ import { cn } from "@/lib/utils"
9
+ import { formatSignalNumber } from "./signal-utils"
10
+ import { SignalTooltip } from "./signal-tooltip"
11
+
12
+ export function CrossPartyDivergenceBadge({
13
+ divergence,
14
+ className,
15
+ }: {
16
+ divergence?: CrossPartyDivergence | null
17
+ className?: string
18
+ }) {
19
+ const { mode } = useAudienceMode()
20
+ const isResearchView = mode === "research"
21
+
22
+ if (!divergence?.has_cross_party_divergence) {
23
+ return null
24
+ }
25
+
26
+ const magnitude = formatSignalNumber(divergence.divergence_magnitude)
27
+ const orgCount = divergence.organization_count
28
+ const tooltip = isResearchView
29
+ ? `Reports diverge by ${magnitude} across ${orgCount} organization${orgCount === 1 ? "" : "s"}.`
30
+ : "Different organizations reported different scores for this same model on this same benchmark."
31
+
32
+ return (
33
+ <SignalTooltip content={tooltip}>
34
+ <Badge
35
+ variant="outline"
36
+ className={cn(
37
+ "border-violet-300 bg-violet-50 text-violet-900 dark:border-violet-900/60 dark:bg-violet-950/40 dark:text-violet-100",
38
+ className
39
+ )}
40
+ >
41
+ <UsersRound className="h-3 w-3" />
42
+ {isResearchView ? "Cross-party divergence" : "Sources disagree"}
43
+ </Badge>
44
+ </SignalTooltip>
45
+ )
46
+ }
components/signals/provenance-badge.tsx ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client"
2
+
3
+ import { AlertTriangle, BadgeCheck, Handshake, UserRoundCheck } from "lucide-react"
4
+
5
+ import { useAudienceMode } from "@/components/audience-mode-provider"
6
+ import { Badge } from "@/components/ui/badge"
7
+ import type { Provenance, ProvenanceSourceType } from "@/lib/backend-artifacts"
8
+ import { cn } from "@/lib/utils"
9
+ import { SignalTooltip } from "./signal-tooltip"
10
+
11
+ export function getRelationshipDisplayName(value: string | null | undefined) {
12
+ const normalized = value?.replace(/_/g, " ").trim()
13
+ if (!normalized) {
14
+ return "Unknown"
15
+ }
16
+
17
+ return normalized
18
+ .split(/\s+/)
19
+ .map((token) => `${token.charAt(0).toUpperCase()}${token.slice(1).toLowerCase()}`)
20
+ .join(" ")
21
+ }
22
+
23
+ export function getRelationshipShortLabel(value: string | null | undefined, mode: "research" | "policy" = "research") {
24
+ switch ((value ?? "").toLowerCase()) {
25
+ case "first_party":
26
+ return mode === "policy" ? "Reported by model developer" : "1st party"
27
+ case "third_party":
28
+ return mode === "policy" ? "Independently reported" : "3rd party"
29
+ case "collaborative":
30
+ return mode === "policy" ? "Joint report" : "Collaborative"
31
+ case "other":
32
+ return "Other"
33
+ default:
34
+ return getRelationshipDisplayName(value)
35
+ }
36
+ }
37
+
38
+ export function getRelationshipBadgeTone(value: string | null | undefined): string {
39
+ switch ((value ?? "").toLowerCase()) {
40
+ case "first_party":
41
+ return "border-amber-300 bg-amber-50 text-amber-900 dark:border-amber-900/60 dark:bg-amber-950/40 dark:text-amber-100"
42
+ case "third_party":
43
+ return "border-emerald-300 bg-emerald-50 text-emerald-900 dark:border-emerald-900/60 dark:bg-emerald-950/40 dark:text-emerald-100"
44
+ case "collaborative":
45
+ return "border-sky-300 bg-sky-50 text-sky-900 dark:border-sky-900/60 dark:bg-sky-950/40 dark:text-sky-100"
46
+ default:
47
+ return "border-border/70 bg-muted/40 text-muted-foreground"
48
+ }
49
+ }
50
+
51
+ function normalizeSourceType(value: string | null | undefined): ProvenanceSourceType | "other" | null {
52
+ switch ((value ?? "").toLowerCase()) {
53
+ case "first_party":
54
+ case "third_party":
55
+ case "collaborative":
56
+ case "unspecified":
57
+ return value?.toLowerCase() as ProvenanceSourceType
58
+ case "other":
59
+ return "other"
60
+ default:
61
+ return null
62
+ }
63
+ }
64
+
65
+ function ProvenanceIcon({ sourceType }: { sourceType: ProvenanceSourceType | "other" }) {
66
+ if (sourceType === "third_party") {
67
+ return <BadgeCheck className="h-3 w-3" />
68
+ }
69
+
70
+ if (sourceType === "collaborative") {
71
+ return <Handshake className="h-3 w-3" />
72
+ }
73
+
74
+ return <UserRoundCheck className="h-3 w-3" />
75
+ }
76
+
77
+ export function ProvenanceBadge({
78
+ provenance,
79
+ relationship,
80
+ sourceOrganizationName,
81
+ showOther = false,
82
+ className,
83
+ }: {
84
+ provenance?: Provenance | null
85
+ relationship?: string | null
86
+ sourceOrganizationName?: string | null
87
+ showOther?: boolean
88
+ className?: string
89
+ }) {
90
+ const { mode } = useAudienceMode()
91
+ const sourceType = provenance?.source_type ?? normalizeSourceType(relationship)
92
+
93
+ if (!sourceType || sourceType === "unspecified" || (!showOther && sourceType === "other")) {
94
+ return null
95
+ }
96
+
97
+ const firstPartyOnly = provenance?.first_party_only === true
98
+ const label = firstPartyOnly
99
+ ? mode === "policy"
100
+ ? "Only model developer reported"
101
+ : "1st party only"
102
+ : getRelationshipShortLabel(sourceType, mode)
103
+
104
+ const tooltip = firstPartyOnly
105
+ ? mode === "policy"
106
+ ? "Only the model developer reported this score; no independent replication is recorded."
107
+ : "First-party only - no independent replication is recorded for this group."
108
+ : sourceOrganizationName
109
+ ? `Reported by ${sourceOrganizationName}.`
110
+ : getRelationshipDisplayName(sourceType)
111
+
112
+ return (
113
+ <SignalTooltip content={tooltip}>
114
+ <Badge
115
+ variant="outline"
116
+ className={cn(getRelationshipBadgeTone(sourceType), className)}
117
+ >
118
+ <ProvenanceIcon sourceType={sourceType} />
119
+ {label}
120
+ {firstPartyOnly && <AlertTriangle className="h-3 w-3" />}
121
+ </Badge>
122
+ </SignalTooltip>
123
+ )
124
+ }
components/signals/reproducibility-badge.tsx ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client"
2
+
3
+ import { AlertTriangle } from "lucide-react"
4
+
5
+ import { useAudienceMode } from "@/components/audience-mode-provider"
6
+ import { Badge } from "@/components/ui/badge"
7
+ import type { ReproducibilityGap } from "@/lib/backend-artifacts"
8
+ import { cn } from "@/lib/utils"
9
+ import { formatMissingField } from "./signal-utils"
10
+ import { SignalTooltip } from "./signal-tooltip"
11
+
12
+ export function ReproducibilityBadge({
13
+ gap,
14
+ className,
15
+ }: {
16
+ gap?: ReproducibilityGap | null
17
+ className?: string
18
+ }) {
19
+ const { mode } = useAudienceMode()
20
+ const isResearchView = mode === "research"
21
+
22
+ if (!gap?.has_reproducibility_gap) {
23
+ return null
24
+ }
25
+
26
+ const missing = gap.missing_fields.map(formatMissingField)
27
+ const countLine = `${gap.populated_field_count} of ${gap.required_field_count} setup fields recorded.`
28
+ const tooltip = isResearchView
29
+ ? `Setup not fully documented. Missing: ${missing.join(", ") || "none listed"}. ${countLine}`
30
+ : `This score's setup is not fully documented, so it cannot be re-run as-is. ${countLine}`
31
+
32
+ return (
33
+ <SignalTooltip content={tooltip}>
34
+ <Badge
35
+ variant="outline"
36
+ className={cn(
37
+ "border-amber-300 bg-amber-50 text-amber-900 dark:border-amber-900/60 dark:bg-amber-950/40 dark:text-amber-100",
38
+ className
39
+ )}
40
+ >
41
+ <AlertTriangle className="h-3 w-3" />
42
+ {isResearchView ? "Reproducibility gap" : "Setup not documented"}
43
+ </Badge>
44
+ </SignalTooltip>
45
+ )
46
+ }
components/signals/reproducibility-panel.tsx ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client"
2
+
3
+ import { AlertTriangle } from "lucide-react"
4
+
5
+ import { useAudienceMode } from "@/components/audience-mode-provider"
6
+ import type { ReproducibilityGap } from "@/lib/backend-artifacts"
7
+ import { formatMissingField } from "./signal-utils"
8
+
9
+ export function ReproducibilityPanel({
10
+ gap,
11
+ }: {
12
+ gap?: ReproducibilityGap | null
13
+ }) {
14
+ const { mode } = useAudienceMode()
15
+ const isResearchView = mode === "research"
16
+
17
+ if (!gap) {
18
+ return null
19
+ }
20
+
21
+ return (
22
+ <div className="rounded-2xl border bg-background/70 p-4">
23
+ <div className="mb-4 flex items-start gap-2">
24
+ <AlertTriangle className="mt-0.5 h-4 w-4 shrink-0 text-amber-600 dark:text-amber-300" />
25
+ <div>
26
+ <div className="font-semibold">
27
+ {isResearchView ? "Reproducibility" : "Re-runnability"}
28
+ </div>
29
+ <div className="text-sm text-muted-foreground">
30
+ {isResearchView
31
+ ? "Whether the setup is documented well enough for someone else to re-run."
32
+ : "Whether someone could re-run this evaluation with the information available."}
33
+ </div>
34
+ </div>
35
+ </div>
36
+
37
+ <div className="space-y-2.5 text-sm">
38
+ <PanelRow
39
+ label="Setup fields recorded"
40
+ value={`${gap.populated_field_count} of ${gap.required_field_count}`}
41
+ />
42
+ {gap.missing_fields.length > 0 && (
43
+ <PanelRow
44
+ label="Missing"
45
+ value={gap.missing_fields.map(formatMissingField).join(", ")}
46
+ />
47
+ )}
48
+ </div>
49
+ </div>
50
+ )
51
+ }
52
+
53
+ function PanelRow({ label, value }: { label: string; value: string }) {
54
+ return (
55
+ <div className="flex gap-3">
56
+ <span className="w-32 shrink-0 text-muted-foreground">{label}</span>
57
+ <span className="min-w-0 flex-1 break-words font-medium">{value}</span>
58
+ </div>
59
+ )
60
+ }
components/signals/signal-tooltip.tsx ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client"
2
+
3
+ import type { ReactNode } from "react"
4
+ import * as TooltipPrimitive from "@radix-ui/react-tooltip"
5
+
6
+ export function SignalTooltip({
7
+ children,
8
+ content,
9
+ }: {
10
+ children: ReactNode
11
+ content: ReactNode
12
+ }) {
13
+ return (
14
+ <TooltipPrimitive.Provider delayDuration={150}>
15
+ <TooltipPrimitive.Root>
16
+ <TooltipPrimitive.Trigger asChild>{children}</TooltipPrimitive.Trigger>
17
+ <TooltipPrimitive.Portal>
18
+ <TooltipPrimitive.Content
19
+ side="top"
20
+ align="center"
21
+ sideOffset={8}
22
+ className="z-50 max-w-80 rounded-md border border-border/70 bg-popover px-3 py-2 text-xs leading-5 text-popover-foreground shadow-lg"
23
+ >
24
+ {content}
25
+ <TooltipPrimitive.Arrow className="fill-popover" />
26
+ </TooltipPrimitive.Content>
27
+ </TooltipPrimitive.Portal>
28
+ </TooltipPrimitive.Root>
29
+ </TooltipPrimitive.Provider>
30
+ )
31
+ }
components/signals/signal-utils.ts ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { DifferingSetupField, ReportingCompleteness } from "@/lib/backend-artifacts"
2
+
3
+ const FIELD_PREFIXES = [
4
+ "autobenchmarkcard.",
5
+ "eee_eval.",
6
+ "evalcards.",
7
+ ]
8
+
9
+ const TOKEN_OVERRIDES: Record<string, string> = {
10
+ api: "API",
11
+ ai: "AI",
12
+ eee: "EEE",
13
+ hf: "HF",
14
+ id: "ID",
15
+ llm: "LLM",
16
+ url: "URL",
17
+ }
18
+
19
+ function titleCaseSegment(segment: string) {
20
+ return segment
21
+ .split(/[\s_-]+/)
22
+ .filter(Boolean)
23
+ .map((token) => TOKEN_OVERRIDES[token.toLowerCase()] ?? `${token.charAt(0).toUpperCase()}${token.slice(1).toLowerCase()}`)
24
+ .join(" ")
25
+ }
26
+
27
+ export function formatPercent(value: number | null | undefined, digits = 0) {
28
+ if (value == null || !Number.isFinite(value)) {
29
+ return "N/A"
30
+ }
31
+
32
+ return `${(value * 100).toFixed(digits)}%`
33
+ }
34
+
35
+ export function formatSignalNumber(value: number | null | undefined, digits = 3) {
36
+ if (value == null || !Number.isFinite(value)) {
37
+ return "N/A"
38
+ }
39
+
40
+ if (Math.abs(value) >= 100) {
41
+ return value.toFixed(1).replace(/\.0$/, "")
42
+ }
43
+
44
+ return value.toFixed(digits).replace(/0+$/, "").replace(/\.$/, "")
45
+ }
46
+
47
+ export function formatFieldLabel(path: string) {
48
+ let next = path
49
+ for (const prefix of FIELD_PREFIXES) {
50
+ if (next.startsWith(prefix)) {
51
+ next = next.slice(prefix.length)
52
+ break
53
+ }
54
+ }
55
+
56
+ return next
57
+ .split(".")
58
+ .filter(Boolean)
59
+ .map(titleCaseSegment)
60
+ .join(" / ")
61
+ }
62
+
63
+ export function formatMissingField(field: string) {
64
+ return titleCaseSegment(field)
65
+ }
66
+
67
+ export function formatSignalValue(value: unknown) {
68
+ if (value == null) {
69
+ return "(unspecified)"
70
+ }
71
+
72
+ if (typeof value === "string") {
73
+ return value
74
+ }
75
+
76
+ if (typeof value === "number" || typeof value === "boolean" || typeof value === "bigint") {
77
+ return String(value)
78
+ }
79
+
80
+ try {
81
+ return JSON.stringify(value)
82
+ } catch {
83
+ return String(value)
84
+ }
85
+ }
86
+
87
+ export function formatDifferingFields(fields: DifferingSetupField[], limit = 2) {
88
+ if (fields.length === 0) {
89
+ return "setup fields"
90
+ }
91
+
92
+ const labels = fields.slice(0, limit).map((item) => formatMissingField(item.field))
93
+ const remainder = fields.length - labels.length
94
+ return remainder > 0 ? `${labels.join(", ")} +${remainder}` : labels.join(", ")
95
+ }
96
+
97
+ export function getCompletenessPopulatedCount(completeness: ReportingCompleteness) {
98
+ if (completeness.field_scores.length === 0) {
99
+ return Math.round(completeness.completeness_score * completeness.total_fields_evaluated)
100
+ }
101
+
102
+ return Math.round(
103
+ completeness.field_scores.reduce((sum, field) => sum + field.score, 0)
104
+ )
105
+ }
components/signals/signals-row-badges.tsx ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client"
2
+
3
+ import type { RowAnnotations } from "@/lib/backend-artifacts"
4
+ import { cn } from "@/lib/utils"
5
+ import { CrossPartyDivergenceBadge } from "./cross-party-divergence-badge"
6
+ import { ProvenanceBadge } from "./provenance-badge"
7
+ import { ReproducibilityBadge } from "./reproducibility-badge"
8
+ import { VariantDivergenceBadge } from "./variant-divergence-badge"
9
+
10
+ /**
11
+ * Renders the four signal badges for a single row.
12
+ *
13
+ * - `variant`="full" (default): shows all four signals. Use for single-metric
14
+ * leaderboards, expanded row panels, and one-off contexts.
15
+ * - `variant`="cell": only shows divergence signals (variant + cross-party).
16
+ * Use inside multi-metric matrix cells, where reproducibility and provenance
17
+ * are constant across columns and would just be visual noise.
18
+ * - `variant`="row": only shows reproducibility + provenance — the constant
19
+ * per-(model, benchmark) signals. Pair with `variant="cell"` columns so each
20
+ * row carries its constant signals once at the row header.
21
+ */
22
+ export function SignalsRowBadges({
23
+ annotations,
24
+ className,
25
+ hideOnMobile = true,
26
+ variant = "full",
27
+ }: {
28
+ annotations?: RowAnnotations | null
29
+ className?: string
30
+ hideOnMobile?: boolean
31
+ variant?: "full" | "cell" | "row"
32
+ }) {
33
+ if (!annotations) {
34
+ return null
35
+ }
36
+
37
+ const showRowLevel = variant === "full" || variant === "row"
38
+ const showCellLevel = variant === "full" || variant === "cell"
39
+
40
+ const hasReproducibility = showRowLevel && annotations.reproducibility_gap?.has_reproducibility_gap
41
+ const hasProvenance =
42
+ showRowLevel &&
43
+ Boolean(
44
+ annotations.provenance && annotations.provenance.source_type !== "unspecified"
45
+ )
46
+ const hasVariant = showCellLevel && annotations.variant_divergence?.has_variant_divergence
47
+ const hasCrossParty =
48
+ showCellLevel && annotations.cross_party_divergence?.has_cross_party_divergence
49
+
50
+ if (!hasReproducibility && !hasProvenance && !hasVariant && !hasCrossParty) {
51
+ return null
52
+ }
53
+
54
+ return (
55
+ <div
56
+ className={cn(
57
+ "mt-1.5 flex flex-wrap justify-end gap-1.5",
58
+ hideOnMobile && "hidden md:flex",
59
+ className
60
+ )}
61
+ >
62
+ {showRowLevel && <ReproducibilityBadge gap={annotations.reproducibility_gap} />}
63
+ {showRowLevel && <ProvenanceBadge provenance={annotations.provenance} />}
64
+ {showCellLevel && <VariantDivergenceBadge divergence={annotations.variant_divergence} />}
65
+ {showCellLevel && <CrossPartyDivergenceBadge divergence={annotations.cross_party_divergence} />}
66
+ </div>
67
+ )
68
+ }
components/signals/variant-divergence-badge.tsx ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client"
2
+
3
+ import { GitCompareArrows } from "lucide-react"
4
+
5
+ import { useAudienceMode } from "@/components/audience-mode-provider"
6
+ import { Badge } from "@/components/ui/badge"
7
+ import type { VariantDivergence } from "@/lib/backend-artifacts"
8
+ import { cn } from "@/lib/utils"
9
+ import { formatDifferingFields, formatSignalNumber } from "./signal-utils"
10
+ import { SignalTooltip } from "./signal-tooltip"
11
+
12
+ export function VariantDivergenceBadge({
13
+ divergence,
14
+ className,
15
+ }: {
16
+ divergence?: VariantDivergence | null
17
+ className?: string
18
+ }) {
19
+ const { mode } = useAudienceMode()
20
+ const isResearchView = mode === "research"
21
+
22
+ if (!divergence?.has_variant_divergence) {
23
+ return null
24
+ }
25
+
26
+ const magnitude = formatSignalNumber(divergence.divergence_magnitude)
27
+ const fields = formatDifferingFields(divergence.differing_setup_fields)
28
+ const tooltip = isResearchView
29
+ ? `Scores diverge by ${magnitude} across different setups: ${fields}.`
30
+ : "Different runs of this evaluation produced different scores, so the setup matters."
31
+
32
+ return (
33
+ <SignalTooltip content={tooltip}>
34
+ <Badge
35
+ variant="outline"
36
+ className={cn(
37
+ "border-rose-300 bg-rose-50 text-rose-900 dark:border-rose-900/60 dark:bg-rose-950/35 dark:text-rose-100",
38
+ className
39
+ )}
40
+ >
41
+ <GitCompareArrows className="h-3 w-3" />
42
+ {isResearchView ? "Variant divergence" : "Score depends on setup"}
43
+ </Badge>
44
+ </SignalTooltip>
45
+ )
46
+ }
docs/INTERPRETIVE_SIGNALS.md ADDED
@@ -0,0 +1,622 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # EvalCards interpretive signals — frontend implementation spec
2
+
3
+ **Status:** ready to implement. Backend ships in `evaleval/eval_cards_backend_pipeline` PR #1 (merged `b05323c`). All field shapes below are stable and covered by the backend's test suite.
4
+
5
+ **Companion docs:**
6
+ - Spec source of truth: *EvalCards Interpretive Signals v1.0* (Anka Reuel, Stanford). Section refs (§3, §4, …) below point at that doc.
7
+ - Open backend questions: [evaleval/eval_cards_backend_pipeline#2](https://github.com/evaleval/eval_cards_backend_pipeline/issues/2). None block frontend work — they may shift wording, not shape.
8
+
9
+ ---
10
+
11
+ ## 0. What this PR does at a glance
12
+
13
+ The backend now annotates evaluation records with four interpretive signals:
14
+
15
+ 1. **Reproducibility gap** — *per row.* Was the evaluation documented well enough to be re-run? Surfaced as a missing-fields list (e.g. "missing `max_tokens`").
16
+ 2. **Reporting completeness** — *per benchmark.* What fraction of EvalCards-required documentation fields are populated? Surfaced as a `[0, 1]` score with a missing-field breakdown.
17
+ 3. **Provenance** — *per row.* Who reported this score (first-party / third-party / collaborative / unspecified), and is it the only source for this `(model, benchmark, metric)` group?
18
+ 4. **Comparability** — *per `(model, benchmark, metric)` group.* Two flavors: **variant divergence** (same model, same benchmark, different setups → diverging scores) and **cross-party divergence** (different orgs reporting → diverging scores).
19
+
20
+ Plus a corpus-level rollup file (`corpus-aggregates.json`) for a stratified analytics page.
21
+
22
+ The frontend's job: surface these signals **in three places** — row-level badges, per-eval / per-model summary panels, and a corpus dashboard view.
23
+
24
+ ---
25
+
26
+ ## 1. Where the new data lives
27
+
28
+ All fields are new additions to existing artifacts. No artifact is removed or reshaped.
29
+
30
+ | Artifact | New fields |
31
+ |---|---|
32
+ | `evals/{id}.json` (`HFEvalDetail`) | Per-row `evalcards.annotations` block on every `metrics[].model_results[]` and `subtasks[…].metrics[].model_results[]`. Plus eval-root `evalcards.annotations.reporting_completeness`, `evalcards.annotations.benchmark_comparability`, and three top-level summaries: `reproducibility_summary`, `provenance_summary`, `comparability_summary`. |
33
+ | `models/{id}.json` (`HFModelDetail`) | Per-row `evalcards.annotations` block on every `hierarchy_by_category[*][*].metrics[].model_results[]`. Plus three top-level summaries scoped to that model. |
34
+ | `eval-list.json` / `eval-list-lite.json` (`HFEvalListEntry`) | Three summaries per entry. |
35
+ | `model-cards.json` / `model-cards-lite.json` (`HFModelCardEntry`) | Three summaries per entry. |
36
+ | `eval-hierarchy.json` (`EvalHierarchy`) | Each family node and leaf node carries the three summaries (aggregated over evals under it). |
37
+ | **`corpus-aggregates.json` (NEW FILE)** | Stratified rollups for paper / dashboard use. |
38
+ | `manifest.json` | New entry in `summary_artifacts`: `corpus_aggregates: "corpus-aggregates.json"`. |
39
+
40
+ `signal_version` (currently `"1.0"`) is present on every annotation. Treat it as opaque; surface only in admin/debug.
41
+
42
+ ---
43
+
44
+ ## 2. TypeScript types to add
45
+
46
+ Add to `lib/backend-artifacts.ts` (preferred — these are pipeline contract types):
47
+
48
+ ```ts
49
+ // Spec §3
50
+ export interface ReproducibilityGap {
51
+ has_reproducibility_gap: boolean
52
+ missing_fields: string[] // e.g. ["max_tokens"]
53
+ required_field_count: number // 2 base + 2 if agentic on current runtime
54
+ populated_field_count: number
55
+ signal_version: string
56
+ }
57
+
58
+ // Spec §5
59
+ export type ProvenanceSourceType =
60
+ | "first_party"
61
+ | "third_party"
62
+ | "collaborative"
63
+ | "unspecified"
64
+
65
+ export interface Provenance {
66
+ source_type: ProvenanceSourceType
67
+ is_multi_source: boolean
68
+ first_party_only: boolean // see §6.1 below for caveat
69
+ distinct_reporting_organizations: number
70
+ signal_version: string
71
+ }
72
+
73
+ // Spec §6.1
74
+ export interface VariantDivergence {
75
+ has_variant_divergence: boolean
76
+ group_id: string // "{model_route_id}__{metric_summary_id}"
77
+ divergence_magnitude: number
78
+ threshold_used: number
79
+ threshold_basis:
80
+ | "proportion_or_continuous_normalized"
81
+ | "percent"
82
+ | "range_5pct"
83
+ | "fallback_default"
84
+ differing_setup_fields: Array<{ field: string; values: unknown[] }>
85
+ scores_in_group: number[]
86
+ this_triple_score: number | null // this row's score within the group
87
+ triple_count_in_group: number
88
+ score_scale_anomaly: boolean
89
+ group_variant_breakdown: Array<{ variant_key: string; row_count: number }>
90
+ signal_version: string
91
+ }
92
+
93
+ // Spec §6.2
94
+ export interface CrossPartyDivergence {
95
+ has_cross_party_divergence: boolean
96
+ group_id: string
97
+ divergence_magnitude: number
98
+ threshold_used: number
99
+ threshold_basis: VariantDivergence["threshold_basis"]
100
+ scores_by_organization: Record<string, number> // display org name → score
101
+ differing_setup_fields: Array<{ field: string; values: unknown[] }>
102
+ organization_count: number
103
+ group_variant_breakdown: Array<{ variant_key: string; row_count: number }>
104
+ signal_version: string
105
+ }
106
+
107
+ // Per-row annotation block (carried on every model_result row)
108
+ export interface RowAnnotations {
109
+ reproducibility_gap: ReproducibilityGap | null
110
+ provenance: Provenance | null
111
+ variant_divergence: VariantDivergence | null
112
+ cross_party_divergence: CrossPartyDivergence | null
113
+ }
114
+
115
+ // Spec §4
116
+ export interface ReportingCompleteness {
117
+ completeness_score: number // [0, 1]
118
+ total_fields_evaluated: number
119
+ missing_required_fields: string[] // dotted paths
120
+ partial_fields: Array<{
121
+ field_path: string
122
+ score: number // (0, 1) — strictly between
123
+ populated_subitems: number
124
+ total_subitems: number
125
+ }>
126
+ field_scores: Array<{
127
+ field_path: string
128
+ coverage_type: "full" | "partial" | "reserved"
129
+ score: number // [0, 1]
130
+ }>
131
+ signal_version: string
132
+ }
133
+
134
+ export interface BenchmarkComparability {
135
+ variant_divergence_groups: Array<{
136
+ group_id: string
137
+ model_route_id: string
138
+ divergence_magnitude: number
139
+ threshold_used: number
140
+ threshold_basis: VariantDivergence["threshold_basis"]
141
+ differing_setup_fields: VariantDivergence["differing_setup_fields"]
142
+ }>
143
+ cross_party_divergence_groups: Array<{
144
+ group_id: string
145
+ model_route_id: string
146
+ divergence_magnitude: number
147
+ threshold_used: number
148
+ threshold_basis: VariantDivergence["threshold_basis"]
149
+ scores_by_organization: Record<string, number>
150
+ differing_setup_fields: VariantDivergence["differing_setup_fields"]
151
+ }>
152
+ }
153
+
154
+ // Eval-root or model-root annotation block
155
+ export interface EvalcardsAnnotations {
156
+ reporting_completeness?: ReportingCompleteness
157
+ benchmark_comparability?: BenchmarkComparability
158
+ }
159
+
160
+ // Top-level summary blocks (present on eval-list / model-cards / eval / model / hierarchy nodes)
161
+ export interface ReproducibilitySummary {
162
+ results_total: number
163
+ has_reproducibility_gap_count: number
164
+ populated_ratio_avg: number | null // null when results_total == 0
165
+ }
166
+
167
+ export interface ProvenanceSummary {
168
+ total_results: number
169
+ total_groups: number
170
+ multi_source_groups: number
171
+ first_party_only_groups: number
172
+ source_type_distribution: Record<ProvenanceSourceType, number>
173
+ }
174
+
175
+ export interface ComparabilitySummary {
176
+ total_groups: number
177
+ groups_with_variant_check: number // eligible groups (>=2 rows, differing setups, >=2 scored)
178
+ groups_with_cross_party_check: number // eligible groups (>=2 named orgs)
179
+ variant_divergent_count: number
180
+ cross_party_divergent_count: number
181
+ }
182
+
183
+ export interface SignalSummaries {
184
+ reproducibility_summary?: ReproducibilitySummary
185
+ provenance_summary?: ProvenanceSummary
186
+ comparability_summary?: ComparabilitySummary
187
+ }
188
+
189
+ // corpus-aggregates.json
190
+ export interface CorpusAggregates {
191
+ generated_at: string
192
+ signal_version: string
193
+ stratification_dimensions: ["category"]
194
+ reproducibility: Stratified<ReproducibilityCorpusBlock>
195
+ completeness: Stratified<CompletenessCorpusBlock>
196
+ provenance: Stratified<ProvenanceCorpusBlock>
197
+ comparability: Stratified<ComparabilityCorpusBlock>
198
+ }
199
+
200
+ export interface Stratified<T> {
201
+ overall: T
202
+ by_category: Record<string, T> // categories: agentic | general | knowledge | reasoning | safety | other
203
+ }
204
+
205
+ export interface ReproducibilityCorpusBlock {
206
+ total_triples: number
207
+ triples_with_reproducibility_gap: number
208
+ reproducibility_gap_rate: number | null
209
+ agentic_triples: number
210
+ per_field_missingness: Record<string, {
211
+ missing_count: number
212
+ missing_rate: number | null
213
+ denominator: "all_triples" | "agentic_only"
214
+ denominator_count: number
215
+ }>
216
+ }
217
+
218
+ export interface CompletenessCorpusBlock {
219
+ total_benchmarks: number
220
+ completeness_score_mean: number | null
221
+ completeness_score_median: number | null
222
+ per_field_population: Record<string, {
223
+ mean_score: number
224
+ populated_rate: number
225
+ fully_populated_rate: number
226
+ benchmark_count: number
227
+ }>
228
+ }
229
+
230
+ export interface ProvenanceCorpusBlock {
231
+ total_triples: number
232
+ total_groups: number
233
+ multi_source_groups: number
234
+ multi_source_rate: number | null
235
+ first_party_only_groups: number
236
+ first_party_only_rate: number | null
237
+ source_type_distribution: Record<ProvenanceSourceType, number>
238
+ }
239
+
240
+ export interface ComparabilityCorpusBlock {
241
+ total_groups: number
242
+ variant_eligible_groups: number
243
+ variant_divergent_groups: number
244
+ variant_divergence_rate: number | null
245
+ cross_party_eligible_groups: number
246
+ cross_party_divergent_groups: number
247
+ cross_party_divergence_rate: number | null // commonly null on current corpus
248
+ }
249
+ ```
250
+
251
+ Then in `lib/hf-data.ts`:
252
+
253
+ - Extend `HFEvalModelResult` (line ~522) with `evalcards?: { annotations?: RowAnnotations }`.
254
+ - Extend `HFEvalDetail` (line ~556) with `evalcards?: { annotations?: EvalcardsAnnotations }` plus the three summary fields from `SignalSummaries`.
255
+ - Extend `HFEvalListEntry` (line ~475) with `SignalSummaries` fields.
256
+ - Extend `HFModelCardEntry` (line ~439) with `SignalSummaries` fields.
257
+ - Extend `HFModelDetail` (line ~571) with `SignalSummaries` fields.
258
+ - Extend `HFModelHierarchyMetric` (line ~616) — `model_results` already typed as `HFEvalModelResult`, so the per-row annotations propagate automatically.
259
+
260
+ In `EvalHierarchy` types (`lib/backend-artifacts.ts` line ~54), add `SignalSummaries` to both `HierarchyFamily` and `HierarchyBenchmark`.
261
+
262
+ All fields are **optional** at the type level — older cached snapshots won't have them, and the frontend should render gracefully when they're absent.
263
+
264
+ ---
265
+
266
+ ## 3. Data plumbing
267
+
268
+ ### 3.1 New fetcher + API route for corpus aggregates
269
+
270
+ In `lib/hf-data.ts`, add after the existing fetchers (~line 866):
271
+
272
+ ```ts
273
+ export async function fetchCorpusAggregates(): Promise<CorpusAggregates | null> {
274
+ return fetchHFJsonSafe<CorpusAggregates>("corpus-aggregates.json")
275
+ }
276
+ ```
277
+
278
+ Add to `scripts/cache-hf-data.mjs` `CACHE_ROOT_FILES` array: `"corpus-aggregates.json"`. (Mark it optional in `OPTIONAL_CACHE_ROOT_FILES` if shipping while the HF dataset upload is still rolling — once the backend pipeline next runs against the dataset, the file will appear.)
279
+
280
+ Create `app/api/corpus-aggregates/route.ts`:
281
+
282
+ ```ts
283
+ import { NextResponse } from "next/server"
284
+ import { fetchCorpusAggregates } from "@/lib/hf-data"
285
+
286
+ export async function GET() {
287
+ const aggregates = await fetchCorpusAggregates()
288
+ if (!aggregates) {
289
+ return NextResponse.json({ error: "Corpus aggregates not available" }, { status: 404 })
290
+ }
291
+ return NextResponse.json(aggregates)
292
+ }
293
+ ```
294
+
295
+ ### 3.2 Rest of plumbing is automatic
296
+
297
+ Existing fetchers (`fetchEvalDetail`, `fetchModelDetail`, `fetchEvalList`, `fetchModelCardsList`, `fetchEvalHierarchy`) just pull the raw JSON, so the new fields propagate without code changes once the types above are widened.
298
+
299
+ ---
300
+
301
+ ## 4. UX components to build
302
+
303
+ Build a small set of reusable signal components in `components/signals/`. Each takes one of the typed shapes above and renders a badge / panel. This keeps signal rendering consistent across `eval-detail.tsx`, `benchmark-detail.tsx`, `model-compare-dialog.tsx`, and the new corpus dashboard.
304
+
305
+ ```
306
+ components/signals/
307
+ ├── reproducibility-badge.tsx
308
+ ├── provenance-badge.tsx // already partially exists in benchmark-detail.tsx — see §4.2
309
+ ├── variant-divergence-badge.tsx
310
+ ├── cross-party-divergence-badge.tsx
311
+ ├── reproducibility-panel.tsx // detail view — full missing-fields list
312
+ ├── completeness-panel.tsx // detail view — score bar + missing-field list
313
+ ├── comparability-panel.tsx // detail view — divergent groups list
314
+ ├── signals-row-badges.tsx // composite: renders all four row-level badges with proper spacing
315
+ └── signal-tooltip.tsx // shared tooltip primitive
316
+ ```
317
+
318
+ All badges should follow the existing tone conventions used by `getRelationshipBadgeTone` ([components/benchmark-detail.tsx:289](../components/benchmark-detail.tsx#L289)) and the `Badge` primitive in [components/ui/badge.tsx](../components/ui/badge.tsx).
319
+
320
+ ### 4.1 Row-level badges — placement
321
+
322
+ Insert `<SignalsRowBadges annotations={modelResult.evalcards?.annotations} />` next to the score cell in:
323
+
324
+ - **Eval detail leaderboard table** — [components/eval-detail.tsx:869-871](../components/eval-detail.tsx#L869-L871) (the `<TableCell className="text-right">` containing the score). Render badges below the score on a new line for desktop, hidden on mobile.
325
+ - **Benchmark detail rows** — `components/benchmark-detail.tsx` renders score rows in several places (search for `formatRawScoreValue`); insert the same component.
326
+ - **Model compare dialog** — [components/model-compare-dialog.tsx](../components/model-compare-dialog.tsx) score columns.
327
+
328
+ **Display rules — only badge for actionable states.** Silence is meaningful here.
329
+
330
+ | Signal | Show badge when | Hide when |
331
+ |---|---|---|
332
+ | Reproducibility | `has_reproducibility_gap === true` | gap=false, or annotation absent |
333
+ | Provenance | `source_type` ∈ {`first_party`, `third_party`, `collaborative`} | `source_type === "unspecified"` |
334
+ | Variant divergence | `variant_divergence !== null && has_variant_divergence === true` | null (not applicable) or false (checked, fine) |
335
+ | Cross-party divergence | `cross_party_divergence !== null && has_cross_party_divergence === true` | null (almost always on current corpus) or false |
336
+
337
+ `has_*: false` means "we checked and it's fine" — silent success. `null` means "not applicable / not enough data" — also silent. **Only divergent / gap-positive states warrant pixels.**
338
+
339
+ **Dedup rule.** `variant_divergence` and `cross_party_divergence` are duplicated onto every row in the same group. If you render three rows from the same `group_id`, render the divergence badge on each row but the *expanded panel* (§4.4) only once at the group header.
340
+
341
+ ### 4.2 Provenance badge — reuse what's there
342
+
343
+ [components/benchmark-detail.tsx:262-302](../components/benchmark-detail.tsx#L262-L302) already has `getRelationshipShortLabel` and `getRelationshipBadgeTone`. Extract these into `components/signals/provenance-badge.tsx` and import back into `benchmark-detail.tsx`. The new badge should **also** consume the new `Provenance` annotation when present (it carries `is_multi_source` and `first_party_only`, which the current implementation derives row-by-row from `source_metadata` alone).
344
+
345
+ When `provenance.first_party_only === true`, show a small ⚠ subtle indicator on the first-party badge ("first-party only — no independent replication"). This is the headline use of the signal for policy-mode readers.
346
+
347
+ ### 4.3 Reproducibility badge — content rules
348
+
349
+ Tooltip content depends on audience mode (`useAudienceMode()` from [components/audience-mode-provider.tsx:40](../components/audience-mode-provider.tsx#L40)):
350
+
351
+ - Research mode: "Setup not fully documented. Missing: `max_tokens`, `eval_plan`."
352
+ - Policy mode: "This score's setup isn't fully documented, so it can't be re-run as-is."
353
+
354
+ Always include the count "{populated_field_count} of {required_field_count} setup fields recorded." Don't hardcode "4 fields" — the active runtime checks 2 base fields (`temperature`, `max_tokens`) plus 2 agentic fields (`eval_plan`, `eval_limits`) when the benchmark is agentic. Read counts off the annotation.
355
+
356
+ ### 4.4 Detail panels — placement
357
+
358
+ #### Reproducibility panel
359
+ The existing "Evaluation Provenance" panel in [components/eval-detail.tsx:952-998](../components/eval-detail.tsx#L952-L998) (rendered when a row is expanded) is the right place for the **per-row** reproducibility breakdown. Add a new `DetailPanel` adjacent to it:
360
+
361
+ ```tsx
362
+ {rowAnnotations?.reproducibility_gap && (
363
+ <DetailPanel
364
+ title={isResearchView ? "Reproducibility" : "Re-runnability"}
365
+ subtitle={
366
+ isResearchView
367
+ ? "Whether the setup is documented well enough for someone else to re-run."
368
+ : "Whether someone could re-run this evaluation with the information available."
369
+ }
370
+ >
371
+ <MetaRow
372
+ label="Setup fields recorded"
373
+ value={`${rowAnnotations.reproducibility_gap.populated_field_count} of ${rowAnnotations.reproducibility_gap.required_field_count}`}
374
+ />
375
+ {rowAnnotations.reproducibility_gap.missing_fields.length > 0 && (
376
+ <MetaRow
377
+ label="Missing"
378
+ value={rowAnnotations.reproducibility_gap.missing_fields.join(", ")}
379
+ />
380
+ )}
381
+ </DetailPanel>
382
+ )}
383
+ ```
384
+
385
+ #### Completeness panel
386
+ Render at the **eval-detail header level** (above the leaderboard, below the metric specification card). New `<CompletenessPanel completeness={detail.evalcards?.annotations?.reporting_completeness} />`. UI: progress bar showing `completeness_score`, label "{N} of {M} fields populated" where N = sum of `field_scores[].score` rounded, M = `total_fields_evaluated`. Below: collapsible accordions:
387
+
388
+ - **Missing required fields** (count badge) — list of `missing_required_fields` with friendly labels (see §6.4 for label mapping).
389
+ - **Partially populated** (count badge) — `partial_fields` rendered as "{field}: {populated_subitems}/{total_subitems}".
390
+
391
+ In policy mode, don't show the dotted-path field names — show friendly labels only. In research mode, show both.
392
+
393
+ #### Comparability panel
394
+ Also at eval-detail header level. Sourced from `detail.evalcards?.annotations?.benchmark_comparability`. Render as two collapsibles — "Variant divergence ({count})" and "Cross-party divergence ({count})". Each item should link to the relevant model row (use `model_route_id` from each group entry as anchor — add `id={"row-" + model_route_id}` on the leaderboard row).
395
+
396
+ When both arrays are empty, hide the panel entirely. When `comparability_summary.groups_with_cross_party_check === 0` (the common state), surface a small note: "No third-party reports available for cross-party comparison."
397
+
398
+ ### 4.5 Per-eval header chips
399
+ On the eval-detail page header (next to existing "Measures" / "Source dataset" chips around [components/eval-detail.tsx:486-525](../components/eval-detail.tsx#L486-L525)), add a fourth chip when `evalcards.annotations.reporting_completeness` is present:
400
+
401
+ > **Documentation**
402
+ > {round(completeness_score * 100)}%
403
+
404
+ Tooltip: "{N} of {M} EvalCards documentation fields populated for this benchmark."
405
+
406
+ ### 4.6 Per-model card chips
407
+ On `components/eval-card.tsx` and the model card pages, add three chips driven by the model-level summaries. Replace the hand-written hint at [components/eval-card.tsx:250](../components/eval-card.tsx#L250) ("Some results lack generation settings; compare scores with care.") with a data-driven version:
408
+
409
+ > {has_reproducibility_gap_count} of {results_total} reported scores aren't fully documented.
410
+
411
+ Show only when `has_reproducibility_gap_count > 0`. The hand-written hint was a placeholder for exactly this signal — wire it up.
412
+
413
+ ---
414
+
415
+ ## 5. New page: corpus dashboard
416
+
417
+ Add `app/corpus/page.tsx` (linked from main navigation [components/navigation.tsx](../components/navigation.tsx)). Server component that calls `fetchCorpusAggregates()` and renders four sections:
418
+
419
+ ### 5.1 Reproducibility section
420
+ - Headline number: `reproducibility_gap_rate` rendered as percentage. Sub-label: "{triples_with_reproducibility_gap} of {total_triples} reported scores."
421
+ - Per-field horizontal bar chart from `per_field_missingness`. **Bar denominator depends on `denominator` field**: agentic-only fields use `agentic_triples`, others use `total_triples`. Label each bar with the denominator type so users understand.
422
+ - Toggle: `overall` ↔ `by_category` (rendered as a small-multiple grid, one panel per category).
423
+
424
+ ### 5.2 Completeness section
425
+ - Headline: `completeness_score_mean` (and median) across `total_benchmarks`.
426
+ - Histogram of per-benchmark scores (pull individual benchmark scores from `eval-list.json` `reporting_completeness.completeness_score`, since corpus-aggregates only carries mean/median).
427
+ - Per-field bar chart from `per_field_population` — three bars per field: `mean_score`, `populated_rate`, `fully_populated_rate`. (See §6.7 for which one to highlight per coverage type.)
428
+
429
+ ### 5.3 Provenance section
430
+ - Stacked bar of `source_type_distribution` (across all triples).
431
+ - Two ratios: `multi_source_rate`, `first_party_only_rate`. Label both: "% of (model, benchmark, metric) groups."
432
+
433
+ ### 5.4 Comparability section
434
+ - Two side-by-side panels: Variant divergence (eligible-aware rate) and Cross-party divergence (often null).
435
+ - **When `cross_party_divergence_rate === null`:** show a "Not enough multi-org coverage to compute" empty state, not "0%". Same for `variant_divergence_rate === null`. This is critical — see §6.5.
436
+
437
+ All sections support a category toggle (research mode shows category breakdowns by default; policy mode shows overall by default).
438
+
439
+ ---
440
+
441
+ ## 6. Caveats and edge cases (read these before implementing)
442
+
443
+ ### 6.1 `first_party_only` semantics
444
+ A row can be `first_party_only: true` even when `is_multi_source: false`. The spec literal: a group with one *named* org reporting first-party gets the badge. **Don't read it as "exclusive coverage"** — read it as "no independent replication." The label suggestion is "First-party only" rather than "Sole source."
445
+
446
+ If `distinct_reporting_organizations === 0` (all rows have null org), `first_party_only` is `false` even when `source_type === "first_party"`. Render the row's source as "First-party (org unspecified)" in research mode; suppress the first-party-only badge.
447
+
448
+ ### 6.2 Active reproducibility field set is reduced
449
+ The spec describes four base fields (`temperature`, `top_p`, `max_tokens`, `prompt_template`); the active backend currently checks **only `temperature` and `max_tokens`** plus `eval_plan` / `eval_limits` for agentic benchmarks. **Don't hardcode "4 fields" anywhere.** Always read `required_field_count` off the annotation. This is a deliberate spec-author choice and may revert; the field count is the only stable interface.
450
+
451
+ ### 6.3 Missing-field path strings
452
+ `missing_fields` for reproducibility uses bare names (e.g. `"max_tokens"`). `missing_required_fields` for completeness uses dotted paths (e.g. `"autobenchmarkcard.methodology.baseline_results"`). Different conventions, intentional. Build a small label map for completeness paths — paths come from [registry/completeness_fields.json](https://github.com/evaleval/eval_cards_backend_pipeline/blob/main/registry/completeness_fields.json) on the backend repo. Suggested label rules:
453
+
454
+ - Drop the `autobenchmarkcard.` / `eee_eval.` / `evalcards.` prefix.
455
+ - Replace dots with " / ", underscore with space, title-case.
456
+ - Example: `autobenchmarkcard.methodology.baseline_results` → "Methodology / Baseline results".
457
+
458
+ ### 6.4 `differing_setup_fields[].values` may contain null and mixed types
459
+ Per spec §6.1.4, `null` is a *distinct* value from any explicit setting (comparing "explicit 2048" to "unspecified" is meaningful). Render `null` as "(unspecified)" rather than the string "null". Numeric, string, boolean, and object values can all appear in the same array; render with `JSON.stringify` for objects, plain text otherwise.
460
+
461
+ ### 6.5 `null` rates in comparability are *not* zero
462
+ Eligibility-aware denominators mean `variant_divergence_rate` and `cross_party_divergence_rate` are `null` when no groups were eligible. **Render as "N/A — not enough data" or an empty-state card, never as "0%".** On the current corpus, `cross_party_divergence_rate` will commonly be null (third-party reports are sparse). Treat this as a normal state, not a data-loading error.
463
+
464
+ ### 6.6 Score-scale anomaly flag
465
+ `variant_divergence.score_scale_anomaly === true` indicates the metric was declared `proportion` but scores fell outside [0, 1] — usually a metric-normalization bug upstream. Surface as a small "data quality warning" annotation alongside the divergence number; the divergence is still computed but the threshold may not be apples-to-apples.
466
+
467
+ ### 6.7 `mean_score` vs `populated_rate` for completeness
468
+ Per-field aggregates expose three numbers. Pick which to display based on `coverage_type`:
469
+
470
+ - **`full` and `reserved` fields** — `mean_score` and `populated_rate` are equal. Show one number labeled "% of benchmarks populating this field."
471
+ - **`partial` fields** — they diverge. `populated_rate` = % of benchmarks with *any* sub-item; `mean_score` = average sub-item population fraction. Show both: "{populated_rate}% have any data, {mean_score}% on average across sub-items."
472
+
473
+ ### 6.8 No `computed_at` on per-record annotations
474
+ Only `signal_version` is on each annotation. For "last computed" UI text, use `manifest.json → generated_at` from the existing `BackendManifest`.
475
+
476
+ ### 6.9 Stratification categories
477
+ `by_category` keys are: `agentic`, `general`, `knowledge`, `reasoning`, `safety`, `other`. Same set as the existing `category` field on evals — reuse whatever color scheme is currently keyed off `inferCategoryFromBenchmark` ([lib/benchmark-schema.ts](../lib/benchmark-schema.ts)).
478
+
479
+ ### 6.10 Annotation block can be `null` or absent
480
+ `evalcards.annotations.{reproducibility_gap,provenance,variant_divergence,cross_party_divergence}` can each be `null` independently, and the entire `evalcards` block may be absent on older cached snapshots. Use optional chaining everywhere; never assume presence. The `RowAnnotations` type intentionally types each subfield as `T | null` (not `T | undefined`) because the backend writes explicit `null`.
481
+
482
+ ---
483
+
484
+ ## 7. Suggested implementation order
485
+
486
+ 1. **Types + plumbing** (1–2 hours): types in `backend-artifacts.ts` + `hf-data.ts`, the `fetchCorpusAggregates` fetcher, the API route, and adding `corpus-aggregates.json` to the cache script. No UI yet.
487
+ 2. **Row-level badges** (½ day): build `signals/` directory with the four badge components, the dedup-aware `signals-row-badges.tsx`, and wire into eval-detail and benchmark-detail. This is the most visible win.
488
+ 3. **Per-eval completeness panel + comparability panel** (½ day): single benchmark, easy to design around. New `CompletenessPanel` is the headline new UX in this set.
489
+ 4. **Per-row reproducibility detail panel** (1–2 hours): drops into the existing expanded row layout.
490
+ 5. **Per-eval / per-model header chips + replace the hand-written gap hint** (1–2 hours): wires the summary fields into existing card surfaces.
491
+ 6. **Corpus dashboard page** (1–2 days): new route, new components, biggest scope. Defer until 1–5 are live and reviewed.
492
+
493
+ Each step is independently shippable. Steps 1–5 can land before the corpus dashboard is designed.
494
+
495
+ ---
496
+
497
+ ## 8. Out of scope (don't do these yet)
498
+
499
+ - **Filter / sort the eval list by signal state** ("show only benchmarks with completeness > 0.5"). Wait for the dashboard view to land first; users will tell us which filters they actually want.
500
+ - **Side-by-side score comparison with divergence overlay.** The data supports it (`scores_in_group`, `scores_by_organization`) but the design space is large. Hold off until we see the row-level badges in use.
501
+ - **Recompute / verification UI for missing reproducibility fields.** Backend-side; out of scope here.
502
+ - **Per-instance sample-level badges.** Signals operate at row / benchmark level; sample-level instance data is unaffected.
503
+
504
+ ---
505
+
506
+ ## 9. Reference: minimal real-shape examples
507
+
508
+ Per-row `evalcards.annotations` with all four signals populated:
509
+
510
+ ```jsonc
511
+ {
512
+ "reproducibility_gap": {
513
+ "has_reproducibility_gap": true,
514
+ "missing_fields": ["max_tokens"],
515
+ "required_field_count": 2,
516
+ "populated_field_count": 1,
517
+ "signal_version": "1.0"
518
+ },
519
+ "provenance": {
520
+ "source_type": "first_party",
521
+ "is_multi_source": false,
522
+ "first_party_only": true,
523
+ "distinct_reporting_organizations": 1,
524
+ "signal_version": "1.0"
525
+ },
526
+ "variant_divergence": null,
527
+ "cross_party_divergence": null
528
+ }
529
+ ```
530
+
531
+ Per-eval `evalcards.annotations` with completeness + comparability:
532
+
533
+ ```jsonc
534
+ {
535
+ "reporting_completeness": {
536
+ "completeness_score": 0.62,
537
+ "total_fields_evaluated": 28,
538
+ "missing_required_fields": [
539
+ "autobenchmarkcard.methodology.baseline_results",
540
+ "autobenchmarkcard.methodology.validation",
541
+ "evalcards.preregistration_url"
542
+ ],
543
+ "partial_fields": [
544
+ { "field_path": "autobenchmarkcard.data", "score": 0.5, "populated_subitems": 2, "total_subitems": 4 }
545
+ ],
546
+ "field_scores": [/* 28 entries */],
547
+ "signal_version": "1.0"
548
+ },
549
+ "benchmark_comparability": {
550
+ "variant_divergence_groups": [
551
+ {
552
+ "group_id": "openai__gpt-5__hfopenllm_v2_bbh_accuracy",
553
+ "model_route_id": "openai__gpt-5",
554
+ "divergence_magnitude": 0.12,
555
+ "threshold_used": 0.05,
556
+ "threshold_basis": "proportion_or_continuous_normalized",
557
+ "differing_setup_fields": [
558
+ { "field": "max_tokens", "values": [2048, 4096, 8192] }
559
+ ]
560
+ }
561
+ ],
562
+ "cross_party_divergence_groups": []
563
+ }
564
+ }
565
+ ```
566
+
567
+ Top-level `provenance_summary` example:
568
+
569
+ ```jsonc
570
+ {
571
+ "total_results": 142,
572
+ "total_groups": 47,
573
+ "multi_source_groups": 3,
574
+ "first_party_only_groups": 30,
575
+ "source_type_distribution": {
576
+ "first_party": 120,
577
+ "third_party": 18,
578
+ "collaborative": 0,
579
+ "unspecified": 4
580
+ }
581
+ }
582
+ ```
583
+
584
+ `corpus-aggregates.json` structure (top of file):
585
+
586
+ ```jsonc
587
+ {
588
+ "generated_at": "2026-04-27T...",
589
+ "signal_version": "1.0",
590
+ "stratification_dimensions": ["category"],
591
+ "reproducibility": { "overall": {/* ReproducibilityCorpusBlock */}, "by_category": { "agentic": {...}, "general": {...}, ... } },
592
+ "completeness": { "overall": {/* CompletenessCorpusBlock */}, "by_category": {...} },
593
+ "provenance": { "overall": {/* ProvenanceCorpusBlock */}, "by_category": {...} },
594
+ "comparability": { "overall": {/* ComparabilityCorpusBlock */}, "by_category": {...} }
595
+ }
596
+ ```
597
+
598
+ ---
599
+
600
+ ## 10. Audience-mode wording cheatsheet
601
+
602
+ | Element | Research mode | Policy mode |
603
+ |---|---|---|
604
+ | Reproducibility gap badge | "Reproducibility gap" | "Setup not documented" |
605
+ | Reproducibility tooltip | "Setup not fully documented. Missing: {fields}." | "This score's setup isn't documented, so it can't be re-run as-is." |
606
+ | Reproducibility panel title | "Reproducibility" | "Re-runnability" |
607
+ | Completeness chip label | "Documentation" | "Documentation" |
608
+ | Completeness panel title | "Reporting completeness" | "How well is this benchmark documented?" |
609
+ | Provenance: first-party | "1st party" | "Reported by model developer" |
610
+ | Provenance: first-party only | "1st party only — no replication" | "Only the model developer reported this score" |
611
+ | Provenance: third-party | "3rd party" | "Independently reported" |
612
+ | Provenance: collaborative | "Collaborative" | "Joint report" |
613
+ | Variant divergence badge | "Variant divergence" | "Score depends on setup" |
614
+ | Variant divergence tooltip | "Scores diverge by {magnitude} across different setups: {fields}." | "Different runs of this evaluation produced different scores — the setup matters." |
615
+ | Cross-party divergence badge | "Cross-party divergence" | "Sources disagree" |
616
+ | Cross-party divergence tooltip | "Reports diverge by {magnitude} across organizations." | "Different organizations reported different scores for this same model on this same benchmark." |
617
+
618
+ Adjust tone but keep the underlying numbers identical across modes — the data is the same, only the framing changes.
619
+
620
+ ---
621
+
622
+ *Last updated 2026-04-27. Maintainer: backend pipeline (eval_cards_backend_pipeline), frontend (general-eval-card). Questions on backend semantics → [eval_cards_backend_pipeline#2](https://github.com/evaleval/eval_cards_backend_pipeline/issues/2). Questions on UX → discuss with @anka-evals + frontend team.*
lib/backend-artifacts.ts CHANGED
@@ -2,6 +2,10 @@ export interface BackendManifest {
2
  generated_at: string
3
  config_version: number
4
  skipped_configs: string[]
 
 
 
 
5
  }
6
 
7
  export interface BackendManifestStatus {
@@ -14,6 +18,209 @@ export interface BackendManifestStatus {
14
  pendingRefreshCount: number
15
  }
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  export interface HierarchyTags {
18
  domains: string[]
19
  languages: string[]
@@ -32,16 +239,17 @@ export interface HierarchySlice {
32
  metrics: HierarchyMetric[]
33
  }
34
 
35
- export interface HierarchyBenchmark {
36
  key: string
37
  display_name: string
38
  has_card: boolean
39
  tags: HierarchyTags
40
  slices: HierarchySlice[]
41
  metrics: HierarchyMetric[]
 
42
  }
43
 
44
- export interface HierarchyComposite {
45
  key: string
46
  display_name: string
47
  has_card: boolean
@@ -51,17 +259,32 @@ export interface HierarchyComposite {
51
  summary_eval_ids?: string[]
52
  }
53
 
54
- export interface HierarchyFamily {
55
  key: string
56
  display_name: string
57
- has_card: boolean
58
  category: string
59
- tags: HierarchyTags
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  standalone_benchmarks?: HierarchyBenchmark[]
61
  composites?: HierarchyComposite[]
62
  benchmarks?: HierarchyBenchmark[]
63
  slices?: HierarchySlice[]
64
  metrics?: HierarchyMetric[]
 
 
65
  }
66
 
67
  export interface EvalHierarchyStats {
@@ -75,7 +298,7 @@ export interface EvalHierarchyStats {
75
  }
76
 
77
  export interface EvalHierarchy {
78
- stats: EvalHierarchyStats
79
  families: HierarchyFamily[]
80
  }
81
 
@@ -159,4 +382,4 @@ export interface ComparisonIndex {
159
  metric_group_order: MetricGroup[]
160
  evals: Record<string, ComparisonEvalEntry>
161
  by_model: Record<string, Record<string, Record<string, ComparisonByModelEntry>>>
162
- }
 
2
  generated_at: string
3
  config_version: number
4
  skipped_configs: string[]
5
+ summary_artifacts?: {
6
+ corpus_aggregates?: string
7
+ [key: string]: string | undefined
8
+ }
9
  }
10
 
11
  export interface BackendManifestStatus {
 
18
  pendingRefreshCount: number
19
  }
20
 
21
+ // ---------------------------------------------------------------------------
22
+ // EvalCards interpretive signals v1.0
23
+ // ---------------------------------------------------------------------------
24
+
25
+ export interface ReproducibilityGap {
26
+ has_reproducibility_gap: boolean
27
+ missing_fields: string[]
28
+ required_field_count: number
29
+ populated_field_count: number
30
+ signal_version: string
31
+ }
32
+
33
+ export type ProvenanceSourceType =
34
+ | "first_party"
35
+ | "third_party"
36
+ | "collaborative"
37
+ | "unspecified"
38
+
39
+ export interface Provenance {
40
+ source_type: ProvenanceSourceType
41
+ is_multi_source: boolean
42
+ first_party_only: boolean
43
+ distinct_reporting_organizations: number
44
+ signal_version: string
45
+ }
46
+
47
+ export type DivergenceThresholdBasis =
48
+ | "proportion_or_continuous_normalized"
49
+ | "percent"
50
+ | "range_5pct"
51
+ | "fallback_default"
52
+
53
+ export interface DifferingSetupField {
54
+ field: string
55
+ values: unknown[]
56
+ }
57
+
58
+ export interface VariantDivergence {
59
+ has_variant_divergence: boolean
60
+ group_id: string
61
+ divergence_magnitude: number
62
+ threshold_used: number
63
+ threshold_basis: DivergenceThresholdBasis
64
+ differing_setup_fields: DifferingSetupField[]
65
+ scores_in_group: number[]
66
+ this_triple_score: number | null
67
+ triple_count_in_group: number
68
+ score_scale_anomaly: boolean
69
+ group_variant_breakdown: Array<{ variant_key: string; row_count: number }>
70
+ signal_version: string
71
+ }
72
+
73
+ export interface CrossPartyDivergence {
74
+ has_cross_party_divergence: boolean
75
+ group_id: string
76
+ divergence_magnitude: number
77
+ threshold_used: number
78
+ threshold_basis: DivergenceThresholdBasis
79
+ scores_by_organization: Record<string, number>
80
+ differing_setup_fields: DifferingSetupField[]
81
+ organization_count: number
82
+ group_variant_breakdown: Array<{ variant_key: string; row_count: number }>
83
+ signal_version: string
84
+ }
85
+
86
+ export interface RowAnnotations {
87
+ reproducibility_gap: ReproducibilityGap | null
88
+ provenance: Provenance | null
89
+ variant_divergence: VariantDivergence | null
90
+ cross_party_divergence: CrossPartyDivergence | null
91
+ }
92
+
93
+ export interface ReportingCompleteness {
94
+ completeness_score: number
95
+ total_fields_evaluated: number
96
+ missing_required_fields: string[]
97
+ partial_fields: Array<{
98
+ field_path: string
99
+ score: number
100
+ populated_subitems: number
101
+ total_subitems: number
102
+ }>
103
+ field_scores: Array<{
104
+ field_path: string
105
+ coverage_type: "full" | "partial" | "reserved"
106
+ score: number
107
+ }>
108
+ signal_version: string
109
+ }
110
+
111
+ export interface BenchmarkComparability {
112
+ variant_divergence_groups: Array<{
113
+ group_id: string
114
+ model_route_id: string
115
+ divergence_magnitude: number
116
+ threshold_used: number
117
+ threshold_basis: DivergenceThresholdBasis
118
+ differing_setup_fields: DifferingSetupField[]
119
+ }>
120
+ cross_party_divergence_groups: Array<{
121
+ group_id: string
122
+ model_route_id: string
123
+ divergence_magnitude: number
124
+ threshold_used: number
125
+ threshold_basis: DivergenceThresholdBasis
126
+ scores_by_organization: Record<string, number>
127
+ differing_setup_fields: DifferingSetupField[]
128
+ }>
129
+ }
130
+
131
+ export interface EvalcardsAnnotations {
132
+ reporting_completeness?: ReportingCompleteness
133
+ benchmark_comparability?: BenchmarkComparability
134
+ }
135
+
136
+ export interface ReproducibilitySummary {
137
+ results_total: number
138
+ has_reproducibility_gap_count: number
139
+ populated_ratio_avg: number | null
140
+ }
141
+
142
+ export interface ProvenanceSummary {
143
+ total_results: number
144
+ total_groups: number
145
+ multi_source_groups: number
146
+ first_party_only_groups: number
147
+ source_type_distribution: Record<ProvenanceSourceType, number>
148
+ }
149
+
150
+ export interface ComparabilitySummary {
151
+ total_groups: number
152
+ groups_with_variant_check: number
153
+ groups_with_cross_party_check: number
154
+ variant_divergent_count: number
155
+ cross_party_divergent_count: number
156
+ }
157
+
158
+ export interface SignalSummaries {
159
+ reproducibility_summary?: ReproducibilitySummary
160
+ provenance_summary?: ProvenanceSummary
161
+ comparability_summary?: ComparabilitySummary
162
+ }
163
+
164
+ export interface CorpusAggregates {
165
+ generated_at: string
166
+ signal_version: string
167
+ stratification_dimensions: ["category"]
168
+ reproducibility: Stratified<ReproducibilityCorpusBlock>
169
+ completeness: Stratified<CompletenessCorpusBlock>
170
+ provenance: Stratified<ProvenanceCorpusBlock>
171
+ comparability: Stratified<ComparabilityCorpusBlock>
172
+ }
173
+
174
+ export interface Stratified<T> {
175
+ overall: T
176
+ by_category: Record<string, T>
177
+ }
178
+
179
+ export interface ReproducibilityCorpusBlock {
180
+ total_triples: number
181
+ triples_with_reproducibility_gap: number
182
+ reproducibility_gap_rate: number | null
183
+ agentic_triples: number
184
+ per_field_missingness: Record<string, {
185
+ missing_count: number
186
+ missing_rate: number | null
187
+ denominator: "all_triples" | "agentic_only"
188
+ denominator_count: number
189
+ }>
190
+ }
191
+
192
+ export interface CompletenessCorpusBlock {
193
+ total_benchmarks: number
194
+ completeness_score_mean: number | null
195
+ completeness_score_median: number | null
196
+ per_field_population: Record<string, {
197
+ mean_score: number
198
+ populated_rate: number
199
+ fully_populated_rate: number
200
+ benchmark_count: number
201
+ }>
202
+ }
203
+
204
+ export interface ProvenanceCorpusBlock {
205
+ total_triples: number
206
+ total_groups: number
207
+ multi_source_groups: number
208
+ multi_source_rate: number | null
209
+ first_party_only_groups: number
210
+ first_party_only_rate: number | null
211
+ source_type_distribution: Record<ProvenanceSourceType, number>
212
+ }
213
+
214
+ export interface ComparabilityCorpusBlock {
215
+ total_groups: number
216
+ variant_eligible_groups: number
217
+ variant_divergent_groups: number
218
+ variant_divergence_rate: number | null
219
+ cross_party_eligible_groups: number
220
+ cross_party_divergent_groups: number
221
+ cross_party_divergence_rate: number | null
222
+ }
223
+
224
  export interface HierarchyTags {
225
  domains: string[]
226
  languages: string[]
 
239
  metrics: HierarchyMetric[]
240
  }
241
 
242
+ export interface HierarchyBenchmark extends SignalSummaries {
243
  key: string
244
  display_name: string
245
  has_card: boolean
246
  tags: HierarchyTags
247
  slices: HierarchySlice[]
248
  metrics: HierarchyMetric[]
249
+ summary_eval_ids?: string[]
250
  }
251
 
252
+ export interface HierarchyComposite extends SignalSummaries {
253
  key: string
254
  display_name: string
255
  has_card: boolean
 
259
  summary_eval_ids?: string[]
260
  }
261
 
262
+ export interface HierarchyLeaf extends SignalSummaries {
263
  key: string
264
  display_name: string
 
265
  category: string
266
+ evals_count?: number
267
+ eval_summary_ids?: string[]
268
+ tags?: Partial<HierarchyTags>
269
+ has_card?: boolean
270
+ }
271
+
272
+ export interface HierarchyFamily extends SignalSummaries {
273
+ key: string
274
+ display_name: string
275
+ has_card?: boolean
276
+ category: string
277
+ tags?: Partial<HierarchyTags>
278
+ evals_count?: number
279
+ eval_summary_ids?: string[]
280
+ // Legacy nested shape (composites + standalone benchmarks)
281
  standalone_benchmarks?: HierarchyBenchmark[]
282
  composites?: HierarchyComposite[]
283
  benchmarks?: HierarchyBenchmark[]
284
  slices?: HierarchySlice[]
285
  metrics?: HierarchyMetric[]
286
+ // Newer 2-level shape (family → leaf)
287
+ leaves?: HierarchyLeaf[]
288
  }
289
 
290
  export interface EvalHierarchyStats {
 
298
  }
299
 
300
  export interface EvalHierarchy {
301
+ stats?: EvalHierarchyStats
302
  families: HierarchyFamily[]
303
  }
304
 
 
382
  metric_group_order: MetricGroup[]
383
  evals: Record<string, ComparisonEvalEntry>
384
  by_model: Record<string, Record<string, Record<string, ComparisonByModelEntry>>>
385
+ }
lib/benchmark-schema.ts CHANGED
@@ -3,6 +3,8 @@
3
  * Based on the evalevalai.com schema structure
4
  */
5
 
 
 
6
  export interface BenchmarkEvaluation {
7
  schema_version: string
8
  eval_summary_id?: string
@@ -31,6 +33,7 @@ export interface BenchmarkEvaluation {
31
  generation_config?: GenerationConfig
32
  evaluation_results: EvaluationResult[]
33
  detailed_evaluation_results_per_samples?: SampleResult[]
 
34
  }
35
 
36
  export interface EvalLibrary {
@@ -96,6 +99,7 @@ export interface EvaluationResult {
96
  score_details: ScoreDetails
97
  detailed_evaluation_results_url?: string
98
  generation_config?: GenerationConfig
 
99
  }
100
 
101
  export interface MetricConfig {
@@ -208,7 +212,7 @@ export function inferCategoryFromBenchmark(benchmarkName: string): CategoryType
208
  /**
209
  * Aggregate evaluations by model
210
  */
211
- export interface ModelSummaryCore {
212
  model_info: ModelInfo
213
  evaluations_by_category: Record<CategoryType, BenchmarkEvaluation[]>
214
  total_evaluations: number
@@ -275,6 +279,9 @@ export interface EvaluationCardData {
275
  max: number
276
  average: number | null
277
  }
 
 
 
278
 
279
  // Quick stats
280
  top_scores: Array<{
 
3
  * Based on the evalevalai.com schema structure
4
  */
5
 
6
+ import type { EvalcardsAnnotations, RowAnnotations, SignalSummaries } from "@/lib/backend-artifacts"
7
+
8
  export interface BenchmarkEvaluation {
9
  schema_version: string
10
  eval_summary_id?: string
 
33
  generation_config?: GenerationConfig
34
  evaluation_results: EvaluationResult[]
35
  detailed_evaluation_results_per_samples?: SampleResult[]
36
+ evalcards?: { annotations?: EvalcardsAnnotations }
37
  }
38
 
39
  export interface EvalLibrary {
 
99
  score_details: ScoreDetails
100
  detailed_evaluation_results_url?: string
101
  generation_config?: GenerationConfig
102
+ evalcards?: { annotations?: RowAnnotations }
103
  }
104
 
105
  export interface MetricConfig {
 
212
  /**
213
  * Aggregate evaluations by model
214
  */
215
+ export interface ModelSummaryCore extends SignalSummaries {
216
  model_info: ModelInfo
217
  evaluations_by_category: Record<CategoryType, BenchmarkEvaluation[]>
218
  total_evaluations: number
 
279
  max: number
280
  average: number | null
281
  }
282
+ reproducibility_summary?: SignalSummaries["reproducibility_summary"]
283
+ provenance_summary?: SignalSummaries["provenance_summary"]
284
+ comparability_summary?: SignalSummaries["comparability_summary"]
285
 
286
  // Quick stats
287
  top_scores: Array<{
lib/dashboard-data-client.ts CHANGED
@@ -1,4 +1,4 @@
1
- import type { BackendManifestStatus, ComparisonIndex, EvalHierarchy } from "@/lib/backend-artifacts"
2
  import type { BenchmarkEvaluationCardData } from "@/components/benchmark-evaluation-card"
3
  import type { HFEvalDetail } from "@/lib/hf-data"
4
  import type {
@@ -108,3 +108,7 @@ export function fetchEvalHierarchy() {
108
  export function fetchComparisonIndex() {
109
  return fetchJson<ComparisonIndex>("/api/comparison-index")
110
  }
 
 
 
 
 
1
+ import type { BackendManifestStatus, ComparisonIndex, CorpusAggregates, EvalHierarchy } from "@/lib/backend-artifacts"
2
  import type { BenchmarkEvaluationCardData } from "@/components/benchmark-evaluation-card"
3
  import type { HFEvalDetail } from "@/lib/hf-data"
4
  import type {
 
108
  export function fetchComparisonIndex() {
109
  return fetchJson<ComparisonIndex>("/api/comparison-index")
110
  }
111
+
112
+ export function fetchCorpusAggregates() {
113
+ return fetchJson<CorpusAggregates>("/api/corpus-aggregates")
114
+ }
lib/eval-processing.ts CHANGED
@@ -15,6 +15,7 @@ import type {
15
  MetricConfig,
16
  EvaluationResult,
17
  } from './benchmark-schema'
 
18
  import type { ModelEvaluationSummary } from './benchmark-schema'
19
  import type { ModelSummaryCore } from './benchmark-schema'
20
  import { inferCategoryFromBenchmark } from './benchmark-schema'
@@ -130,7 +131,7 @@ export interface ModelResultForBenchmark {
130
  }>
131
  }
132
 
133
- export interface BenchmarkEvalSummary {
134
  evaluation_name: string
135
  /** URL-safe slug derived from evaluation_name */
136
  evaluation_id: string
@@ -192,6 +193,7 @@ export interface BenchmarkEvalSummary {
192
  leaderboard_metrics?: BenchmarkLeaderboardMetric[]
193
  /** Matrix rows for multi-metric benchmark leaderboards */
194
  leaderboard_rows?: BenchmarkLeaderboardRow[]
 
195
  }
196
 
197
  export interface BenchmarkSummaryMetric {
@@ -234,6 +236,7 @@ export interface BenchmarkLeaderboardRow {
234
  source_metadata: SourceMetadata
235
  source_data: BenchmarkEvaluation["source_data"]
236
  values: Record<string, number | null>
 
237
  metrics_present: number
238
  }
239
 
@@ -727,6 +730,9 @@ export function createEvaluationCard(
727
  eval_libraries: Array.from(evalLibraries.values()).sort((a, b) => a.name.localeCompare(b.name)),
728
  latest_source_name: latestSourceName,
729
  params_billions: Number.isFinite(paramsBillions ?? NaN) ? paramsBillions : null,
 
 
 
730
  top_scores: topScores,
731
  source_urls: Array.from(sourceUrls),
732
  detail_urls: Array.from(detailUrls),
 
15
  MetricConfig,
16
  EvaluationResult,
17
  } from './benchmark-schema'
18
+ import type { EvalcardsAnnotations, RowAnnotations, SignalSummaries } from './backend-artifacts'
19
  import type { ModelEvaluationSummary } from './benchmark-schema'
20
  import type { ModelSummaryCore } from './benchmark-schema'
21
  import { inferCategoryFromBenchmark } from './benchmark-schema'
 
131
  }>
132
  }
133
 
134
+ export interface BenchmarkEvalSummary extends SignalSummaries {
135
  evaluation_name: string
136
  /** URL-safe slug derived from evaluation_name */
137
  evaluation_id: string
 
193
  leaderboard_metrics?: BenchmarkLeaderboardMetric[]
194
  /** Matrix rows for multi-metric benchmark leaderboards */
195
  leaderboard_rows?: BenchmarkLeaderboardRow[]
196
+ evalcards?: { annotations?: EvalcardsAnnotations }
197
  }
198
 
199
  export interface BenchmarkSummaryMetric {
 
236
  source_metadata: SourceMetadata
237
  source_data: BenchmarkEvaluation["source_data"]
238
  values: Record<string, number | null>
239
+ annotations_by_metric?: Record<string, RowAnnotations | null | undefined>
240
  metrics_present: number
241
  }
242
 
 
730
  eval_libraries: Array.from(evalLibraries.values()).sort((a, b) => a.name.localeCompare(b.name)),
731
  latest_source_name: latestSourceName,
732
  params_billions: Number.isFinite(paramsBillions ?? NaN) ? paramsBillions : null,
733
+ reproducibility_summary: summary.reproducibility_summary,
734
+ provenance_summary: summary.provenance_summary,
735
+ comparability_summary: summary.comparability_summary,
736
  top_scores: topScores,
737
  source_urls: Array.from(sourceUrls),
738
  detail_urls: Array.from(detailUrls),
lib/hf-data.ts CHANGED
@@ -3,7 +3,16 @@ import "server-only"
3
  import { promises as fs } from "fs"
4
  import path from "path"
5
 
6
- import type { BackendManifest, BackendManifestStatus, ComparisonIndex, EvalHierarchy } from "@/lib/backend-artifacts"
 
 
 
 
 
 
 
 
 
7
  import type {
8
  BenchmarkCard,
9
  BenchmarkEvaluation,
@@ -436,7 +445,7 @@ async function fetchHFJsonSafe<T>(relativePath: string): Promise<T | null> {
436
  // HF dataset types (shapes of JSON files in the HF repo)
437
  // ---------------------------------------------------------------------------
438
 
439
- export interface HFModelCardEntry {
440
  model_family_id: string
441
  model_route_id: string
442
  model_family_name: string
@@ -472,7 +481,7 @@ export interface HFModelCardEntry {
472
  }>
473
  }
474
 
475
- export interface HFEvalListEntry {
476
  eval_summary_id: string
477
  benchmark: string
478
  canonical_display_name?: string
@@ -517,6 +526,7 @@ export interface HFEvalListEntry {
517
  models_count: number
518
  top_score: number
519
  }>
 
520
  }
521
 
522
  export interface HFEvalModelResult {
@@ -538,6 +548,7 @@ export interface HFEvalModelResult {
538
  detailed_evaluation_results_meta?: unknown
539
  instance_level_data?: unknown
540
  passthrough_top_level_fields?: unknown
 
541
  }
542
 
543
  export interface HFEvalMetric {
@@ -553,7 +564,7 @@ export interface HFEvalMetric {
553
  model_results: HFEvalModelResult[]
554
  }
555
 
556
- export interface HFEvalDetail {
557
  eval_summary_id: string
558
  benchmark: string
559
  canonical_display_name?: string
@@ -566,9 +577,10 @@ export interface HFEvalDetail {
566
  benchmark_card: BenchmarkCard | null
567
  metrics: HFEvalMetric[]
568
  subtasks: unknown[]
 
569
  }
570
 
571
- export interface HFModelDetail {
572
  model_info: ModelInfo & {
573
  family_id?: string
574
  family_slug?: string
@@ -846,13 +858,112 @@ export async function fetchBackendManifest(): Promise<BackendManifest> {
846
  }
847
 
848
  export async function fetchEvalHierarchy(): Promise<EvalHierarchy> {
849
- return fetchHFJson<EvalHierarchy>("eval-hierarchy.json")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
850
  }
851
 
852
  export async function fetchComparisonIndex(): Promise<ComparisonIndex> {
853
  return fetchHFJson<ComparisonIndex>("comparison-index.json")
854
  }
855
 
 
 
 
 
856
  export async function fetchModelDetail(slug: string): Promise<HFModelDetail | null> {
857
  return fetchHFJsonSafe<HFModelDetail>(`models/${slug}.json`)
858
  }
@@ -1297,6 +1408,7 @@ function flattenHierarchyNode(
1297
  detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
1298
  result.detailed_evaluation_results
1299
  ),
 
1300
  }
1301
 
1302
  const existing = resultsByVariant.get(variantKey)
 
3
  import { promises as fs } from "fs"
4
  import path from "path"
5
 
6
+ import type {
7
+ BackendManifest,
8
+ BackendManifestStatus,
9
+ ComparisonIndex,
10
+ CorpusAggregates,
11
+ EvalHierarchy,
12
+ EvalcardsAnnotations,
13
+ RowAnnotations,
14
+ SignalSummaries,
15
+ } from "@/lib/backend-artifacts"
16
  import type {
17
  BenchmarkCard,
18
  BenchmarkEvaluation,
 
445
  // HF dataset types (shapes of JSON files in the HF repo)
446
  // ---------------------------------------------------------------------------
447
 
448
+ export interface HFModelCardEntry extends SignalSummaries {
449
  model_family_id: string
450
  model_route_id: string
451
  model_family_name: string
 
481
  }>
482
  }
483
 
484
+ export interface HFEvalListEntry extends SignalSummaries {
485
  eval_summary_id: string
486
  benchmark: string
487
  canonical_display_name?: string
 
526
  models_count: number
527
  top_score: number
528
  }>
529
+ evalcards?: { annotations?: EvalcardsAnnotations }
530
  }
531
 
532
  export interface HFEvalModelResult {
 
548
  detailed_evaluation_results_meta?: unknown
549
  instance_level_data?: unknown
550
  passthrough_top_level_fields?: unknown
551
+ evalcards?: { annotations?: RowAnnotations }
552
  }
553
 
554
  export interface HFEvalMetric {
 
564
  model_results: HFEvalModelResult[]
565
  }
566
 
567
+ export interface HFEvalDetail extends SignalSummaries {
568
  eval_summary_id: string
569
  benchmark: string
570
  canonical_display_name?: string
 
577
  benchmark_card: BenchmarkCard | null
578
  metrics: HFEvalMetric[]
579
  subtasks: unknown[]
580
+ evalcards?: { annotations?: EvalcardsAnnotations }
581
  }
582
 
583
+ export interface HFModelDetail extends SignalSummaries {
584
  model_info: ModelInfo & {
585
  family_id?: string
586
  family_slug?: string
 
858
  }
859
 
860
  export async function fetchEvalHierarchy(): Promise<EvalHierarchy> {
861
+ const raw = await fetchHFJson<EvalHierarchy>("eval-hierarchy.json")
862
+ return adaptEvalHierarchy(raw)
863
+ }
864
+
865
+ /**
866
+ * The upstream pipeline migrated to a flat 2-level shape (family → leaf).
867
+ * The evals page still walks the older composites/standalone_benchmarks tree,
868
+ * so we synthesize the legacy view from `leaves` when the new shape is present.
869
+ * Also computes a fallback `stats` block when missing.
870
+ */
871
+ function adaptEvalHierarchy(raw: EvalHierarchy): EvalHierarchy {
872
+ const families = (raw.families ?? []).map((family) => {
873
+ const hasLegacyTree =
874
+ (family.composites && family.composites.length > 0) ||
875
+ (family.standalone_benchmarks && family.standalone_benchmarks.length > 0) ||
876
+ (family.benchmarks && family.benchmarks.length > 0)
877
+
878
+ if (hasLegacyTree) {
879
+ return family
880
+ }
881
+
882
+ const leaves = family.leaves ?? []
883
+ if (leaves.length === 0) {
884
+ return family
885
+ }
886
+
887
+ const standalone = leaves.map((leaf) => ({
888
+ key: leaf.key,
889
+ display_name: leaf.display_name,
890
+ has_card: leaf.has_card ?? false,
891
+ tags: {
892
+ domains: leaf.tags?.domains ?? [],
893
+ languages: leaf.tags?.languages ?? [],
894
+ tasks: leaf.tags?.tasks ?? [],
895
+ },
896
+ slices: [],
897
+ metrics: [],
898
+ reproducibility_summary: leaf.reproducibility_summary,
899
+ provenance_summary: leaf.provenance_summary,
900
+ comparability_summary: leaf.comparability_summary,
901
+ summary_eval_ids: leaf.eval_summary_ids,
902
+ }))
903
+
904
+ return {
905
+ ...family,
906
+ tags: {
907
+ domains: family.tags?.domains ?? [],
908
+ languages: family.tags?.languages ?? [],
909
+ tasks: family.tags?.tasks ?? [],
910
+ },
911
+ standalone_benchmarks: standalone,
912
+ }
913
+ })
914
+
915
+ if (raw.stats) {
916
+ return { ...raw, families }
917
+ }
918
+
919
+ let composite_count = 0
920
+ let standalone_benchmark_count = 0
921
+ let single_benchmark_count = 0
922
+ let slice_count = 0
923
+ let metric_count = 0
924
+
925
+ for (const family of families) {
926
+ composite_count += family.composites?.length ?? 0
927
+ const standalone = family.standalone_benchmarks ?? []
928
+ standalone_benchmark_count += standalone.length
929
+ if ((family.composites?.length ?? 0) === 0 && standalone.length === 1) {
930
+ single_benchmark_count += 1
931
+ }
932
+ for (const composite of family.composites ?? []) {
933
+ for (const benchmark of composite.benchmarks ?? []) {
934
+ slice_count += benchmark.slices?.length ?? 0
935
+ metric_count += benchmark.metrics?.length ?? 0
936
+ }
937
+ }
938
+ for (const benchmark of standalone) {
939
+ slice_count += benchmark.slices?.length ?? 0
940
+ metric_count += benchmark.metrics?.length ?? 0
941
+ }
942
+ }
943
+
944
+ return {
945
+ ...raw,
946
+ families,
947
+ stats: {
948
+ family_count: families.length,
949
+ composite_count,
950
+ standalone_benchmark_count,
951
+ single_benchmark_count,
952
+ slice_count,
953
+ metric_count,
954
+ metric_rows_scanned: 0,
955
+ },
956
+ }
957
  }
958
 
959
  export async function fetchComparisonIndex(): Promise<ComparisonIndex> {
960
  return fetchHFJson<ComparisonIndex>("comparison-index.json")
961
  }
962
 
963
+ export async function fetchCorpusAggregates(): Promise<CorpusAggregates | null> {
964
+ return fetchHFJsonSafe<CorpusAggregates>("corpus-aggregates.json")
965
+ }
966
+
967
  export async function fetchModelDetail(slug: string): Promise<HFModelDetail | null> {
968
  return fetchHFJsonSafe<HFModelDetail>(`models/${slug}.json`)
969
  }
 
1408
  detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
1409
  result.detailed_evaluation_results
1410
  ),
1411
+ evalcards: result.evalcards,
1412
  }
1413
 
1414
  const existing = resultsByVariant.get(variantKey)
lib/model-data.ts CHANGED
@@ -27,6 +27,7 @@ import { getCanonicalModelIdentity, getModelFamilyRouteId } from "@/lib/model-fa
27
  import { getBenchmarkCard, normalizeBenchmarkKey } from "@/lib/benchmark-metadata"
28
  import {
29
  type HFEvalDetail,
 
30
  type HFEvalModelResult,
31
  type HFModelCardEntry,
32
  type HFModelDetail,
@@ -337,6 +338,24 @@ function parseParamsBillions(value: unknown): number | null {
337
  return Number.isFinite(numeric) && numeric > 0 ? numeric : null
338
  }
339
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  // ---------------------------------------------------------------------------
341
  // HF model-cards.json → EvaluationCardData
342
  // ---------------------------------------------------------------------------
@@ -391,6 +410,9 @@ function hfModelCardToEvaluationCardData(entry: HFModelCardEntry): EvaluationCar
391
  ? `${entry.benchmark_names.length} benchmark${entry.benchmark_names.length === 1 ? "" : "s"}`
392
  : undefined,
393
  params_billions: parseParamsBillions(entry.params_billions),
 
 
 
394
  benchmark_names: (entry.benchmark_names ?? []).map((name) => getBenchmarkDisplayName(name)),
395
  score_summary: {
396
  count: entry.score_summary.count,
@@ -408,31 +430,7 @@ function hfModelCardToEvaluationCardData(entry: HFModelCardEntry): EvaluationCar
408
  // HF eval-list.json → BenchmarkEvalListItem
409
  // ---------------------------------------------------------------------------
410
 
411
- function hfEvalEntryToListItem(entry: {
412
- eval_summary_id: string
413
- benchmark: string
414
- benchmark_family_key: string
415
- benchmark_family_name: string
416
- benchmark_parent_name?: string
417
- benchmark_leaf_key: string
418
- benchmark_leaf_name: string
419
- evaluation_name?: string
420
- display_name: string
421
- is_summary_score?: boolean
422
- summary_eval_ids?: string[]
423
- category: string
424
- tags: { domains: string[]; languages: string[]; tasks: string[] }
425
- models_count: number
426
- metrics_count: number
427
- subtasks_count?: number
428
- metric_names: string[]
429
- primary_metric_name: string
430
- benchmark_card: BenchmarkCard | null
431
- source_data?: SourceData
432
- top_score: number
433
- instance_data: { available: boolean; url_count: number; sample_urls: string[]; models_with_loaded_instances: number }
434
- metrics: Array<{ metric_summary_id: string; metric_name: string; lower_is_better: boolean; models_count: number; top_score: number }>
435
- }): BenchmarkEvalListItem {
436
  // Use the pipeline's category directly, mapped to our CategoryType
437
  const category = mapHFCategories([entry.category])[0] ?? "General" as CategoryType
438
 
@@ -486,6 +484,10 @@ function hfEvalEntryToListItem(entry: {
486
  subtasks_count: entry.subtasks_count ?? 0,
487
  is_summary_score: entry.is_summary_score ?? false,
488
  summary_eval_ids: entry.summary_eval_ids ?? [],
 
 
 
 
489
  }
490
  }
491
 
@@ -652,6 +654,7 @@ function buildBenchmarkLeaderboardMatrix(detail: HFEvalDetail) {
652
  source_metadata: sourceMetadata,
653
  source_data: sourceData,
654
  values: { [columnKey]: modelResult.score ?? null },
 
655
  metrics_present: 0,
656
  _timestampValue: nextTimestamp,
657
  })
@@ -659,6 +662,10 @@ function buildBenchmarkLeaderboardMatrix(detail: HFEvalDetail) {
659
  }
660
 
661
  existing.values[columnKey] = modelResult.score ?? null
 
 
 
 
662
  if (!existing.model_route_id && modelResult.model_route_id) {
663
  existing.model_route_id = modelResult.model_route_id
664
  }
@@ -725,6 +732,7 @@ function toModelResultsForMetric(
725
  detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
726
  mr.detailed_evaluation_results
727
  ),
 
728
  }
729
 
730
  return {
@@ -797,6 +805,11 @@ function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
797
  subtasks,
798
  leaderboard_metrics: leaderboardMatrix.leaderboard_metrics,
799
  leaderboard_rows: leaderboardMatrix.leaderboard_rows,
 
 
 
 
 
800
  }
801
  }
802
 
@@ -847,6 +860,11 @@ function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
847
  subtasks,
848
  leaderboard_metrics: leaderboardMatrix.leaderboard_metrics,
849
  leaderboard_rows: leaderboardMatrix.leaderboard_rows,
 
 
 
 
 
850
  }
851
  }
852
 
@@ -1140,6 +1158,7 @@ function buildSingleMetricSuiteMatrixSummary(
1140
  source_metadata: sourceMetadata,
1141
  source_data: sourceData,
1142
  values: { [columnKey]: modelResult.score ?? null },
 
1143
  metrics_present: 0,
1144
  _timestampValue: nextTimestamp,
1145
  })
@@ -1147,6 +1166,10 @@ function buildSingleMetricSuiteMatrixSummary(
1147
  }
1148
 
1149
  existing.values[columnKey] = modelResult.score ?? null
 
 
 
 
1150
  if (!existing.model_route_id && modelResult.model_route_id) {
1151
  existing.model_route_id = modelResult.model_route_id
1152
  }
@@ -1469,7 +1492,7 @@ export async function getModelSummaryById(modelId: string) {
1469
  if (detail) {
1470
  const evaluations = flattenModelEvaluations(detail)
1471
  if (evaluations.length > 0) {
1472
- return createModelFamilySummary(evaluations)
1473
  }
1474
  }
1475
  }
@@ -1489,7 +1512,7 @@ export async function getModelSummaryById(modelId: string) {
1489
  if (detail) {
1490
  const evaluations = flattenModelEvaluations(detail)
1491
  if (evaluations.length > 0) {
1492
- return createModelFamilySummary(evaluations)
1493
  }
1494
  }
1495
 
@@ -1501,7 +1524,7 @@ export async function getModelSummaryById(modelId: string) {
1501
  if (variantDetail) {
1502
  const evaluations = flattenModelEvaluations(variantDetail)
1503
  if (evaluations.length > 0) {
1504
- return createModelFamilySummary(evaluations)
1505
  }
1506
  }
1507
  }
 
27
  import { getBenchmarkCard, normalizeBenchmarkKey } from "@/lib/benchmark-metadata"
28
  import {
29
  type HFEvalDetail,
30
+ type HFEvalListEntry,
31
  type HFEvalModelResult,
32
  type HFModelCardEntry,
33
  type HFModelDetail,
 
338
  return Number.isFinite(numeric) && numeric > 0 ? numeric : null
339
  }
340
 
341
+ function attachModelSignalSummaries<T extends ReturnType<typeof createModelFamilySummary>>(
342
+ summary: T,
343
+ detail: HFModelDetail
344
+ ): T {
345
+ return {
346
+ ...summary,
347
+ reproducibility_summary: detail.reproducibility_summary,
348
+ provenance_summary: detail.provenance_summary,
349
+ comparability_summary: detail.comparability_summary,
350
+ variants: summary.variants.map((variant) => ({
351
+ ...variant,
352
+ reproducibility_summary: detail.reproducibility_summary,
353
+ provenance_summary: detail.provenance_summary,
354
+ comparability_summary: detail.comparability_summary,
355
+ })),
356
+ }
357
+ }
358
+
359
  // ---------------------------------------------------------------------------
360
  // HF model-cards.json → EvaluationCardData
361
  // ---------------------------------------------------------------------------
 
410
  ? `${entry.benchmark_names.length} benchmark${entry.benchmark_names.length === 1 ? "" : "s"}`
411
  : undefined,
412
  params_billions: parseParamsBillions(entry.params_billions),
413
+ reproducibility_summary: entry.reproducibility_summary,
414
+ provenance_summary: entry.provenance_summary,
415
+ comparability_summary: entry.comparability_summary,
416
  benchmark_names: (entry.benchmark_names ?? []).map((name) => getBenchmarkDisplayName(name)),
417
  score_summary: {
418
  count: entry.score_summary.count,
 
430
  // HF eval-list.json → BenchmarkEvalListItem
431
  // ---------------------------------------------------------------------------
432
 
433
+ function hfEvalEntryToListItem(entry: HFEvalListEntry): BenchmarkEvalListItem {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
  // Use the pipeline's category directly, mapped to our CategoryType
435
  const category = mapHFCategories([entry.category])[0] ?? "General" as CategoryType
436
 
 
484
  subtasks_count: entry.subtasks_count ?? 0,
485
  is_summary_score: entry.is_summary_score ?? false,
486
  summary_eval_ids: entry.summary_eval_ids ?? [],
487
+ evalcards: entry.evalcards,
488
+ reproducibility_summary: entry.reproducibility_summary,
489
+ provenance_summary: entry.provenance_summary,
490
+ comparability_summary: entry.comparability_summary,
491
  }
492
  }
493
 
 
654
  source_metadata: sourceMetadata,
655
  source_data: sourceData,
656
  values: { [columnKey]: modelResult.score ?? null },
657
+ annotations_by_metric: { [columnKey]: modelResult.evalcards?.annotations ?? null },
658
  metrics_present: 0,
659
  _timestampValue: nextTimestamp,
660
  })
 
662
  }
663
 
664
  existing.values[columnKey] = modelResult.score ?? null
665
+ existing.annotations_by_metric = {
666
+ ...(existing.annotations_by_metric ?? {}),
667
+ [columnKey]: modelResult.evalcards?.annotations ?? null,
668
+ }
669
  if (!existing.model_route_id && modelResult.model_route_id) {
670
  existing.model_route_id = modelResult.model_route_id
671
  }
 
732
  detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
733
  mr.detailed_evaluation_results
734
  ),
735
+ evalcards: mr.evalcards,
736
  }
737
 
738
  return {
 
805
  subtasks,
806
  leaderboard_metrics: leaderboardMatrix.leaderboard_metrics,
807
  leaderboard_rows: leaderboardMatrix.leaderboard_rows,
808
+ source_data: detail.source_data,
809
+ evalcards: detail.evalcards,
810
+ reproducibility_summary: detail.reproducibility_summary,
811
+ provenance_summary: detail.provenance_summary,
812
+ comparability_summary: detail.comparability_summary,
813
  }
814
  }
815
 
 
860
  subtasks,
861
  leaderboard_metrics: leaderboardMatrix.leaderboard_metrics,
862
  leaderboard_rows: leaderboardMatrix.leaderboard_rows,
863
+ source_data: detail.source_data,
864
+ evalcards: detail.evalcards,
865
+ reproducibility_summary: detail.reproducibility_summary,
866
+ provenance_summary: detail.provenance_summary,
867
+ comparability_summary: detail.comparability_summary,
868
  }
869
  }
870
 
 
1158
  source_metadata: sourceMetadata,
1159
  source_data: sourceData,
1160
  values: { [columnKey]: modelResult.score ?? null },
1161
+ annotations_by_metric: { [columnKey]: modelResult.evalcards?.annotations ?? null },
1162
  metrics_present: 0,
1163
  _timestampValue: nextTimestamp,
1164
  })
 
1166
  }
1167
 
1168
  existing.values[columnKey] = modelResult.score ?? null
1169
+ existing.annotations_by_metric = {
1170
+ ...(existing.annotations_by_metric ?? {}),
1171
+ [columnKey]: modelResult.evalcards?.annotations ?? null,
1172
+ }
1173
  if (!existing.model_route_id && modelResult.model_route_id) {
1174
  existing.model_route_id = modelResult.model_route_id
1175
  }
 
1492
  if (detail) {
1493
  const evaluations = flattenModelEvaluations(detail)
1494
  if (evaluations.length > 0) {
1495
+ return attachModelSignalSummaries(createModelFamilySummary(evaluations), detail)
1496
  }
1497
  }
1498
  }
 
1512
  if (detail) {
1513
  const evaluations = flattenModelEvaluations(detail)
1514
  if (evaluations.length > 0) {
1515
+ return attachModelSignalSummaries(createModelFamilySummary(evaluations), detail)
1516
  }
1517
  }
1518
 
 
1524
  if (variantDetail) {
1525
  const evaluations = flattenModelEvaluations(variantDetail)
1526
  if (evaluations.length > 0) {
1527
+ return attachModelSignalSummaries(createModelFamilySummary(evaluations), variantDetail)
1528
  }
1529
  }
1530
  }
public/peer-ranks.json CHANGED
The diff for this file is too large to render. See raw diff
 
scripts/cache-hf-data.mjs CHANGED
@@ -32,11 +32,13 @@ const CACHE_ROOT_FILES = [
32
  "benchmark-metadata.json",
33
  "eval-hierarchy.json",
34
  "comparison-index.json",
 
35
  ]
36
 
37
  const OPTIONAL_CACHE_ROOT_FILES = new Set([
38
  "model-cards-lite.json",
39
  "eval-list-lite.json",
 
40
  ])
41
 
42
  const CACHE_DIRECTORIES = ["developers", "evals", "models"]
 
32
  "benchmark-metadata.json",
33
  "eval-hierarchy.json",
34
  "comparison-index.json",
35
+ "corpus-aggregates.json",
36
  ]
37
 
38
  const OPTIONAL_CACHE_ROOT_FILES = new Set([
39
  "model-cards-lite.json",
40
  "eval-list-lite.json",
41
+ "corpus-aggregates.json",
42
  ])
43
 
44
  const CACHE_DIRECTORIES = ["developers", "evals", "models"]