Spaces:
Running
Running
Add interpretive signals, corpus dashboard, and slice browser
Browse filesSurfaces reproducibility, reporting completeness, provenance, and
comparability signals from the backend across eval detail, model compare,
eval list cards, and a new /corpus dashboard. Adapts to the upstream
2-level hierarchy (family → leaf), caps the leaderboard at 24 default
columns and replaces the slice tabs with a search dialog when a
benchmark has more than 5 slices.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- app/api/corpus-aggregates/route.ts +16 -0
- app/corpus/page.tsx +36 -0
- app/evals/page.tsx +308 -19
- components/benchmark-detail.tsx +45 -53
- components/benchmark-evaluation-card.tsx +19 -0
- components/eval-card.tsx +11 -6
- components/eval-detail.tsx +274 -29
- components/model-compare-dialog.tsx +15 -0
- components/navigation.tsx +6 -0
- components/signals/comparability-panel.tsx +193 -0
- components/signals/completeness-panel.tsx +147 -0
- components/signals/corpus-dashboard.tsx +442 -0
- components/signals/cross-party-divergence-badge.tsx +46 -0
- components/signals/provenance-badge.tsx +124 -0
- components/signals/reproducibility-badge.tsx +46 -0
- components/signals/reproducibility-panel.tsx +60 -0
- components/signals/signal-tooltip.tsx +31 -0
- components/signals/signal-utils.ts +105 -0
- components/signals/signals-row-badges.tsx +68 -0
- components/signals/variant-divergence-badge.tsx +46 -0
- docs/INTERPRETIVE_SIGNALS.md +622 -0
- lib/backend-artifacts.ts +230 -7
- lib/benchmark-schema.ts +8 -1
- lib/dashboard-data-client.ts +5 -1
- lib/eval-processing.ts +7 -1
- lib/hf-data.ts +118 -6
- lib/model-data.ts +51 -28
- public/peer-ranks.json +0 -0
- scripts/cache-hf-data.mjs +2 -0
app/api/corpus-aggregates/route.ts
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { NextResponse } from "next/server"
|
| 2 |
+
|
| 3 |
+
import { fetchCorpusAggregates } from "@/lib/hf-data"
|
| 4 |
+
|
| 5 |
+
export async function GET() {
|
| 6 |
+
const aggregates = await fetchCorpusAggregates()
|
| 7 |
+
|
| 8 |
+
if (!aggregates) {
|
| 9 |
+
return NextResponse.json(
|
| 10 |
+
{ error: "Corpus aggregates not available" },
|
| 11 |
+
{ status: 404 }
|
| 12 |
+
)
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
return NextResponse.json(aggregates)
|
| 16 |
+
}
|
app/corpus/page.tsx
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { CorpusDashboard } from "@/components/signals/corpus-dashboard"
|
| 2 |
+
import { Navigation } from "@/components/navigation"
|
| 3 |
+
import { fetchCorpusAggregates, fetchEvalListLite } from "@/lib/hf-data"
|
| 4 |
+
|
| 5 |
+
export default async function CorpusPage() {
|
| 6 |
+
const [aggregates, evalList] = await Promise.all([
|
| 7 |
+
fetchCorpusAggregates(),
|
| 8 |
+
fetchEvalListLite().catch(() => ({ evals: [] })),
|
| 9 |
+
])
|
| 10 |
+
|
| 11 |
+
const completenessScores = evalList.evals
|
| 12 |
+
.map((entry) => entry.evalcards?.annotations?.reporting_completeness?.completeness_score)
|
| 13 |
+
.filter((score): score is number => typeof score === "number" && Number.isFinite(score))
|
| 14 |
+
|
| 15 |
+
return (
|
| 16 |
+
<div className="min-h-screen bg-background">
|
| 17 |
+
<Navigation />
|
| 18 |
+
<main className="container mx-auto px-4 py-8">
|
| 19 |
+
{aggregates ? (
|
| 20 |
+
<CorpusDashboard aggregates={aggregates} completenessScores={completenessScores} />
|
| 21 |
+
) : (
|
| 22 |
+
<section className="rounded-2xl border border-dashed border-border/70 bg-card p-8 text-center">
|
| 23 |
+
<div className="text-[11px] font-semibold uppercase tracking-[0.22em] text-muted-foreground">
|
| 24 |
+
Interpretive signals
|
| 25 |
+
</div>
|
| 26 |
+
<h1 className="mt-2 text-2xl font-semibold tracking-tight">Corpus aggregates are not available yet</h1>
|
| 27 |
+
<p className="mx-auto mt-3 max-w-2xl text-sm leading-6 text-muted-foreground">
|
| 28 |
+
The frontend is ready for `corpus-aggregates.json`, but this cached backend snapshot does not include it yet.
|
| 29 |
+
Once the dataset ships the file, this page will render reproducibility, completeness, provenance, and comparability rollups.
|
| 30 |
+
</p>
|
| 31 |
+
</section>
|
| 32 |
+
)}
|
| 33 |
+
</main>
|
| 34 |
+
</div>
|
| 35 |
+
)
|
| 36 |
+
}
|
app/evals/page.tsx
CHANGED
|
@@ -11,7 +11,7 @@ import { PageHeader } from "@/components/page-header"
|
|
| 11 |
import { Button } from "@/components/ui/button"
|
| 12 |
import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
|
| 13 |
import { Input } from "@/components/ui/input"
|
| 14 |
-
import type { EvalHierarchy } from "@/lib/backend-artifacts"
|
| 15 |
import type { BenchmarkCard, CategoryType } from "@/lib/benchmark-schema"
|
| 16 |
import type { BenchmarkEvalListItem } from "@/lib/eval-processing"
|
| 17 |
import { fetchBenchmarkMetadata, fetchEvalHierarchy, fetchEvalList } from "@/lib/dashboard-data-client"
|
|
@@ -251,7 +251,7 @@ interface EvalBrowserMatrixPreviewRow {
|
|
| 251 |
value: string
|
| 252 |
}
|
| 253 |
|
| 254 |
-
interface EvalBrowserNode {
|
| 255 |
id: string
|
| 256 |
parentId: string | null
|
| 257 |
kind: EvalBrowserNodeKind
|
|
@@ -261,6 +261,7 @@ interface EvalBrowserNode {
|
|
| 261 |
description: string
|
| 262 |
category: CategoryType
|
| 263 |
domains: string[]
|
|
|
|
| 264 |
dataType?: string
|
| 265 |
license?: string
|
| 266 |
card?: BenchmarkCard
|
|
@@ -272,6 +273,8 @@ interface EvalBrowserNode {
|
|
| 272 |
childIds: string[]
|
| 273 |
href?: string
|
| 274 |
scopeKeys: string[]
|
|
|
|
|
|
|
| 275 |
matrixPreview?: {
|
| 276 |
columnLabel: string
|
| 277 |
rows: EvalBrowserMatrixPreviewRow[]
|
|
@@ -336,6 +339,86 @@ function summarizeNodeStats(
|
|
| 336 |
0
|
| 337 |
)
|
| 338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
return {
|
| 340 |
category: getDominantCategory(summaries, fallbackCategory),
|
| 341 |
modelsCount,
|
|
@@ -346,6 +429,10 @@ function summarizeNodeStats(
|
|
| 346 |
summaries[0]?.source_data?.hf_repo ??
|
| 347 |
summaries[0]?.source_data?.dataset_name ??
|
| 348 |
"Hierarchy summary",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
}
|
| 350 |
}
|
| 351 |
|
|
@@ -435,6 +522,94 @@ function getNodeCard(
|
|
| 435 |
return undefined
|
| 436 |
}
|
| 437 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
function looksLikeLanguageSplit(value: string) {
|
| 439 |
const normalized = normalizeBenchmarkKey(value)
|
| 440 |
const languageLike = new Set([
|
|
@@ -528,6 +703,7 @@ export default function EvalsPage() {
|
|
| 528 |
const [totalModels, setTotalModels] = useState(0)
|
| 529 |
const [searchQuery, setSearchQuery] = useState("")
|
| 530 |
const [selectedDomain, setSelectedDomain] = useState<string | null>(null)
|
|
|
|
| 531 |
const [selectedCategory, setSelectedCategory] = useState<string | null>(null)
|
| 532 |
const [selectedNodeKind, setSelectedNodeKind] = useState<EvalBrowserNodeKind | null>(null)
|
| 533 |
const [currentNodeId, setCurrentNodeId] = useState<string | null>(null)
|
|
@@ -558,10 +734,12 @@ export default function EvalsPage() {
|
|
| 558 |
const params = new URLSearchParams(window.location.search)
|
| 559 |
const incomingSearch = params.get("search") ?? ""
|
| 560 |
const incomingDomain = params.get("domain")
|
|
|
|
| 561 |
const incomingCategory = params.get("category")
|
| 562 |
const incomingNode = params.get("node")
|
| 563 |
setSearchQuery(incomingSearch)
|
| 564 |
setSelectedDomain(incomingDomain)
|
|
|
|
| 565 |
setSelectedCategory(incomingCategory)
|
| 566 |
setCurrentNodeId(incomingNode)
|
| 567 |
}
|
|
@@ -586,6 +764,9 @@ export default function EvalsPage() {
|
|
| 586 |
if (selectedDomain) {
|
| 587 |
params.set("domain", selectedDomain)
|
| 588 |
}
|
|
|
|
|
|
|
|
|
|
| 589 |
if (selectedCategory) {
|
| 590 |
params.set("category", selectedCategory)
|
| 591 |
}
|
|
@@ -607,7 +788,7 @@ export default function EvalsPage() {
|
|
| 607 |
}
|
| 608 |
|
| 609 |
pendingHistoryActionRef.current = "replace"
|
| 610 |
-
}, [currentNodeId, searchQuery, selectedCategory, selectedDomain])
|
| 611 |
|
| 612 |
const summariesWithCards = useMemo(() => {
|
| 613 |
return summaries.map((summary) => {
|
|
@@ -665,6 +846,7 @@ export default function EvalsPage() {
|
|
| 665 |
suiteLabel,
|
| 666 |
category,
|
| 667 |
domains,
|
|
|
|
| 668 |
summaries,
|
| 669 |
card,
|
| 670 |
sourceLabel,
|
|
@@ -682,6 +864,7 @@ export default function EvalsPage() {
|
|
| 682 |
suiteLabel?: string
|
| 683 |
category: CategoryType
|
| 684 |
domains: string[]
|
|
|
|
| 685 |
summaries: BenchmarkEvalListItem[]
|
| 686 |
card?: BenchmarkCard
|
| 687 |
sourceLabel?: string
|
|
@@ -692,6 +875,7 @@ export default function EvalsPage() {
|
|
| 692 |
descriptionFallback: string
|
| 693 |
}) => {
|
| 694 |
const stats = summarizeNodeStats(summaries, category)
|
|
|
|
| 695 |
addNode({
|
| 696 |
id,
|
| 697 |
parentId,
|
|
@@ -702,6 +886,7 @@ export default function EvalsPage() {
|
|
| 702 |
description: buildDescription(title, card, descriptionFallback),
|
| 703 |
category: stats.category,
|
| 704 |
domains: Array.from(new Set(domains.flatMap((domain) => normalizeDomainList(domain)))),
|
|
|
|
| 705 |
dataType: card?.benchmark_details?.data_type,
|
| 706 |
license: card?.ethical_and_legal_considerations?.data_licensing,
|
| 707 |
card,
|
|
@@ -714,6 +899,10 @@ export default function EvalsPage() {
|
|
| 714 |
href,
|
| 715 |
scopeKeys,
|
| 716 |
matrixPreview,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 717 |
})
|
| 718 |
}
|
| 719 |
|
|
@@ -799,6 +988,7 @@ export default function EvalsPage() {
|
|
| 799 |
slices = [],
|
| 800 |
metrics = [],
|
| 801 |
scopeKeys,
|
|
|
|
| 802 |
}: {
|
| 803 |
parentId: string | null
|
| 804 |
familyLabel?: string
|
|
@@ -812,6 +1002,8 @@ export default function EvalsPage() {
|
|
| 812 |
slices?: Array<{ key: string; display_name: string; metrics: Array<{ key: string; display_name: string }> }>
|
| 813 |
metrics?: Array<{ key: string; display_name: string }>
|
| 814 |
scopeKeys: string[]
|
|
|
|
|
|
|
| 815 |
}) => {
|
| 816 |
const benchmarkId = `${parentId ?? "root"}::benchmark:${normalizeBenchmarkKey(benchmarkKey)}`
|
| 817 |
const card = summary?.benchmark_card ?? getNodeCard(benchmarkCards, ...cardCandidates)
|
|
@@ -821,6 +1013,13 @@ export default function EvalsPage() {
|
|
| 821 |
!summary && metrics.length > 0
|
| 822 |
? scopeKeys.map((scopeKey) => pickSummaryForKey(summariesWithCards, scopeKey, scopeKeys)).find(Boolean)
|
| 823 |
: undefined
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 824 |
const isParentRollupBenchmark =
|
| 825 |
Boolean(parentId) && scopeKeys.some((scopeKey) => isSameHierarchyKey(scopeKey, benchmarkKey))
|
| 826 |
|
|
@@ -829,10 +1028,10 @@ export default function EvalsPage() {
|
|
| 829 |
|
| 830 |
if (drilldownSlices.length > 0) {
|
| 831 |
createSliceNodes(parentId, parentLabel, summary, drilldownSlices, category, scopeKeys)
|
| 832 |
-
} else if (
|
| 833 |
const parent = nodes.get(parentId)
|
| 834 |
if (parent && !parent.href) {
|
| 835 |
-
parent.href =
|
| 836 |
}
|
| 837 |
}
|
| 838 |
return
|
|
@@ -849,14 +1048,7 @@ export default function EvalsPage() {
|
|
| 849 |
domains,
|
| 850 |
summaries: summary ? [summary] : [],
|
| 851 |
card,
|
| 852 |
-
href:
|
| 853 |
-
drilldownSlices.length === 0
|
| 854 |
-
? summary
|
| 855 |
-
? `/evals/${summary.evaluation_id}`
|
| 856 |
-
: fallbackSummary
|
| 857 |
-
? `/evals/${fallbackSummary.evaluation_id}`
|
| 858 |
-
: undefined
|
| 859 |
-
: undefined,
|
| 860 |
scopeKeys,
|
| 861 |
descriptionFallback: `Browse the {label} benchmark and its lower-level breakdowns.`,
|
| 862 |
})
|
|
@@ -1038,6 +1230,7 @@ export default function EvalsPage() {
|
|
| 1038 |
slices: standalone.slices ?? [],
|
| 1039 |
metrics: standalone.metrics ?? [],
|
| 1040 |
scopeKeys: familyScopeKeys,
|
|
|
|
| 1041 |
})
|
| 1042 |
}
|
| 1043 |
|
|
@@ -1199,6 +1392,8 @@ export default function EvalsPage() {
|
|
| 1199 |
})),
|
| 1200 |
metrics: benchmarkSource?.metrics ?? family.metrics ?? [],
|
| 1201 |
scopeKeys: familyScopeKeys,
|
|
|
|
|
|
|
| 1202 |
})
|
| 1203 |
}
|
| 1204 |
|
|
@@ -1243,6 +1438,7 @@ export default function EvalsPage() {
|
|
| 1243 |
node.description,
|
| 1244 |
node.sourceLabel,
|
| 1245 |
...node.domains,
|
|
|
|
| 1246 |
]
|
| 1247 |
|
| 1248 |
return haystacks.some((value) => value?.toLowerCase().includes(query))
|
|
@@ -1261,6 +1457,12 @@ export default function EvalsPage() {
|
|
| 1261 |
domainCandidates = domainCandidates.filter((node) => node.category === selectedCategory)
|
| 1262 |
}
|
| 1263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1264 |
for (const node of domainCandidates) {
|
| 1265 |
for (const domain of node.domains) {
|
| 1266 |
domainSet.add(domain)
|
|
@@ -1268,7 +1470,34 @@ export default function EvalsPage() {
|
|
| 1268 |
}
|
| 1269 |
|
| 1270 |
return Array.from(domainSet).sort((a, b) => a.localeCompare(b))
|
| 1271 |
-
}, [nodesMatchingSearch, selectedCategory])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1272 |
|
| 1273 |
const allCategories = useMemo(() => {
|
| 1274 |
const categorySet = new Set<string>()
|
|
@@ -1284,12 +1513,18 @@ export default function EvalsPage() {
|
|
| 1284 |
)
|
| 1285 |
}
|
| 1286 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1287 |
for (const node of categoryCandidates) {
|
| 1288 |
categorySet.add(node.category)
|
| 1289 |
}
|
| 1290 |
|
| 1291 |
return Array.from(categorySet).sort((a, b) => a.localeCompare(b))
|
| 1292 |
-
}, [nodesMatchingSearch, selectedDomain])
|
| 1293 |
|
| 1294 |
const filtered = useMemo(() => {
|
| 1295 |
let list = [...nodesMatchingSearch]
|
|
@@ -1306,13 +1541,21 @@ export default function EvalsPage() {
|
|
| 1306 |
)
|
| 1307 |
}
|
| 1308 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1309 |
if (selectedCategory) {
|
| 1310 |
list = list.filter((node) => node.category === selectedCategory)
|
| 1311 |
}
|
| 1312 |
|
| 1313 |
list.sort((a, b) => a.title.localeCompare(b.title, undefined, { sensitivity: "base" }))
|
| 1314 |
return list
|
| 1315 |
-
}, [nodesMatchingSearch, selectedCategory, selectedDomain, selectedNodeKind])
|
| 1316 |
|
| 1317 |
useEffect(() => {
|
| 1318 |
if (selectedDomain && !allDomains.includes(selectedDomain)) {
|
|
@@ -1320,6 +1563,12 @@ export default function EvalsPage() {
|
|
| 1320 |
}
|
| 1321 |
}, [allDomains, selectedDomain])
|
| 1322 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1323 |
useEffect(() => {
|
| 1324 |
if (selectedCategory && !allCategories.includes(selectedCategory)) {
|
| 1325 |
setSelectedCategory(null)
|
|
@@ -1328,7 +1577,7 @@ export default function EvalsPage() {
|
|
| 1328 |
|
| 1329 |
useEffect(() => {
|
| 1330 |
setPage(1)
|
| 1331 |
-
}, [currentNodeId, searchQuery, selectedCategory, selectedDomain, selectedNodeKind])
|
| 1332 |
|
| 1333 |
const pagedNodes = useMemo(
|
| 1334 |
() => filtered.slice((page - 1) * PAGE_SIZE, page * PAGE_SIZE),
|
|
@@ -1336,7 +1585,7 @@ export default function EvalsPage() {
|
|
| 1336 |
)
|
| 1337 |
|
| 1338 |
const currentLevelKinds = Array.from(new Set(currentLevelNodes.map((node) => node.kind)))
|
| 1339 |
-
const activeFilterCount = [searchQuery.trim(), selectedDomain, selectedCategory, selectedNodeKind].filter(Boolean).length
|
| 1340 |
const currentLevelLabel =
|
| 1341 |
currentNodeId === null
|
| 1342 |
? "Rollout entry level"
|
|
@@ -1480,6 +1729,7 @@ export default function EvalsPage() {
|
|
| 1480 |
onClick={() => {
|
| 1481 |
setSearchQuery("")
|
| 1482 |
setSelectedDomain(null)
|
|
|
|
| 1483 |
setSelectedCategory(null)
|
| 1484 |
setSelectedNodeKind(null)
|
| 1485 |
}}
|
|
@@ -1545,7 +1795,7 @@ export default function EvalsPage() {
|
|
| 1545 |
</div>
|
| 1546 |
</div>
|
| 1547 |
|
| 1548 |
-
{hierarchy && (
|
| 1549 |
<div className="flex flex-wrap gap-2 text-sm">
|
| 1550 |
<span className="rounded-full border border-stone-200/80 bg-stone-50/80 px-3 py-1.5 font-medium text-stone-700 dark:border-stone-700/80 dark:bg-stone-900/70 dark:text-stone-200">
|
| 1551 |
{hierarchy.stats.family_count} families
|
|
@@ -1669,6 +1919,43 @@ export default function EvalsPage() {
|
|
| 1669 |
</div>
|
| 1670 |
)}
|
| 1671 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1672 |
{allCategories.length > 0 && (
|
| 1673 |
<div className="mt-4 space-y-1.5">
|
| 1674 |
<div className="text-[11px] font-semibold uppercase tracking-[0.2em] text-stone-500 dark:text-stone-400">
|
|
@@ -1794,6 +2081,8 @@ export default function EvalsPage() {
|
|
| 1794 |
{node.title}
|
| 1795 |
</h3>
|
| 1796 |
|
|
|
|
|
|
|
| 1797 |
{node.description && (
|
| 1798 |
<p className="mb-4 flex-1 text-sm leading-6 text-stone-600 line-clamp-3 dark:text-stone-300">
|
| 1799 |
{node.description}
|
|
|
|
| 11 |
import { Button } from "@/components/ui/button"
|
| 12 |
import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
|
| 13 |
import { Input } from "@/components/ui/input"
|
| 14 |
+
import type { EvalHierarchy, SignalSummaries } from "@/lib/backend-artifacts"
|
| 15 |
import type { BenchmarkCard, CategoryType } from "@/lib/benchmark-schema"
|
| 16 |
import type { BenchmarkEvalListItem } from "@/lib/eval-processing"
|
| 17 |
import { fetchBenchmarkMetadata, fetchEvalHierarchy, fetchEvalList } from "@/lib/dashboard-data-client"
|
|
|
|
| 251 |
value: string
|
| 252 |
}
|
| 253 |
|
| 254 |
+
interface EvalBrowserNode extends SignalSummaries {
|
| 255 |
id: string
|
| 256 |
parentId: string | null
|
| 257 |
kind: EvalBrowserNodeKind
|
|
|
|
| 261 |
description: string
|
| 262 |
category: CategoryType
|
| 263 |
domains: string[]
|
| 264 |
+
tasks: string[]
|
| 265 |
dataType?: string
|
| 266 |
license?: string
|
| 267 |
card?: BenchmarkCard
|
|
|
|
| 273 |
childIds: string[]
|
| 274 |
href?: string
|
| 275 |
scopeKeys: string[]
|
| 276 |
+
/** Reporting completeness score in [0, 1] when known, otherwise undefined. */
|
| 277 |
+
completenessScore?: number
|
| 278 |
matrixPreview?: {
|
| 279 |
columnLabel: string
|
| 280 |
rows: EvalBrowserMatrixPreviewRow[]
|
|
|
|
| 339 |
0
|
| 340 |
)
|
| 341 |
|
| 342 |
+
// Aggregate signals across all summaries under this node so a family card
|
| 343 |
+
// can show signals that span its children.
|
| 344 |
+
const reproducibilitySummaries = summaries
|
| 345 |
+
.map((s) => s.reproducibility_summary)
|
| 346 |
+
.filter((value): value is NonNullable<typeof value> => Boolean(value))
|
| 347 |
+
const provenanceSummaries = summaries
|
| 348 |
+
.map((s) => s.provenance_summary)
|
| 349 |
+
.filter((value): value is NonNullable<typeof value> => Boolean(value))
|
| 350 |
+
const comparabilitySummaries = summaries
|
| 351 |
+
.map((s) => s.comparability_summary)
|
| 352 |
+
.filter((value): value is NonNullable<typeof value> => Boolean(value))
|
| 353 |
+
|
| 354 |
+
const reproducibility_summary = reproducibilitySummaries.length
|
| 355 |
+
? reproducibilitySummaries.reduce(
|
| 356 |
+
(acc, item) => ({
|
| 357 |
+
results_total: acc.results_total + item.results_total,
|
| 358 |
+
has_reproducibility_gap_count:
|
| 359 |
+
acc.has_reproducibility_gap_count + item.has_reproducibility_gap_count,
|
| 360 |
+
populated_ratio_avg: null,
|
| 361 |
+
}),
|
| 362 |
+
{ results_total: 0, has_reproducibility_gap_count: 0, populated_ratio_avg: null as number | null }
|
| 363 |
+
)
|
| 364 |
+
: undefined
|
| 365 |
+
|
| 366 |
+
const provenance_summary = provenanceSummaries.length
|
| 367 |
+
? provenanceSummaries.reduce(
|
| 368 |
+
(acc, item) => {
|
| 369 |
+
for (const key of ["first_party", "third_party", "collaborative", "unspecified"] as const) {
|
| 370 |
+
acc.source_type_distribution[key] += item.source_type_distribution[key] ?? 0
|
| 371 |
+
}
|
| 372 |
+
return {
|
| 373 |
+
total_results: acc.total_results + item.total_results,
|
| 374 |
+
total_groups: acc.total_groups + item.total_groups,
|
| 375 |
+
multi_source_groups: acc.multi_source_groups + item.multi_source_groups,
|
| 376 |
+
first_party_only_groups: acc.first_party_only_groups + item.first_party_only_groups,
|
| 377 |
+
source_type_distribution: acc.source_type_distribution,
|
| 378 |
+
}
|
| 379 |
+
},
|
| 380 |
+
{
|
| 381 |
+
total_results: 0,
|
| 382 |
+
total_groups: 0,
|
| 383 |
+
multi_source_groups: 0,
|
| 384 |
+
first_party_only_groups: 0,
|
| 385 |
+
source_type_distribution: {
|
| 386 |
+
first_party: 0,
|
| 387 |
+
third_party: 0,
|
| 388 |
+
collaborative: 0,
|
| 389 |
+
unspecified: 0,
|
| 390 |
+
},
|
| 391 |
+
}
|
| 392 |
+
)
|
| 393 |
+
: undefined
|
| 394 |
+
|
| 395 |
+
const comparability_summary = comparabilitySummaries.length
|
| 396 |
+
? comparabilitySummaries.reduce(
|
| 397 |
+
(acc, item) => ({
|
| 398 |
+
total_groups: acc.total_groups + item.total_groups,
|
| 399 |
+
groups_with_variant_check: acc.groups_with_variant_check + item.groups_with_variant_check,
|
| 400 |
+
groups_with_cross_party_check: acc.groups_with_cross_party_check + item.groups_with_cross_party_check,
|
| 401 |
+
variant_divergent_count: acc.variant_divergent_count + item.variant_divergent_count,
|
| 402 |
+
cross_party_divergent_count: acc.cross_party_divergent_count + item.cross_party_divergent_count,
|
| 403 |
+
}),
|
| 404 |
+
{
|
| 405 |
+
total_groups: 0,
|
| 406 |
+
groups_with_variant_check: 0,
|
| 407 |
+
groups_with_cross_party_check: 0,
|
| 408 |
+
variant_divergent_count: 0,
|
| 409 |
+
cross_party_divergent_count: 0,
|
| 410 |
+
}
|
| 411 |
+
)
|
| 412 |
+
: undefined
|
| 413 |
+
|
| 414 |
+
// Average completeness score across summaries that report one.
|
| 415 |
+
const completenessScores = summaries
|
| 416 |
+
.map((s) => s.evalcards?.annotations?.reporting_completeness?.completeness_score)
|
| 417 |
+
.filter((v): v is number => typeof v === "number" && Number.isFinite(v))
|
| 418 |
+
const completenessScore = completenessScores.length
|
| 419 |
+
? completenessScores.reduce((sum, value) => sum + value, 0) / completenessScores.length
|
| 420 |
+
: undefined
|
| 421 |
+
|
| 422 |
return {
|
| 423 |
category: getDominantCategory(summaries, fallbackCategory),
|
| 424 |
modelsCount,
|
|
|
|
| 429 |
summaries[0]?.source_data?.hf_repo ??
|
| 430 |
summaries[0]?.source_data?.dataset_name ??
|
| 431 |
"Hierarchy summary",
|
| 432 |
+
reproducibility_summary,
|
| 433 |
+
provenance_summary,
|
| 434 |
+
comparability_summary,
|
| 435 |
+
completenessScore,
|
| 436 |
}
|
| 437 |
}
|
| 438 |
|
|
|
|
| 522 |
return undefined
|
| 523 |
}
|
| 524 |
|
| 525 |
+
/**
|
| 526 |
+
* Compact signal indicators for a node card. Shown alongside (or instead of)
|
| 527 |
+
* the benchmark-card-derived metadata so that nodes lacking a benchmark card
|
| 528 |
+
* still surface useful interpretive context.
|
| 529 |
+
*/
|
| 530 |
+
function NodeSignalChips({ node }: { node: EvalBrowserNode }) {
|
| 531 |
+
const repro = node.reproducibility_summary
|
| 532 |
+
const prov = node.provenance_summary
|
| 533 |
+
const comparability = node.comparability_summary
|
| 534 |
+
const completeness = node.completenessScore
|
| 535 |
+
|
| 536 |
+
const reproPercent =
|
| 537 |
+
repro && repro.results_total > 0
|
| 538 |
+
? Math.round((repro.has_reproducibility_gap_count / repro.results_total) * 100)
|
| 539 |
+
: null
|
| 540 |
+
|
| 541 |
+
const firstPartyPercent =
|
| 542 |
+
prov && prov.total_groups > 0
|
| 543 |
+
? Math.round((prov.first_party_only_groups / prov.total_groups) * 100)
|
| 544 |
+
: null
|
| 545 |
+
|
| 546 |
+
const variantDivergent = comparability?.variant_divergent_count ?? 0
|
| 547 |
+
const crossPartyDivergent = comparability?.cross_party_divergent_count ?? 0
|
| 548 |
+
|
| 549 |
+
const completenessPercent = completeness != null ? Math.round(completeness * 100) : null
|
| 550 |
+
|
| 551 |
+
const hasAny =
|
| 552 |
+
reproPercent !== null ||
|
| 553 |
+
firstPartyPercent !== null ||
|
| 554 |
+
variantDivergent > 0 ||
|
| 555 |
+
crossPartyDivergent > 0 ||
|
| 556 |
+
completenessPercent !== null
|
| 557 |
+
|
| 558 |
+
if (!hasAny) {
|
| 559 |
+
return null
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
return (
|
| 563 |
+
<div className="mb-3 flex flex-wrap gap-1.5">
|
| 564 |
+
{completenessPercent !== null && (
|
| 565 |
+
<span
|
| 566 |
+
className={cn(
|
| 567 |
+
"inline-flex items-center gap-1 rounded-full border px-2.5 py-0.5 text-[10px] font-semibold",
|
| 568 |
+
completenessPercent >= 50
|
| 569 |
+
? "border-emerald-200 bg-emerald-50 text-emerald-800 dark:border-emerald-900/50 dark:bg-emerald-950/30 dark:text-emerald-200"
|
| 570 |
+
: "border-amber-200 bg-amber-50 text-amber-800 dark:border-amber-900/50 dark:bg-amber-950/30 dark:text-amber-200"
|
| 571 |
+
)}
|
| 572 |
+
title={`Documentation completeness: ${completenessPercent}% of EvalCards fields populated.`}
|
| 573 |
+
>
|
| 574 |
+
{completenessPercent}% documented
|
| 575 |
+
</span>
|
| 576 |
+
)}
|
| 577 |
+
{reproPercent !== null && reproPercent > 0 && (
|
| 578 |
+
<span
|
| 579 |
+
className="inline-flex items-center gap-1 rounded-full border border-amber-200 bg-amber-50 px-2.5 py-0.5 text-[10px] font-semibold text-amber-800 dark:border-amber-900/50 dark:bg-amber-950/30 dark:text-amber-200"
|
| 580 |
+
title={`${repro?.has_reproducibility_gap_count.toLocaleString()} of ${repro?.results_total.toLocaleString()} reported scores missing setup details.`}
|
| 581 |
+
>
|
| 582 |
+
{reproPercent}% setup gaps
|
| 583 |
+
</span>
|
| 584 |
+
)}
|
| 585 |
+
{firstPartyPercent !== null && firstPartyPercent >= 50 && (
|
| 586 |
+
<span
|
| 587 |
+
className="inline-flex items-center gap-1 rounded-full border border-amber-200 bg-amber-50 px-2.5 py-0.5 text-[10px] font-semibold text-amber-800 dark:border-amber-900/50 dark:bg-amber-950/30 dark:text-amber-200"
|
| 588 |
+
title={`${firstPartyPercent}% of (model, metric) groups have only first-party reports — no independent replication.`}
|
| 589 |
+
>
|
| 590 |
+
{firstPartyPercent}% 1st-party only
|
| 591 |
+
</span>
|
| 592 |
+
)}
|
| 593 |
+
{variantDivergent > 0 && (
|
| 594 |
+
<span
|
| 595 |
+
className="inline-flex items-center gap-1 rounded-full border border-rose-200 bg-rose-50 px-2.5 py-0.5 text-[10px] font-semibold text-rose-800 dark:border-rose-900/50 dark:bg-rose-950/30 dark:text-rose-200"
|
| 596 |
+
title={`${variantDivergent} group${variantDivergent === 1 ? "" : "s"} where setup variations produced diverging scores.`}
|
| 597 |
+
>
|
| 598 |
+
{variantDivergent} setup divergence{variantDivergent === 1 ? "" : "s"}
|
| 599 |
+
</span>
|
| 600 |
+
)}
|
| 601 |
+
{crossPartyDivergent > 0 && (
|
| 602 |
+
<span
|
| 603 |
+
className="inline-flex items-center gap-1 rounded-full border border-violet-200 bg-violet-50 px-2.5 py-0.5 text-[10px] font-semibold text-violet-800 dark:border-violet-900/50 dark:bg-violet-950/30 dark:text-violet-200"
|
| 604 |
+
title={`${crossPartyDivergent} group${crossPartyDivergent === 1 ? "" : "s"} where different organizations reported diverging scores.`}
|
| 605 |
+
>
|
| 606 |
+
{crossPartyDivergent} source disagreement{crossPartyDivergent === 1 ? "" : "s"}
|
| 607 |
+
</span>
|
| 608 |
+
)}
|
| 609 |
+
</div>
|
| 610 |
+
)
|
| 611 |
+
}
|
| 612 |
+
|
| 613 |
function looksLikeLanguageSplit(value: string) {
|
| 614 |
const normalized = normalizeBenchmarkKey(value)
|
| 615 |
const languageLike = new Set([
|
|
|
|
| 703 |
const [totalModels, setTotalModels] = useState(0)
|
| 704 |
const [searchQuery, setSearchQuery] = useState("")
|
| 705 |
const [selectedDomain, setSelectedDomain] = useState<string | null>(null)
|
| 706 |
+
const [selectedTask, setSelectedTask] = useState<string | null>(null)
|
| 707 |
const [selectedCategory, setSelectedCategory] = useState<string | null>(null)
|
| 708 |
const [selectedNodeKind, setSelectedNodeKind] = useState<EvalBrowserNodeKind | null>(null)
|
| 709 |
const [currentNodeId, setCurrentNodeId] = useState<string | null>(null)
|
|
|
|
| 734 |
const params = new URLSearchParams(window.location.search)
|
| 735 |
const incomingSearch = params.get("search") ?? ""
|
| 736 |
const incomingDomain = params.get("domain")
|
| 737 |
+
const incomingTask = params.get("task")
|
| 738 |
const incomingCategory = params.get("category")
|
| 739 |
const incomingNode = params.get("node")
|
| 740 |
setSearchQuery(incomingSearch)
|
| 741 |
setSelectedDomain(incomingDomain)
|
| 742 |
+
setSelectedTask(incomingTask)
|
| 743 |
setSelectedCategory(incomingCategory)
|
| 744 |
setCurrentNodeId(incomingNode)
|
| 745 |
}
|
|
|
|
| 764 |
if (selectedDomain) {
|
| 765 |
params.set("domain", selectedDomain)
|
| 766 |
}
|
| 767 |
+
if (selectedTask) {
|
| 768 |
+
params.set("task", selectedTask)
|
| 769 |
+
}
|
| 770 |
if (selectedCategory) {
|
| 771 |
params.set("category", selectedCategory)
|
| 772 |
}
|
|
|
|
| 788 |
}
|
| 789 |
|
| 790 |
pendingHistoryActionRef.current = "replace"
|
| 791 |
+
}, [currentNodeId, searchQuery, selectedCategory, selectedDomain, selectedTask])
|
| 792 |
|
| 793 |
const summariesWithCards = useMemo(() => {
|
| 794 |
return summaries.map((summary) => {
|
|
|
|
| 846 |
suiteLabel,
|
| 847 |
category,
|
| 848 |
domains,
|
| 849 |
+
tasks,
|
| 850 |
summaries,
|
| 851 |
card,
|
| 852 |
sourceLabel,
|
|
|
|
| 864 |
suiteLabel?: string
|
| 865 |
category: CategoryType
|
| 866 |
domains: string[]
|
| 867 |
+
tasks?: string[]
|
| 868 |
summaries: BenchmarkEvalListItem[]
|
| 869 |
card?: BenchmarkCard
|
| 870 |
sourceLabel?: string
|
|
|
|
| 875 |
descriptionFallback: string
|
| 876 |
}) => {
|
| 877 |
const stats = summarizeNodeStats(summaries, category)
|
| 878 |
+
const summaryTasks = summaries.flatMap((summary) => summary.tags?.tasks ?? [])
|
| 879 |
addNode({
|
| 880 |
id,
|
| 881 |
parentId,
|
|
|
|
| 886 |
description: buildDescription(title, card, descriptionFallback),
|
| 887 |
category: stats.category,
|
| 888 |
domains: Array.from(new Set(domains.flatMap((domain) => normalizeDomainList(domain)))),
|
| 889 |
+
tasks: Array.from(new Set([...(tasks ?? []), ...summaryTasks].map((task) => task.trim()).filter(Boolean))),
|
| 890 |
dataType: card?.benchmark_details?.data_type,
|
| 891 |
license: card?.ethical_and_legal_considerations?.data_licensing,
|
| 892 |
card,
|
|
|
|
| 899 |
href,
|
| 900 |
scopeKeys,
|
| 901 |
matrixPreview,
|
| 902 |
+
reproducibility_summary: stats.reproducibility_summary,
|
| 903 |
+
provenance_summary: stats.provenance_summary,
|
| 904 |
+
comparability_summary: stats.comparability_summary,
|
| 905 |
+
completenessScore: stats.completenessScore,
|
| 906 |
})
|
| 907 |
}
|
| 908 |
|
|
|
|
| 988 |
slices = [],
|
| 989 |
metrics = [],
|
| 990 |
scopeKeys,
|
| 991 |
+
fallbackEvalId,
|
| 992 |
}: {
|
| 993 |
parentId: string | null
|
| 994 |
familyLabel?: string
|
|
|
|
| 1002 |
slices?: Array<{ key: string; display_name: string; metrics: Array<{ key: string; display_name: string }> }>
|
| 1003 |
metrics?: Array<{ key: string; display_name: string }>
|
| 1004 |
scopeKeys: string[]
|
| 1005 |
+
/** Final-resort eval id when no summary or fallback summary matches; comes from leaf.eval_summary_ids */
|
| 1006 |
+
fallbackEvalId?: string
|
| 1007 |
}) => {
|
| 1008 |
const benchmarkId = `${parentId ?? "root"}::benchmark:${normalizeBenchmarkKey(benchmarkKey)}`
|
| 1009 |
const card = summary?.benchmark_card ?? getNodeCard(benchmarkCards, ...cardCandidates)
|
|
|
|
| 1013 |
!summary && metrics.length > 0
|
| 1014 |
? scopeKeys.map((scopeKey) => pickSummaryForKey(summariesWithCards, scopeKey, scopeKeys)).find(Boolean)
|
| 1015 |
: undefined
|
| 1016 |
+
const resolvedHref = summary
|
| 1017 |
+
? `/evals/${summary.evaluation_id}`
|
| 1018 |
+
: fallbackSummary
|
| 1019 |
+
? `/evals/${fallbackSummary.evaluation_id}`
|
| 1020 |
+
: fallbackEvalId
|
| 1021 |
+
? `/evals/${fallbackEvalId}`
|
| 1022 |
+
: undefined
|
| 1023 |
const isParentRollupBenchmark =
|
| 1024 |
Boolean(parentId) && scopeKeys.some((scopeKey) => isSameHierarchyKey(scopeKey, benchmarkKey))
|
| 1025 |
|
|
|
|
| 1028 |
|
| 1029 |
if (drilldownSlices.length > 0) {
|
| 1030 |
createSliceNodes(parentId, parentLabel, summary, drilldownSlices, category, scopeKeys)
|
| 1031 |
+
} else if (resolvedHref) {
|
| 1032 |
const parent = nodes.get(parentId)
|
| 1033 |
if (parent && !parent.href) {
|
| 1034 |
+
parent.href = resolvedHref
|
| 1035 |
}
|
| 1036 |
}
|
| 1037 |
return
|
|
|
|
| 1048 |
domains,
|
| 1049 |
summaries: summary ? [summary] : [],
|
| 1050 |
card,
|
| 1051 |
+
href: drilldownSlices.length === 0 ? resolvedHref : undefined,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1052 |
scopeKeys,
|
| 1053 |
descriptionFallback: `Browse the {label} benchmark and its lower-level breakdowns.`,
|
| 1054 |
})
|
|
|
|
| 1230 |
slices: standalone.slices ?? [],
|
| 1231 |
metrics: standalone.metrics ?? [],
|
| 1232 |
scopeKeys: familyScopeKeys,
|
| 1233 |
+
fallbackEvalId: standalone.summary_eval_ids?.[0],
|
| 1234 |
})
|
| 1235 |
}
|
| 1236 |
|
|
|
|
| 1392 |
})),
|
| 1393 |
metrics: benchmarkSource?.metrics ?? family.metrics ?? [],
|
| 1394 |
scopeKeys: familyScopeKeys,
|
| 1395 |
+
fallbackEvalId:
|
| 1396 |
+
benchmarkSource?.summary_eval_ids?.[0] ?? family.eval_summary_ids?.[0],
|
| 1397 |
})
|
| 1398 |
}
|
| 1399 |
|
|
|
|
| 1438 |
node.description,
|
| 1439 |
node.sourceLabel,
|
| 1440 |
...node.domains,
|
| 1441 |
+
...node.tasks,
|
| 1442 |
]
|
| 1443 |
|
| 1444 |
return haystacks.some((value) => value?.toLowerCase().includes(query))
|
|
|
|
| 1457 |
domainCandidates = domainCandidates.filter((node) => node.category === selectedCategory)
|
| 1458 |
}
|
| 1459 |
|
| 1460 |
+
if (selectedTask) {
|
| 1461 |
+
domainCandidates = domainCandidates.filter((node) =>
|
| 1462 |
+
node.tasks.some((task) => task.toLowerCase() === selectedTask.toLowerCase())
|
| 1463 |
+
)
|
| 1464 |
+
}
|
| 1465 |
+
|
| 1466 |
for (const node of domainCandidates) {
|
| 1467 |
for (const domain of node.domains) {
|
| 1468 |
domainSet.add(domain)
|
|
|
|
| 1470 |
}
|
| 1471 |
|
| 1472 |
return Array.from(domainSet).sort((a, b) => a.localeCompare(b))
|
| 1473 |
+
}, [nodesMatchingSearch, selectedCategory, selectedNodeKind, selectedTask])
|
| 1474 |
+
|
| 1475 |
+
const allTasks = useMemo(() => {
|
| 1476 |
+
const taskSet = new Set<string>()
|
| 1477 |
+
let taskCandidates = nodesMatchingSearch
|
| 1478 |
+
|
| 1479 |
+
if (selectedNodeKind) {
|
| 1480 |
+
taskCandidates = taskCandidates.filter((node) => node.kind === selectedNodeKind)
|
| 1481 |
+
}
|
| 1482 |
+
|
| 1483 |
+
if (selectedCategory) {
|
| 1484 |
+
taskCandidates = taskCandidates.filter((node) => node.category === selectedCategory)
|
| 1485 |
+
}
|
| 1486 |
+
|
| 1487 |
+
if (selectedDomain) {
|
| 1488 |
+
taskCandidates = taskCandidates.filter((node) =>
|
| 1489 |
+
node.domains.some((domain) => domain.toLowerCase() === selectedDomain.toLowerCase())
|
| 1490 |
+
)
|
| 1491 |
+
}
|
| 1492 |
+
|
| 1493 |
+
for (const node of taskCandidates) {
|
| 1494 |
+
for (const task of node.tasks) {
|
| 1495 |
+
taskSet.add(task)
|
| 1496 |
+
}
|
| 1497 |
+
}
|
| 1498 |
+
|
| 1499 |
+
return Array.from(taskSet).sort((a, b) => a.localeCompare(b)).slice(0, 40)
|
| 1500 |
+
}, [nodesMatchingSearch, selectedCategory, selectedDomain, selectedNodeKind])
|
| 1501 |
|
| 1502 |
const allCategories = useMemo(() => {
|
| 1503 |
const categorySet = new Set<string>()
|
|
|
|
| 1513 |
)
|
| 1514 |
}
|
| 1515 |
|
| 1516 |
+
if (selectedTask) {
|
| 1517 |
+
categoryCandidates = categoryCandidates.filter((node) =>
|
| 1518 |
+
node.tasks.some((task) => task.toLowerCase() === selectedTask.toLowerCase())
|
| 1519 |
+
)
|
| 1520 |
+
}
|
| 1521 |
+
|
| 1522 |
for (const node of categoryCandidates) {
|
| 1523 |
categorySet.add(node.category)
|
| 1524 |
}
|
| 1525 |
|
| 1526 |
return Array.from(categorySet).sort((a, b) => a.localeCompare(b))
|
| 1527 |
+
}, [nodesMatchingSearch, selectedDomain, selectedNodeKind, selectedTask])
|
| 1528 |
|
| 1529 |
const filtered = useMemo(() => {
|
| 1530 |
let list = [...nodesMatchingSearch]
|
|
|
|
| 1541 |
)
|
| 1542 |
}
|
| 1543 |
|
| 1544 |
+
if (selectedTask) {
|
| 1545 |
+
list = list.filter((node) =>
|
| 1546 |
+
node.tasks.some(
|
| 1547 |
+
(task) => task.toLowerCase() === selectedTask.toLowerCase()
|
| 1548 |
+
)
|
| 1549 |
+
)
|
| 1550 |
+
}
|
| 1551 |
+
|
| 1552 |
if (selectedCategory) {
|
| 1553 |
list = list.filter((node) => node.category === selectedCategory)
|
| 1554 |
}
|
| 1555 |
|
| 1556 |
list.sort((a, b) => a.title.localeCompare(b.title, undefined, { sensitivity: "base" }))
|
| 1557 |
return list
|
| 1558 |
+
}, [nodesMatchingSearch, selectedCategory, selectedDomain, selectedNodeKind, selectedTask])
|
| 1559 |
|
| 1560 |
useEffect(() => {
|
| 1561 |
if (selectedDomain && !allDomains.includes(selectedDomain)) {
|
|
|
|
| 1563 |
}
|
| 1564 |
}, [allDomains, selectedDomain])
|
| 1565 |
|
| 1566 |
+
useEffect(() => {
|
| 1567 |
+
if (selectedTask && !allTasks.includes(selectedTask)) {
|
| 1568 |
+
setSelectedTask(null)
|
| 1569 |
+
}
|
| 1570 |
+
}, [allTasks, selectedTask])
|
| 1571 |
+
|
| 1572 |
useEffect(() => {
|
| 1573 |
if (selectedCategory && !allCategories.includes(selectedCategory)) {
|
| 1574 |
setSelectedCategory(null)
|
|
|
|
| 1577 |
|
| 1578 |
useEffect(() => {
|
| 1579 |
setPage(1)
|
| 1580 |
+
}, [currentNodeId, searchQuery, selectedCategory, selectedDomain, selectedNodeKind, selectedTask])
|
| 1581 |
|
| 1582 |
const pagedNodes = useMemo(
|
| 1583 |
() => filtered.slice((page - 1) * PAGE_SIZE, page * PAGE_SIZE),
|
|
|
|
| 1585 |
)
|
| 1586 |
|
| 1587 |
const currentLevelKinds = Array.from(new Set(currentLevelNodes.map((node) => node.kind)))
|
| 1588 |
+
const activeFilterCount = [searchQuery.trim(), selectedDomain, selectedTask, selectedCategory, selectedNodeKind].filter(Boolean).length
|
| 1589 |
const currentLevelLabel =
|
| 1590 |
currentNodeId === null
|
| 1591 |
? "Rollout entry level"
|
|
|
|
| 1729 |
onClick={() => {
|
| 1730 |
setSearchQuery("")
|
| 1731 |
setSelectedDomain(null)
|
| 1732 |
+
setSelectedTask(null)
|
| 1733 |
setSelectedCategory(null)
|
| 1734 |
setSelectedNodeKind(null)
|
| 1735 |
}}
|
|
|
|
| 1795 |
</div>
|
| 1796 |
</div>
|
| 1797 |
|
| 1798 |
+
{hierarchy?.stats && (
|
| 1799 |
<div className="flex flex-wrap gap-2 text-sm">
|
| 1800 |
<span className="rounded-full border border-stone-200/80 bg-stone-50/80 px-3 py-1.5 font-medium text-stone-700 dark:border-stone-700/80 dark:bg-stone-900/70 dark:text-stone-200">
|
| 1801 |
{hierarchy.stats.family_count} families
|
|
|
|
| 1919 |
</div>
|
| 1920 |
)}
|
| 1921 |
|
| 1922 |
+
{allTasks.length > 0 && (
|
| 1923 |
+
<div className="mt-4 space-y-1.5">
|
| 1924 |
+
<div className="text-[11px] font-semibold uppercase tracking-[0.2em] text-stone-500 dark:text-stone-400">
|
| 1925 |
+
Task type
|
| 1926 |
+
</div>
|
| 1927 |
+
<div className="flex max-h-40 flex-wrap items-center gap-1.5 overflow-y-auto pr-1">
|
| 1928 |
+
<button
|
| 1929 |
+
type="button"
|
| 1930 |
+
onClick={() => setSelectedTask(null)}
|
| 1931 |
+
className={cn(
|
| 1932 |
+
"shrink-0 rounded-full border px-3 py-1.5 text-xs font-medium transition-colors",
|
| 1933 |
+
selectedTask === null
|
| 1934 |
+
? "border-stone-950 bg-stone-950 text-stone-50 dark:border-stone-100 dark:bg-stone-100 dark:text-stone-950"
|
| 1935 |
+
: "border-stone-200/80 bg-stone-50/80 text-stone-600 hover:bg-stone-100 dark:border-stone-700/80 dark:bg-stone-900/70 dark:text-stone-300 dark:hover:bg-stone-800"
|
| 1936 |
+
)}
|
| 1937 |
+
>
|
| 1938 |
+
All
|
| 1939 |
+
</button>
|
| 1940 |
+
{allTasks.map((task) => (
|
| 1941 |
+
<button
|
| 1942 |
+
key={task}
|
| 1943 |
+
type="button"
|
| 1944 |
+
onClick={() => setSelectedTask(selectedTask === task ? null : task)}
|
| 1945 |
+
className={cn(
|
| 1946 |
+
"shrink-0 rounded-full border px-3 py-1.5 text-xs font-medium transition-colors capitalize",
|
| 1947 |
+
selectedTask === task
|
| 1948 |
+
? "border-emerald-300 bg-emerald-50 text-emerald-800 dark:border-emerald-800 dark:bg-emerald-950/50 dark:text-emerald-200"
|
| 1949 |
+
: "border-stone-200/80 bg-white text-stone-600 hover:bg-stone-50 dark:border-stone-700/80 dark:bg-stone-900 dark:text-stone-300 dark:hover:bg-stone-800"
|
| 1950 |
+
)}
|
| 1951 |
+
>
|
| 1952 |
+
{task}
|
| 1953 |
+
</button>
|
| 1954 |
+
))}
|
| 1955 |
+
</div>
|
| 1956 |
+
</div>
|
| 1957 |
+
)}
|
| 1958 |
+
|
| 1959 |
{allCategories.length > 0 && (
|
| 1960 |
<div className="mt-4 space-y-1.5">
|
| 1961 |
<div className="text-[11px] font-semibold uppercase tracking-[0.2em] text-stone-500 dark:text-stone-400">
|
|
|
|
| 2081 |
{node.title}
|
| 2082 |
</h3>
|
| 2083 |
|
| 2084 |
+
<NodeSignalChips node={node} />
|
| 2085 |
+
|
| 2086 |
{node.description && (
|
| 2087 |
<p className="mb-4 flex-1 text-sm leading-6 text-stone-600 line-clamp-3 dark:text-stone-300">
|
| 2088 |
{node.description}
|
components/benchmark-detail.tsx
CHANGED
|
@@ -15,6 +15,12 @@ import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/component
|
|
| 15 |
import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog"
|
| 16 |
import { Input } from "@/components/ui/input"
|
| 17 |
import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
import {
|
| 19 |
DropdownMenu,
|
| 20 |
DropdownMenuContent,
|
|
@@ -259,48 +265,6 @@ function getOrganizationDisplayName(value: string | null | undefined) {
|
|
| 259 |
return normalizeDisplayLabel(value) || "Unknown Organization"
|
| 260 |
}
|
| 261 |
|
| 262 |
-
function getRelationshipDisplayName(value: string | null | undefined) {
|
| 263 |
-
return normalizeDisplayLabel(value?.replace(/_/g, " ")) || "Unknown"
|
| 264 |
-
}
|
| 265 |
-
|
| 266 |
-
/**
|
| 267 |
-
* Short, badge-friendly label for evaluator relationships.
|
| 268 |
-
* Unknown / "other" values fall back to the normalized full name.
|
| 269 |
-
*/
|
| 270 |
-
function getRelationshipShortLabel(value: string | null | undefined) {
|
| 271 |
-
switch ((value ?? "").toLowerCase()) {
|
| 272 |
-
case "first_party":
|
| 273 |
-
return "1st party"
|
| 274 |
-
case "third_party":
|
| 275 |
-
return "3rd party"
|
| 276 |
-
case "collaborative":
|
| 277 |
-
return "Collaborative"
|
| 278 |
-
case "other":
|
| 279 |
-
return "Other"
|
| 280 |
-
default:
|
| 281 |
-
return getRelationshipDisplayName(value)
|
| 282 |
-
}
|
| 283 |
-
}
|
| 284 |
-
|
| 285 |
-
/**
|
| 286 |
-
* Tone classes for the relationship badge so readers can scan first-party
|
| 287 |
-
* vs third-party reports at a glance without reading the text.
|
| 288 |
-
*/
|
| 289 |
-
function getRelationshipBadgeTone(value: string | null | undefined): string {
|
| 290 |
-
switch ((value ?? "").toLowerCase()) {
|
| 291 |
-
case "first_party":
|
| 292 |
-
// Self-reported by the model's developer — caution tone.
|
| 293 |
-
return "border-amber-300 bg-amber-50 text-amber-900 dark:border-amber-900/60 dark:bg-amber-950/40 dark:text-amber-100"
|
| 294 |
-
case "third_party":
|
| 295 |
-
// Independently evaluated — confidence tone.
|
| 296 |
-
return "border-emerald-300 bg-emerald-50 text-emerald-900 dark:border-emerald-900/60 dark:bg-emerald-950/40 dark:text-emerald-100"
|
| 297 |
-
case "collaborative":
|
| 298 |
-
return "border-sky-300 bg-sky-50 text-sky-900 dark:border-sky-900/60 dark:bg-sky-950/40 dark:text-sky-100"
|
| 299 |
-
default:
|
| 300 |
-
return "border-border/70 bg-muted/40 text-muted-foreground"
|
| 301 |
-
}
|
| 302 |
-
}
|
| 303 |
-
|
| 304 |
function getSourceTypeDisplayName(value: string | null | undefined) {
|
| 305 |
return normalizeDisplayLabel(value?.replace(/_/g, " ")) || "Unknown"
|
| 306 |
}
|
|
@@ -1798,6 +1762,10 @@ export function BenchmarkDetail({
|
|
| 1798 |
thirdPartyEvaluations,
|
| 1799 |
}
|
| 1800 |
}, [allEvaluations])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1801 |
|
| 1802 |
const allCategoryResults = useMemo(
|
| 1803 |
() =>
|
|
@@ -1868,14 +1836,14 @@ export function BenchmarkDetail({
|
|
| 1868 |
}
|
| 1869 |
|
| 1870 |
const reproducibilityCopy =
|
| 1871 |
-
|
| 1872 |
? null
|
| 1873 |
-
:
|
| 1874 |
? "How this model was prompted during testing is not documented. Scores cannot be independently confirmed."
|
| 1875 |
-
:
|
| 1876 |
|
| 1877 |
const comparabilityCopy =
|
| 1878 |
-
|
| 1879 |
? `${benchmarkCount > 0 ? `These results cover ${benchmarkCount} benchmark${benchmarkCount === 1 ? "" : "s"},` : "These results"} but missing prompting details mean apparent score gaps may partly reflect setup differences as well as capability.`
|
| 1880 |
: "Shared benchmark coverage helps, but evaluator choices, benchmark mix, and model size can still limit direct apples-to-apples comparison."
|
| 1881 |
|
|
@@ -1898,9 +1866,10 @@ export function BenchmarkDetail({
|
|
| 1898 |
allCategoryResults,
|
| 1899 |
allEvaluations.length,
|
| 1900 |
reportingStats,
|
|
|
|
|
|
|
| 1901 |
summary.model_info.additional_details?.params_billions,
|
| 1902 |
summary.model_info.name,
|
| 1903 |
-
summary.total_evaluations,
|
| 1904 |
])
|
| 1905 |
|
| 1906 |
const benchmarkGroups = useMemo(
|
|
@@ -3283,6 +3252,14 @@ export function BenchmarkDetail({
|
|
| 3283 |
Mixed scale · renormalized
|
| 3284 |
</span>
|
| 3285 |
)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3286 |
</div>
|
| 3287 |
|
| 3288 |
{/* Hero: title + developer + stat strip */}
|
|
@@ -4663,6 +4640,10 @@ function AggregatedBenchmarkCard({
|
|
| 4663 |
Score
|
| 4664 |
</div>
|
| 4665 |
<div className="mt-1 text-lg font-semibold tracking-tight">{variant.displayScore}</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4666 |
</div>
|
| 4667 |
|
| 4668 |
<div className="min-w-0">
|
|
@@ -5200,7 +5181,10 @@ function BenchmarkDeepDiveDialogPanel({
|
|
| 5200 |
)}
|
| 5201 |
</div>
|
| 5202 |
</TableCell>
|
| 5203 |
-
<TableCell className="px-4 py-3 text-right align-top font-semibold tabular-nums">
|
|
|
|
|
|
|
|
|
|
| 5204 |
<TableCell className="px-4 py-3 text-right align-top tabular-nums text-muted-foreground">
|
| 5205 |
{(variant.rankPosition != null || resolvedRank)
|
| 5206 |
? `#${resolvedRank?.position ?? variant.rankPosition}${(resolvedRank?.total ?? variant.rankTotal) ? `/${resolvedRank?.total ?? variant.rankTotal}` : ""}`
|
|
@@ -5265,7 +5249,10 @@ function BenchmarkDeepDiveDialogPanel({
|
|
| 5265 |
)}
|
| 5266 |
</div>
|
| 5267 |
</TableCell>
|
| 5268 |
-
<TableCell className="px-4 py-3 text-right align-top font-semibold tabular-nums">
|
|
|
|
|
|
|
|
|
|
| 5269 |
<TableCell className="px-4 py-3 text-right align-top tabular-nums text-muted-foreground">
|
| 5270 |
{(variant.rankPosition != null || resolvedRank)
|
| 5271 |
? `#${resolvedRank?.position ?? variant.rankPosition}${(resolvedRank?.total ?? variant.rankTotal) ? `/${resolvedRank?.total ?? variant.rankTotal}` : ""}`
|
|
@@ -5379,10 +5366,15 @@ function VariantExpandedDetail({
|
|
| 5379 |
<Badge variant="outline" className="font-normal">
|
| 5380 |
{group.title}
|
| 5381 |
</Badge>
|
| 5382 |
-
|
| 5383 |
-
|
| 5384 |
-
|
| 5385 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5386 |
<div className="text-sm text-muted-foreground">{variant.result.metric_config.evaluation_description}</div>
|
| 5387 |
</div>
|
| 5388 |
|
|
|
|
| 15 |
import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog"
|
| 16 |
import { Input } from "@/components/ui/input"
|
| 17 |
import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table"
|
| 18 |
+
import {
|
| 19 |
+
getRelationshipBadgeTone,
|
| 20 |
+
getRelationshipDisplayName,
|
| 21 |
+
getRelationshipShortLabel,
|
| 22 |
+
} from "@/components/signals/provenance-badge"
|
| 23 |
+
import { SignalsRowBadges } from "@/components/signals/signals-row-badges"
|
| 24 |
import {
|
| 25 |
DropdownMenu,
|
| 26 |
DropdownMenuContent,
|
|
|
|
| 265 |
return normalizeDisplayLabel(value) || "Unknown Organization"
|
| 266 |
}
|
| 267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
function getSourceTypeDisplayName(value: string | null | undefined) {
|
| 269 |
return normalizeDisplayLabel(value?.replace(/_/g, " ")) || "Unknown"
|
| 270 |
}
|
|
|
|
| 1762 |
thirdPartyEvaluations,
|
| 1763 |
}
|
| 1764 |
}, [allEvaluations])
|
| 1765 |
+
const reproducibilityGapCount =
|
| 1766 |
+
summary.reproducibility_summary?.has_reproducibility_gap_count ?? reportingStats.missingGenerationConfigs
|
| 1767 |
+
const reproducibilityResultsTotal =
|
| 1768 |
+
summary.reproducibility_summary?.results_total ?? summary.total_evaluations
|
| 1769 |
|
| 1770 |
const allCategoryResults = useMemo(
|
| 1771 |
() =>
|
|
|
|
| 1836 |
}
|
| 1837 |
|
| 1838 |
const reproducibilityCopy =
|
| 1839 |
+
reproducibilityGapCount === 0
|
| 1840 |
? null
|
| 1841 |
+
: reproducibilityGapCount === reproducibilityResultsTotal
|
| 1842 |
? "How this model was prompted during testing is not documented. Scores cannot be independently confirmed."
|
| 1843 |
+
: `${reproducibilityGapCount} of ${reproducibilityResultsTotal} reported scores are missing enough setup detail to be re-run as-is.`
|
| 1844 |
|
| 1845 |
const comparabilityCopy =
|
| 1846 |
+
reproducibilityGapCount > 0
|
| 1847 |
? `${benchmarkCount > 0 ? `These results cover ${benchmarkCount} benchmark${benchmarkCount === 1 ? "" : "s"},` : "These results"} but missing prompting details mean apparent score gaps may partly reflect setup differences as well as capability.`
|
| 1848 |
: "Shared benchmark coverage helps, but evaluator choices, benchmark mix, and model size can still limit direct apples-to-apples comparison."
|
| 1849 |
|
|
|
|
| 1866 |
allCategoryResults,
|
| 1867 |
allEvaluations.length,
|
| 1868 |
reportingStats,
|
| 1869 |
+
reproducibilityGapCount,
|
| 1870 |
+
reproducibilityResultsTotal,
|
| 1871 |
summary.model_info.additional_details?.params_billions,
|
| 1872 |
summary.model_info.name,
|
|
|
|
| 1873 |
])
|
| 1874 |
|
| 1875 |
const benchmarkGroups = useMemo(
|
|
|
|
| 3252 |
Mixed scale · renormalized
|
| 3253 |
</span>
|
| 3254 |
)}
|
| 3255 |
+
{reproducibilityGapCount > 0 && (
|
| 3256 |
+
<span
|
| 3257 |
+
className="ml-1 inline-flex items-center rounded-full border border-amber-300 bg-amber-50 px-2 py-0.5 text-[10px] tracking-[0.12em] text-amber-900 dark:border-amber-900/60 dark:bg-amber-950/40 dark:text-amber-100"
|
| 3258 |
+
title={`${reproducibilityGapCount} of ${reproducibilityResultsTotal} reported scores are not fully documented.`}
|
| 3259 |
+
>
|
| 3260 |
+
Setup gaps
|
| 3261 |
+
</span>
|
| 3262 |
+
)}
|
| 3263 |
</div>
|
| 3264 |
|
| 3265 |
{/* Hero: title + developer + stat strip */}
|
|
|
|
| 4640 |
Score
|
| 4641 |
</div>
|
| 4642 |
<div className="mt-1 text-lg font-semibold tracking-tight">{variant.displayScore}</div>
|
| 4643 |
+
<SignalsRowBadges
|
| 4644 |
+
annotations={variant.result.evalcards?.annotations}
|
| 4645 |
+
className="justify-start"
|
| 4646 |
+
/>
|
| 4647 |
</div>
|
| 4648 |
|
| 4649 |
<div className="min-w-0">
|
|
|
|
| 5181 |
)}
|
| 5182 |
</div>
|
| 5183 |
</TableCell>
|
| 5184 |
+
<TableCell className="px-4 py-3 text-right align-top font-semibold tabular-nums">
|
| 5185 |
+
<div>{variant.displayScore}</div>
|
| 5186 |
+
<SignalsRowBadges annotations={variant.result.evalcards?.annotations} />
|
| 5187 |
+
</TableCell>
|
| 5188 |
<TableCell className="px-4 py-3 text-right align-top tabular-nums text-muted-foreground">
|
| 5189 |
{(variant.rankPosition != null || resolvedRank)
|
| 5190 |
? `#${resolvedRank?.position ?? variant.rankPosition}${(resolvedRank?.total ?? variant.rankTotal) ? `/${resolvedRank?.total ?? variant.rankTotal}` : ""}`
|
|
|
|
| 5249 |
)}
|
| 5250 |
</div>
|
| 5251 |
</TableCell>
|
| 5252 |
+
<TableCell className="px-4 py-3 text-right align-top font-semibold tabular-nums">
|
| 5253 |
+
<div>{variant.displayScore}</div>
|
| 5254 |
+
<SignalsRowBadges annotations={variant.result.evalcards?.annotations} />
|
| 5255 |
+
</TableCell>
|
| 5256 |
<TableCell className="px-4 py-3 text-right align-top tabular-nums text-muted-foreground">
|
| 5257 |
{(variant.rankPosition != null || resolvedRank)
|
| 5258 |
? `#${resolvedRank?.position ?? variant.rankPosition}${(resolvedRank?.total ?? variant.rankTotal) ? `/${resolvedRank?.total ?? variant.rankTotal}` : ""}`
|
|
|
|
| 5366 |
<Badge variant="outline" className="font-normal">
|
| 5367 |
{group.title}
|
| 5368 |
</Badge>
|
| 5369 |
+
<Badge variant="secondary" className="font-normal">
|
| 5370 |
+
{variant.displayScore}
|
| 5371 |
+
</Badge>
|
| 5372 |
+
<SignalsRowBadges
|
| 5373 |
+
annotations={variant.result.evalcards?.annotations}
|
| 5374 |
+
className="mt-0 justify-start"
|
| 5375 |
+
hideOnMobile={false}
|
| 5376 |
+
/>
|
| 5377 |
+
</div>
|
| 5378 |
<div className="text-sm text-muted-foreground">{variant.result.metric_config.evaluation_description}</div>
|
| 5379 |
</div>
|
| 5380 |
|
components/benchmark-evaluation-card.tsx
CHANGED
|
@@ -5,6 +5,7 @@ import { useMemo } from "react"
|
|
| 5 |
import { useAudienceMode } from "@/components/audience-mode-provider"
|
| 6 |
import { useRouter } from "next/navigation"
|
| 7 |
import {
|
|
|
|
| 8 |
Award,
|
| 9 |
ChevronDown,
|
| 10 |
ChevronRight,
|
|
@@ -14,6 +15,7 @@ import {
|
|
| 14 |
} from "lucide-react"
|
| 15 |
|
| 16 |
import type { CategoryType } from "@/lib/benchmark-schema"
|
|
|
|
| 17 |
import { getCategoryColor } from "@/lib/benchmark-schema"
|
| 18 |
import type { BenchmarkCard } from "@/lib/benchmark-schema"
|
| 19 |
import { lookupBenchmarkCard } from "@/lib/benchmark-metadata-utils"
|
|
@@ -59,6 +61,9 @@ export type BenchmarkEvaluationCardData = {
|
|
| 59 |
max: number
|
| 60 |
average: number | null
|
| 61 |
}
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
top_scores: Array<{
|
| 64 |
benchmark: string
|
|
@@ -262,6 +267,8 @@ export function BenchmarkEvaluationCard({
|
|
| 262 |
const scoreRange = [formatScoreValue(data.score_summary?.min), formatScoreValue(data.score_summary?.max)]
|
| 263 |
.filter((value): value is string => Boolean(value))
|
| 264 |
.join(" to ")
|
|
|
|
|
|
|
| 265 |
|
| 266 |
return (
|
| 267 |
<Card
|
|
@@ -297,6 +304,12 @@ export function BenchmarkEvaluationCard({
|
|
| 297 |
{paramsBillions && <Badge variant="secondary">{paramsBillions} parameters</Badge>}
|
| 298 |
<Badge variant="outline">{data.benchmarks_count} benchmark suites</Badge>
|
| 299 |
<Badge variant="outline">{data.evaluations_count} reported results</Badge>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
</div>
|
| 301 |
</div>
|
| 302 |
|
|
@@ -447,6 +460,12 @@ export function BenchmarkEvaluationCard({
|
|
| 447 |
{data.source_types.length > 0 && (
|
| 448 |
<KeyValueRow label="Artifact type" value={data.source_types.map((s) => s.replace(/_/g, " ")).join(", ")} />
|
| 449 |
)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
</div>
|
| 451 |
</CollapsibleContent>
|
| 452 |
</Collapsible>
|
|
|
|
| 5 |
import { useAudienceMode } from "@/components/audience-mode-provider"
|
| 6 |
import { useRouter } from "next/navigation"
|
| 7 |
import {
|
| 8 |
+
AlertTriangle,
|
| 9 |
Award,
|
| 10 |
ChevronDown,
|
| 11 |
ChevronRight,
|
|
|
|
| 15 |
} from "lucide-react"
|
| 16 |
|
| 17 |
import type { CategoryType } from "@/lib/benchmark-schema"
|
| 18 |
+
import type { SignalSummaries } from "@/lib/backend-artifacts"
|
| 19 |
import { getCategoryColor } from "@/lib/benchmark-schema"
|
| 20 |
import type { BenchmarkCard } from "@/lib/benchmark-schema"
|
| 21 |
import { lookupBenchmarkCard } from "@/lib/benchmark-metadata-utils"
|
|
|
|
| 61 |
max: number
|
| 62 |
average: number | null
|
| 63 |
}
|
| 64 |
+
reproducibility_summary?: SignalSummaries["reproducibility_summary"]
|
| 65 |
+
provenance_summary?: SignalSummaries["provenance_summary"]
|
| 66 |
+
comparability_summary?: SignalSummaries["comparability_summary"]
|
| 67 |
|
| 68 |
top_scores: Array<{
|
| 69 |
benchmark: string
|
|
|
|
| 267 |
const scoreRange = [formatScoreValue(data.score_summary?.min), formatScoreValue(data.score_summary?.max)]
|
| 268 |
.filter((value): value is string => Boolean(value))
|
| 269 |
.join(" to ")
|
| 270 |
+
const reproducibilityGapCount = data.reproducibility_summary?.has_reproducibility_gap_count ?? 0
|
| 271 |
+
const reproducibilityTotal = data.reproducibility_summary?.results_total ?? data.evaluations_count
|
| 272 |
|
| 273 |
return (
|
| 274 |
<Card
|
|
|
|
| 304 |
{paramsBillions && <Badge variant="secondary">{paramsBillions} parameters</Badge>}
|
| 305 |
<Badge variant="outline">{data.benchmarks_count} benchmark suites</Badge>
|
| 306 |
<Badge variant="outline">{data.evaluations_count} reported results</Badge>
|
| 307 |
+
{reproducibilityGapCount > 0 && (
|
| 308 |
+
<Badge className="border-amber-300 bg-amber-50 text-amber-900 hover:bg-amber-50 dark:border-amber-900/60 dark:bg-amber-950/40 dark:text-amber-100">
|
| 309 |
+
<AlertTriangle className="h-3 w-3" />
|
| 310 |
+
{reproducibilityGapCount} setup gaps
|
| 311 |
+
</Badge>
|
| 312 |
+
)}
|
| 313 |
</div>
|
| 314 |
</div>
|
| 315 |
|
|
|
|
| 460 |
{data.source_types.length > 0 && (
|
| 461 |
<KeyValueRow label="Artifact type" value={data.source_types.map((s) => s.replace(/_/g, " ")).join(", ")} />
|
| 462 |
)}
|
| 463 |
+
{reproducibilityGapCount > 0 && (
|
| 464 |
+
<KeyValueRow
|
| 465 |
+
label="Re-runnability"
|
| 466 |
+
value={`${reproducibilityGapCount} of ${reproducibilityTotal} reported scores are not fully documented`}
|
| 467 |
+
/>
|
| 468 |
+
)}
|
| 469 |
</div>
|
| 470 |
</CollapsibleContent>
|
| 471 |
</Collapsible>
|
components/eval-card.tsx
CHANGED
|
@@ -79,6 +79,11 @@ export function EvalCard({ summary, delayMs = 0 }: EvalCardProps) {
|
|
| 79 |
const domainPreview = domains.slice(0, 2)
|
| 80 |
// Source provenance pulled from the pipeline's source_data
|
| 81 |
const sourceData = summary.source_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
const datasetName = sourceData?.dataset_name
|
| 83 |
const datasetUrl =
|
| 84 |
sourceData?.dataset_url ??
|
|
@@ -129,10 +134,10 @@ export function EvalCard({ summary, delayMs = 0 }: EvalCardProps) {
|
|
| 129 |
Independently evaluated
|
| 130 |
</Badge>
|
| 131 |
)}
|
| 132 |
-
{
|
| 133 |
<Badge className="bg-amber-500 text-amber-950 hover:bg-amber-500">
|
| 134 |
<AlertTriangle className="mr-1 h-3 w-3" />
|
| 135 |
-
|
| 136 |
</Badge>
|
| 137 |
)}
|
| 138 |
</div>
|
|
@@ -182,8 +187,8 @@ export function EvalCard({ summary, delayMs = 0 }: EvalCardProps) {
|
|
| 182 |
<DataRow
|
| 183 |
label="Config"
|
| 184 |
value={
|
| 185 |
-
|
| 186 |
-
? `${
|
| 187 |
: "Fully documented"
|
| 188 |
}
|
| 189 |
/>
|
|
@@ -245,9 +250,9 @@ export function EvalCard({ summary, delayMs = 0 }: EvalCardProps) {
|
|
| 245 |
<div className="space-y-1.5 text-sm">
|
| 246 |
<DataRow label="Avg score" value={scorePercent} />
|
| 247 |
<DataRow label="Reported by" value={summary.evaluator_names.join(", ") || "Unknown"} />
|
| 248 |
-
{
|
| 249 |
<p className="pt-1 text-xs text-muted-foreground">
|
| 250 |
-
|
| 251 |
</p>
|
| 252 |
)}
|
| 253 |
</div>
|
|
|
|
| 79 |
const domainPreview = domains.slice(0, 2)
|
| 80 |
// Source provenance pulled from the pipeline's source_data
|
| 81 |
const sourceData = summary.source_data
|
| 82 |
+
const reproducibilitySummary = summary.reproducibility_summary
|
| 83 |
+
const reproducibilityGapCount =
|
| 84 |
+
reproducibilitySummary?.has_reproducibility_gap_count ?? summary.missing_generation_config_count
|
| 85 |
+
const reproducibilityResultsTotal =
|
| 86 |
+
reproducibilitySummary?.results_total ?? summary.models_count
|
| 87 |
const datasetName = sourceData?.dataset_name
|
| 88 |
const datasetUrl =
|
| 89 |
sourceData?.dataset_url ??
|
|
|
|
| 134 |
Independently evaluated
|
| 135 |
</Badge>
|
| 136 |
)}
|
| 137 |
+
{reproducibilityGapCount > 0 && (
|
| 138 |
<Badge className="bg-amber-500 text-amber-950 hover:bg-amber-500">
|
| 139 |
<AlertTriangle className="mr-1 h-3 w-3" />
|
| 140 |
+
Documentation gaps
|
| 141 |
</Badge>
|
| 142 |
)}
|
| 143 |
</div>
|
|
|
|
| 187 |
<DataRow
|
| 188 |
label="Config"
|
| 189 |
value={
|
| 190 |
+
reproducibilityGapCount > 0
|
| 191 |
+
? `${reproducibilityGapCount} of ${reproducibilityResultsTotal} scores have setup gaps`
|
| 192 |
: "Fully documented"
|
| 193 |
}
|
| 194 |
/>
|
|
|
|
| 250 |
<div className="space-y-1.5 text-sm">
|
| 251 |
<DataRow label="Avg score" value={scorePercent} />
|
| 252 |
<DataRow label="Reported by" value={summary.evaluator_names.join(", ") || "Unknown"} />
|
| 253 |
+
{reproducibilityGapCount > 0 && (
|
| 254 |
<p className="pt-1 text-xs text-muted-foreground">
|
| 255 |
+
{reproducibilityGapCount} of {reproducibilityResultsTotal} reported scores are not fully documented.
|
| 256 |
</p>
|
| 257 |
)}
|
| 258 |
</div>
|
components/eval-detail.tsx
CHANGED
|
@@ -5,8 +5,22 @@ import { Fragment, useEffect, useMemo, useState } from "react"
|
|
| 5 |
import Link from "next/link"
|
| 6 |
import { Badge } from "@/components/ui/badge"
|
| 7 |
import { Button } from "@/components/ui/button"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"
|
| 9 |
import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
import {
|
| 11 |
DropdownMenu,
|
| 12 |
DropdownMenuCheckboxItem,
|
|
@@ -32,9 +46,11 @@ import {
|
|
| 32 |
Globe,
|
| 33 |
Medal,
|
| 34 |
Scale,
|
|
|
|
| 35 |
Shield,
|
| 36 |
SlidersHorizontal,
|
| 37 |
Tag,
|
|
|
|
| 38 |
} from "lucide-react"
|
| 39 |
import type { BenchmarkCard } from "@/lib/benchmark-schema"
|
| 40 |
import type { BenchmarkEvalSummary, ModelResultForBenchmark } from "@/lib/eval-processing"
|
|
@@ -53,6 +69,212 @@ interface LeaderboardRow {
|
|
| 53 |
type LeaderboardMetric = NonNullable<BenchmarkEvalSummary["leaderboard_metrics"]>[number]
|
| 54 |
type LeaderboardMatrixRow = NonNullable<BenchmarkEvalSummary["leaderboard_rows"]>[number]
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
const PARAM_RANGE_VALUES = [1, 2, 3, 4, 6, 8, 10, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 500] as const
|
| 57 |
const PARAM_RANGE_MARKERS = [
|
| 58 |
{ label: "< 1B", step: 0 },
|
|
@@ -400,6 +622,11 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 400 |
: summary.is_aggregated
|
| 401 |
? "Averaged model results across the contributing composite benchmarks, with drill-down to each component score."
|
| 402 |
: "Model results with benchmark context, source dataset detail, and optional instance-data links."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
|
| 404 |
const toggleRow = (key: string) =>
|
| 405 |
setExpandedRows((current) => ({
|
|
@@ -430,6 +657,15 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 430 |
? `${summary.metrics_count ?? summary.leaderboard_metrics?.length ?? 1} measures`
|
| 431 |
: `${summary.metrics_count ?? 1} ${(summary.metrics_count ?? 1) === 1 ? "measure" : "measures"}`}
|
| 432 |
</Badge>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
</div>
|
| 434 |
</div>
|
| 435 |
{overviewOpen ? (
|
|
@@ -580,6 +816,12 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 580 |
</dl>
|
| 581 |
</div>
|
| 582 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 583 |
{!hasMultiMetricLeaderboard && (summary.root_metrics?.length || summary.subtasks?.length) ? (
|
| 584 |
<section className="rounded-2xl border bg-muted/5 p-3.5">
|
| 585 |
<div className="space-y-1">
|
|
@@ -812,10 +1054,14 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 812 |
const samples = Array.isArray(modelResult.source_data)
|
| 813 |
? undefined
|
| 814 |
: modelResult.source_data.samples_number
|
|
|
|
| 815 |
|
| 816 |
return (
|
| 817 |
<Fragment key={key}>
|
| 818 |
-
<TableRow
|
|
|
|
|
|
|
|
|
|
| 819 |
<TableCell className="px-4">
|
| 820 |
<div
|
| 821 |
className={cn(
|
|
@@ -868,6 +1114,7 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 868 |
|
| 869 |
<TableCell className="text-right">
|
| 870 |
<div className="text-xl font-semibold tabular-nums">{formatRawScore(modelResult.score, summary.metric_config.unit)}</div>
|
|
|
|
| 871 |
</TableCell>
|
| 872 |
|
| 873 |
{isResearchView ? (
|
|
@@ -997,6 +1244,8 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 997 |
)}
|
| 998 |
</DetailPanel>
|
| 999 |
|
|
|
|
|
|
|
| 1000 |
<DetailPanel
|
| 1001 |
title={isResearchView ? "Score Breakdown" : "Metric Summary"}
|
| 1002 |
subtitle={
|
|
@@ -1183,7 +1432,14 @@ function MultiMetricLeaderboard({
|
|
| 1183 |
const leaderboardMetrics = summary.leaderboard_metrics ?? []
|
| 1184 |
const leaderboardRows = summary.leaderboard_rows ?? []
|
| 1185 |
const allMetricKeys = useMemo(() => leaderboardMetrics.map((metric) => metric.column_key), [leaderboardMetrics])
|
| 1186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1187 |
const maxParamStepIndex = PARAM_RANGE_VALUES.length - 1
|
| 1188 |
const leaderboardMetricMap = useMemo(
|
| 1189 |
() => new Map(leaderboardMetrics.map((metric) => [metric.column_key, metric])),
|
|
@@ -1333,8 +1589,8 @@ function MultiMetricLeaderboard({
|
|
| 1333 |
}, [maxParamStep, minParamStep, sortDirection, sortKey])
|
| 1334 |
|
| 1335 |
useEffect(() => {
|
| 1336 |
-
setVisibleMetricKeys(
|
| 1337 |
-
}, [
|
| 1338 |
|
| 1339 |
useEffect(() => {
|
| 1340 |
setActiveSubtaskTab("all")
|
|
@@ -1521,30 +1777,11 @@ function MultiMetricLeaderboard({
|
|
| 1521 |
<CardContent className="p-0">
|
| 1522 |
{hasSubtaskTabs && (
|
| 1523 |
<div className="border-b bg-background px-5 py-3 sm:px-6">
|
| 1524 |
-
<
|
| 1525 |
-
|
| 1526 |
-
|
| 1527 |
-
|
| 1528 |
-
|
| 1529 |
-
type="button"
|
| 1530 |
-
size="sm"
|
| 1531 |
-
variant={activeSubtaskTab === "all" ? "default" : "outline"}
|
| 1532 |
-
onClick={() => setActiveSubtaskTab("all")}
|
| 1533 |
-
>
|
| 1534 |
-
All slices
|
| 1535 |
-
</Button>
|
| 1536 |
-
{singleMetricSubtaskTabs.map((tab) => (
|
| 1537 |
-
<Button
|
| 1538 |
-
key={tab.key}
|
| 1539 |
-
type="button"
|
| 1540 |
-
size="sm"
|
| 1541 |
-
variant={activeSubtaskTab === tab.key ? "default" : "outline"}
|
| 1542 |
-
onClick={() => setActiveSubtaskTab(tab.key)}
|
| 1543 |
-
>
|
| 1544 |
-
{tab.label}
|
| 1545 |
-
</Button>
|
| 1546 |
-
))}
|
| 1547 |
-
</div>
|
| 1548 |
</div>
|
| 1549 |
)}
|
| 1550 |
|
|
@@ -1739,6 +1976,12 @@ function MultiMetricLeaderboard({
|
|
| 1739 |
)}
|
| 1740 |
<span className="lg:hidden">{row.model_info.developer ?? "Unknown developer"}</span>
|
| 1741 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1742 |
</div>
|
| 1743 |
</TableCell>
|
| 1744 |
|
|
@@ -1754,6 +1997,7 @@ function MultiMetricLeaderboard({
|
|
| 1754 |
|
| 1755 |
{visibleMetrics.map((metric) => {
|
| 1756 |
const score = row.values[metric.column_key]
|
|
|
|
| 1757 |
return (
|
| 1758 |
<TableCell
|
| 1759 |
key={metric.column_key}
|
|
@@ -1762,7 +2006,8 @@ function MultiMetricLeaderboard({
|
|
| 1762 |
!isNumericScore(score) && "text-muted-foreground"
|
| 1763 |
)}
|
| 1764 |
>
|
| 1765 |
-
{isNumericScore(score) ? formatRawScore(score, metric.unit) : "—"}
|
|
|
|
| 1766 |
</TableCell>
|
| 1767 |
)
|
| 1768 |
})}
|
|
|
|
| 5 |
import Link from "next/link"
|
| 6 |
import { Badge } from "@/components/ui/badge"
|
| 7 |
import { Button } from "@/components/ui/button"
|
| 8 |
+
import { CompletenessPanel } from "@/components/signals/completeness-panel"
|
| 9 |
+
import { ComparabilityPanel } from "@/components/signals/comparability-panel"
|
| 10 |
+
import { ReproducibilityPanel } from "@/components/signals/reproducibility-panel"
|
| 11 |
+
import { SignalsRowBadges } from "@/components/signals/signals-row-badges"
|
| 12 |
+
import { SignalTooltip } from "@/components/signals/signal-tooltip"
|
| 13 |
+
import { getCompletenessPopulatedCount } from "@/components/signals/signal-utils"
|
| 14 |
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"
|
| 15 |
import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
|
| 16 |
+
import {
|
| 17 |
+
Dialog,
|
| 18 |
+
DialogContent,
|
| 19 |
+
DialogDescription,
|
| 20 |
+
DialogHeader,
|
| 21 |
+
DialogTitle,
|
| 22 |
+
} from "@/components/ui/dialog"
|
| 23 |
+
import { Input } from "@/components/ui/input"
|
| 24 |
import {
|
| 25 |
DropdownMenu,
|
| 26 |
DropdownMenuCheckboxItem,
|
|
|
|
| 46 |
Globe,
|
| 47 |
Medal,
|
| 48 |
Scale,
|
| 49 |
+
Search,
|
| 50 |
Shield,
|
| 51 |
SlidersHorizontal,
|
| 52 |
Tag,
|
| 53 |
+
X,
|
| 54 |
} from "lucide-react"
|
| 55 |
import type { BenchmarkCard } from "@/lib/benchmark-schema"
|
| 56 |
import type { BenchmarkEvalSummary, ModelResultForBenchmark } from "@/lib/eval-processing"
|
|
|
|
| 69 |
type LeaderboardMetric = NonNullable<BenchmarkEvalSummary["leaderboard_metrics"]>[number]
|
| 70 |
type LeaderboardMatrixRow = NonNullable<BenchmarkEvalSummary["leaderboard_rows"]>[number]
|
| 71 |
|
| 72 |
+
/**
|
| 73 |
+
* Pick a representative row-level annotation for the matrix view.
|
| 74 |
+
*
|
| 75 |
+
* Reproducibility and provenance are typically constant across all metrics for
|
| 76 |
+
* a given (model, benchmark) pair, so rendering them in every cell is just
|
| 77 |
+
* noise. This helper grabs the first non-null annotation across visible metrics
|
| 78 |
+
* and returns it for the row-level badge strip.
|
| 79 |
+
*/
|
| 80 |
+
function getRowLevelAnnotations(
|
| 81 |
+
row: LeaderboardMatrixRow,
|
| 82 |
+
visibleMetrics: LeaderboardMetric[]
|
| 83 |
+
) {
|
| 84 |
+
const annotationsByMetric = row.annotations_by_metric
|
| 85 |
+
if (!annotationsByMetric) {
|
| 86 |
+
return null
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
for (const metric of visibleMetrics) {
|
| 90 |
+
const annotations = annotationsByMetric[metric.column_key]
|
| 91 |
+
if (annotations) {
|
| 92 |
+
return annotations
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
return null
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
const SLICE_PILL_THRESHOLD = 5
|
| 100 |
+
|
| 101 |
+
interface SliceTab {
|
| 102 |
+
key: string
|
| 103 |
+
label: string
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
/**
|
| 107 |
+
* Slice picker that adapts to slice count.
|
| 108 |
+
*
|
| 109 |
+
* - <= SLICE_PILL_THRESHOLD: render every slice as a pill (current familiar UX).
|
| 110 |
+
* - > SLICE_PILL_THRESHOLD: render "All slices" + currently-selected pill +
|
| 111 |
+
* a "Browse N slices" button that opens a searchable dialog. Hundreds of
|
| 112 |
+
* subtasks (e.g. AIRBench's 374) fit cleanly.
|
| 113 |
+
*/
|
| 114 |
+
function SliceSelector({
|
| 115 |
+
activeSubtaskTab,
|
| 116 |
+
onChange,
|
| 117 |
+
tabs,
|
| 118 |
+
}: {
|
| 119 |
+
activeSubtaskTab: string
|
| 120 |
+
onChange: (key: string) => void
|
| 121 |
+
tabs: SliceTab[]
|
| 122 |
+
}) {
|
| 123 |
+
const [browserOpen, setBrowserOpen] = useState(false)
|
| 124 |
+
const [search, setSearch] = useState("")
|
| 125 |
+
|
| 126 |
+
const useBrowser = tabs.length > SLICE_PILL_THRESHOLD
|
| 127 |
+
const activeTab = tabs.find((tab) => tab.key === activeSubtaskTab)
|
| 128 |
+
|
| 129 |
+
const filteredTabs = useMemo(() => {
|
| 130 |
+
const query = search.trim().toLowerCase()
|
| 131 |
+
if (!query) return tabs
|
| 132 |
+
return tabs.filter((tab) => tab.label.toLowerCase().includes(query))
|
| 133 |
+
}, [search, tabs])
|
| 134 |
+
|
| 135 |
+
if (!useBrowser) {
|
| 136 |
+
return (
|
| 137 |
+
<div>
|
| 138 |
+
<div className="mb-2 text-[11px] font-semibold uppercase tracking-[0.16em] text-muted-foreground">
|
| 139 |
+
Benchmark slices
|
| 140 |
+
</div>
|
| 141 |
+
<div className="flex flex-wrap gap-2">
|
| 142 |
+
<Button
|
| 143 |
+
type="button"
|
| 144 |
+
size="sm"
|
| 145 |
+
variant={activeSubtaskTab === "all" ? "default" : "outline"}
|
| 146 |
+
onClick={() => onChange("all")}
|
| 147 |
+
>
|
| 148 |
+
All slices
|
| 149 |
+
</Button>
|
| 150 |
+
{tabs.map((tab) => (
|
| 151 |
+
<Button
|
| 152 |
+
key={tab.key}
|
| 153 |
+
type="button"
|
| 154 |
+
size="sm"
|
| 155 |
+
variant={activeSubtaskTab === tab.key ? "default" : "outline"}
|
| 156 |
+
onClick={() => onChange(tab.key)}
|
| 157 |
+
>
|
| 158 |
+
{tab.label}
|
| 159 |
+
</Button>
|
| 160 |
+
))}
|
| 161 |
+
</div>
|
| 162 |
+
</div>
|
| 163 |
+
)
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
return (
|
| 167 |
+
<div>
|
| 168 |
+
<div className="mb-2 flex items-center justify-between gap-2">
|
| 169 |
+
<div className="text-[11px] font-semibold uppercase tracking-[0.16em] text-muted-foreground">
|
| 170 |
+
Benchmark slices
|
| 171 |
+
</div>
|
| 172 |
+
<span className="text-xs text-muted-foreground">{tabs.length} total</span>
|
| 173 |
+
</div>
|
| 174 |
+
<div className="flex flex-wrap items-center gap-2">
|
| 175 |
+
<Button
|
| 176 |
+
type="button"
|
| 177 |
+
size="sm"
|
| 178 |
+
variant={activeSubtaskTab === "all" ? "default" : "outline"}
|
| 179 |
+
onClick={() => onChange("all")}
|
| 180 |
+
>
|
| 181 |
+
All slices
|
| 182 |
+
</Button>
|
| 183 |
+
{activeTab && (
|
| 184 |
+
<Button
|
| 185 |
+
type="button"
|
| 186 |
+
size="sm"
|
| 187 |
+
variant="default"
|
| 188 |
+
onClick={() => onChange("all")}
|
| 189 |
+
className="max-w-[18rem] truncate"
|
| 190 |
+
title={`Active: ${activeTab.label}. Click to clear.`}
|
| 191 |
+
>
|
| 192 |
+
{activeTab.label}
|
| 193 |
+
<X className="ml-1.5 h-3 w-3 shrink-0" />
|
| 194 |
+
</Button>
|
| 195 |
+
)}
|
| 196 |
+
<Button
|
| 197 |
+
type="button"
|
| 198 |
+
size="sm"
|
| 199 |
+
variant="outline"
|
| 200 |
+
onClick={() => setBrowserOpen(true)}
|
| 201 |
+
className="gap-1.5"
|
| 202 |
+
>
|
| 203 |
+
<Search className="h-3.5 w-3.5" />
|
| 204 |
+
{activeTab ? "Change slice" : `Browse ${tabs.length} slices`}
|
| 205 |
+
</Button>
|
| 206 |
+
</div>
|
| 207 |
+
|
| 208 |
+
<Dialog
|
| 209 |
+
open={browserOpen}
|
| 210 |
+
onOpenChange={(open) => {
|
| 211 |
+
setBrowserOpen(open)
|
| 212 |
+
if (!open) setSearch("")
|
| 213 |
+
}}
|
| 214 |
+
>
|
| 215 |
+
<DialogContent className="max-w-2xl">
|
| 216 |
+
<DialogHeader>
|
| 217 |
+
<DialogTitle>Browse benchmark slices</DialogTitle>
|
| 218 |
+
<DialogDescription>
|
| 219 |
+
{tabs.length} slices in this benchmark. Pick one to filter the leaderboard,
|
| 220 |
+
or close to keep showing all slices.
|
| 221 |
+
</DialogDescription>
|
| 222 |
+
</DialogHeader>
|
| 223 |
+
|
| 224 |
+
<Input
|
| 225 |
+
value={search}
|
| 226 |
+
onChange={(event) => setSearch(event.target.value)}
|
| 227 |
+
placeholder="Search slices..."
|
| 228 |
+
autoFocus
|
| 229 |
+
/>
|
| 230 |
+
|
| 231 |
+
<div className="max-h-[60vh] overflow-y-auto rounded-md border">
|
| 232 |
+
<button
|
| 233 |
+
type="button"
|
| 234 |
+
onClick={() => {
|
| 235 |
+
onChange("all")
|
| 236 |
+
setBrowserOpen(false)
|
| 237 |
+
}}
|
| 238 |
+
className={cn(
|
| 239 |
+
"flex w-full items-center justify-between border-b px-4 py-2.5 text-left text-sm transition-colors hover:bg-muted/40",
|
| 240 |
+
activeSubtaskTab === "all" && "bg-muted/40 font-semibold"
|
| 241 |
+
)}
|
| 242 |
+
>
|
| 243 |
+
<span>All slices (no filter)</span>
|
| 244 |
+
{activeSubtaskTab === "all" && <span className="text-xs text-muted-foreground">selected</span>}
|
| 245 |
+
</button>
|
| 246 |
+
{filteredTabs.length === 0 ? (
|
| 247 |
+
<div className="px-4 py-6 text-center text-sm text-muted-foreground">
|
| 248 |
+
No slices match "{search}".
|
| 249 |
+
</div>
|
| 250 |
+
) : (
|
| 251 |
+
filteredTabs.map((tab) => (
|
| 252 |
+
<button
|
| 253 |
+
key={tab.key}
|
| 254 |
+
type="button"
|
| 255 |
+
onClick={() => {
|
| 256 |
+
onChange(tab.key)
|
| 257 |
+
setBrowserOpen(false)
|
| 258 |
+
}}
|
| 259 |
+
className={cn(
|
| 260 |
+
"flex w-full items-center justify-between border-b px-4 py-2 text-left text-sm transition-colors hover:bg-muted/40 last:border-b-0",
|
| 261 |
+
activeSubtaskTab === tab.key && "bg-muted/40 font-semibold"
|
| 262 |
+
)}
|
| 263 |
+
>
|
| 264 |
+
<span className="min-w-0 truncate pr-2">{tab.label}</span>
|
| 265 |
+
{activeSubtaskTab === tab.key && (
|
| 266 |
+
<span className="shrink-0 text-xs text-muted-foreground">selected</span>
|
| 267 |
+
)}
|
| 268 |
+
</button>
|
| 269 |
+
))
|
| 270 |
+
)}
|
| 271 |
+
</div>
|
| 272 |
+
</DialogContent>
|
| 273 |
+
</Dialog>
|
| 274 |
+
</div>
|
| 275 |
+
)
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
const PARAM_RANGE_VALUES = [1, 2, 3, 4, 6, 8, 10, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 500] as const
|
| 279 |
const PARAM_RANGE_MARKERS = [
|
| 280 |
{ label: "< 1B", step: 0 },
|
|
|
|
| 622 |
: summary.is_aggregated
|
| 623 |
? "Averaged model results across the contributing composite benchmarks, with drill-down to each component score."
|
| 624 |
: "Model results with benchmark context, source dataset detail, and optional instance-data links."
|
| 625 |
+
const reportingCompleteness = summary.evalcards?.annotations?.reporting_completeness
|
| 626 |
+
const benchmarkComparability = summary.evalcards?.annotations?.benchmark_comparability
|
| 627 |
+
const documentationPopulatedCount = reportingCompleteness
|
| 628 |
+
? getCompletenessPopulatedCount(reportingCompleteness)
|
| 629 |
+
: null
|
| 630 |
|
| 631 |
const toggleRow = (key: string) =>
|
| 632 |
setExpandedRows((current) => ({
|
|
|
|
| 657 |
? `${summary.metrics_count ?? summary.leaderboard_metrics?.length ?? 1} measures`
|
| 658 |
: `${summary.metrics_count ?? 1} ${(summary.metrics_count ?? 1) === 1 ? "measure" : "measures"}`}
|
| 659 |
</Badge>
|
| 660 |
+
{reportingCompleteness && (
|
| 661 |
+
<SignalTooltip
|
| 662 |
+
content={`${documentationPopulatedCount} of ${reportingCompleteness.total_fields_evaluated} EvalCards documentation fields populated for this benchmark.`}
|
| 663 |
+
>
|
| 664 |
+
<Badge variant="outline" className="border-emerald-200 bg-emerald-50 text-emerald-800 dark:border-emerald-900/50 dark:bg-emerald-950/30 dark:text-emerald-200">
|
| 665 |
+
Documentation {Math.round(reportingCompleteness.completeness_score * 100)}%
|
| 666 |
+
</Badge>
|
| 667 |
+
</SignalTooltip>
|
| 668 |
+
)}
|
| 669 |
</div>
|
| 670 |
</div>
|
| 671 |
{overviewOpen ? (
|
|
|
|
| 816 |
</dl>
|
| 817 |
</div>
|
| 818 |
|
| 819 |
+
<CompletenessPanel completeness={reportingCompleteness} />
|
| 820 |
+
<ComparabilityPanel
|
| 821 |
+
comparability={benchmarkComparability}
|
| 822 |
+
summary={summary.comparability_summary}
|
| 823 |
+
/>
|
| 824 |
+
|
| 825 |
{!hasMultiMetricLeaderboard && (summary.root_metrics?.length || summary.subtasks?.length) ? (
|
| 826 |
<section className="rounded-2xl border bg-muted/5 p-3.5">
|
| 827 |
<div className="space-y-1">
|
|
|
|
| 1054 |
const samples = Array.isArray(modelResult.source_data)
|
| 1055 |
? undefined
|
| 1056 |
: modelResult.source_data.samples_number
|
| 1057 |
+
const rowAnnotations = modelResult.result.evalcards?.annotations
|
| 1058 |
|
| 1059 |
return (
|
| 1060 |
<Fragment key={key}>
|
| 1061 |
+
<TableRow
|
| 1062 |
+
id={modelResult.model_route_id ? `row-${modelResult.model_route_id}` : undefined}
|
| 1063 |
+
className={cn("group", isExpanded && "bg-muted/15")}
|
| 1064 |
+
>
|
| 1065 |
<TableCell className="px-4">
|
| 1066 |
<div
|
| 1067 |
className={cn(
|
|
|
|
| 1114 |
|
| 1115 |
<TableCell className="text-right">
|
| 1116 |
<div className="text-xl font-semibold tabular-nums">{formatRawScore(modelResult.score, summary.metric_config.unit)}</div>
|
| 1117 |
+
<SignalsRowBadges annotations={rowAnnotations} />
|
| 1118 |
</TableCell>
|
| 1119 |
|
| 1120 |
{isResearchView ? (
|
|
|
|
| 1244 |
)}
|
| 1245 |
</DetailPanel>
|
| 1246 |
|
| 1247 |
+
<ReproducibilityPanel gap={rowAnnotations?.reproducibility_gap} />
|
| 1248 |
+
|
| 1249 |
<DetailPanel
|
| 1250 |
title={isResearchView ? "Score Breakdown" : "Metric Summary"}
|
| 1251 |
subtitle={
|
|
|
|
| 1432 |
const leaderboardMetrics = summary.leaderboard_metrics ?? []
|
| 1433 |
const leaderboardRows = summary.leaderboard_rows ?? []
|
| 1434 |
const allMetricKeys = useMemo(() => leaderboardMetrics.map((metric) => metric.column_key), [leaderboardMetrics])
|
| 1435 |
+
// Cap default visible columns to avoid hangs on benchmarks with hundreds of metrics
|
| 1436 |
+
// (e.g. helm_air_bench has 374 subtask×metric pairs). Users can opt in to more.
|
| 1437 |
+
const DEFAULT_VISIBLE_METRIC_CAP = 24
|
| 1438 |
+
const defaultVisibleMetricKeys = useMemo(
|
| 1439 |
+
() => allMetricKeys.slice(0, DEFAULT_VISIBLE_METRIC_CAP),
|
| 1440 |
+
[allMetricKeys]
|
| 1441 |
+
)
|
| 1442 |
+
const [visibleMetricKeys, setVisibleMetricKeys] = useState<string[]>(() => defaultVisibleMetricKeys)
|
| 1443 |
const maxParamStepIndex = PARAM_RANGE_VALUES.length - 1
|
| 1444 |
const leaderboardMetricMap = useMemo(
|
| 1445 |
() => new Map(leaderboardMetrics.map((metric) => [metric.column_key, metric])),
|
|
|
|
| 1589 |
}, [maxParamStep, minParamStep, sortDirection, sortKey])
|
| 1590 |
|
| 1591 |
useEffect(() => {
|
| 1592 |
+
setVisibleMetricKeys(defaultVisibleMetricKeys)
|
| 1593 |
+
}, [defaultVisibleMetricKeys, summary.evaluation_id])
|
| 1594 |
|
| 1595 |
useEffect(() => {
|
| 1596 |
setActiveSubtaskTab("all")
|
|
|
|
| 1777 |
<CardContent className="p-0">
|
| 1778 |
{hasSubtaskTabs && (
|
| 1779 |
<div className="border-b bg-background px-5 py-3 sm:px-6">
|
| 1780 |
+
<SliceSelector
|
| 1781 |
+
activeSubtaskTab={activeSubtaskTab}
|
| 1782 |
+
onChange={setActiveSubtaskTab}
|
| 1783 |
+
tabs={singleMetricSubtaskTabs}
|
| 1784 |
+
/>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1785 |
</div>
|
| 1786 |
)}
|
| 1787 |
|
|
|
|
| 1976 |
)}
|
| 1977 |
<span className="lg:hidden">{row.model_info.developer ?? "Unknown developer"}</span>
|
| 1978 |
</div>
|
| 1979 |
+
<SignalsRowBadges
|
| 1980 |
+
annotations={getRowLevelAnnotations(row, visibleMetrics)}
|
| 1981 |
+
variant="row"
|
| 1982 |
+
className="mt-1 justify-start"
|
| 1983 |
+
hideOnMobile={false}
|
| 1984 |
+
/>
|
| 1985 |
</div>
|
| 1986 |
</TableCell>
|
| 1987 |
|
|
|
|
| 1997 |
|
| 1998 |
{visibleMetrics.map((metric) => {
|
| 1999 |
const score = row.values[metric.column_key]
|
| 2000 |
+
const annotations = row.annotations_by_metric?.[metric.column_key]
|
| 2001 |
return (
|
| 2002 |
<TableCell
|
| 2003 |
key={metric.column_key}
|
|
|
|
| 2006 |
!isNumericScore(score) && "text-muted-foreground"
|
| 2007 |
)}
|
| 2008 |
>
|
| 2009 |
+
<div>{isNumericScore(score) ? formatRawScore(score, metric.unit) : "—"}</div>
|
| 2010 |
+
<SignalsRowBadges annotations={annotations} variant="cell" />
|
| 2011 |
</TableCell>
|
| 2012 |
)
|
| 2013 |
})}
|
components/model-compare-dialog.tsx
CHANGED
|
@@ -133,6 +133,7 @@ const CONTEXT_ROWS = [
|
|
| 133 |
{ key: "benchmarks", label: "Benchmark coverage" },
|
| 134 |
{ key: "variants", label: "Versions" },
|
| 135 |
{ key: "score_summary", label: "Score range" },
|
|
|
|
| 136 |
{ key: "latest", label: "Latest summary" },
|
| 137 |
{ key: "updated", label: "Updated" },
|
| 138 |
] as const
|
|
@@ -409,6 +410,20 @@ export function ModelCompareDialog({
|
|
| 409 |
</div>
|
| 410 |
</div>
|
| 411 |
) : null}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
{row.key === "latest" ? (
|
| 413 |
<div className="flex items-center gap-2">
|
| 414 |
<span>{model.latest_source_name || `${model.benchmarks_count} benchmark suites summarized`}</span>
|
|
|
|
| 133 |
{ key: "benchmarks", label: "Benchmark coverage" },
|
| 134 |
{ key: "variants", label: "Versions" },
|
| 135 |
{ key: "score_summary", label: "Score range" },
|
| 136 |
+
{ key: "reproducibility", label: "Re-runnability" },
|
| 137 |
{ key: "latest", label: "Latest summary" },
|
| 138 |
{ key: "updated", label: "Updated" },
|
| 139 |
] as const
|
|
|
|
| 410 |
</div>
|
| 411 |
</div>
|
| 412 |
) : null}
|
| 413 |
+
{row.key === "reproducibility" ? (
|
| 414 |
+
model.reproducibility_summary && model.reproducibility_summary.has_reproducibility_gap_count > 0 ? (
|
| 415 |
+
<div className="space-y-1">
|
| 416 |
+
<div className="font-medium">
|
| 417 |
+
{model.reproducibility_summary.has_reproducibility_gap_count} setup gaps
|
| 418 |
+
</div>
|
| 419 |
+
<div className="text-sm text-muted-foreground">
|
| 420 |
+
Out of {model.reproducibility_summary.results_total} reported scores
|
| 421 |
+
</div>
|
| 422 |
+
</div>
|
| 423 |
+
) : (
|
| 424 |
+
<span className="text-muted-foreground">No setup gaps reported</span>
|
| 425 |
+
)
|
| 426 |
+
) : null}
|
| 427 |
{row.key === "latest" ? (
|
| 428 |
<div className="flex items-center gap-2">
|
| 429 |
<span>{model.latest_source_name || `${model.benchmarks_count} benchmark suites summarized`}</span>
|
components/navigation.tsx
CHANGED
|
@@ -38,6 +38,12 @@ export function Navigation() {
|
|
| 38 |
icon: BarChart3,
|
| 39 |
isActive: pathname === "/evals" || pathname?.startsWith("/evals/")
|
| 40 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
{
|
| 42 |
href: "/survey",
|
| 43 |
label: "Survey",
|
|
|
|
| 38 |
icon: BarChart3,
|
| 39 |
isActive: pathname === "/evals" || pathname?.startsWith("/evals/")
|
| 40 |
},
|
| 41 |
+
{
|
| 42 |
+
href: "/corpus",
|
| 43 |
+
label: "Corpus",
|
| 44 |
+
icon: FlaskConical,
|
| 45 |
+
isActive: pathname === "/corpus" || pathname?.startsWith("/corpus/")
|
| 46 |
+
},
|
| 47 |
{
|
| 48 |
href: "/survey",
|
| 49 |
label: "Survey",
|
components/signals/comparability-panel.tsx
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"use client"
|
| 2 |
+
|
| 3 |
+
import type { ReactNode } from "react"
|
| 4 |
+
import { ChevronDown, GitCompareArrows, UsersRound } from "lucide-react"
|
| 5 |
+
|
| 6 |
+
import { useAudienceMode } from "@/components/audience-mode-provider"
|
| 7 |
+
import { Badge } from "@/components/ui/badge"
|
| 8 |
+
import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
|
| 9 |
+
import type { BenchmarkComparability, ComparabilitySummary, DifferingSetupField } from "@/lib/backend-artifacts"
|
| 10 |
+
import {
|
| 11 |
+
formatFieldLabel,
|
| 12 |
+
formatSignalNumber,
|
| 13 |
+
formatSignalValue,
|
| 14 |
+
} from "./signal-utils"
|
| 15 |
+
|
| 16 |
+
export function ComparabilityPanel({
|
| 17 |
+
comparability,
|
| 18 |
+
summary,
|
| 19 |
+
}: {
|
| 20 |
+
comparability?: BenchmarkComparability | null
|
| 21 |
+
summary?: ComparabilitySummary
|
| 22 |
+
}) {
|
| 23 |
+
const { mode } = useAudienceMode()
|
| 24 |
+
const isResearchView = mode === "research"
|
| 25 |
+
const variantGroups = comparability?.variant_divergence_groups ?? []
|
| 26 |
+
const crossPartyGroups = comparability?.cross_party_divergence_groups ?? []
|
| 27 |
+
const showNoCrossPartyNote = summary?.groups_with_cross_party_check === 0
|
| 28 |
+
|
| 29 |
+
if (variantGroups.length === 0 && crossPartyGroups.length === 0 && !showNoCrossPartyNote) {
|
| 30 |
+
return null
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
return (
|
| 34 |
+
<section className="rounded-2xl border border-border/70 bg-background/70 p-4 sm:p-5">
|
| 35 |
+
<div className="flex flex-col gap-2 sm:flex-row sm:items-start sm:justify-between">
|
| 36 |
+
<div className="space-y-1">
|
| 37 |
+
<div className="flex items-center gap-2">
|
| 38 |
+
<GitCompareArrows className="h-4 w-4 text-primary" />
|
| 39 |
+
<h3 className="font-semibold">
|
| 40 |
+
{isResearchView ? "Comparability" : "Can these scores be compared directly?"}
|
| 41 |
+
</h3>
|
| 42 |
+
</div>
|
| 43 |
+
<p className="max-w-2xl text-sm text-muted-foreground">
|
| 44 |
+
{isResearchView
|
| 45 |
+
? "Groups where reported scores diverge across setups or reporting organizations."
|
| 46 |
+
: "Flags cases where score differences may come from setup choices or different reporting sources."}
|
| 47 |
+
</p>
|
| 48 |
+
</div>
|
| 49 |
+
{summary && (
|
| 50 |
+
<div className="flex flex-wrap gap-2 text-xs">
|
| 51 |
+
<Badge variant="outline">{summary.groups_with_variant_check} setup checks</Badge>
|
| 52 |
+
<Badge variant="outline">{summary.groups_with_cross_party_check} source checks</Badge>
|
| 53 |
+
</div>
|
| 54 |
+
)}
|
| 55 |
+
</div>
|
| 56 |
+
|
| 57 |
+
{showNoCrossPartyNote && (
|
| 58 |
+
<div className="mt-4 rounded-xl border border-dashed border-border/70 bg-muted/10 px-3 py-2 text-sm text-muted-foreground">
|
| 59 |
+
No third-party reports are available for cross-party comparison.
|
| 60 |
+
</div>
|
| 61 |
+
)}
|
| 62 |
+
|
| 63 |
+
<div className="mt-4 grid gap-3 lg:grid-cols-2">
|
| 64 |
+
{variantGroups.length > 0 && (
|
| 65 |
+
<GroupList
|
| 66 |
+
icon="variant"
|
| 67 |
+
title="Variant divergence"
|
| 68 |
+
count={variantGroups.length}
|
| 69 |
+
>
|
| 70 |
+
{variantGroups.slice(0, 8).map((group) => (
|
| 71 |
+
<DivergenceGroupItem
|
| 72 |
+
key={group.group_id}
|
| 73 |
+
modelRouteId={group.model_route_id}
|
| 74 |
+
magnitude={group.divergence_magnitude}
|
| 75 |
+
threshold={group.threshold_used}
|
| 76 |
+
fields={group.differing_setup_fields}
|
| 77 |
+
/>
|
| 78 |
+
))}
|
| 79 |
+
</GroupList>
|
| 80 |
+
)}
|
| 81 |
+
|
| 82 |
+
{crossPartyGroups.length > 0 && (
|
| 83 |
+
<GroupList
|
| 84 |
+
icon="cross-party"
|
| 85 |
+
title="Cross-party divergence"
|
| 86 |
+
count={crossPartyGroups.length}
|
| 87 |
+
>
|
| 88 |
+
{crossPartyGroups.slice(0, 8).map((group) => (
|
| 89 |
+
<DivergenceGroupItem
|
| 90 |
+
key={group.group_id}
|
| 91 |
+
modelRouteId={group.model_route_id}
|
| 92 |
+
magnitude={group.divergence_magnitude}
|
| 93 |
+
threshold={group.threshold_used}
|
| 94 |
+
fields={group.differing_setup_fields}
|
| 95 |
+
scoresByOrganization={group.scores_by_organization}
|
| 96 |
+
/>
|
| 97 |
+
))}
|
| 98 |
+
</GroupList>
|
| 99 |
+
)}
|
| 100 |
+
</div>
|
| 101 |
+
</section>
|
| 102 |
+
)
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
function GroupList({
|
| 106 |
+
icon,
|
| 107 |
+
title,
|
| 108 |
+
count,
|
| 109 |
+
children,
|
| 110 |
+
}: {
|
| 111 |
+
icon: "variant" | "cross-party"
|
| 112 |
+
title: string
|
| 113 |
+
count: number
|
| 114 |
+
children: ReactNode
|
| 115 |
+
}) {
|
| 116 |
+
const Icon = icon === "variant" ? GitCompareArrows : UsersRound
|
| 117 |
+
|
| 118 |
+
return (
|
| 119 |
+
<Collapsible defaultOpen>
|
| 120 |
+
<CollapsibleTrigger asChild>
|
| 121 |
+
<button
|
| 122 |
+
type="button"
|
| 123 |
+
className="flex w-full items-center justify-between rounded-xl border border-border/70 bg-muted/10 px-3 py-2 text-left transition-colors hover:bg-muted/20"
|
| 124 |
+
>
|
| 125 |
+
<span className="flex items-center gap-2 text-sm font-semibold">
|
| 126 |
+
<Icon className="h-4 w-4 text-muted-foreground" />
|
| 127 |
+
{title}
|
| 128 |
+
<Badge variant="secondary">{count}</Badge>
|
| 129 |
+
</span>
|
| 130 |
+
<ChevronDown className="h-4 w-4 text-muted-foreground" />
|
| 131 |
+
</button>
|
| 132 |
+
</CollapsibleTrigger>
|
| 133 |
+
<CollapsibleContent className="mt-2 space-y-2">
|
| 134 |
+
{children}
|
| 135 |
+
</CollapsibleContent>
|
| 136 |
+
</Collapsible>
|
| 137 |
+
)
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
function DivergenceGroupItem({
|
| 141 |
+
modelRouteId,
|
| 142 |
+
magnitude,
|
| 143 |
+
threshold,
|
| 144 |
+
fields,
|
| 145 |
+
scoresByOrganization,
|
| 146 |
+
}: {
|
| 147 |
+
modelRouteId: string
|
| 148 |
+
magnitude: number
|
| 149 |
+
threshold: number
|
| 150 |
+
fields: DifferingSetupField[]
|
| 151 |
+
scoresByOrganization?: Record<string, number>
|
| 152 |
+
}) {
|
| 153 |
+
return (
|
| 154 |
+
<a
|
| 155 |
+
href={`#row-${modelRouteId}`}
|
| 156 |
+
className="block rounded-xl border border-border/60 bg-background px-3 py-2 text-sm transition-colors hover:bg-muted/20"
|
| 157 |
+
>
|
| 158 |
+
<div className="flex items-start justify-between gap-3">
|
| 159 |
+
<div className="min-w-0">
|
| 160 |
+
<div className="font-medium">{modelRouteId}</div>
|
| 161 |
+
<div className="mt-1 text-xs text-muted-foreground">
|
| 162 |
+
Divergence {formatSignalNumber(magnitude)}; threshold {formatSignalNumber(threshold)}
|
| 163 |
+
</div>
|
| 164 |
+
</div>
|
| 165 |
+
<span className="shrink-0 text-xs font-medium text-primary">Jump to row</span>
|
| 166 |
+
</div>
|
| 167 |
+
|
| 168 |
+
{fields.length > 0 && (
|
| 169 |
+
<div className="mt-2 space-y-1 text-xs text-muted-foreground">
|
| 170 |
+
{fields.slice(0, 3).map((field) => (
|
| 171 |
+
<div key={field.field}>
|
| 172 |
+
<span className="font-medium text-foreground">{formatFieldLabel(field.field)}:</span>{" "}
|
| 173 |
+
{field.values.map(formatSignalValue).join(", ")}
|
| 174 |
+
</div>
|
| 175 |
+
))}
|
| 176 |
+
</div>
|
| 177 |
+
)}
|
| 178 |
+
|
| 179 |
+
{scoresByOrganization && Object.keys(scoresByOrganization).length > 0 && (
|
| 180 |
+
<div className="mt-2 flex flex-wrap gap-1.5">
|
| 181 |
+
{Object.entries(scoresByOrganization).slice(0, 4).map(([org, score]) => (
|
| 182 |
+
<span
|
| 183 |
+
key={org}
|
| 184 |
+
className="rounded-full border border-border/60 bg-muted/20 px-2 py-0.5 text-[11px] text-muted-foreground"
|
| 185 |
+
>
|
| 186 |
+
{org}: {formatSignalNumber(score)}
|
| 187 |
+
</span>
|
| 188 |
+
))}
|
| 189 |
+
</div>
|
| 190 |
+
)}
|
| 191 |
+
</a>
|
| 192 |
+
)
|
| 193 |
+
}
|
components/signals/completeness-panel.tsx
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"use client"
|
| 2 |
+
|
| 3 |
+
import type { ReactNode } from "react"
|
| 4 |
+
import { ChevronDown, ClipboardCheck } from "lucide-react"
|
| 5 |
+
|
| 6 |
+
import { useAudienceMode } from "@/components/audience-mode-provider"
|
| 7 |
+
import { Badge } from "@/components/ui/badge"
|
| 8 |
+
import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
|
| 9 |
+
import { Progress } from "@/components/ui/progress"
|
| 10 |
+
import type { ReportingCompleteness } from "@/lib/backend-artifacts"
|
| 11 |
+
import {
|
| 12 |
+
formatFieldLabel,
|
| 13 |
+
formatPercent,
|
| 14 |
+
getCompletenessPopulatedCount,
|
| 15 |
+
} from "./signal-utils"
|
| 16 |
+
|
| 17 |
+
export function CompletenessPanel({
|
| 18 |
+
completeness,
|
| 19 |
+
}: {
|
| 20 |
+
completeness?: ReportingCompleteness | null
|
| 21 |
+
}) {
|
| 22 |
+
const { mode } = useAudienceMode()
|
| 23 |
+
const isResearchView = mode === "research"
|
| 24 |
+
|
| 25 |
+
if (!completeness) {
|
| 26 |
+
return null
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
const populatedCount = getCompletenessPopulatedCount(completeness)
|
| 30 |
+
const total = completeness.total_fields_evaluated
|
| 31 |
+
const missingFields = completeness.missing_required_fields ?? []
|
| 32 |
+
const partialFields = completeness.partial_fields ?? []
|
| 33 |
+
|
| 34 |
+
return (
|
| 35 |
+
<section className="rounded-2xl border border-border/70 bg-background/70 p-4 sm:p-5">
|
| 36 |
+
<div className="flex flex-col gap-4 lg:flex-row lg:items-start lg:justify-between">
|
| 37 |
+
<div className="space-y-1">
|
| 38 |
+
<div className="flex items-center gap-2">
|
| 39 |
+
<ClipboardCheck className="h-4 w-4 text-primary" />
|
| 40 |
+
<h3 className="font-semibold">
|
| 41 |
+
{isResearchView ? "Reporting completeness" : "How well is this benchmark documented?"}
|
| 42 |
+
</h3>
|
| 43 |
+
</div>
|
| 44 |
+
<p className="max-w-2xl text-sm text-muted-foreground">
|
| 45 |
+
{isResearchView
|
| 46 |
+
? "Coverage of EvalCards-required documentation fields for this benchmark."
|
| 47 |
+
: "A quick read on how much supporting documentation is available before leaning on the scores."}
|
| 48 |
+
</p>
|
| 49 |
+
</div>
|
| 50 |
+
|
| 51 |
+
<div className="min-w-[14rem] rounded-xl border border-border/70 bg-muted/10 px-3 py-2">
|
| 52 |
+
<div className="flex items-baseline justify-between gap-3">
|
| 53 |
+
<span className="text-[11px] font-semibold uppercase tracking-[0.18em] text-muted-foreground">
|
| 54 |
+
Documentation
|
| 55 |
+
</span>
|
| 56 |
+
<span className="text-lg font-semibold tabular-nums">
|
| 57 |
+
{formatPercent(completeness.completeness_score)}
|
| 58 |
+
</span>
|
| 59 |
+
</div>
|
| 60 |
+
<Progress value={completeness.completeness_score * 100} className="mt-2 h-2" />
|
| 61 |
+
<div className="mt-2 text-xs text-muted-foreground">
|
| 62 |
+
{populatedCount} of {total} fields populated
|
| 63 |
+
</div>
|
| 64 |
+
</div>
|
| 65 |
+
</div>
|
| 66 |
+
|
| 67 |
+
{(missingFields.length > 0 || partialFields.length > 0) && (
|
| 68 |
+
<div className="mt-4 grid gap-3 lg:grid-cols-2">
|
| 69 |
+
<SignalListCollapsible
|
| 70 |
+
title="Missing required fields"
|
| 71 |
+
count={missingFields.length}
|
| 72 |
+
>
|
| 73 |
+
{missingFields.length === 0 ? (
|
| 74 |
+
<p className="text-sm text-muted-foreground">No missing required fields recorded.</p>
|
| 75 |
+
) : (
|
| 76 |
+
<ul className="space-y-1.5 text-sm">
|
| 77 |
+
{missingFields.slice(0, 12).map((field) => (
|
| 78 |
+
<li key={field} className="rounded-lg border border-border/50 bg-background px-3 py-2">
|
| 79 |
+
<span className="font-medium">{formatFieldLabel(field)}</span>
|
| 80 |
+
{isResearchView && (
|
| 81 |
+
<span className="ml-2 text-xs text-muted-foreground">{field}</span>
|
| 82 |
+
)}
|
| 83 |
+
</li>
|
| 84 |
+
))}
|
| 85 |
+
</ul>
|
| 86 |
+
)}
|
| 87 |
+
</SignalListCollapsible>
|
| 88 |
+
|
| 89 |
+
<SignalListCollapsible
|
| 90 |
+
title="Partially populated"
|
| 91 |
+
count={partialFields.length}
|
| 92 |
+
>
|
| 93 |
+
{partialFields.length === 0 ? (
|
| 94 |
+
<p className="text-sm text-muted-foreground">No partially populated fields recorded.</p>
|
| 95 |
+
) : (
|
| 96 |
+
<ul className="space-y-1.5 text-sm">
|
| 97 |
+
{partialFields.slice(0, 12).map((field) => (
|
| 98 |
+
<li key={field.field_path} className="rounded-lg border border-border/50 bg-background px-3 py-2">
|
| 99 |
+
<div className="flex items-start justify-between gap-3">
|
| 100 |
+
<span className="font-medium">{formatFieldLabel(field.field_path)}</span>
|
| 101 |
+
<span className="shrink-0 text-muted-foreground">
|
| 102 |
+
{field.populated_subitems}/{field.total_subitems}
|
| 103 |
+
</span>
|
| 104 |
+
</div>
|
| 105 |
+
{isResearchView && (
|
| 106 |
+
<div className="mt-1 text-xs text-muted-foreground">{field.field_path}</div>
|
| 107 |
+
)}
|
| 108 |
+
</li>
|
| 109 |
+
))}
|
| 110 |
+
</ul>
|
| 111 |
+
)}
|
| 112 |
+
</SignalListCollapsible>
|
| 113 |
+
</div>
|
| 114 |
+
)}
|
| 115 |
+
</section>
|
| 116 |
+
)
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
function SignalListCollapsible({
|
| 120 |
+
title,
|
| 121 |
+
count,
|
| 122 |
+
children,
|
| 123 |
+
}: {
|
| 124 |
+
title: string
|
| 125 |
+
count: number
|
| 126 |
+
children: ReactNode
|
| 127 |
+
}) {
|
| 128 |
+
return (
|
| 129 |
+
<Collapsible>
|
| 130 |
+
<CollapsibleTrigger asChild>
|
| 131 |
+
<button
|
| 132 |
+
type="button"
|
| 133 |
+
className="flex w-full items-center justify-between rounded-xl border border-border/70 bg-muted/10 px-3 py-2 text-left transition-colors hover:bg-muted/20"
|
| 134 |
+
>
|
| 135 |
+
<span className="flex items-center gap-2 text-sm font-semibold">
|
| 136 |
+
{title}
|
| 137 |
+
<Badge variant="secondary">{count}</Badge>
|
| 138 |
+
</span>
|
| 139 |
+
<ChevronDown className="h-4 w-4 text-muted-foreground" />
|
| 140 |
+
</button>
|
| 141 |
+
</CollapsibleTrigger>
|
| 142 |
+
<CollapsibleContent className="mt-2">
|
| 143 |
+
{children}
|
| 144 |
+
</CollapsibleContent>
|
| 145 |
+
</Collapsible>
|
| 146 |
+
)
|
| 147 |
+
}
|
components/signals/corpus-dashboard.tsx
ADDED
|
@@ -0,0 +1,442 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"use client"
|
| 2 |
+
|
| 3 |
+
import type { ReactNode } from "react"
|
| 4 |
+
import { useEffect, useMemo, useState } from "react"
|
| 5 |
+
import { BarChart3, ClipboardCheck, GitCompareArrows, ShieldCheck } from "lucide-react"
|
| 6 |
+
|
| 7 |
+
import { useAudienceMode } from "@/components/audience-mode-provider"
|
| 8 |
+
import { Badge } from "@/components/ui/badge"
|
| 9 |
+
import { Button } from "@/components/ui/button"
|
| 10 |
+
import type {
|
| 11 |
+
ComparabilityCorpusBlock,
|
| 12 |
+
CompletenessCorpusBlock,
|
| 13 |
+
CorpusAggregates,
|
| 14 |
+
ProvenanceCorpusBlock,
|
| 15 |
+
ReproducibilityCorpusBlock,
|
| 16 |
+
} from "@/lib/backend-artifacts"
|
| 17 |
+
import { getCategoryColor } from "@/lib/benchmark-schema"
|
| 18 |
+
import {
|
| 19 |
+
formatFieldLabel,
|
| 20 |
+
formatPercent,
|
| 21 |
+
} from "./signal-utils"
|
| 22 |
+
|
| 23 |
+
const CATEGORY_ORDER = ["agentic", "general", "knowledge", "reasoning", "safety", "other"]
|
| 24 |
+
|
| 25 |
+
const SOURCE_COLORS: Record<string, string> = {
|
| 26 |
+
first_party: "bg-amber-500",
|
| 27 |
+
third_party: "bg-emerald-500",
|
| 28 |
+
collaborative: "bg-sky-500",
|
| 29 |
+
unspecified: "bg-stone-400",
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
export function CorpusDashboard({
|
| 33 |
+
aggregates,
|
| 34 |
+
completenessScores,
|
| 35 |
+
}: {
|
| 36 |
+
aggregates: CorpusAggregates
|
| 37 |
+
completenessScores: number[]
|
| 38 |
+
}) {
|
| 39 |
+
const { mode } = useAudienceMode()
|
| 40 |
+
const [view, setView] = useState<"overall" | "category">("overall")
|
| 41 |
+
|
| 42 |
+
useEffect(() => {
|
| 43 |
+
setView(mode === "research" ? "category" : "overall")
|
| 44 |
+
}, [mode])
|
| 45 |
+
|
| 46 |
+
const categoryKeys = useMemo(
|
| 47 |
+
() =>
|
| 48 |
+
CATEGORY_ORDER.filter((category) =>
|
| 49 |
+
aggregates.reproducibility.by_category[category] ||
|
| 50 |
+
aggregates.completeness.by_category[category] ||
|
| 51 |
+
aggregates.provenance.by_category[category] ||
|
| 52 |
+
aggregates.comparability.by_category[category]
|
| 53 |
+
),
|
| 54 |
+
[aggregates]
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
return (
|
| 58 |
+
<div className="space-y-6">
|
| 59 |
+
<section className="rounded-2xl border border-border/70 bg-card p-5 shadow-sm">
|
| 60 |
+
<div className="flex flex-col gap-4 lg:flex-row lg:items-start lg:justify-between">
|
| 61 |
+
<div>
|
| 62 |
+
<div className="text-[11px] font-semibold uppercase tracking-[0.22em] text-muted-foreground">
|
| 63 |
+
Interpretive signals
|
| 64 |
+
</div>
|
| 65 |
+
<h1 className="mt-2 text-3xl font-semibold tracking-tight">Corpus Dashboard</h1>
|
| 66 |
+
<p className="mt-2 max-w-3xl text-sm leading-6 text-muted-foreground">
|
| 67 |
+
Corpus-level rollups for reproducibility, documentation completeness, source provenance, and comparability.
|
| 68 |
+
</p>
|
| 69 |
+
</div>
|
| 70 |
+
|
| 71 |
+
<div className="flex flex-wrap items-center gap-2">
|
| 72 |
+
<Badge variant="outline">Signals v{aggregates.signal_version}</Badge>
|
| 73 |
+
<Badge variant="outline">Generated {formatGeneratedDate(aggregates.generated_at)}</Badge>
|
| 74 |
+
<div className="inline-flex rounded-full border bg-muted/20 p-1">
|
| 75 |
+
<Button
|
| 76 |
+
type="button"
|
| 77 |
+
size="sm"
|
| 78 |
+
variant={view === "overall" ? "default" : "ghost"}
|
| 79 |
+
className="h-8 rounded-full"
|
| 80 |
+
onClick={() => setView("overall")}
|
| 81 |
+
>
|
| 82 |
+
Overall
|
| 83 |
+
</Button>
|
| 84 |
+
<Button
|
| 85 |
+
type="button"
|
| 86 |
+
size="sm"
|
| 87 |
+
variant={view === "category" ? "default" : "ghost"}
|
| 88 |
+
className="h-8 rounded-full"
|
| 89 |
+
onClick={() => setView("category")}
|
| 90 |
+
>
|
| 91 |
+
By category
|
| 92 |
+
</Button>
|
| 93 |
+
</div>
|
| 94 |
+
</div>
|
| 95 |
+
</div>
|
| 96 |
+
</section>
|
| 97 |
+
|
| 98 |
+
{view === "overall" ? (
|
| 99 |
+
<div className="grid gap-6">
|
| 100 |
+
<ReproducibilitySection block={aggregates.reproducibility.overall} />
|
| 101 |
+
<CompletenessSection block={aggregates.completeness.overall} scores={completenessScores} />
|
| 102 |
+
<ProvenanceSection block={aggregates.provenance.overall} />
|
| 103 |
+
<ComparabilitySection block={aggregates.comparability.overall} />
|
| 104 |
+
</div>
|
| 105 |
+
) : (
|
| 106 |
+
<div className="grid gap-4 xl:grid-cols-2">
|
| 107 |
+
{categoryKeys.map((category) => (
|
| 108 |
+
<CategoryPanel
|
| 109 |
+
key={category}
|
| 110 |
+
category={category}
|
| 111 |
+
reproducibility={aggregates.reproducibility.by_category[category]}
|
| 112 |
+
completeness={aggregates.completeness.by_category[category]}
|
| 113 |
+
provenance={aggregates.provenance.by_category[category]}
|
| 114 |
+
comparability={aggregates.comparability.by_category[category]}
|
| 115 |
+
/>
|
| 116 |
+
))}
|
| 117 |
+
</div>
|
| 118 |
+
)}
|
| 119 |
+
</div>
|
| 120 |
+
)
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
function ReproducibilitySection({ block }: { block: ReproducibilityCorpusBlock }) {
|
| 124 |
+
return (
|
| 125 |
+
<DashboardSection
|
| 126 |
+
icon={<ShieldCheck className="h-5 w-5" />}
|
| 127 |
+
title="Reproducibility"
|
| 128 |
+
subtitle="Reported scores with enough setup documentation to re-run."
|
| 129 |
+
headline={formatPercent(block.reproducibility_gap_rate)}
|
| 130 |
+
headlineLabel={`${block.triples_with_reproducibility_gap.toLocaleString()} of ${block.total_triples.toLocaleString()} reported scores have gaps`}
|
| 131 |
+
>
|
| 132 |
+
<div className="grid gap-2">
|
| 133 |
+
{Object.entries(block.per_field_missingness).slice(0, 10).map(([field, value]) => (
|
| 134 |
+
<MetricBar
|
| 135 |
+
key={field}
|
| 136 |
+
label={formatFieldLabel(field)}
|
| 137 |
+
value={value.missing_rate}
|
| 138 |
+
detail={`${value.missing_count.toLocaleString()} missing / ${value.denominator === "agentic_only" ? "agentic only" : "all scores"}`}
|
| 139 |
+
/>
|
| 140 |
+
))}
|
| 141 |
+
</div>
|
| 142 |
+
</DashboardSection>
|
| 143 |
+
)
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
function CompletenessSection({
|
| 147 |
+
block,
|
| 148 |
+
scores,
|
| 149 |
+
}: {
|
| 150 |
+
block: CompletenessCorpusBlock
|
| 151 |
+
scores: number[]
|
| 152 |
+
}) {
|
| 153 |
+
return (
|
| 154 |
+
<DashboardSection
|
| 155 |
+
icon={<ClipboardCheck className="h-5 w-5" />}
|
| 156 |
+
title="Reporting Completeness"
|
| 157 |
+
subtitle="How much benchmark documentation is populated."
|
| 158 |
+
headline={formatPercent(block.completeness_score_mean)}
|
| 159 |
+
headlineLabel={`Median ${formatPercent(block.completeness_score_median)} across ${block.total_benchmarks.toLocaleString()} benchmarks`}
|
| 160 |
+
>
|
| 161 |
+
{scores.length > 0 && <Histogram scores={scores} />}
|
| 162 |
+
<div className="mt-4 grid gap-2">
|
| 163 |
+
{Object.entries(block.per_field_population).slice(0, 10).map(([field, value]) => (
|
| 164 |
+
<div key={field} className="rounded-xl border border-border/60 bg-background px-3 py-2">
|
| 165 |
+
<div className="flex items-start justify-between gap-3 text-sm">
|
| 166 |
+
<span className="font-medium">{formatFieldLabel(field)}</span>
|
| 167 |
+
<span className="shrink-0 tabular-nums text-muted-foreground">
|
| 168 |
+
{formatPercent(value.mean_score)}
|
| 169 |
+
</span>
|
| 170 |
+
</div>
|
| 171 |
+
<div className="mt-2 grid gap-1.5">
|
| 172 |
+
<MetricBar label="Any data" value={value.populated_rate} compact />
|
| 173 |
+
<MetricBar label="Fully populated" value={value.fully_populated_rate} compact />
|
| 174 |
+
</div>
|
| 175 |
+
</div>
|
| 176 |
+
))}
|
| 177 |
+
</div>
|
| 178 |
+
</DashboardSection>
|
| 179 |
+
)
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
function ProvenanceSection({ block }: { block: ProvenanceCorpusBlock }) {
|
| 183 |
+
const distribution = block.source_type_distribution
|
| 184 |
+
const total = Object.values(distribution).reduce((sum, value) => sum + value, 0)
|
| 185 |
+
|
| 186 |
+
return (
|
| 187 |
+
<DashboardSection
|
| 188 |
+
icon={<BarChart3 className="h-5 w-5" />}
|
| 189 |
+
title="Provenance"
|
| 190 |
+
subtitle="Who reported the scores, and whether groups have multiple sources."
|
| 191 |
+
headline={formatPercent(block.multi_source_rate)}
|
| 192 |
+
headlineLabel="of (model, benchmark, metric) groups have multiple reporting sources"
|
| 193 |
+
>
|
| 194 |
+
<div className="overflow-hidden rounded-full border border-border/70 bg-muted/30">
|
| 195 |
+
<div className="flex h-4 w-full">
|
| 196 |
+
{Object.entries(distribution).map(([sourceType, count]) => (
|
| 197 |
+
<div
|
| 198 |
+
key={sourceType}
|
| 199 |
+
className={SOURCE_COLORS[sourceType] ?? "bg-muted-foreground"}
|
| 200 |
+
style={{ width: total > 0 ? `${(count / total) * 100}%` : "0%" }}
|
| 201 |
+
title={`${sourceType.replace(/_/g, " ")}: ${count}`}
|
| 202 |
+
/>
|
| 203 |
+
))}
|
| 204 |
+
</div>
|
| 205 |
+
</div>
|
| 206 |
+
|
| 207 |
+
<div className="mt-3 grid gap-2 sm:grid-cols-2">
|
| 208 |
+
<RatioTile label="Multi-source groups" value={block.multi_source_rate} count={block.multi_source_groups} />
|
| 209 |
+
<RatioTile label="First-party only groups" value={block.first_party_only_rate} count={block.first_party_only_groups} />
|
| 210 |
+
</div>
|
| 211 |
+
</DashboardSection>
|
| 212 |
+
)
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
function ComparabilitySection({ block }: { block: ComparabilityCorpusBlock }) {
|
| 216 |
+
return (
|
| 217 |
+
<DashboardSection
|
| 218 |
+
icon={<GitCompareArrows className="h-5 w-5" />}
|
| 219 |
+
title="Comparability"
|
| 220 |
+
subtitle="Eligible groups where scores diverge across setups or reporting organizations."
|
| 221 |
+
headline={formatNullableRate(block.variant_divergence_rate)}
|
| 222 |
+
headlineLabel={`${block.variant_divergent_groups.toLocaleString()} of ${block.variant_eligible_groups.toLocaleString()} setup-eligible groups diverge`}
|
| 223 |
+
>
|
| 224 |
+
<div className="grid gap-3 md:grid-cols-2">
|
| 225 |
+
<ComparabilityRateCard
|
| 226 |
+
title="Variant divergence"
|
| 227 |
+
rate={block.variant_divergence_rate}
|
| 228 |
+
eligible={block.variant_eligible_groups}
|
| 229 |
+
divergent={block.variant_divergent_groups}
|
| 230 |
+
/>
|
| 231 |
+
<ComparabilityRateCard
|
| 232 |
+
title="Cross-party divergence"
|
| 233 |
+
rate={block.cross_party_divergence_rate}
|
| 234 |
+
eligible={block.cross_party_eligible_groups}
|
| 235 |
+
divergent={block.cross_party_divergent_groups}
|
| 236 |
+
/>
|
| 237 |
+
</div>
|
| 238 |
+
</DashboardSection>
|
| 239 |
+
)
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
function CategoryPanel({
|
| 243 |
+
category,
|
| 244 |
+
reproducibility,
|
| 245 |
+
completeness,
|
| 246 |
+
provenance,
|
| 247 |
+
comparability,
|
| 248 |
+
}: {
|
| 249 |
+
category: string
|
| 250 |
+
reproducibility?: ReproducibilityCorpusBlock
|
| 251 |
+
completeness?: CompletenessCorpusBlock
|
| 252 |
+
provenance?: ProvenanceCorpusBlock
|
| 253 |
+
comparability?: ComparabilityCorpusBlock
|
| 254 |
+
}) {
|
| 255 |
+
const categoryLabel = `${category.charAt(0).toUpperCase()}${category.slice(1)}`
|
| 256 |
+
|
| 257 |
+
return (
|
| 258 |
+
<section className="rounded-2xl border border-border/70 bg-card p-4 shadow-sm">
|
| 259 |
+
<div className="mb-4 flex items-center justify-between gap-3">
|
| 260 |
+
<h2 className="font-semibold">{categoryLabel}</h2>
|
| 261 |
+
<Badge className={getCategoryColor(categoryLabel)}>{categoryLabel}</Badge>
|
| 262 |
+
</div>
|
| 263 |
+
<div className="grid gap-3 sm:grid-cols-2">
|
| 264 |
+
<MiniMetric label="Reproducibility gaps" value={formatPercent(reproducibility?.reproducibility_gap_rate)} />
|
| 265 |
+
<MiniMetric label="Documentation mean" value={formatPercent(completeness?.completeness_score_mean)} />
|
| 266 |
+
<MiniMetric label="Multi-source groups" value={formatPercent(provenance?.multi_source_rate)} />
|
| 267 |
+
<MiniMetric label="Variant divergence" value={formatNullableRate(comparability?.variant_divergence_rate)} />
|
| 268 |
+
</div>
|
| 269 |
+
{comparability?.cross_party_divergence_rate == null && (
|
| 270 |
+
<div className="mt-3 rounded-xl border border-dashed border-border/70 bg-muted/10 px-3 py-2 text-sm text-muted-foreground">
|
| 271 |
+
Cross-party divergence: N/A - not enough multi-org coverage.
|
| 272 |
+
</div>
|
| 273 |
+
)}
|
| 274 |
+
</section>
|
| 275 |
+
)
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
function DashboardSection({
|
| 279 |
+
icon,
|
| 280 |
+
title,
|
| 281 |
+
subtitle,
|
| 282 |
+
headline,
|
| 283 |
+
headlineLabel,
|
| 284 |
+
children,
|
| 285 |
+
}: {
|
| 286 |
+
icon: ReactNode
|
| 287 |
+
title: string
|
| 288 |
+
subtitle: string
|
| 289 |
+
headline: string
|
| 290 |
+
headlineLabel: string
|
| 291 |
+
children: ReactNode
|
| 292 |
+
}) {
|
| 293 |
+
return (
|
| 294 |
+
<section className="rounded-2xl border border-border/70 bg-card p-5 shadow-sm">
|
| 295 |
+
<div className="grid gap-5 lg:grid-cols-[minmax(0,18rem)_1fr]">
|
| 296 |
+
<div>
|
| 297 |
+
<div className="flex items-center gap-2 text-primary">
|
| 298 |
+
{icon}
|
| 299 |
+
<h2 className="font-semibold">{title}</h2>
|
| 300 |
+
</div>
|
| 301 |
+
<p className="mt-2 text-sm leading-6 text-muted-foreground">{subtitle}</p>
|
| 302 |
+
<div className="mt-5 rounded-xl border border-border/70 bg-muted/10 px-3 py-3">
|
| 303 |
+
<div className="text-3xl font-semibold tabular-nums">{headline}</div>
|
| 304 |
+
<div className="mt-1 text-xs leading-5 text-muted-foreground">{headlineLabel}</div>
|
| 305 |
+
</div>
|
| 306 |
+
</div>
|
| 307 |
+
<div>{children}</div>
|
| 308 |
+
</div>
|
| 309 |
+
</section>
|
| 310 |
+
)
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
function MetricBar({
|
| 314 |
+
label,
|
| 315 |
+
value,
|
| 316 |
+
detail,
|
| 317 |
+
compact = false,
|
| 318 |
+
}: {
|
| 319 |
+
label: string
|
| 320 |
+
value: number | null
|
| 321 |
+
detail?: string
|
| 322 |
+
compact?: boolean
|
| 323 |
+
}) {
|
| 324 |
+
const percent = value == null ? 0 : Math.max(0, Math.min(100, value * 100))
|
| 325 |
+
|
| 326 |
+
return (
|
| 327 |
+
<div className={compact ? "space-y-1" : "rounded-xl border border-border/60 bg-background px-3 py-2"}>
|
| 328 |
+
<div className="flex items-center justify-between gap-3 text-sm">
|
| 329 |
+
<span className="min-w-0 truncate font-medium">{label}</span>
|
| 330 |
+
<span className="shrink-0 tabular-nums text-muted-foreground">{formatPercent(value)}</span>
|
| 331 |
+
</div>
|
| 332 |
+
<div className="mt-1.5 h-2 overflow-hidden rounded-full bg-muted">
|
| 333 |
+
<div className="h-full rounded-full bg-primary/75" style={{ width: `${percent}%` }} />
|
| 334 |
+
</div>
|
| 335 |
+
{detail && <div className="mt-1 text-xs text-muted-foreground">{detail}</div>}
|
| 336 |
+
</div>
|
| 337 |
+
)
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
function Histogram({ scores }: { scores: number[] }) {
|
| 341 |
+
const buckets = Array.from({ length: 10 }, (_, index) => ({
|
| 342 |
+
label: `${index * 10}-${(index + 1) * 10}%`,
|
| 343 |
+
count: 0,
|
| 344 |
+
}))
|
| 345 |
+
|
| 346 |
+
for (const score of scores) {
|
| 347 |
+
if (!Number.isFinite(score)) continue
|
| 348 |
+
const bucket = Math.min(9, Math.max(0, Math.floor(score * 10)))
|
| 349 |
+
buckets[bucket].count += 1
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
const maxCount = Math.max(...buckets.map((bucket) => bucket.count), 1)
|
| 353 |
+
|
| 354 |
+
return (
|
| 355 |
+
<div className="rounded-xl border border-border/60 bg-background px-3 py-3">
|
| 356 |
+
<div className="mb-3 text-sm font-semibold">Benchmark completeness distribution</div>
|
| 357 |
+
<div className="flex h-28 items-end gap-1.5">
|
| 358 |
+
{buckets.map((bucket) => (
|
| 359 |
+
<div key={bucket.label} className="flex min-w-0 flex-1 flex-col items-center gap-1">
|
| 360 |
+
<div
|
| 361 |
+
className="w-full rounded-t bg-primary/70"
|
| 362 |
+
style={{ height: `${Math.max(4, (bucket.count / maxCount) * 100)}%` }}
|
| 363 |
+
title={`${bucket.label}: ${bucket.count}`}
|
| 364 |
+
/>
|
| 365 |
+
<span className="text-[9px] text-muted-foreground">{bucket.label.split("-")[0]}</span>
|
| 366 |
+
</div>
|
| 367 |
+
))}
|
| 368 |
+
</div>
|
| 369 |
+
</div>
|
| 370 |
+
)
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
function RatioTile({ label, value, count }: { label: string; value: number | null; count: number }) {
|
| 374 |
+
return (
|
| 375 |
+
<div className="rounded-xl border border-border/60 bg-background px-3 py-2">
|
| 376 |
+
<div className="text-sm font-medium">{label}</div>
|
| 377 |
+
<div className="mt-1 flex items-baseline justify-between gap-2">
|
| 378 |
+
<span className="text-xl font-semibold tabular-nums">{formatPercent(value)}</span>
|
| 379 |
+
<span className="text-xs text-muted-foreground">{count.toLocaleString()} groups</span>
|
| 380 |
+
</div>
|
| 381 |
+
</div>
|
| 382 |
+
)
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
function ComparabilityRateCard({
|
| 386 |
+
title,
|
| 387 |
+
rate,
|
| 388 |
+
eligible,
|
| 389 |
+
divergent,
|
| 390 |
+
}: {
|
| 391 |
+
title: string
|
| 392 |
+
rate: number | null
|
| 393 |
+
eligible: number
|
| 394 |
+
divergent: number
|
| 395 |
+
}) {
|
| 396 |
+
if (rate == null) {
|
| 397 |
+
return (
|
| 398 |
+
<div className="rounded-xl border border-dashed border-border/70 bg-muted/10 px-4 py-5">
|
| 399 |
+
<div className="font-semibold">{title}</div>
|
| 400 |
+
<div className="mt-2 text-sm text-muted-foreground">
|
| 401 |
+
N/A - not enough data to compute this rate.
|
| 402 |
+
</div>
|
| 403 |
+
</div>
|
| 404 |
+
)
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
return (
|
| 408 |
+
<div className="rounded-xl border border-border/70 bg-background px-4 py-4">
|
| 409 |
+
<div className="font-semibold">{title}</div>
|
| 410 |
+
<div className="mt-3 text-2xl font-semibold tabular-nums">{formatPercent(rate)}</div>
|
| 411 |
+
<div className="mt-1 text-sm text-muted-foreground">
|
| 412 |
+
{divergent.toLocaleString()} of {eligible.toLocaleString()} eligible groups
|
| 413 |
+
</div>
|
| 414 |
+
</div>
|
| 415 |
+
)
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
function MiniMetric({ label, value }: { label: string; value: string }) {
|
| 419 |
+
return (
|
| 420 |
+
<div className="rounded-xl border border-border/60 bg-muted/10 px-3 py-2">
|
| 421 |
+
<div className="text-xs text-muted-foreground">{label}</div>
|
| 422 |
+
<div className="mt-1 text-xl font-semibold tabular-nums">{value}</div>
|
| 423 |
+
</div>
|
| 424 |
+
)
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
function formatNullableRate(value: number | null | undefined) {
|
| 428 |
+
return value == null ? "N/A" : formatPercent(value)
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
function formatGeneratedDate(value: string) {
|
| 432 |
+
const date = new Date(value)
|
| 433 |
+
if (Number.isNaN(date.getTime())) {
|
| 434 |
+
return value
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
return date.toLocaleDateString("en-US", {
|
| 438 |
+
year: "numeric",
|
| 439 |
+
month: "short",
|
| 440 |
+
day: "numeric",
|
| 441 |
+
})
|
| 442 |
+
}
|
components/signals/cross-party-divergence-badge.tsx
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"use client"
|
| 2 |
+
|
| 3 |
+
import { UsersRound } from "lucide-react"
|
| 4 |
+
|
| 5 |
+
import { useAudienceMode } from "@/components/audience-mode-provider"
|
| 6 |
+
import { Badge } from "@/components/ui/badge"
|
| 7 |
+
import type { CrossPartyDivergence } from "@/lib/backend-artifacts"
|
| 8 |
+
import { cn } from "@/lib/utils"
|
| 9 |
+
import { formatSignalNumber } from "./signal-utils"
|
| 10 |
+
import { SignalTooltip } from "./signal-tooltip"
|
| 11 |
+
|
| 12 |
+
export function CrossPartyDivergenceBadge({
|
| 13 |
+
divergence,
|
| 14 |
+
className,
|
| 15 |
+
}: {
|
| 16 |
+
divergence?: CrossPartyDivergence | null
|
| 17 |
+
className?: string
|
| 18 |
+
}) {
|
| 19 |
+
const { mode } = useAudienceMode()
|
| 20 |
+
const isResearchView = mode === "research"
|
| 21 |
+
|
| 22 |
+
if (!divergence?.has_cross_party_divergence) {
|
| 23 |
+
return null
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
const magnitude = formatSignalNumber(divergence.divergence_magnitude)
|
| 27 |
+
const orgCount = divergence.organization_count
|
| 28 |
+
const tooltip = isResearchView
|
| 29 |
+
? `Reports diverge by ${magnitude} across ${orgCount} organization${orgCount === 1 ? "" : "s"}.`
|
| 30 |
+
: "Different organizations reported different scores for this same model on this same benchmark."
|
| 31 |
+
|
| 32 |
+
return (
|
| 33 |
+
<SignalTooltip content={tooltip}>
|
| 34 |
+
<Badge
|
| 35 |
+
variant="outline"
|
| 36 |
+
className={cn(
|
| 37 |
+
"border-violet-300 bg-violet-50 text-violet-900 dark:border-violet-900/60 dark:bg-violet-950/40 dark:text-violet-100",
|
| 38 |
+
className
|
| 39 |
+
)}
|
| 40 |
+
>
|
| 41 |
+
<UsersRound className="h-3 w-3" />
|
| 42 |
+
{isResearchView ? "Cross-party divergence" : "Sources disagree"}
|
| 43 |
+
</Badge>
|
| 44 |
+
</SignalTooltip>
|
| 45 |
+
)
|
| 46 |
+
}
|
components/signals/provenance-badge.tsx
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"use client"
|
| 2 |
+
|
| 3 |
+
import { AlertTriangle, BadgeCheck, Handshake, UserRoundCheck } from "lucide-react"
|
| 4 |
+
|
| 5 |
+
import { useAudienceMode } from "@/components/audience-mode-provider"
|
| 6 |
+
import { Badge } from "@/components/ui/badge"
|
| 7 |
+
import type { Provenance, ProvenanceSourceType } from "@/lib/backend-artifacts"
|
| 8 |
+
import { cn } from "@/lib/utils"
|
| 9 |
+
import { SignalTooltip } from "./signal-tooltip"
|
| 10 |
+
|
| 11 |
+
export function getRelationshipDisplayName(value: string | null | undefined) {
|
| 12 |
+
const normalized = value?.replace(/_/g, " ").trim()
|
| 13 |
+
if (!normalized) {
|
| 14 |
+
return "Unknown"
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
return normalized
|
| 18 |
+
.split(/\s+/)
|
| 19 |
+
.map((token) => `${token.charAt(0).toUpperCase()}${token.slice(1).toLowerCase()}`)
|
| 20 |
+
.join(" ")
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
export function getRelationshipShortLabel(value: string | null | undefined, mode: "research" | "policy" = "research") {
|
| 24 |
+
switch ((value ?? "").toLowerCase()) {
|
| 25 |
+
case "first_party":
|
| 26 |
+
return mode === "policy" ? "Reported by model developer" : "1st party"
|
| 27 |
+
case "third_party":
|
| 28 |
+
return mode === "policy" ? "Independently reported" : "3rd party"
|
| 29 |
+
case "collaborative":
|
| 30 |
+
return mode === "policy" ? "Joint report" : "Collaborative"
|
| 31 |
+
case "other":
|
| 32 |
+
return "Other"
|
| 33 |
+
default:
|
| 34 |
+
return getRelationshipDisplayName(value)
|
| 35 |
+
}
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
export function getRelationshipBadgeTone(value: string | null | undefined): string {
|
| 39 |
+
switch ((value ?? "").toLowerCase()) {
|
| 40 |
+
case "first_party":
|
| 41 |
+
return "border-amber-300 bg-amber-50 text-amber-900 dark:border-amber-900/60 dark:bg-amber-950/40 dark:text-amber-100"
|
| 42 |
+
case "third_party":
|
| 43 |
+
return "border-emerald-300 bg-emerald-50 text-emerald-900 dark:border-emerald-900/60 dark:bg-emerald-950/40 dark:text-emerald-100"
|
| 44 |
+
case "collaborative":
|
| 45 |
+
return "border-sky-300 bg-sky-50 text-sky-900 dark:border-sky-900/60 dark:bg-sky-950/40 dark:text-sky-100"
|
| 46 |
+
default:
|
| 47 |
+
return "border-border/70 bg-muted/40 text-muted-foreground"
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
function normalizeSourceType(value: string | null | undefined): ProvenanceSourceType | "other" | null {
|
| 52 |
+
switch ((value ?? "").toLowerCase()) {
|
| 53 |
+
case "first_party":
|
| 54 |
+
case "third_party":
|
| 55 |
+
case "collaborative":
|
| 56 |
+
case "unspecified":
|
| 57 |
+
return value?.toLowerCase() as ProvenanceSourceType
|
| 58 |
+
case "other":
|
| 59 |
+
return "other"
|
| 60 |
+
default:
|
| 61 |
+
return null
|
| 62 |
+
}
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
function ProvenanceIcon({ sourceType }: { sourceType: ProvenanceSourceType | "other" }) {
|
| 66 |
+
if (sourceType === "third_party") {
|
| 67 |
+
return <BadgeCheck className="h-3 w-3" />
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
if (sourceType === "collaborative") {
|
| 71 |
+
return <Handshake className="h-3 w-3" />
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
return <UserRoundCheck className="h-3 w-3" />
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
export function ProvenanceBadge({
|
| 78 |
+
provenance,
|
| 79 |
+
relationship,
|
| 80 |
+
sourceOrganizationName,
|
| 81 |
+
showOther = false,
|
| 82 |
+
className,
|
| 83 |
+
}: {
|
| 84 |
+
provenance?: Provenance | null
|
| 85 |
+
relationship?: string | null
|
| 86 |
+
sourceOrganizationName?: string | null
|
| 87 |
+
showOther?: boolean
|
| 88 |
+
className?: string
|
| 89 |
+
}) {
|
| 90 |
+
const { mode } = useAudienceMode()
|
| 91 |
+
const sourceType = provenance?.source_type ?? normalizeSourceType(relationship)
|
| 92 |
+
|
| 93 |
+
if (!sourceType || sourceType === "unspecified" || (!showOther && sourceType === "other")) {
|
| 94 |
+
return null
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
const firstPartyOnly = provenance?.first_party_only === true
|
| 98 |
+
const label = firstPartyOnly
|
| 99 |
+
? mode === "policy"
|
| 100 |
+
? "Only model developer reported"
|
| 101 |
+
: "1st party only"
|
| 102 |
+
: getRelationshipShortLabel(sourceType, mode)
|
| 103 |
+
|
| 104 |
+
const tooltip = firstPartyOnly
|
| 105 |
+
? mode === "policy"
|
| 106 |
+
? "Only the model developer reported this score; no independent replication is recorded."
|
| 107 |
+
: "First-party only - no independent replication is recorded for this group."
|
| 108 |
+
: sourceOrganizationName
|
| 109 |
+
? `Reported by ${sourceOrganizationName}.`
|
| 110 |
+
: getRelationshipDisplayName(sourceType)
|
| 111 |
+
|
| 112 |
+
return (
|
| 113 |
+
<SignalTooltip content={tooltip}>
|
| 114 |
+
<Badge
|
| 115 |
+
variant="outline"
|
| 116 |
+
className={cn(getRelationshipBadgeTone(sourceType), className)}
|
| 117 |
+
>
|
| 118 |
+
<ProvenanceIcon sourceType={sourceType} />
|
| 119 |
+
{label}
|
| 120 |
+
{firstPartyOnly && <AlertTriangle className="h-3 w-3" />}
|
| 121 |
+
</Badge>
|
| 122 |
+
</SignalTooltip>
|
| 123 |
+
)
|
| 124 |
+
}
|
components/signals/reproducibility-badge.tsx
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"use client"
|
| 2 |
+
|
| 3 |
+
import { AlertTriangle } from "lucide-react"
|
| 4 |
+
|
| 5 |
+
import { useAudienceMode } from "@/components/audience-mode-provider"
|
| 6 |
+
import { Badge } from "@/components/ui/badge"
|
| 7 |
+
import type { ReproducibilityGap } from "@/lib/backend-artifacts"
|
| 8 |
+
import { cn } from "@/lib/utils"
|
| 9 |
+
import { formatMissingField } from "./signal-utils"
|
| 10 |
+
import { SignalTooltip } from "./signal-tooltip"
|
| 11 |
+
|
| 12 |
+
export function ReproducibilityBadge({
|
| 13 |
+
gap,
|
| 14 |
+
className,
|
| 15 |
+
}: {
|
| 16 |
+
gap?: ReproducibilityGap | null
|
| 17 |
+
className?: string
|
| 18 |
+
}) {
|
| 19 |
+
const { mode } = useAudienceMode()
|
| 20 |
+
const isResearchView = mode === "research"
|
| 21 |
+
|
| 22 |
+
if (!gap?.has_reproducibility_gap) {
|
| 23 |
+
return null
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
const missing = gap.missing_fields.map(formatMissingField)
|
| 27 |
+
const countLine = `${gap.populated_field_count} of ${gap.required_field_count} setup fields recorded.`
|
| 28 |
+
const tooltip = isResearchView
|
| 29 |
+
? `Setup not fully documented. Missing: ${missing.join(", ") || "none listed"}. ${countLine}`
|
| 30 |
+
: `This score's setup is not fully documented, so it cannot be re-run as-is. ${countLine}`
|
| 31 |
+
|
| 32 |
+
return (
|
| 33 |
+
<SignalTooltip content={tooltip}>
|
| 34 |
+
<Badge
|
| 35 |
+
variant="outline"
|
| 36 |
+
className={cn(
|
| 37 |
+
"border-amber-300 bg-amber-50 text-amber-900 dark:border-amber-900/60 dark:bg-amber-950/40 dark:text-amber-100",
|
| 38 |
+
className
|
| 39 |
+
)}
|
| 40 |
+
>
|
| 41 |
+
<AlertTriangle className="h-3 w-3" />
|
| 42 |
+
{isResearchView ? "Reproducibility gap" : "Setup not documented"}
|
| 43 |
+
</Badge>
|
| 44 |
+
</SignalTooltip>
|
| 45 |
+
)
|
| 46 |
+
}
|
components/signals/reproducibility-panel.tsx
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"use client"
|
| 2 |
+
|
| 3 |
+
import { AlertTriangle } from "lucide-react"
|
| 4 |
+
|
| 5 |
+
import { useAudienceMode } from "@/components/audience-mode-provider"
|
| 6 |
+
import type { ReproducibilityGap } from "@/lib/backend-artifacts"
|
| 7 |
+
import { formatMissingField } from "./signal-utils"
|
| 8 |
+
|
| 9 |
+
export function ReproducibilityPanel({
|
| 10 |
+
gap,
|
| 11 |
+
}: {
|
| 12 |
+
gap?: ReproducibilityGap | null
|
| 13 |
+
}) {
|
| 14 |
+
const { mode } = useAudienceMode()
|
| 15 |
+
const isResearchView = mode === "research"
|
| 16 |
+
|
| 17 |
+
if (!gap) {
|
| 18 |
+
return null
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
return (
|
| 22 |
+
<div className="rounded-2xl border bg-background/70 p-4">
|
| 23 |
+
<div className="mb-4 flex items-start gap-2">
|
| 24 |
+
<AlertTriangle className="mt-0.5 h-4 w-4 shrink-0 text-amber-600 dark:text-amber-300" />
|
| 25 |
+
<div>
|
| 26 |
+
<div className="font-semibold">
|
| 27 |
+
{isResearchView ? "Reproducibility" : "Re-runnability"}
|
| 28 |
+
</div>
|
| 29 |
+
<div className="text-sm text-muted-foreground">
|
| 30 |
+
{isResearchView
|
| 31 |
+
? "Whether the setup is documented well enough for someone else to re-run."
|
| 32 |
+
: "Whether someone could re-run this evaluation with the information available."}
|
| 33 |
+
</div>
|
| 34 |
+
</div>
|
| 35 |
+
</div>
|
| 36 |
+
|
| 37 |
+
<div className="space-y-2.5 text-sm">
|
| 38 |
+
<PanelRow
|
| 39 |
+
label="Setup fields recorded"
|
| 40 |
+
value={`${gap.populated_field_count} of ${gap.required_field_count}`}
|
| 41 |
+
/>
|
| 42 |
+
{gap.missing_fields.length > 0 && (
|
| 43 |
+
<PanelRow
|
| 44 |
+
label="Missing"
|
| 45 |
+
value={gap.missing_fields.map(formatMissingField).join(", ")}
|
| 46 |
+
/>
|
| 47 |
+
)}
|
| 48 |
+
</div>
|
| 49 |
+
</div>
|
| 50 |
+
)
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
function PanelRow({ label, value }: { label: string; value: string }) {
|
| 54 |
+
return (
|
| 55 |
+
<div className="flex gap-3">
|
| 56 |
+
<span className="w-32 shrink-0 text-muted-foreground">{label}</span>
|
| 57 |
+
<span className="min-w-0 flex-1 break-words font-medium">{value}</span>
|
| 58 |
+
</div>
|
| 59 |
+
)
|
| 60 |
+
}
|
components/signals/signal-tooltip.tsx
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"use client"
|
| 2 |
+
|
| 3 |
+
import type { ReactNode } from "react"
|
| 4 |
+
import * as TooltipPrimitive from "@radix-ui/react-tooltip"
|
| 5 |
+
|
| 6 |
+
export function SignalTooltip({
|
| 7 |
+
children,
|
| 8 |
+
content,
|
| 9 |
+
}: {
|
| 10 |
+
children: ReactNode
|
| 11 |
+
content: ReactNode
|
| 12 |
+
}) {
|
| 13 |
+
return (
|
| 14 |
+
<TooltipPrimitive.Provider delayDuration={150}>
|
| 15 |
+
<TooltipPrimitive.Root>
|
| 16 |
+
<TooltipPrimitive.Trigger asChild>{children}</TooltipPrimitive.Trigger>
|
| 17 |
+
<TooltipPrimitive.Portal>
|
| 18 |
+
<TooltipPrimitive.Content
|
| 19 |
+
side="top"
|
| 20 |
+
align="center"
|
| 21 |
+
sideOffset={8}
|
| 22 |
+
className="z-50 max-w-80 rounded-md border border-border/70 bg-popover px-3 py-2 text-xs leading-5 text-popover-foreground shadow-lg"
|
| 23 |
+
>
|
| 24 |
+
{content}
|
| 25 |
+
<TooltipPrimitive.Arrow className="fill-popover" />
|
| 26 |
+
</TooltipPrimitive.Content>
|
| 27 |
+
</TooltipPrimitive.Portal>
|
| 28 |
+
</TooltipPrimitive.Root>
|
| 29 |
+
</TooltipPrimitive.Provider>
|
| 30 |
+
)
|
| 31 |
+
}
|
components/signals/signal-utils.ts
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { DifferingSetupField, ReportingCompleteness } from "@/lib/backend-artifacts"
|
| 2 |
+
|
| 3 |
+
const FIELD_PREFIXES = [
|
| 4 |
+
"autobenchmarkcard.",
|
| 5 |
+
"eee_eval.",
|
| 6 |
+
"evalcards.",
|
| 7 |
+
]
|
| 8 |
+
|
| 9 |
+
const TOKEN_OVERRIDES: Record<string, string> = {
|
| 10 |
+
api: "API",
|
| 11 |
+
ai: "AI",
|
| 12 |
+
eee: "EEE",
|
| 13 |
+
hf: "HF",
|
| 14 |
+
id: "ID",
|
| 15 |
+
llm: "LLM",
|
| 16 |
+
url: "URL",
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
function titleCaseSegment(segment: string) {
|
| 20 |
+
return segment
|
| 21 |
+
.split(/[\s_-]+/)
|
| 22 |
+
.filter(Boolean)
|
| 23 |
+
.map((token) => TOKEN_OVERRIDES[token.toLowerCase()] ?? `${token.charAt(0).toUpperCase()}${token.slice(1).toLowerCase()}`)
|
| 24 |
+
.join(" ")
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
export function formatPercent(value: number | null | undefined, digits = 0) {
|
| 28 |
+
if (value == null || !Number.isFinite(value)) {
|
| 29 |
+
return "N/A"
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
return `${(value * 100).toFixed(digits)}%`
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
export function formatSignalNumber(value: number | null | undefined, digits = 3) {
|
| 36 |
+
if (value == null || !Number.isFinite(value)) {
|
| 37 |
+
return "N/A"
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
if (Math.abs(value) >= 100) {
|
| 41 |
+
return value.toFixed(1).replace(/\.0$/, "")
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
return value.toFixed(digits).replace(/0+$/, "").replace(/\.$/, "")
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
export function formatFieldLabel(path: string) {
|
| 48 |
+
let next = path
|
| 49 |
+
for (const prefix of FIELD_PREFIXES) {
|
| 50 |
+
if (next.startsWith(prefix)) {
|
| 51 |
+
next = next.slice(prefix.length)
|
| 52 |
+
break
|
| 53 |
+
}
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
return next
|
| 57 |
+
.split(".")
|
| 58 |
+
.filter(Boolean)
|
| 59 |
+
.map(titleCaseSegment)
|
| 60 |
+
.join(" / ")
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
export function formatMissingField(field: string) {
|
| 64 |
+
return titleCaseSegment(field)
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
export function formatSignalValue(value: unknown) {
|
| 68 |
+
if (value == null) {
|
| 69 |
+
return "(unspecified)"
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
if (typeof value === "string") {
|
| 73 |
+
return value
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
if (typeof value === "number" || typeof value === "boolean" || typeof value === "bigint") {
|
| 77 |
+
return String(value)
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
try {
|
| 81 |
+
return JSON.stringify(value)
|
| 82 |
+
} catch {
|
| 83 |
+
return String(value)
|
| 84 |
+
}
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
export function formatDifferingFields(fields: DifferingSetupField[], limit = 2) {
|
| 88 |
+
if (fields.length === 0) {
|
| 89 |
+
return "setup fields"
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
const labels = fields.slice(0, limit).map((item) => formatMissingField(item.field))
|
| 93 |
+
const remainder = fields.length - labels.length
|
| 94 |
+
return remainder > 0 ? `${labels.join(", ")} +${remainder}` : labels.join(", ")
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
export function getCompletenessPopulatedCount(completeness: ReportingCompleteness) {
|
| 98 |
+
if (completeness.field_scores.length === 0) {
|
| 99 |
+
return Math.round(completeness.completeness_score * completeness.total_fields_evaluated)
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
return Math.round(
|
| 103 |
+
completeness.field_scores.reduce((sum, field) => sum + field.score, 0)
|
| 104 |
+
)
|
| 105 |
+
}
|
components/signals/signals-row-badges.tsx
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"use client"
|
| 2 |
+
|
| 3 |
+
import type { RowAnnotations } from "@/lib/backend-artifacts"
|
| 4 |
+
import { cn } from "@/lib/utils"
|
| 5 |
+
import { CrossPartyDivergenceBadge } from "./cross-party-divergence-badge"
|
| 6 |
+
import { ProvenanceBadge } from "./provenance-badge"
|
| 7 |
+
import { ReproducibilityBadge } from "./reproducibility-badge"
|
| 8 |
+
import { VariantDivergenceBadge } from "./variant-divergence-badge"
|
| 9 |
+
|
| 10 |
+
/**
|
| 11 |
+
* Renders the four signal badges for a single row.
|
| 12 |
+
*
|
| 13 |
+
* - `variant`="full" (default): shows all four signals. Use for single-metric
|
| 14 |
+
* leaderboards, expanded row panels, and one-off contexts.
|
| 15 |
+
* - `variant`="cell": only shows divergence signals (variant + cross-party).
|
| 16 |
+
* Use inside multi-metric matrix cells, where reproducibility and provenance
|
| 17 |
+
* are constant across columns and would just be visual noise.
|
| 18 |
+
* - `variant`="row": only shows reproducibility + provenance — the constant
|
| 19 |
+
* per-(model, benchmark) signals. Pair with `variant="cell"` columns so each
|
| 20 |
+
* row carries its constant signals once at the row header.
|
| 21 |
+
*/
|
| 22 |
+
export function SignalsRowBadges({
|
| 23 |
+
annotations,
|
| 24 |
+
className,
|
| 25 |
+
hideOnMobile = true,
|
| 26 |
+
variant = "full",
|
| 27 |
+
}: {
|
| 28 |
+
annotations?: RowAnnotations | null
|
| 29 |
+
className?: string
|
| 30 |
+
hideOnMobile?: boolean
|
| 31 |
+
variant?: "full" | "cell" | "row"
|
| 32 |
+
}) {
|
| 33 |
+
if (!annotations) {
|
| 34 |
+
return null
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
const showRowLevel = variant === "full" || variant === "row"
|
| 38 |
+
const showCellLevel = variant === "full" || variant === "cell"
|
| 39 |
+
|
| 40 |
+
const hasReproducibility = showRowLevel && annotations.reproducibility_gap?.has_reproducibility_gap
|
| 41 |
+
const hasProvenance =
|
| 42 |
+
showRowLevel &&
|
| 43 |
+
Boolean(
|
| 44 |
+
annotations.provenance && annotations.provenance.source_type !== "unspecified"
|
| 45 |
+
)
|
| 46 |
+
const hasVariant = showCellLevel && annotations.variant_divergence?.has_variant_divergence
|
| 47 |
+
const hasCrossParty =
|
| 48 |
+
showCellLevel && annotations.cross_party_divergence?.has_cross_party_divergence
|
| 49 |
+
|
| 50 |
+
if (!hasReproducibility && !hasProvenance && !hasVariant && !hasCrossParty) {
|
| 51 |
+
return null
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
return (
|
| 55 |
+
<div
|
| 56 |
+
className={cn(
|
| 57 |
+
"mt-1.5 flex flex-wrap justify-end gap-1.5",
|
| 58 |
+
hideOnMobile && "hidden md:flex",
|
| 59 |
+
className
|
| 60 |
+
)}
|
| 61 |
+
>
|
| 62 |
+
{showRowLevel && <ReproducibilityBadge gap={annotations.reproducibility_gap} />}
|
| 63 |
+
{showRowLevel && <ProvenanceBadge provenance={annotations.provenance} />}
|
| 64 |
+
{showCellLevel && <VariantDivergenceBadge divergence={annotations.variant_divergence} />}
|
| 65 |
+
{showCellLevel && <CrossPartyDivergenceBadge divergence={annotations.cross_party_divergence} />}
|
| 66 |
+
</div>
|
| 67 |
+
)
|
| 68 |
+
}
|
components/signals/variant-divergence-badge.tsx
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"use client"
|
| 2 |
+
|
| 3 |
+
import { GitCompareArrows } from "lucide-react"
|
| 4 |
+
|
| 5 |
+
import { useAudienceMode } from "@/components/audience-mode-provider"
|
| 6 |
+
import { Badge } from "@/components/ui/badge"
|
| 7 |
+
import type { VariantDivergence } from "@/lib/backend-artifacts"
|
| 8 |
+
import { cn } from "@/lib/utils"
|
| 9 |
+
import { formatDifferingFields, formatSignalNumber } from "./signal-utils"
|
| 10 |
+
import { SignalTooltip } from "./signal-tooltip"
|
| 11 |
+
|
| 12 |
+
export function VariantDivergenceBadge({
|
| 13 |
+
divergence,
|
| 14 |
+
className,
|
| 15 |
+
}: {
|
| 16 |
+
divergence?: VariantDivergence | null
|
| 17 |
+
className?: string
|
| 18 |
+
}) {
|
| 19 |
+
const { mode } = useAudienceMode()
|
| 20 |
+
const isResearchView = mode === "research"
|
| 21 |
+
|
| 22 |
+
if (!divergence?.has_variant_divergence) {
|
| 23 |
+
return null
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
const magnitude = formatSignalNumber(divergence.divergence_magnitude)
|
| 27 |
+
const fields = formatDifferingFields(divergence.differing_setup_fields)
|
| 28 |
+
const tooltip = isResearchView
|
| 29 |
+
? `Scores diverge by ${magnitude} across different setups: ${fields}.`
|
| 30 |
+
: "Different runs of this evaluation produced different scores, so the setup matters."
|
| 31 |
+
|
| 32 |
+
return (
|
| 33 |
+
<SignalTooltip content={tooltip}>
|
| 34 |
+
<Badge
|
| 35 |
+
variant="outline"
|
| 36 |
+
className={cn(
|
| 37 |
+
"border-rose-300 bg-rose-50 text-rose-900 dark:border-rose-900/60 dark:bg-rose-950/35 dark:text-rose-100",
|
| 38 |
+
className
|
| 39 |
+
)}
|
| 40 |
+
>
|
| 41 |
+
<GitCompareArrows className="h-3 w-3" />
|
| 42 |
+
{isResearchView ? "Variant divergence" : "Score depends on setup"}
|
| 43 |
+
</Badge>
|
| 44 |
+
</SignalTooltip>
|
| 45 |
+
)
|
| 46 |
+
}
|
docs/INTERPRETIVE_SIGNALS.md
ADDED
|
@@ -0,0 +1,622 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# EvalCards interpretive signals — frontend implementation spec
|
| 2 |
+
|
| 3 |
+
**Status:** ready to implement. Backend ships in `evaleval/eval_cards_backend_pipeline` PR #1 (merged `b05323c`). All field shapes below are stable and covered by the backend's test suite.
|
| 4 |
+
|
| 5 |
+
**Companion docs:**
|
| 6 |
+
- Spec source of truth: *EvalCards Interpretive Signals v1.0* (Anka Reuel, Stanford). Section refs (§3, §4, …) below point at that doc.
|
| 7 |
+
- Open backend questions: [evaleval/eval_cards_backend_pipeline#2](https://github.com/evaleval/eval_cards_backend_pipeline/issues/2). None block frontend work — they may shift wording, not shape.
|
| 8 |
+
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
## 0. What this PR does at a glance
|
| 12 |
+
|
| 13 |
+
The backend now annotates evaluation records with four interpretive signals:
|
| 14 |
+
|
| 15 |
+
1. **Reproducibility gap** — *per row.* Was the evaluation documented well enough to be re-run? Surfaced as a missing-fields list (e.g. "missing `max_tokens`").
|
| 16 |
+
2. **Reporting completeness** — *per benchmark.* What fraction of EvalCards-required documentation fields are populated? Surfaced as a `[0, 1]` score with a missing-field breakdown.
|
| 17 |
+
3. **Provenance** — *per row.* Who reported this score (first-party / third-party / collaborative / unspecified), and is it the only source for this `(model, benchmark, metric)` group?
|
| 18 |
+
4. **Comparability** — *per `(model, benchmark, metric)` group.* Two flavors: **variant divergence** (same model, same benchmark, different setups → diverging scores) and **cross-party divergence** (different orgs reporting → diverging scores).
|
| 19 |
+
|
| 20 |
+
Plus a corpus-level rollup file (`corpus-aggregates.json`) for a stratified analytics page.
|
| 21 |
+
|
| 22 |
+
The frontend's job: surface these signals **in three places** — row-level badges, per-eval / per-model summary panels, and a corpus dashboard view.
|
| 23 |
+
|
| 24 |
+
---
|
| 25 |
+
|
| 26 |
+
## 1. Where the new data lives
|
| 27 |
+
|
| 28 |
+
All fields are new additions to existing artifacts. No artifact is removed or reshaped.
|
| 29 |
+
|
| 30 |
+
| Artifact | New fields |
|
| 31 |
+
|---|---|
|
| 32 |
+
| `evals/{id}.json` (`HFEvalDetail`) | Per-row `evalcards.annotations` block on every `metrics[].model_results[]` and `subtasks[…].metrics[].model_results[]`. Plus eval-root `evalcards.annotations.reporting_completeness`, `evalcards.annotations.benchmark_comparability`, and three top-level summaries: `reproducibility_summary`, `provenance_summary`, `comparability_summary`. |
|
| 33 |
+
| `models/{id}.json` (`HFModelDetail`) | Per-row `evalcards.annotations` block on every `hierarchy_by_category[*][*].metrics[].model_results[]`. Plus three top-level summaries scoped to that model. |
|
| 34 |
+
| `eval-list.json` / `eval-list-lite.json` (`HFEvalListEntry`) | Three summaries per entry. |
|
| 35 |
+
| `model-cards.json` / `model-cards-lite.json` (`HFModelCardEntry`) | Three summaries per entry. |
|
| 36 |
+
| `eval-hierarchy.json` (`EvalHierarchy`) | Each family node and leaf node carries the three summaries (aggregated over evals under it). |
|
| 37 |
+
| **`corpus-aggregates.json` (NEW FILE)** | Stratified rollups for paper / dashboard use. |
|
| 38 |
+
| `manifest.json` | New entry in `summary_artifacts`: `corpus_aggregates: "corpus-aggregates.json"`. |
|
| 39 |
+
|
| 40 |
+
`signal_version` (currently `"1.0"`) is present on every annotation. Treat it as opaque; surface only in admin/debug.
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
+
## 2. TypeScript types to add
|
| 45 |
+
|
| 46 |
+
Add to `lib/backend-artifacts.ts` (preferred — these are pipeline contract types):
|
| 47 |
+
|
| 48 |
+
```ts
|
| 49 |
+
// Spec §3
|
| 50 |
+
export interface ReproducibilityGap {
|
| 51 |
+
has_reproducibility_gap: boolean
|
| 52 |
+
missing_fields: string[] // e.g. ["max_tokens"]
|
| 53 |
+
required_field_count: number // 2 base + 2 if agentic on current runtime
|
| 54 |
+
populated_field_count: number
|
| 55 |
+
signal_version: string
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
// Spec §5
|
| 59 |
+
export type ProvenanceSourceType =
|
| 60 |
+
| "first_party"
|
| 61 |
+
| "third_party"
|
| 62 |
+
| "collaborative"
|
| 63 |
+
| "unspecified"
|
| 64 |
+
|
| 65 |
+
export interface Provenance {
|
| 66 |
+
source_type: ProvenanceSourceType
|
| 67 |
+
is_multi_source: boolean
|
| 68 |
+
first_party_only: boolean // see §6.1 below for caveat
|
| 69 |
+
distinct_reporting_organizations: number
|
| 70 |
+
signal_version: string
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
// Spec §6.1
|
| 74 |
+
export interface VariantDivergence {
|
| 75 |
+
has_variant_divergence: boolean
|
| 76 |
+
group_id: string // "{model_route_id}__{metric_summary_id}"
|
| 77 |
+
divergence_magnitude: number
|
| 78 |
+
threshold_used: number
|
| 79 |
+
threshold_basis:
|
| 80 |
+
| "proportion_or_continuous_normalized"
|
| 81 |
+
| "percent"
|
| 82 |
+
| "range_5pct"
|
| 83 |
+
| "fallback_default"
|
| 84 |
+
differing_setup_fields: Array<{ field: string; values: unknown[] }>
|
| 85 |
+
scores_in_group: number[]
|
| 86 |
+
this_triple_score: number | null // this row's score within the group
|
| 87 |
+
triple_count_in_group: number
|
| 88 |
+
score_scale_anomaly: boolean
|
| 89 |
+
group_variant_breakdown: Array<{ variant_key: string; row_count: number }>
|
| 90 |
+
signal_version: string
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
// Spec §6.2
|
| 94 |
+
export interface CrossPartyDivergence {
|
| 95 |
+
has_cross_party_divergence: boolean
|
| 96 |
+
group_id: string
|
| 97 |
+
divergence_magnitude: number
|
| 98 |
+
threshold_used: number
|
| 99 |
+
threshold_basis: VariantDivergence["threshold_basis"]
|
| 100 |
+
scores_by_organization: Record<string, number> // display org name → score
|
| 101 |
+
differing_setup_fields: Array<{ field: string; values: unknown[] }>
|
| 102 |
+
organization_count: number
|
| 103 |
+
group_variant_breakdown: Array<{ variant_key: string; row_count: number }>
|
| 104 |
+
signal_version: string
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
// Per-row annotation block (carried on every model_result row)
|
| 108 |
+
export interface RowAnnotations {
|
| 109 |
+
reproducibility_gap: ReproducibilityGap | null
|
| 110 |
+
provenance: Provenance | null
|
| 111 |
+
variant_divergence: VariantDivergence | null
|
| 112 |
+
cross_party_divergence: CrossPartyDivergence | null
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
// Spec §4
|
| 116 |
+
export interface ReportingCompleteness {
|
| 117 |
+
completeness_score: number // [0, 1]
|
| 118 |
+
total_fields_evaluated: number
|
| 119 |
+
missing_required_fields: string[] // dotted paths
|
| 120 |
+
partial_fields: Array<{
|
| 121 |
+
field_path: string
|
| 122 |
+
score: number // (0, 1) — strictly between
|
| 123 |
+
populated_subitems: number
|
| 124 |
+
total_subitems: number
|
| 125 |
+
}>
|
| 126 |
+
field_scores: Array<{
|
| 127 |
+
field_path: string
|
| 128 |
+
coverage_type: "full" | "partial" | "reserved"
|
| 129 |
+
score: number // [0, 1]
|
| 130 |
+
}>
|
| 131 |
+
signal_version: string
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
export interface BenchmarkComparability {
|
| 135 |
+
variant_divergence_groups: Array<{
|
| 136 |
+
group_id: string
|
| 137 |
+
model_route_id: string
|
| 138 |
+
divergence_magnitude: number
|
| 139 |
+
threshold_used: number
|
| 140 |
+
threshold_basis: VariantDivergence["threshold_basis"]
|
| 141 |
+
differing_setup_fields: VariantDivergence["differing_setup_fields"]
|
| 142 |
+
}>
|
| 143 |
+
cross_party_divergence_groups: Array<{
|
| 144 |
+
group_id: string
|
| 145 |
+
model_route_id: string
|
| 146 |
+
divergence_magnitude: number
|
| 147 |
+
threshold_used: number
|
| 148 |
+
threshold_basis: VariantDivergence["threshold_basis"]
|
| 149 |
+
scores_by_organization: Record<string, number>
|
| 150 |
+
differing_setup_fields: VariantDivergence["differing_setup_fields"]
|
| 151 |
+
}>
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
// Eval-root or model-root annotation block
|
| 155 |
+
export interface EvalcardsAnnotations {
|
| 156 |
+
reporting_completeness?: ReportingCompleteness
|
| 157 |
+
benchmark_comparability?: BenchmarkComparability
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
// Top-level summary blocks (present on eval-list / model-cards / eval / model / hierarchy nodes)
|
| 161 |
+
export interface ReproducibilitySummary {
|
| 162 |
+
results_total: number
|
| 163 |
+
has_reproducibility_gap_count: number
|
| 164 |
+
populated_ratio_avg: number | null // null when results_total == 0
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
export interface ProvenanceSummary {
|
| 168 |
+
total_results: number
|
| 169 |
+
total_groups: number
|
| 170 |
+
multi_source_groups: number
|
| 171 |
+
first_party_only_groups: number
|
| 172 |
+
source_type_distribution: Record<ProvenanceSourceType, number>
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
export interface ComparabilitySummary {
|
| 176 |
+
total_groups: number
|
| 177 |
+
groups_with_variant_check: number // eligible groups (>=2 rows, differing setups, >=2 scored)
|
| 178 |
+
groups_with_cross_party_check: number // eligible groups (>=2 named orgs)
|
| 179 |
+
variant_divergent_count: number
|
| 180 |
+
cross_party_divergent_count: number
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
export interface SignalSummaries {
|
| 184 |
+
reproducibility_summary?: ReproducibilitySummary
|
| 185 |
+
provenance_summary?: ProvenanceSummary
|
| 186 |
+
comparability_summary?: ComparabilitySummary
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
// corpus-aggregates.json
|
| 190 |
+
export interface CorpusAggregates {
|
| 191 |
+
generated_at: string
|
| 192 |
+
signal_version: string
|
| 193 |
+
stratification_dimensions: ["category"]
|
| 194 |
+
reproducibility: Stratified<ReproducibilityCorpusBlock>
|
| 195 |
+
completeness: Stratified<CompletenessCorpusBlock>
|
| 196 |
+
provenance: Stratified<ProvenanceCorpusBlock>
|
| 197 |
+
comparability: Stratified<ComparabilityCorpusBlock>
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
export interface Stratified<T> {
|
| 201 |
+
overall: T
|
| 202 |
+
by_category: Record<string, T> // categories: agentic | general | knowledge | reasoning | safety | other
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
export interface ReproducibilityCorpusBlock {
|
| 206 |
+
total_triples: number
|
| 207 |
+
triples_with_reproducibility_gap: number
|
| 208 |
+
reproducibility_gap_rate: number | null
|
| 209 |
+
agentic_triples: number
|
| 210 |
+
per_field_missingness: Record<string, {
|
| 211 |
+
missing_count: number
|
| 212 |
+
missing_rate: number | null
|
| 213 |
+
denominator: "all_triples" | "agentic_only"
|
| 214 |
+
denominator_count: number
|
| 215 |
+
}>
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
export interface CompletenessCorpusBlock {
|
| 219 |
+
total_benchmarks: number
|
| 220 |
+
completeness_score_mean: number | null
|
| 221 |
+
completeness_score_median: number | null
|
| 222 |
+
per_field_population: Record<string, {
|
| 223 |
+
mean_score: number
|
| 224 |
+
populated_rate: number
|
| 225 |
+
fully_populated_rate: number
|
| 226 |
+
benchmark_count: number
|
| 227 |
+
}>
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
export interface ProvenanceCorpusBlock {
|
| 231 |
+
total_triples: number
|
| 232 |
+
total_groups: number
|
| 233 |
+
multi_source_groups: number
|
| 234 |
+
multi_source_rate: number | null
|
| 235 |
+
first_party_only_groups: number
|
| 236 |
+
first_party_only_rate: number | null
|
| 237 |
+
source_type_distribution: Record<ProvenanceSourceType, number>
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
export interface ComparabilityCorpusBlock {
|
| 241 |
+
total_groups: number
|
| 242 |
+
variant_eligible_groups: number
|
| 243 |
+
variant_divergent_groups: number
|
| 244 |
+
variant_divergence_rate: number | null
|
| 245 |
+
cross_party_eligible_groups: number
|
| 246 |
+
cross_party_divergent_groups: number
|
| 247 |
+
cross_party_divergence_rate: number | null // commonly null on current corpus
|
| 248 |
+
}
|
| 249 |
+
```
|
| 250 |
+
|
| 251 |
+
Then in `lib/hf-data.ts`:
|
| 252 |
+
|
| 253 |
+
- Extend `HFEvalModelResult` (line ~522) with `evalcards?: { annotations?: RowAnnotations }`.
|
| 254 |
+
- Extend `HFEvalDetail` (line ~556) with `evalcards?: { annotations?: EvalcardsAnnotations }` plus the three summary fields from `SignalSummaries`.
|
| 255 |
+
- Extend `HFEvalListEntry` (line ~475) with `SignalSummaries` fields.
|
| 256 |
+
- Extend `HFModelCardEntry` (line ~439) with `SignalSummaries` fields.
|
| 257 |
+
- Extend `HFModelDetail` (line ~571) with `SignalSummaries` fields.
|
| 258 |
+
- Extend `HFModelHierarchyMetric` (line ~616) — `model_results` already typed as `HFEvalModelResult`, so the per-row annotations propagate automatically.
|
| 259 |
+
|
| 260 |
+
In `EvalHierarchy` types (`lib/backend-artifacts.ts` line ~54), add `SignalSummaries` to both `HierarchyFamily` and `HierarchyBenchmark`.
|
| 261 |
+
|
| 262 |
+
All fields are **optional** at the type level — older cached snapshots won't have them, and the frontend should render gracefully when they're absent.
|
| 263 |
+
|
| 264 |
+
---
|
| 265 |
+
|
| 266 |
+
## 3. Data plumbing
|
| 267 |
+
|
| 268 |
+
### 3.1 New fetcher + API route for corpus aggregates
|
| 269 |
+
|
| 270 |
+
In `lib/hf-data.ts`, add after the existing fetchers (~line 866):
|
| 271 |
+
|
| 272 |
+
```ts
|
| 273 |
+
export async function fetchCorpusAggregates(): Promise<CorpusAggregates | null> {
|
| 274 |
+
return fetchHFJsonSafe<CorpusAggregates>("corpus-aggregates.json")
|
| 275 |
+
}
|
| 276 |
+
```
|
| 277 |
+
|
| 278 |
+
Add to `scripts/cache-hf-data.mjs` `CACHE_ROOT_FILES` array: `"corpus-aggregates.json"`. (Mark it optional in `OPTIONAL_CACHE_ROOT_FILES` if shipping while the HF dataset upload is still rolling — once the backend pipeline next runs against the dataset, the file will appear.)
|
| 279 |
+
|
| 280 |
+
Create `app/api/corpus-aggregates/route.ts`:
|
| 281 |
+
|
| 282 |
+
```ts
|
| 283 |
+
import { NextResponse } from "next/server"
|
| 284 |
+
import { fetchCorpusAggregates } from "@/lib/hf-data"
|
| 285 |
+
|
| 286 |
+
export async function GET() {
|
| 287 |
+
const aggregates = await fetchCorpusAggregates()
|
| 288 |
+
if (!aggregates) {
|
| 289 |
+
return NextResponse.json({ error: "Corpus aggregates not available" }, { status: 404 })
|
| 290 |
+
}
|
| 291 |
+
return NextResponse.json(aggregates)
|
| 292 |
+
}
|
| 293 |
+
```
|
| 294 |
+
|
| 295 |
+
### 3.2 Rest of plumbing is automatic
|
| 296 |
+
|
| 297 |
+
Existing fetchers (`fetchEvalDetail`, `fetchModelDetail`, `fetchEvalList`, `fetchModelCardsList`, `fetchEvalHierarchy`) just pull the raw JSON, so the new fields propagate without code changes once the types above are widened.
|
| 298 |
+
|
| 299 |
+
---
|
| 300 |
+
|
| 301 |
+
## 4. UX components to build
|
| 302 |
+
|
| 303 |
+
Build a small set of reusable signal components in `components/signals/`. Each takes one of the typed shapes above and renders a badge / panel. This keeps signal rendering consistent across `eval-detail.tsx`, `benchmark-detail.tsx`, `model-compare-dialog.tsx`, and the new corpus dashboard.
|
| 304 |
+
|
| 305 |
+
```
|
| 306 |
+
components/signals/
|
| 307 |
+
├── reproducibility-badge.tsx
|
| 308 |
+
├── provenance-badge.tsx // already partially exists in benchmark-detail.tsx — see §4.2
|
| 309 |
+
├── variant-divergence-badge.tsx
|
| 310 |
+
├── cross-party-divergence-badge.tsx
|
| 311 |
+
├── reproducibility-panel.tsx // detail view — full missing-fields list
|
| 312 |
+
├── completeness-panel.tsx // detail view — score bar + missing-field list
|
| 313 |
+
├── comparability-panel.tsx // detail view — divergent groups list
|
| 314 |
+
├── signals-row-badges.tsx // composite: renders all four row-level badges with proper spacing
|
| 315 |
+
└── signal-tooltip.tsx // shared tooltip primitive
|
| 316 |
+
```
|
| 317 |
+
|
| 318 |
+
All badges should follow the existing tone conventions used by `getRelationshipBadgeTone` ([components/benchmark-detail.tsx:289](../components/benchmark-detail.tsx#L289)) and the `Badge` primitive in [components/ui/badge.tsx](../components/ui/badge.tsx).
|
| 319 |
+
|
| 320 |
+
### 4.1 Row-level badges — placement
|
| 321 |
+
|
| 322 |
+
Insert `<SignalsRowBadges annotations={modelResult.evalcards?.annotations} />` next to the score cell in:
|
| 323 |
+
|
| 324 |
+
- **Eval detail leaderboard table** — [components/eval-detail.tsx:869-871](../components/eval-detail.tsx#L869-L871) (the `<TableCell className="text-right">` containing the score). Render badges below the score on a new line for desktop, hidden on mobile.
|
| 325 |
+
- **Benchmark detail rows** — `components/benchmark-detail.tsx` renders score rows in several places (search for `formatRawScoreValue`); insert the same component.
|
| 326 |
+
- **Model compare dialog** — [components/model-compare-dialog.tsx](../components/model-compare-dialog.tsx) score columns.
|
| 327 |
+
|
| 328 |
+
**Display rules — only badge for actionable states.** Silence is meaningful here.
|
| 329 |
+
|
| 330 |
+
| Signal | Show badge when | Hide when |
|
| 331 |
+
|---|---|---|
|
| 332 |
+
| Reproducibility | `has_reproducibility_gap === true` | gap=false, or annotation absent |
|
| 333 |
+
| Provenance | `source_type` ∈ {`first_party`, `third_party`, `collaborative`} | `source_type === "unspecified"` |
|
| 334 |
+
| Variant divergence | `variant_divergence !== null && has_variant_divergence === true` | null (not applicable) or false (checked, fine) |
|
| 335 |
+
| Cross-party divergence | `cross_party_divergence !== null && has_cross_party_divergence === true` | null (almost always on current corpus) or false |
|
| 336 |
+
|
| 337 |
+
`has_*: false` means "we checked and it's fine" — silent success. `null` means "not applicable / not enough data" — also silent. **Only divergent / gap-positive states warrant pixels.**
|
| 338 |
+
|
| 339 |
+
**Dedup rule.** `variant_divergence` and `cross_party_divergence` are duplicated onto every row in the same group. If you render three rows from the same `group_id`, render the divergence badge on each row but the *expanded panel* (§4.4) only once at the group header.
|
| 340 |
+
|
| 341 |
+
### 4.2 Provenance badge — reuse what's there
|
| 342 |
+
|
| 343 |
+
[components/benchmark-detail.tsx:262-302](../components/benchmark-detail.tsx#L262-L302) already has `getRelationshipShortLabel` and `getRelationshipBadgeTone`. Extract these into `components/signals/provenance-badge.tsx` and import back into `benchmark-detail.tsx`. The new badge should **also** consume the new `Provenance` annotation when present (it carries `is_multi_source` and `first_party_only`, which the current implementation derives row-by-row from `source_metadata` alone).
|
| 344 |
+
|
| 345 |
+
When `provenance.first_party_only === true`, show a small ⚠ subtle indicator on the first-party badge ("first-party only — no independent replication"). This is the headline use of the signal for policy-mode readers.
|
| 346 |
+
|
| 347 |
+
### 4.3 Reproducibility badge — content rules
|
| 348 |
+
|
| 349 |
+
Tooltip content depends on audience mode (`useAudienceMode()` from [components/audience-mode-provider.tsx:40](../components/audience-mode-provider.tsx#L40)):
|
| 350 |
+
|
| 351 |
+
- Research mode: "Setup not fully documented. Missing: `max_tokens`, `eval_plan`."
|
| 352 |
+
- Policy mode: "This score's setup isn't fully documented, so it can't be re-run as-is."
|
| 353 |
+
|
| 354 |
+
Always include the count "{populated_field_count} of {required_field_count} setup fields recorded." Don't hardcode "4 fields" — the active runtime checks 2 base fields (`temperature`, `max_tokens`) plus 2 agentic fields (`eval_plan`, `eval_limits`) when the benchmark is agentic. Read counts off the annotation.
|
| 355 |
+
|
| 356 |
+
### 4.4 Detail panels — placement
|
| 357 |
+
|
| 358 |
+
#### Reproducibility panel
|
| 359 |
+
The existing "Evaluation Provenance" panel in [components/eval-detail.tsx:952-998](../components/eval-detail.tsx#L952-L998) (rendered when a row is expanded) is the right place for the **per-row** reproducibility breakdown. Add a new `DetailPanel` adjacent to it:
|
| 360 |
+
|
| 361 |
+
```tsx
|
| 362 |
+
{rowAnnotations?.reproducibility_gap && (
|
| 363 |
+
<DetailPanel
|
| 364 |
+
title={isResearchView ? "Reproducibility" : "Re-runnability"}
|
| 365 |
+
subtitle={
|
| 366 |
+
isResearchView
|
| 367 |
+
? "Whether the setup is documented well enough for someone else to re-run."
|
| 368 |
+
: "Whether someone could re-run this evaluation with the information available."
|
| 369 |
+
}
|
| 370 |
+
>
|
| 371 |
+
<MetaRow
|
| 372 |
+
label="Setup fields recorded"
|
| 373 |
+
value={`${rowAnnotations.reproducibility_gap.populated_field_count} of ${rowAnnotations.reproducibility_gap.required_field_count}`}
|
| 374 |
+
/>
|
| 375 |
+
{rowAnnotations.reproducibility_gap.missing_fields.length > 0 && (
|
| 376 |
+
<MetaRow
|
| 377 |
+
label="Missing"
|
| 378 |
+
value={rowAnnotations.reproducibility_gap.missing_fields.join(", ")}
|
| 379 |
+
/>
|
| 380 |
+
)}
|
| 381 |
+
</DetailPanel>
|
| 382 |
+
)}
|
| 383 |
+
```
|
| 384 |
+
|
| 385 |
+
#### Completeness panel
|
| 386 |
+
Render at the **eval-detail header level** (above the leaderboard, below the metric specification card). New `<CompletenessPanel completeness={detail.evalcards?.annotations?.reporting_completeness} />`. UI: progress bar showing `completeness_score`, label "{N} of {M} fields populated" where N = sum of `field_scores[].score` rounded, M = `total_fields_evaluated`. Below: collapsible accordions:
|
| 387 |
+
|
| 388 |
+
- **Missing required fields** (count badge) — list of `missing_required_fields` with friendly labels (see §6.4 for label mapping).
|
| 389 |
+
- **Partially populated** (count badge) — `partial_fields` rendered as "{field}: {populated_subitems}/{total_subitems}".
|
| 390 |
+
|
| 391 |
+
In policy mode, don't show the dotted-path field names — show friendly labels only. In research mode, show both.
|
| 392 |
+
|
| 393 |
+
#### Comparability panel
|
| 394 |
+
Also at eval-detail header level. Sourced from `detail.evalcards?.annotations?.benchmark_comparability`. Render as two collapsibles — "Variant divergence ({count})" and "Cross-party divergence ({count})". Each item should link to the relevant model row (use `model_route_id` from each group entry as anchor — add `id={"row-" + model_route_id}` on the leaderboard row).
|
| 395 |
+
|
| 396 |
+
When both arrays are empty, hide the panel entirely. When `comparability_summary.groups_with_cross_party_check === 0` (the common state), surface a small note: "No third-party reports available for cross-party comparison."
|
| 397 |
+
|
| 398 |
+
### 4.5 Per-eval header chips
|
| 399 |
+
On the eval-detail page header (next to existing "Measures" / "Source dataset" chips around [components/eval-detail.tsx:486-525](../components/eval-detail.tsx#L486-L525)), add a fourth chip when `evalcards.annotations.reporting_completeness` is present:
|
| 400 |
+
|
| 401 |
+
> **Documentation**
|
| 402 |
+
> {round(completeness_score * 100)}%
|
| 403 |
+
|
| 404 |
+
Tooltip: "{N} of {M} EvalCards documentation fields populated for this benchmark."
|
| 405 |
+
|
| 406 |
+
### 4.6 Per-model card chips
|
| 407 |
+
On `components/eval-card.tsx` and the model card pages, add three chips driven by the model-level summaries. Replace the hand-written hint at [components/eval-card.tsx:250](../components/eval-card.tsx#L250) ("Some results lack generation settings; compare scores with care.") with a data-driven version:
|
| 408 |
+
|
| 409 |
+
> {has_reproducibility_gap_count} of {results_total} reported scores aren't fully documented.
|
| 410 |
+
|
| 411 |
+
Show only when `has_reproducibility_gap_count > 0`. The hand-written hint was a placeholder for exactly this signal — wire it up.
|
| 412 |
+
|
| 413 |
+
---
|
| 414 |
+
|
| 415 |
+
## 5. New page: corpus dashboard
|
| 416 |
+
|
| 417 |
+
Add `app/corpus/page.tsx` (linked from main navigation [components/navigation.tsx](../components/navigation.tsx)). Server component that calls `fetchCorpusAggregates()` and renders four sections:
|
| 418 |
+
|
| 419 |
+
### 5.1 Reproducibility section
|
| 420 |
+
- Headline number: `reproducibility_gap_rate` rendered as percentage. Sub-label: "{triples_with_reproducibility_gap} of {total_triples} reported scores."
|
| 421 |
+
- Per-field horizontal bar chart from `per_field_missingness`. **Bar denominator depends on `denominator` field**: agentic-only fields use `agentic_triples`, others use `total_triples`. Label each bar with the denominator type so users understand.
|
| 422 |
+
- Toggle: `overall` ↔ `by_category` (rendered as a small-multiple grid, one panel per category).
|
| 423 |
+
|
| 424 |
+
### 5.2 Completeness section
|
| 425 |
+
- Headline: `completeness_score_mean` (and median) across `total_benchmarks`.
|
| 426 |
+
- Histogram of per-benchmark scores (pull individual benchmark scores from `eval-list.json` `reporting_completeness.completeness_score`, since corpus-aggregates only carries mean/median).
|
| 427 |
+
- Per-field bar chart from `per_field_population` — three bars per field: `mean_score`, `populated_rate`, `fully_populated_rate`. (See §6.7 for which one to highlight per coverage type.)
|
| 428 |
+
|
| 429 |
+
### 5.3 Provenance section
|
| 430 |
+
- Stacked bar of `source_type_distribution` (across all triples).
|
| 431 |
+
- Two ratios: `multi_source_rate`, `first_party_only_rate`. Label both: "% of (model, benchmark, metric) groups."
|
| 432 |
+
|
| 433 |
+
### 5.4 Comparability section
|
| 434 |
+
- Two side-by-side panels: Variant divergence (eligible-aware rate) and Cross-party divergence (often null).
|
| 435 |
+
- **When `cross_party_divergence_rate === null`:** show a "Not enough multi-org coverage to compute" empty state, not "0%". Same for `variant_divergence_rate === null`. This is critical — see §6.5.
|
| 436 |
+
|
| 437 |
+
All sections support a category toggle (research mode shows category breakdowns by default; policy mode shows overall by default).
|
| 438 |
+
|
| 439 |
+
---
|
| 440 |
+
|
| 441 |
+
## 6. Caveats and edge cases (read these before implementing)
|
| 442 |
+
|
| 443 |
+
### 6.1 `first_party_only` semantics
|
| 444 |
+
A row can be `first_party_only: true` even when `is_multi_source: false`. The spec literal: a group with one *named* org reporting first-party gets the badge. **Don't read it as "exclusive coverage"** — read it as "no independent replication." The label suggestion is "First-party only" rather than "Sole source."
|
| 445 |
+
|
| 446 |
+
If `distinct_reporting_organizations === 0` (all rows have null org), `first_party_only` is `false` even when `source_type === "first_party"`. Render the row's source as "First-party (org unspecified)" in research mode; suppress the first-party-only badge.
|
| 447 |
+
|
| 448 |
+
### 6.2 Active reproducibility field set is reduced
|
| 449 |
+
The spec describes four base fields (`temperature`, `top_p`, `max_tokens`, `prompt_template`); the active backend currently checks **only `temperature` and `max_tokens`** plus `eval_plan` / `eval_limits` for agentic benchmarks. **Don't hardcode "4 fields" anywhere.** Always read `required_field_count` off the annotation. This is a deliberate spec-author choice and may revert; the field count is the only stable interface.
|
| 450 |
+
|
| 451 |
+
### 6.3 Missing-field path strings
|
| 452 |
+
`missing_fields` for reproducibility uses bare names (e.g. `"max_tokens"`). `missing_required_fields` for completeness uses dotted paths (e.g. `"autobenchmarkcard.methodology.baseline_results"`). Different conventions, intentional. Build a small label map for completeness paths — paths come from [registry/completeness_fields.json](https://github.com/evaleval/eval_cards_backend_pipeline/blob/main/registry/completeness_fields.json) on the backend repo. Suggested label rules:
|
| 453 |
+
|
| 454 |
+
- Drop the `autobenchmarkcard.` / `eee_eval.` / `evalcards.` prefix.
|
| 455 |
+
- Replace dots with " / ", underscore with space, title-case.
|
| 456 |
+
- Example: `autobenchmarkcard.methodology.baseline_results` → "Methodology / Baseline results".
|
| 457 |
+
|
| 458 |
+
### 6.4 `differing_setup_fields[].values` may contain null and mixed types
|
| 459 |
+
Per spec §6.1.4, `null` is a *distinct* value from any explicit setting (comparing "explicit 2048" to "unspecified" is meaningful). Render `null` as "(unspecified)" rather than the string "null". Numeric, string, boolean, and object values can all appear in the same array; render with `JSON.stringify` for objects, plain text otherwise.
|
| 460 |
+
|
| 461 |
+
### 6.5 `null` rates in comparability are *not* zero
|
| 462 |
+
Eligibility-aware denominators mean `variant_divergence_rate` and `cross_party_divergence_rate` are `null` when no groups were eligible. **Render as "N/A — not enough data" or an empty-state card, never as "0%".** On the current corpus, `cross_party_divergence_rate` will commonly be null (third-party reports are sparse). Treat this as a normal state, not a data-loading error.
|
| 463 |
+
|
| 464 |
+
### 6.6 Score-scale anomaly flag
|
| 465 |
+
`variant_divergence.score_scale_anomaly === true` indicates the metric was declared `proportion` but scores fell outside [0, 1] — usually a metric-normalization bug upstream. Surface as a small "data quality warning" annotation alongside the divergence number; the divergence is still computed but the threshold may not be apples-to-apples.
|
| 466 |
+
|
| 467 |
+
### 6.7 `mean_score` vs `populated_rate` for completeness
|
| 468 |
+
Per-field aggregates expose three numbers. Pick which to display based on `coverage_type`:
|
| 469 |
+
|
| 470 |
+
- **`full` and `reserved` fields** — `mean_score` and `populated_rate` are equal. Show one number labeled "% of benchmarks populating this field."
|
| 471 |
+
- **`partial` fields** — they diverge. `populated_rate` = % of benchmarks with *any* sub-item; `mean_score` = average sub-item population fraction. Show both: "{populated_rate}% have any data, {mean_score}% on average across sub-items."
|
| 472 |
+
|
| 473 |
+
### 6.8 No `computed_at` on per-record annotations
|
| 474 |
+
Only `signal_version` is on each annotation. For "last computed" UI text, use `manifest.json → generated_at` from the existing `BackendManifest`.
|
| 475 |
+
|
| 476 |
+
### 6.9 Stratification categories
|
| 477 |
+
`by_category` keys are: `agentic`, `general`, `knowledge`, `reasoning`, `safety`, `other`. Same set as the existing `category` field on evals — reuse whatever color scheme is currently keyed off `inferCategoryFromBenchmark` ([lib/benchmark-schema.ts](../lib/benchmark-schema.ts)).
|
| 478 |
+
|
| 479 |
+
### 6.10 Annotation block can be `null` or absent
|
| 480 |
+
`evalcards.annotations.{reproducibility_gap,provenance,variant_divergence,cross_party_divergence}` can each be `null` independently, and the entire `evalcards` block may be absent on older cached snapshots. Use optional chaining everywhere; never assume presence. The `RowAnnotations` type intentionally types each subfield as `T | null` (not `T | undefined`) because the backend writes explicit `null`.
|
| 481 |
+
|
| 482 |
+
---
|
| 483 |
+
|
| 484 |
+
## 7. Suggested implementation order
|
| 485 |
+
|
| 486 |
+
1. **Types + plumbing** (1–2 hours): types in `backend-artifacts.ts` + `hf-data.ts`, the `fetchCorpusAggregates` fetcher, the API route, and adding `corpus-aggregates.json` to the cache script. No UI yet.
|
| 487 |
+
2. **Row-level badges** (½ day): build `signals/` directory with the four badge components, the dedup-aware `signals-row-badges.tsx`, and wire into eval-detail and benchmark-detail. This is the most visible win.
|
| 488 |
+
3. **Per-eval completeness panel + comparability panel** (½ day): single benchmark, easy to design around. New `CompletenessPanel` is the headline new UX in this set.
|
| 489 |
+
4. **Per-row reproducibility detail panel** (1–2 hours): drops into the existing expanded row layout.
|
| 490 |
+
5. **Per-eval / per-model header chips + replace the hand-written gap hint** (1–2 hours): wires the summary fields into existing card surfaces.
|
| 491 |
+
6. **Corpus dashboard page** (1–2 days): new route, new components, biggest scope. Defer until 1–5 are live and reviewed.
|
| 492 |
+
|
| 493 |
+
Each step is independently shippable. Steps 1–5 can land before the corpus dashboard is designed.
|
| 494 |
+
|
| 495 |
+
---
|
| 496 |
+
|
| 497 |
+
## 8. Out of scope (don't do these yet)
|
| 498 |
+
|
| 499 |
+
- **Filter / sort the eval list by signal state** ("show only benchmarks with completeness > 0.5"). Wait for the dashboard view to land first; users will tell us which filters they actually want.
|
| 500 |
+
- **Side-by-side score comparison with divergence overlay.** The data supports it (`scores_in_group`, `scores_by_organization`) but the design space is large. Hold off until we see the row-level badges in use.
|
| 501 |
+
- **Recompute / verification UI for missing reproducibility fields.** Backend-side; out of scope here.
|
| 502 |
+
- **Per-instance sample-level badges.** Signals operate at row / benchmark level; sample-level instance data is unaffected.
|
| 503 |
+
|
| 504 |
+
---
|
| 505 |
+
|
| 506 |
+
## 9. Reference: minimal real-shape examples
|
| 507 |
+
|
| 508 |
+
Per-row `evalcards.annotations` with all four signals populated:
|
| 509 |
+
|
| 510 |
+
```jsonc
|
| 511 |
+
{
|
| 512 |
+
"reproducibility_gap": {
|
| 513 |
+
"has_reproducibility_gap": true,
|
| 514 |
+
"missing_fields": ["max_tokens"],
|
| 515 |
+
"required_field_count": 2,
|
| 516 |
+
"populated_field_count": 1,
|
| 517 |
+
"signal_version": "1.0"
|
| 518 |
+
},
|
| 519 |
+
"provenance": {
|
| 520 |
+
"source_type": "first_party",
|
| 521 |
+
"is_multi_source": false,
|
| 522 |
+
"first_party_only": true,
|
| 523 |
+
"distinct_reporting_organizations": 1,
|
| 524 |
+
"signal_version": "1.0"
|
| 525 |
+
},
|
| 526 |
+
"variant_divergence": null,
|
| 527 |
+
"cross_party_divergence": null
|
| 528 |
+
}
|
| 529 |
+
```
|
| 530 |
+
|
| 531 |
+
Per-eval `evalcards.annotations` with completeness + comparability:
|
| 532 |
+
|
| 533 |
+
```jsonc
|
| 534 |
+
{
|
| 535 |
+
"reporting_completeness": {
|
| 536 |
+
"completeness_score": 0.62,
|
| 537 |
+
"total_fields_evaluated": 28,
|
| 538 |
+
"missing_required_fields": [
|
| 539 |
+
"autobenchmarkcard.methodology.baseline_results",
|
| 540 |
+
"autobenchmarkcard.methodology.validation",
|
| 541 |
+
"evalcards.preregistration_url"
|
| 542 |
+
],
|
| 543 |
+
"partial_fields": [
|
| 544 |
+
{ "field_path": "autobenchmarkcard.data", "score": 0.5, "populated_subitems": 2, "total_subitems": 4 }
|
| 545 |
+
],
|
| 546 |
+
"field_scores": [/* 28 entries */],
|
| 547 |
+
"signal_version": "1.0"
|
| 548 |
+
},
|
| 549 |
+
"benchmark_comparability": {
|
| 550 |
+
"variant_divergence_groups": [
|
| 551 |
+
{
|
| 552 |
+
"group_id": "openai__gpt-5__hfopenllm_v2_bbh_accuracy",
|
| 553 |
+
"model_route_id": "openai__gpt-5",
|
| 554 |
+
"divergence_magnitude": 0.12,
|
| 555 |
+
"threshold_used": 0.05,
|
| 556 |
+
"threshold_basis": "proportion_or_continuous_normalized",
|
| 557 |
+
"differing_setup_fields": [
|
| 558 |
+
{ "field": "max_tokens", "values": [2048, 4096, 8192] }
|
| 559 |
+
]
|
| 560 |
+
}
|
| 561 |
+
],
|
| 562 |
+
"cross_party_divergence_groups": []
|
| 563 |
+
}
|
| 564 |
+
}
|
| 565 |
+
```
|
| 566 |
+
|
| 567 |
+
Top-level `provenance_summary` example:
|
| 568 |
+
|
| 569 |
+
```jsonc
|
| 570 |
+
{
|
| 571 |
+
"total_results": 142,
|
| 572 |
+
"total_groups": 47,
|
| 573 |
+
"multi_source_groups": 3,
|
| 574 |
+
"first_party_only_groups": 30,
|
| 575 |
+
"source_type_distribution": {
|
| 576 |
+
"first_party": 120,
|
| 577 |
+
"third_party": 18,
|
| 578 |
+
"collaborative": 0,
|
| 579 |
+
"unspecified": 4
|
| 580 |
+
}
|
| 581 |
+
}
|
| 582 |
+
```
|
| 583 |
+
|
| 584 |
+
`corpus-aggregates.json` structure (top of file):
|
| 585 |
+
|
| 586 |
+
```jsonc
|
| 587 |
+
{
|
| 588 |
+
"generated_at": "2026-04-27T...",
|
| 589 |
+
"signal_version": "1.0",
|
| 590 |
+
"stratification_dimensions": ["category"],
|
| 591 |
+
"reproducibility": { "overall": {/* ReproducibilityCorpusBlock */}, "by_category": { "agentic": {...}, "general": {...}, ... } },
|
| 592 |
+
"completeness": { "overall": {/* CompletenessCorpusBlock */}, "by_category": {...} },
|
| 593 |
+
"provenance": { "overall": {/* ProvenanceCorpusBlock */}, "by_category": {...} },
|
| 594 |
+
"comparability": { "overall": {/* ComparabilityCorpusBlock */}, "by_category": {...} }
|
| 595 |
+
}
|
| 596 |
+
```
|
| 597 |
+
|
| 598 |
+
---
|
| 599 |
+
|
| 600 |
+
## 10. Audience-mode wording cheatsheet
|
| 601 |
+
|
| 602 |
+
| Element | Research mode | Policy mode |
|
| 603 |
+
|---|---|---|
|
| 604 |
+
| Reproducibility gap badge | "Reproducibility gap" | "Setup not documented" |
|
| 605 |
+
| Reproducibility tooltip | "Setup not fully documented. Missing: {fields}." | "This score's setup isn't documented, so it can't be re-run as-is." |
|
| 606 |
+
| Reproducibility panel title | "Reproducibility" | "Re-runnability" |
|
| 607 |
+
| Completeness chip label | "Documentation" | "Documentation" |
|
| 608 |
+
| Completeness panel title | "Reporting completeness" | "How well is this benchmark documented?" |
|
| 609 |
+
| Provenance: first-party | "1st party" | "Reported by model developer" |
|
| 610 |
+
| Provenance: first-party only | "1st party only — no replication" | "Only the model developer reported this score" |
|
| 611 |
+
| Provenance: third-party | "3rd party" | "Independently reported" |
|
| 612 |
+
| Provenance: collaborative | "Collaborative" | "Joint report" |
|
| 613 |
+
| Variant divergence badge | "Variant divergence" | "Score depends on setup" |
|
| 614 |
+
| Variant divergence tooltip | "Scores diverge by {magnitude} across different setups: {fields}." | "Different runs of this evaluation produced different scores — the setup matters." |
|
| 615 |
+
| Cross-party divergence badge | "Cross-party divergence" | "Sources disagree" |
|
| 616 |
+
| Cross-party divergence tooltip | "Reports diverge by {magnitude} across organizations." | "Different organizations reported different scores for this same model on this same benchmark." |
|
| 617 |
+
|
| 618 |
+
Adjust tone but keep the underlying numbers identical across modes — the data is the same, only the framing changes.
|
| 619 |
+
|
| 620 |
+
---
|
| 621 |
+
|
| 622 |
+
*Last updated 2026-04-27. Maintainer: backend pipeline (eval_cards_backend_pipeline), frontend (general-eval-card). Questions on backend semantics → [eval_cards_backend_pipeline#2](https://github.com/evaleval/eval_cards_backend_pipeline/issues/2). Questions on UX → discuss with @anka-evals + frontend team.*
|
lib/backend-artifacts.ts
CHANGED
|
@@ -2,6 +2,10 @@ export interface BackendManifest {
|
|
| 2 |
generated_at: string
|
| 3 |
config_version: number
|
| 4 |
skipped_configs: string[]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
}
|
| 6 |
|
| 7 |
export interface BackendManifestStatus {
|
|
@@ -14,6 +18,209 @@ export interface BackendManifestStatus {
|
|
| 14 |
pendingRefreshCount: number
|
| 15 |
}
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
export interface HierarchyTags {
|
| 18 |
domains: string[]
|
| 19 |
languages: string[]
|
|
@@ -32,16 +239,17 @@ export interface HierarchySlice {
|
|
| 32 |
metrics: HierarchyMetric[]
|
| 33 |
}
|
| 34 |
|
| 35 |
-
export interface HierarchyBenchmark {
|
| 36 |
key: string
|
| 37 |
display_name: string
|
| 38 |
has_card: boolean
|
| 39 |
tags: HierarchyTags
|
| 40 |
slices: HierarchySlice[]
|
| 41 |
metrics: HierarchyMetric[]
|
|
|
|
| 42 |
}
|
| 43 |
|
| 44 |
-
export interface HierarchyComposite {
|
| 45 |
key: string
|
| 46 |
display_name: string
|
| 47 |
has_card: boolean
|
|
@@ -51,17 +259,32 @@ export interface HierarchyComposite {
|
|
| 51 |
summary_eval_ids?: string[]
|
| 52 |
}
|
| 53 |
|
| 54 |
-
export interface
|
| 55 |
key: string
|
| 56 |
display_name: string
|
| 57 |
-
has_card: boolean
|
| 58 |
category: string
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
standalone_benchmarks?: HierarchyBenchmark[]
|
| 61 |
composites?: HierarchyComposite[]
|
| 62 |
benchmarks?: HierarchyBenchmark[]
|
| 63 |
slices?: HierarchySlice[]
|
| 64 |
metrics?: HierarchyMetric[]
|
|
|
|
|
|
|
| 65 |
}
|
| 66 |
|
| 67 |
export interface EvalHierarchyStats {
|
|
@@ -75,7 +298,7 @@ export interface EvalHierarchyStats {
|
|
| 75 |
}
|
| 76 |
|
| 77 |
export interface EvalHierarchy {
|
| 78 |
-
stats: EvalHierarchyStats
|
| 79 |
families: HierarchyFamily[]
|
| 80 |
}
|
| 81 |
|
|
@@ -159,4 +382,4 @@ export interface ComparisonIndex {
|
|
| 159 |
metric_group_order: MetricGroup[]
|
| 160 |
evals: Record<string, ComparisonEvalEntry>
|
| 161 |
by_model: Record<string, Record<string, Record<string, ComparisonByModelEntry>>>
|
| 162 |
-
}
|
|
|
|
| 2 |
generated_at: string
|
| 3 |
config_version: number
|
| 4 |
skipped_configs: string[]
|
| 5 |
+
summary_artifacts?: {
|
| 6 |
+
corpus_aggregates?: string
|
| 7 |
+
[key: string]: string | undefined
|
| 8 |
+
}
|
| 9 |
}
|
| 10 |
|
| 11 |
export interface BackendManifestStatus {
|
|
|
|
| 18 |
pendingRefreshCount: number
|
| 19 |
}
|
| 20 |
|
| 21 |
+
// ---------------------------------------------------------------------------
|
| 22 |
+
// EvalCards interpretive signals v1.0
|
| 23 |
+
// ---------------------------------------------------------------------------
|
| 24 |
+
|
| 25 |
+
export interface ReproducibilityGap {
|
| 26 |
+
has_reproducibility_gap: boolean
|
| 27 |
+
missing_fields: string[]
|
| 28 |
+
required_field_count: number
|
| 29 |
+
populated_field_count: number
|
| 30 |
+
signal_version: string
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
export type ProvenanceSourceType =
|
| 34 |
+
| "first_party"
|
| 35 |
+
| "third_party"
|
| 36 |
+
| "collaborative"
|
| 37 |
+
| "unspecified"
|
| 38 |
+
|
| 39 |
+
export interface Provenance {
|
| 40 |
+
source_type: ProvenanceSourceType
|
| 41 |
+
is_multi_source: boolean
|
| 42 |
+
first_party_only: boolean
|
| 43 |
+
distinct_reporting_organizations: number
|
| 44 |
+
signal_version: string
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
export type DivergenceThresholdBasis =
|
| 48 |
+
| "proportion_or_continuous_normalized"
|
| 49 |
+
| "percent"
|
| 50 |
+
| "range_5pct"
|
| 51 |
+
| "fallback_default"
|
| 52 |
+
|
| 53 |
+
export interface DifferingSetupField {
|
| 54 |
+
field: string
|
| 55 |
+
values: unknown[]
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
export interface VariantDivergence {
|
| 59 |
+
has_variant_divergence: boolean
|
| 60 |
+
group_id: string
|
| 61 |
+
divergence_magnitude: number
|
| 62 |
+
threshold_used: number
|
| 63 |
+
threshold_basis: DivergenceThresholdBasis
|
| 64 |
+
differing_setup_fields: DifferingSetupField[]
|
| 65 |
+
scores_in_group: number[]
|
| 66 |
+
this_triple_score: number | null
|
| 67 |
+
triple_count_in_group: number
|
| 68 |
+
score_scale_anomaly: boolean
|
| 69 |
+
group_variant_breakdown: Array<{ variant_key: string; row_count: number }>
|
| 70 |
+
signal_version: string
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
export interface CrossPartyDivergence {
|
| 74 |
+
has_cross_party_divergence: boolean
|
| 75 |
+
group_id: string
|
| 76 |
+
divergence_magnitude: number
|
| 77 |
+
threshold_used: number
|
| 78 |
+
threshold_basis: DivergenceThresholdBasis
|
| 79 |
+
scores_by_organization: Record<string, number>
|
| 80 |
+
differing_setup_fields: DifferingSetupField[]
|
| 81 |
+
organization_count: number
|
| 82 |
+
group_variant_breakdown: Array<{ variant_key: string; row_count: number }>
|
| 83 |
+
signal_version: string
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
export interface RowAnnotations {
|
| 87 |
+
reproducibility_gap: ReproducibilityGap | null
|
| 88 |
+
provenance: Provenance | null
|
| 89 |
+
variant_divergence: VariantDivergence | null
|
| 90 |
+
cross_party_divergence: CrossPartyDivergence | null
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
export interface ReportingCompleteness {
|
| 94 |
+
completeness_score: number
|
| 95 |
+
total_fields_evaluated: number
|
| 96 |
+
missing_required_fields: string[]
|
| 97 |
+
partial_fields: Array<{
|
| 98 |
+
field_path: string
|
| 99 |
+
score: number
|
| 100 |
+
populated_subitems: number
|
| 101 |
+
total_subitems: number
|
| 102 |
+
}>
|
| 103 |
+
field_scores: Array<{
|
| 104 |
+
field_path: string
|
| 105 |
+
coverage_type: "full" | "partial" | "reserved"
|
| 106 |
+
score: number
|
| 107 |
+
}>
|
| 108 |
+
signal_version: string
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
export interface BenchmarkComparability {
|
| 112 |
+
variant_divergence_groups: Array<{
|
| 113 |
+
group_id: string
|
| 114 |
+
model_route_id: string
|
| 115 |
+
divergence_magnitude: number
|
| 116 |
+
threshold_used: number
|
| 117 |
+
threshold_basis: DivergenceThresholdBasis
|
| 118 |
+
differing_setup_fields: DifferingSetupField[]
|
| 119 |
+
}>
|
| 120 |
+
cross_party_divergence_groups: Array<{
|
| 121 |
+
group_id: string
|
| 122 |
+
model_route_id: string
|
| 123 |
+
divergence_magnitude: number
|
| 124 |
+
threshold_used: number
|
| 125 |
+
threshold_basis: DivergenceThresholdBasis
|
| 126 |
+
scores_by_organization: Record<string, number>
|
| 127 |
+
differing_setup_fields: DifferingSetupField[]
|
| 128 |
+
}>
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
export interface EvalcardsAnnotations {
|
| 132 |
+
reporting_completeness?: ReportingCompleteness
|
| 133 |
+
benchmark_comparability?: BenchmarkComparability
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
export interface ReproducibilitySummary {
|
| 137 |
+
results_total: number
|
| 138 |
+
has_reproducibility_gap_count: number
|
| 139 |
+
populated_ratio_avg: number | null
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
export interface ProvenanceSummary {
|
| 143 |
+
total_results: number
|
| 144 |
+
total_groups: number
|
| 145 |
+
multi_source_groups: number
|
| 146 |
+
first_party_only_groups: number
|
| 147 |
+
source_type_distribution: Record<ProvenanceSourceType, number>
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
export interface ComparabilitySummary {
|
| 151 |
+
total_groups: number
|
| 152 |
+
groups_with_variant_check: number
|
| 153 |
+
groups_with_cross_party_check: number
|
| 154 |
+
variant_divergent_count: number
|
| 155 |
+
cross_party_divergent_count: number
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
export interface SignalSummaries {
|
| 159 |
+
reproducibility_summary?: ReproducibilitySummary
|
| 160 |
+
provenance_summary?: ProvenanceSummary
|
| 161 |
+
comparability_summary?: ComparabilitySummary
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
export interface CorpusAggregates {
|
| 165 |
+
generated_at: string
|
| 166 |
+
signal_version: string
|
| 167 |
+
stratification_dimensions: ["category"]
|
| 168 |
+
reproducibility: Stratified<ReproducibilityCorpusBlock>
|
| 169 |
+
completeness: Stratified<CompletenessCorpusBlock>
|
| 170 |
+
provenance: Stratified<ProvenanceCorpusBlock>
|
| 171 |
+
comparability: Stratified<ComparabilityCorpusBlock>
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
export interface Stratified<T> {
|
| 175 |
+
overall: T
|
| 176 |
+
by_category: Record<string, T>
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
export interface ReproducibilityCorpusBlock {
|
| 180 |
+
total_triples: number
|
| 181 |
+
triples_with_reproducibility_gap: number
|
| 182 |
+
reproducibility_gap_rate: number | null
|
| 183 |
+
agentic_triples: number
|
| 184 |
+
per_field_missingness: Record<string, {
|
| 185 |
+
missing_count: number
|
| 186 |
+
missing_rate: number | null
|
| 187 |
+
denominator: "all_triples" | "agentic_only"
|
| 188 |
+
denominator_count: number
|
| 189 |
+
}>
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
export interface CompletenessCorpusBlock {
|
| 193 |
+
total_benchmarks: number
|
| 194 |
+
completeness_score_mean: number | null
|
| 195 |
+
completeness_score_median: number | null
|
| 196 |
+
per_field_population: Record<string, {
|
| 197 |
+
mean_score: number
|
| 198 |
+
populated_rate: number
|
| 199 |
+
fully_populated_rate: number
|
| 200 |
+
benchmark_count: number
|
| 201 |
+
}>
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
export interface ProvenanceCorpusBlock {
|
| 205 |
+
total_triples: number
|
| 206 |
+
total_groups: number
|
| 207 |
+
multi_source_groups: number
|
| 208 |
+
multi_source_rate: number | null
|
| 209 |
+
first_party_only_groups: number
|
| 210 |
+
first_party_only_rate: number | null
|
| 211 |
+
source_type_distribution: Record<ProvenanceSourceType, number>
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
export interface ComparabilityCorpusBlock {
|
| 215 |
+
total_groups: number
|
| 216 |
+
variant_eligible_groups: number
|
| 217 |
+
variant_divergent_groups: number
|
| 218 |
+
variant_divergence_rate: number | null
|
| 219 |
+
cross_party_eligible_groups: number
|
| 220 |
+
cross_party_divergent_groups: number
|
| 221 |
+
cross_party_divergence_rate: number | null
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
export interface HierarchyTags {
|
| 225 |
domains: string[]
|
| 226 |
languages: string[]
|
|
|
|
| 239 |
metrics: HierarchyMetric[]
|
| 240 |
}
|
| 241 |
|
| 242 |
+
export interface HierarchyBenchmark extends SignalSummaries {
|
| 243 |
key: string
|
| 244 |
display_name: string
|
| 245 |
has_card: boolean
|
| 246 |
tags: HierarchyTags
|
| 247 |
slices: HierarchySlice[]
|
| 248 |
metrics: HierarchyMetric[]
|
| 249 |
+
summary_eval_ids?: string[]
|
| 250 |
}
|
| 251 |
|
| 252 |
+
export interface HierarchyComposite extends SignalSummaries {
|
| 253 |
key: string
|
| 254 |
display_name: string
|
| 255 |
has_card: boolean
|
|
|
|
| 259 |
summary_eval_ids?: string[]
|
| 260 |
}
|
| 261 |
|
| 262 |
+
export interface HierarchyLeaf extends SignalSummaries {
|
| 263 |
key: string
|
| 264 |
display_name: string
|
|
|
|
| 265 |
category: string
|
| 266 |
+
evals_count?: number
|
| 267 |
+
eval_summary_ids?: string[]
|
| 268 |
+
tags?: Partial<HierarchyTags>
|
| 269 |
+
has_card?: boolean
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
export interface HierarchyFamily extends SignalSummaries {
|
| 273 |
+
key: string
|
| 274 |
+
display_name: string
|
| 275 |
+
has_card?: boolean
|
| 276 |
+
category: string
|
| 277 |
+
tags?: Partial<HierarchyTags>
|
| 278 |
+
evals_count?: number
|
| 279 |
+
eval_summary_ids?: string[]
|
| 280 |
+
// Legacy nested shape (composites + standalone benchmarks)
|
| 281 |
standalone_benchmarks?: HierarchyBenchmark[]
|
| 282 |
composites?: HierarchyComposite[]
|
| 283 |
benchmarks?: HierarchyBenchmark[]
|
| 284 |
slices?: HierarchySlice[]
|
| 285 |
metrics?: HierarchyMetric[]
|
| 286 |
+
// Newer 2-level shape (family → leaf)
|
| 287 |
+
leaves?: HierarchyLeaf[]
|
| 288 |
}
|
| 289 |
|
| 290 |
export interface EvalHierarchyStats {
|
|
|
|
| 298 |
}
|
| 299 |
|
| 300 |
export interface EvalHierarchy {
|
| 301 |
+
stats?: EvalHierarchyStats
|
| 302 |
families: HierarchyFamily[]
|
| 303 |
}
|
| 304 |
|
|
|
|
| 382 |
metric_group_order: MetricGroup[]
|
| 383 |
evals: Record<string, ComparisonEvalEntry>
|
| 384 |
by_model: Record<string, Record<string, Record<string, ComparisonByModelEntry>>>
|
| 385 |
+
}
|
lib/benchmark-schema.ts
CHANGED
|
@@ -3,6 +3,8 @@
|
|
| 3 |
* Based on the evalevalai.com schema structure
|
| 4 |
*/
|
| 5 |
|
|
|
|
|
|
|
| 6 |
export interface BenchmarkEvaluation {
|
| 7 |
schema_version: string
|
| 8 |
eval_summary_id?: string
|
|
@@ -31,6 +33,7 @@ export interface BenchmarkEvaluation {
|
|
| 31 |
generation_config?: GenerationConfig
|
| 32 |
evaluation_results: EvaluationResult[]
|
| 33 |
detailed_evaluation_results_per_samples?: SampleResult[]
|
|
|
|
| 34 |
}
|
| 35 |
|
| 36 |
export interface EvalLibrary {
|
|
@@ -96,6 +99,7 @@ export interface EvaluationResult {
|
|
| 96 |
score_details: ScoreDetails
|
| 97 |
detailed_evaluation_results_url?: string
|
| 98 |
generation_config?: GenerationConfig
|
|
|
|
| 99 |
}
|
| 100 |
|
| 101 |
export interface MetricConfig {
|
|
@@ -208,7 +212,7 @@ export function inferCategoryFromBenchmark(benchmarkName: string): CategoryType
|
|
| 208 |
/**
|
| 209 |
* Aggregate evaluations by model
|
| 210 |
*/
|
| 211 |
-
export interface ModelSummaryCore {
|
| 212 |
model_info: ModelInfo
|
| 213 |
evaluations_by_category: Record<CategoryType, BenchmarkEvaluation[]>
|
| 214 |
total_evaluations: number
|
|
@@ -275,6 +279,9 @@ export interface EvaluationCardData {
|
|
| 275 |
max: number
|
| 276 |
average: number | null
|
| 277 |
}
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
// Quick stats
|
| 280 |
top_scores: Array<{
|
|
|
|
| 3 |
* Based on the evalevalai.com schema structure
|
| 4 |
*/
|
| 5 |
|
| 6 |
+
import type { EvalcardsAnnotations, RowAnnotations, SignalSummaries } from "@/lib/backend-artifacts"
|
| 7 |
+
|
| 8 |
export interface BenchmarkEvaluation {
|
| 9 |
schema_version: string
|
| 10 |
eval_summary_id?: string
|
|
|
|
| 33 |
generation_config?: GenerationConfig
|
| 34 |
evaluation_results: EvaluationResult[]
|
| 35 |
detailed_evaluation_results_per_samples?: SampleResult[]
|
| 36 |
+
evalcards?: { annotations?: EvalcardsAnnotations }
|
| 37 |
}
|
| 38 |
|
| 39 |
export interface EvalLibrary {
|
|
|
|
| 99 |
score_details: ScoreDetails
|
| 100 |
detailed_evaluation_results_url?: string
|
| 101 |
generation_config?: GenerationConfig
|
| 102 |
+
evalcards?: { annotations?: RowAnnotations }
|
| 103 |
}
|
| 104 |
|
| 105 |
export interface MetricConfig {
|
|
|
|
| 212 |
/**
|
| 213 |
* Aggregate evaluations by model
|
| 214 |
*/
|
| 215 |
+
export interface ModelSummaryCore extends SignalSummaries {
|
| 216 |
model_info: ModelInfo
|
| 217 |
evaluations_by_category: Record<CategoryType, BenchmarkEvaluation[]>
|
| 218 |
total_evaluations: number
|
|
|
|
| 279 |
max: number
|
| 280 |
average: number | null
|
| 281 |
}
|
| 282 |
+
reproducibility_summary?: SignalSummaries["reproducibility_summary"]
|
| 283 |
+
provenance_summary?: SignalSummaries["provenance_summary"]
|
| 284 |
+
comparability_summary?: SignalSummaries["comparability_summary"]
|
| 285 |
|
| 286 |
// Quick stats
|
| 287 |
top_scores: Array<{
|
lib/dashboard-data-client.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import type { BackendManifestStatus, ComparisonIndex, EvalHierarchy } from "@/lib/backend-artifacts"
|
| 2 |
import type { BenchmarkEvaluationCardData } from "@/components/benchmark-evaluation-card"
|
| 3 |
import type { HFEvalDetail } from "@/lib/hf-data"
|
| 4 |
import type {
|
|
@@ -108,3 +108,7 @@ export function fetchEvalHierarchy() {
|
|
| 108 |
export function fetchComparisonIndex() {
|
| 109 |
return fetchJson<ComparisonIndex>("/api/comparison-index")
|
| 110 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { BackendManifestStatus, ComparisonIndex, CorpusAggregates, EvalHierarchy } from "@/lib/backend-artifacts"
|
| 2 |
import type { BenchmarkEvaluationCardData } from "@/components/benchmark-evaluation-card"
|
| 3 |
import type { HFEvalDetail } from "@/lib/hf-data"
|
| 4 |
import type {
|
|
|
|
| 108 |
export function fetchComparisonIndex() {
|
| 109 |
return fetchJson<ComparisonIndex>("/api/comparison-index")
|
| 110 |
}
|
| 111 |
+
|
| 112 |
+
export function fetchCorpusAggregates() {
|
| 113 |
+
return fetchJson<CorpusAggregates>("/api/corpus-aggregates")
|
| 114 |
+
}
|
lib/eval-processing.ts
CHANGED
|
@@ -15,6 +15,7 @@ import type {
|
|
| 15 |
MetricConfig,
|
| 16 |
EvaluationResult,
|
| 17 |
} from './benchmark-schema'
|
|
|
|
| 18 |
import type { ModelEvaluationSummary } from './benchmark-schema'
|
| 19 |
import type { ModelSummaryCore } from './benchmark-schema'
|
| 20 |
import { inferCategoryFromBenchmark } from './benchmark-schema'
|
|
@@ -130,7 +131,7 @@ export interface ModelResultForBenchmark {
|
|
| 130 |
}>
|
| 131 |
}
|
| 132 |
|
| 133 |
-
export interface BenchmarkEvalSummary {
|
| 134 |
evaluation_name: string
|
| 135 |
/** URL-safe slug derived from evaluation_name */
|
| 136 |
evaluation_id: string
|
|
@@ -192,6 +193,7 @@ export interface BenchmarkEvalSummary {
|
|
| 192 |
leaderboard_metrics?: BenchmarkLeaderboardMetric[]
|
| 193 |
/** Matrix rows for multi-metric benchmark leaderboards */
|
| 194 |
leaderboard_rows?: BenchmarkLeaderboardRow[]
|
|
|
|
| 195 |
}
|
| 196 |
|
| 197 |
export interface BenchmarkSummaryMetric {
|
|
@@ -234,6 +236,7 @@ export interface BenchmarkLeaderboardRow {
|
|
| 234 |
source_metadata: SourceMetadata
|
| 235 |
source_data: BenchmarkEvaluation["source_data"]
|
| 236 |
values: Record<string, number | null>
|
|
|
|
| 237 |
metrics_present: number
|
| 238 |
}
|
| 239 |
|
|
@@ -727,6 +730,9 @@ export function createEvaluationCard(
|
|
| 727 |
eval_libraries: Array.from(evalLibraries.values()).sort((a, b) => a.name.localeCompare(b.name)),
|
| 728 |
latest_source_name: latestSourceName,
|
| 729 |
params_billions: Number.isFinite(paramsBillions ?? NaN) ? paramsBillions : null,
|
|
|
|
|
|
|
|
|
|
| 730 |
top_scores: topScores,
|
| 731 |
source_urls: Array.from(sourceUrls),
|
| 732 |
detail_urls: Array.from(detailUrls),
|
|
|
|
| 15 |
MetricConfig,
|
| 16 |
EvaluationResult,
|
| 17 |
} from './benchmark-schema'
|
| 18 |
+
import type { EvalcardsAnnotations, RowAnnotations, SignalSummaries } from './backend-artifacts'
|
| 19 |
import type { ModelEvaluationSummary } from './benchmark-schema'
|
| 20 |
import type { ModelSummaryCore } from './benchmark-schema'
|
| 21 |
import { inferCategoryFromBenchmark } from './benchmark-schema'
|
|
|
|
| 131 |
}>
|
| 132 |
}
|
| 133 |
|
| 134 |
+
export interface BenchmarkEvalSummary extends SignalSummaries {
|
| 135 |
evaluation_name: string
|
| 136 |
/** URL-safe slug derived from evaluation_name */
|
| 137 |
evaluation_id: string
|
|
|
|
| 193 |
leaderboard_metrics?: BenchmarkLeaderboardMetric[]
|
| 194 |
/** Matrix rows for multi-metric benchmark leaderboards */
|
| 195 |
leaderboard_rows?: BenchmarkLeaderboardRow[]
|
| 196 |
+
evalcards?: { annotations?: EvalcardsAnnotations }
|
| 197 |
}
|
| 198 |
|
| 199 |
export interface BenchmarkSummaryMetric {
|
|
|
|
| 236 |
source_metadata: SourceMetadata
|
| 237 |
source_data: BenchmarkEvaluation["source_data"]
|
| 238 |
values: Record<string, number | null>
|
| 239 |
+
annotations_by_metric?: Record<string, RowAnnotations | null | undefined>
|
| 240 |
metrics_present: number
|
| 241 |
}
|
| 242 |
|
|
|
|
| 730 |
eval_libraries: Array.from(evalLibraries.values()).sort((a, b) => a.name.localeCompare(b.name)),
|
| 731 |
latest_source_name: latestSourceName,
|
| 732 |
params_billions: Number.isFinite(paramsBillions ?? NaN) ? paramsBillions : null,
|
| 733 |
+
reproducibility_summary: summary.reproducibility_summary,
|
| 734 |
+
provenance_summary: summary.provenance_summary,
|
| 735 |
+
comparability_summary: summary.comparability_summary,
|
| 736 |
top_scores: topScores,
|
| 737 |
source_urls: Array.from(sourceUrls),
|
| 738 |
detail_urls: Array.from(detailUrls),
|
lib/hf-data.ts
CHANGED
|
@@ -3,7 +3,16 @@ import "server-only"
|
|
| 3 |
import { promises as fs } from "fs"
|
| 4 |
import path from "path"
|
| 5 |
|
| 6 |
-
import type {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
import type {
|
| 8 |
BenchmarkCard,
|
| 9 |
BenchmarkEvaluation,
|
|
@@ -436,7 +445,7 @@ async function fetchHFJsonSafe<T>(relativePath: string): Promise<T | null> {
|
|
| 436 |
// HF dataset types (shapes of JSON files in the HF repo)
|
| 437 |
// ---------------------------------------------------------------------------
|
| 438 |
|
| 439 |
-
export interface HFModelCardEntry {
|
| 440 |
model_family_id: string
|
| 441 |
model_route_id: string
|
| 442 |
model_family_name: string
|
|
@@ -472,7 +481,7 @@ export interface HFModelCardEntry {
|
|
| 472 |
}>
|
| 473 |
}
|
| 474 |
|
| 475 |
-
export interface HFEvalListEntry {
|
| 476 |
eval_summary_id: string
|
| 477 |
benchmark: string
|
| 478 |
canonical_display_name?: string
|
|
@@ -517,6 +526,7 @@ export interface HFEvalListEntry {
|
|
| 517 |
models_count: number
|
| 518 |
top_score: number
|
| 519 |
}>
|
|
|
|
| 520 |
}
|
| 521 |
|
| 522 |
export interface HFEvalModelResult {
|
|
@@ -538,6 +548,7 @@ export interface HFEvalModelResult {
|
|
| 538 |
detailed_evaluation_results_meta?: unknown
|
| 539 |
instance_level_data?: unknown
|
| 540 |
passthrough_top_level_fields?: unknown
|
|
|
|
| 541 |
}
|
| 542 |
|
| 543 |
export interface HFEvalMetric {
|
|
@@ -553,7 +564,7 @@ export interface HFEvalMetric {
|
|
| 553 |
model_results: HFEvalModelResult[]
|
| 554 |
}
|
| 555 |
|
| 556 |
-
export interface HFEvalDetail {
|
| 557 |
eval_summary_id: string
|
| 558 |
benchmark: string
|
| 559 |
canonical_display_name?: string
|
|
@@ -566,9 +577,10 @@ export interface HFEvalDetail {
|
|
| 566 |
benchmark_card: BenchmarkCard | null
|
| 567 |
metrics: HFEvalMetric[]
|
| 568 |
subtasks: unknown[]
|
|
|
|
| 569 |
}
|
| 570 |
|
| 571 |
-
export interface HFModelDetail {
|
| 572 |
model_info: ModelInfo & {
|
| 573 |
family_id?: string
|
| 574 |
family_slug?: string
|
|
@@ -846,13 +858,112 @@ export async function fetchBackendManifest(): Promise<BackendManifest> {
|
|
| 846 |
}
|
| 847 |
|
| 848 |
export async function fetchEvalHierarchy(): Promise<EvalHierarchy> {
|
| 849 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 850 |
}
|
| 851 |
|
| 852 |
export async function fetchComparisonIndex(): Promise<ComparisonIndex> {
|
| 853 |
return fetchHFJson<ComparisonIndex>("comparison-index.json")
|
| 854 |
}
|
| 855 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 856 |
export async function fetchModelDetail(slug: string): Promise<HFModelDetail | null> {
|
| 857 |
return fetchHFJsonSafe<HFModelDetail>(`models/${slug}.json`)
|
| 858 |
}
|
|
@@ -1297,6 +1408,7 @@ function flattenHierarchyNode(
|
|
| 1297 |
detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
|
| 1298 |
result.detailed_evaluation_results
|
| 1299 |
),
|
|
|
|
| 1300 |
}
|
| 1301 |
|
| 1302 |
const existing = resultsByVariant.get(variantKey)
|
|
|
|
| 3 |
import { promises as fs } from "fs"
|
| 4 |
import path from "path"
|
| 5 |
|
| 6 |
+
import type {
|
| 7 |
+
BackendManifest,
|
| 8 |
+
BackendManifestStatus,
|
| 9 |
+
ComparisonIndex,
|
| 10 |
+
CorpusAggregates,
|
| 11 |
+
EvalHierarchy,
|
| 12 |
+
EvalcardsAnnotations,
|
| 13 |
+
RowAnnotations,
|
| 14 |
+
SignalSummaries,
|
| 15 |
+
} from "@/lib/backend-artifacts"
|
| 16 |
import type {
|
| 17 |
BenchmarkCard,
|
| 18 |
BenchmarkEvaluation,
|
|
|
|
| 445 |
// HF dataset types (shapes of JSON files in the HF repo)
|
| 446 |
// ---------------------------------------------------------------------------
|
| 447 |
|
| 448 |
+
export interface HFModelCardEntry extends SignalSummaries {
|
| 449 |
model_family_id: string
|
| 450 |
model_route_id: string
|
| 451 |
model_family_name: string
|
|
|
|
| 481 |
}>
|
| 482 |
}
|
| 483 |
|
| 484 |
+
export interface HFEvalListEntry extends SignalSummaries {
|
| 485 |
eval_summary_id: string
|
| 486 |
benchmark: string
|
| 487 |
canonical_display_name?: string
|
|
|
|
| 526 |
models_count: number
|
| 527 |
top_score: number
|
| 528 |
}>
|
| 529 |
+
evalcards?: { annotations?: EvalcardsAnnotations }
|
| 530 |
}
|
| 531 |
|
| 532 |
export interface HFEvalModelResult {
|
|
|
|
| 548 |
detailed_evaluation_results_meta?: unknown
|
| 549 |
instance_level_data?: unknown
|
| 550 |
passthrough_top_level_fields?: unknown
|
| 551 |
+
evalcards?: { annotations?: RowAnnotations }
|
| 552 |
}
|
| 553 |
|
| 554 |
export interface HFEvalMetric {
|
|
|
|
| 564 |
model_results: HFEvalModelResult[]
|
| 565 |
}
|
| 566 |
|
| 567 |
+
export interface HFEvalDetail extends SignalSummaries {
|
| 568 |
eval_summary_id: string
|
| 569 |
benchmark: string
|
| 570 |
canonical_display_name?: string
|
|
|
|
| 577 |
benchmark_card: BenchmarkCard | null
|
| 578 |
metrics: HFEvalMetric[]
|
| 579 |
subtasks: unknown[]
|
| 580 |
+
evalcards?: { annotations?: EvalcardsAnnotations }
|
| 581 |
}
|
| 582 |
|
| 583 |
+
export interface HFModelDetail extends SignalSummaries {
|
| 584 |
model_info: ModelInfo & {
|
| 585 |
family_id?: string
|
| 586 |
family_slug?: string
|
|
|
|
| 858 |
}
|
| 859 |
|
| 860 |
export async function fetchEvalHierarchy(): Promise<EvalHierarchy> {
|
| 861 |
+
const raw = await fetchHFJson<EvalHierarchy>("eval-hierarchy.json")
|
| 862 |
+
return adaptEvalHierarchy(raw)
|
| 863 |
+
}
|
| 864 |
+
|
| 865 |
+
/**
|
| 866 |
+
* The upstream pipeline migrated to a flat 2-level shape (family → leaf).
|
| 867 |
+
* The evals page still walks the older composites/standalone_benchmarks tree,
|
| 868 |
+
* so we synthesize the legacy view from `leaves` when the new shape is present.
|
| 869 |
+
* Also computes a fallback `stats` block when missing.
|
| 870 |
+
*/
|
| 871 |
+
function adaptEvalHierarchy(raw: EvalHierarchy): EvalHierarchy {
|
| 872 |
+
const families = (raw.families ?? []).map((family) => {
|
| 873 |
+
const hasLegacyTree =
|
| 874 |
+
(family.composites && family.composites.length > 0) ||
|
| 875 |
+
(family.standalone_benchmarks && family.standalone_benchmarks.length > 0) ||
|
| 876 |
+
(family.benchmarks && family.benchmarks.length > 0)
|
| 877 |
+
|
| 878 |
+
if (hasLegacyTree) {
|
| 879 |
+
return family
|
| 880 |
+
}
|
| 881 |
+
|
| 882 |
+
const leaves = family.leaves ?? []
|
| 883 |
+
if (leaves.length === 0) {
|
| 884 |
+
return family
|
| 885 |
+
}
|
| 886 |
+
|
| 887 |
+
const standalone = leaves.map((leaf) => ({
|
| 888 |
+
key: leaf.key,
|
| 889 |
+
display_name: leaf.display_name,
|
| 890 |
+
has_card: leaf.has_card ?? false,
|
| 891 |
+
tags: {
|
| 892 |
+
domains: leaf.tags?.domains ?? [],
|
| 893 |
+
languages: leaf.tags?.languages ?? [],
|
| 894 |
+
tasks: leaf.tags?.tasks ?? [],
|
| 895 |
+
},
|
| 896 |
+
slices: [],
|
| 897 |
+
metrics: [],
|
| 898 |
+
reproducibility_summary: leaf.reproducibility_summary,
|
| 899 |
+
provenance_summary: leaf.provenance_summary,
|
| 900 |
+
comparability_summary: leaf.comparability_summary,
|
| 901 |
+
summary_eval_ids: leaf.eval_summary_ids,
|
| 902 |
+
}))
|
| 903 |
+
|
| 904 |
+
return {
|
| 905 |
+
...family,
|
| 906 |
+
tags: {
|
| 907 |
+
domains: family.tags?.domains ?? [],
|
| 908 |
+
languages: family.tags?.languages ?? [],
|
| 909 |
+
tasks: family.tags?.tasks ?? [],
|
| 910 |
+
},
|
| 911 |
+
standalone_benchmarks: standalone,
|
| 912 |
+
}
|
| 913 |
+
})
|
| 914 |
+
|
| 915 |
+
if (raw.stats) {
|
| 916 |
+
return { ...raw, families }
|
| 917 |
+
}
|
| 918 |
+
|
| 919 |
+
let composite_count = 0
|
| 920 |
+
let standalone_benchmark_count = 0
|
| 921 |
+
let single_benchmark_count = 0
|
| 922 |
+
let slice_count = 0
|
| 923 |
+
let metric_count = 0
|
| 924 |
+
|
| 925 |
+
for (const family of families) {
|
| 926 |
+
composite_count += family.composites?.length ?? 0
|
| 927 |
+
const standalone = family.standalone_benchmarks ?? []
|
| 928 |
+
standalone_benchmark_count += standalone.length
|
| 929 |
+
if ((family.composites?.length ?? 0) === 0 && standalone.length === 1) {
|
| 930 |
+
single_benchmark_count += 1
|
| 931 |
+
}
|
| 932 |
+
for (const composite of family.composites ?? []) {
|
| 933 |
+
for (const benchmark of composite.benchmarks ?? []) {
|
| 934 |
+
slice_count += benchmark.slices?.length ?? 0
|
| 935 |
+
metric_count += benchmark.metrics?.length ?? 0
|
| 936 |
+
}
|
| 937 |
+
}
|
| 938 |
+
for (const benchmark of standalone) {
|
| 939 |
+
slice_count += benchmark.slices?.length ?? 0
|
| 940 |
+
metric_count += benchmark.metrics?.length ?? 0
|
| 941 |
+
}
|
| 942 |
+
}
|
| 943 |
+
|
| 944 |
+
return {
|
| 945 |
+
...raw,
|
| 946 |
+
families,
|
| 947 |
+
stats: {
|
| 948 |
+
family_count: families.length,
|
| 949 |
+
composite_count,
|
| 950 |
+
standalone_benchmark_count,
|
| 951 |
+
single_benchmark_count,
|
| 952 |
+
slice_count,
|
| 953 |
+
metric_count,
|
| 954 |
+
metric_rows_scanned: 0,
|
| 955 |
+
},
|
| 956 |
+
}
|
| 957 |
}
|
| 958 |
|
| 959 |
export async function fetchComparisonIndex(): Promise<ComparisonIndex> {
|
| 960 |
return fetchHFJson<ComparisonIndex>("comparison-index.json")
|
| 961 |
}
|
| 962 |
|
| 963 |
+
export async function fetchCorpusAggregates(): Promise<CorpusAggregates | null> {
|
| 964 |
+
return fetchHFJsonSafe<CorpusAggregates>("corpus-aggregates.json")
|
| 965 |
+
}
|
| 966 |
+
|
| 967 |
export async function fetchModelDetail(slug: string): Promise<HFModelDetail | null> {
|
| 968 |
return fetchHFJsonSafe<HFModelDetail>(`models/${slug}.json`)
|
| 969 |
}
|
|
|
|
| 1408 |
detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
|
| 1409 |
result.detailed_evaluation_results
|
| 1410 |
),
|
| 1411 |
+
evalcards: result.evalcards,
|
| 1412 |
}
|
| 1413 |
|
| 1414 |
const existing = resultsByVariant.get(variantKey)
|
lib/model-data.ts
CHANGED
|
@@ -27,6 +27,7 @@ import { getCanonicalModelIdentity, getModelFamilyRouteId } from "@/lib/model-fa
|
|
| 27 |
import { getBenchmarkCard, normalizeBenchmarkKey } from "@/lib/benchmark-metadata"
|
| 28 |
import {
|
| 29 |
type HFEvalDetail,
|
|
|
|
| 30 |
type HFEvalModelResult,
|
| 31 |
type HFModelCardEntry,
|
| 32 |
type HFModelDetail,
|
|
@@ -337,6 +338,24 @@ function parseParamsBillions(value: unknown): number | null {
|
|
| 337 |
return Number.isFinite(numeric) && numeric > 0 ? numeric : null
|
| 338 |
}
|
| 339 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
// ---------------------------------------------------------------------------
|
| 341 |
// HF model-cards.json → EvaluationCardData
|
| 342 |
// ---------------------------------------------------------------------------
|
|
@@ -391,6 +410,9 @@ function hfModelCardToEvaluationCardData(entry: HFModelCardEntry): EvaluationCar
|
|
| 391 |
? `${entry.benchmark_names.length} benchmark${entry.benchmark_names.length === 1 ? "" : "s"}`
|
| 392 |
: undefined,
|
| 393 |
params_billions: parseParamsBillions(entry.params_billions),
|
|
|
|
|
|
|
|
|
|
| 394 |
benchmark_names: (entry.benchmark_names ?? []).map((name) => getBenchmarkDisplayName(name)),
|
| 395 |
score_summary: {
|
| 396 |
count: entry.score_summary.count,
|
|
@@ -408,31 +430,7 @@ function hfModelCardToEvaluationCardData(entry: HFModelCardEntry): EvaluationCar
|
|
| 408 |
// HF eval-list.json → BenchmarkEvalListItem
|
| 409 |
// ---------------------------------------------------------------------------
|
| 410 |
|
| 411 |
-
function hfEvalEntryToListItem(entry: {
|
| 412 |
-
eval_summary_id: string
|
| 413 |
-
benchmark: string
|
| 414 |
-
benchmark_family_key: string
|
| 415 |
-
benchmark_family_name: string
|
| 416 |
-
benchmark_parent_name?: string
|
| 417 |
-
benchmark_leaf_key: string
|
| 418 |
-
benchmark_leaf_name: string
|
| 419 |
-
evaluation_name?: string
|
| 420 |
-
display_name: string
|
| 421 |
-
is_summary_score?: boolean
|
| 422 |
-
summary_eval_ids?: string[]
|
| 423 |
-
category: string
|
| 424 |
-
tags: { domains: string[]; languages: string[]; tasks: string[] }
|
| 425 |
-
models_count: number
|
| 426 |
-
metrics_count: number
|
| 427 |
-
subtasks_count?: number
|
| 428 |
-
metric_names: string[]
|
| 429 |
-
primary_metric_name: string
|
| 430 |
-
benchmark_card: BenchmarkCard | null
|
| 431 |
-
source_data?: SourceData
|
| 432 |
-
top_score: number
|
| 433 |
-
instance_data: { available: boolean; url_count: number; sample_urls: string[]; models_with_loaded_instances: number }
|
| 434 |
-
metrics: Array<{ metric_summary_id: string; metric_name: string; lower_is_better: boolean; models_count: number; top_score: number }>
|
| 435 |
-
}): BenchmarkEvalListItem {
|
| 436 |
// Use the pipeline's category directly, mapped to our CategoryType
|
| 437 |
const category = mapHFCategories([entry.category])[0] ?? "General" as CategoryType
|
| 438 |
|
|
@@ -486,6 +484,10 @@ function hfEvalEntryToListItem(entry: {
|
|
| 486 |
subtasks_count: entry.subtasks_count ?? 0,
|
| 487 |
is_summary_score: entry.is_summary_score ?? false,
|
| 488 |
summary_eval_ids: entry.summary_eval_ids ?? [],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
}
|
| 490 |
}
|
| 491 |
|
|
@@ -652,6 +654,7 @@ function buildBenchmarkLeaderboardMatrix(detail: HFEvalDetail) {
|
|
| 652 |
source_metadata: sourceMetadata,
|
| 653 |
source_data: sourceData,
|
| 654 |
values: { [columnKey]: modelResult.score ?? null },
|
|
|
|
| 655 |
metrics_present: 0,
|
| 656 |
_timestampValue: nextTimestamp,
|
| 657 |
})
|
|
@@ -659,6 +662,10 @@ function buildBenchmarkLeaderboardMatrix(detail: HFEvalDetail) {
|
|
| 659 |
}
|
| 660 |
|
| 661 |
existing.values[columnKey] = modelResult.score ?? null
|
|
|
|
|
|
|
|
|
|
|
|
|
| 662 |
if (!existing.model_route_id && modelResult.model_route_id) {
|
| 663 |
existing.model_route_id = modelResult.model_route_id
|
| 664 |
}
|
|
@@ -725,6 +732,7 @@ function toModelResultsForMetric(
|
|
| 725 |
detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
|
| 726 |
mr.detailed_evaluation_results
|
| 727 |
),
|
|
|
|
| 728 |
}
|
| 729 |
|
| 730 |
return {
|
|
@@ -797,6 +805,11 @@ function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
|
|
| 797 |
subtasks,
|
| 798 |
leaderboard_metrics: leaderboardMatrix.leaderboard_metrics,
|
| 799 |
leaderboard_rows: leaderboardMatrix.leaderboard_rows,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 800 |
}
|
| 801 |
}
|
| 802 |
|
|
@@ -847,6 +860,11 @@ function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
|
|
| 847 |
subtasks,
|
| 848 |
leaderboard_metrics: leaderboardMatrix.leaderboard_metrics,
|
| 849 |
leaderboard_rows: leaderboardMatrix.leaderboard_rows,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 850 |
}
|
| 851 |
}
|
| 852 |
|
|
@@ -1140,6 +1158,7 @@ function buildSingleMetricSuiteMatrixSummary(
|
|
| 1140 |
source_metadata: sourceMetadata,
|
| 1141 |
source_data: sourceData,
|
| 1142 |
values: { [columnKey]: modelResult.score ?? null },
|
|
|
|
| 1143 |
metrics_present: 0,
|
| 1144 |
_timestampValue: nextTimestamp,
|
| 1145 |
})
|
|
@@ -1147,6 +1166,10 @@ function buildSingleMetricSuiteMatrixSummary(
|
|
| 1147 |
}
|
| 1148 |
|
| 1149 |
existing.values[columnKey] = modelResult.score ?? null
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1150 |
if (!existing.model_route_id && modelResult.model_route_id) {
|
| 1151 |
existing.model_route_id = modelResult.model_route_id
|
| 1152 |
}
|
|
@@ -1469,7 +1492,7 @@ export async function getModelSummaryById(modelId: string) {
|
|
| 1469 |
if (detail) {
|
| 1470 |
const evaluations = flattenModelEvaluations(detail)
|
| 1471 |
if (evaluations.length > 0) {
|
| 1472 |
-
return createModelFamilySummary(evaluations)
|
| 1473 |
}
|
| 1474 |
}
|
| 1475 |
}
|
|
@@ -1489,7 +1512,7 @@ export async function getModelSummaryById(modelId: string) {
|
|
| 1489 |
if (detail) {
|
| 1490 |
const evaluations = flattenModelEvaluations(detail)
|
| 1491 |
if (evaluations.length > 0) {
|
| 1492 |
-
return createModelFamilySummary(evaluations)
|
| 1493 |
}
|
| 1494 |
}
|
| 1495 |
|
|
@@ -1501,7 +1524,7 @@ export async function getModelSummaryById(modelId: string) {
|
|
| 1501 |
if (variantDetail) {
|
| 1502 |
const evaluations = flattenModelEvaluations(variantDetail)
|
| 1503 |
if (evaluations.length > 0) {
|
| 1504 |
-
return createModelFamilySummary(evaluations)
|
| 1505 |
}
|
| 1506 |
}
|
| 1507 |
}
|
|
|
|
| 27 |
import { getBenchmarkCard, normalizeBenchmarkKey } from "@/lib/benchmark-metadata"
|
| 28 |
import {
|
| 29 |
type HFEvalDetail,
|
| 30 |
+
type HFEvalListEntry,
|
| 31 |
type HFEvalModelResult,
|
| 32 |
type HFModelCardEntry,
|
| 33 |
type HFModelDetail,
|
|
|
|
| 338 |
return Number.isFinite(numeric) && numeric > 0 ? numeric : null
|
| 339 |
}
|
| 340 |
|
| 341 |
+
function attachModelSignalSummaries<T extends ReturnType<typeof createModelFamilySummary>>(
|
| 342 |
+
summary: T,
|
| 343 |
+
detail: HFModelDetail
|
| 344 |
+
): T {
|
| 345 |
+
return {
|
| 346 |
+
...summary,
|
| 347 |
+
reproducibility_summary: detail.reproducibility_summary,
|
| 348 |
+
provenance_summary: detail.provenance_summary,
|
| 349 |
+
comparability_summary: detail.comparability_summary,
|
| 350 |
+
variants: summary.variants.map((variant) => ({
|
| 351 |
+
...variant,
|
| 352 |
+
reproducibility_summary: detail.reproducibility_summary,
|
| 353 |
+
provenance_summary: detail.provenance_summary,
|
| 354 |
+
comparability_summary: detail.comparability_summary,
|
| 355 |
+
})),
|
| 356 |
+
}
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
// ---------------------------------------------------------------------------
|
| 360 |
// HF model-cards.json → EvaluationCardData
|
| 361 |
// ---------------------------------------------------------------------------
|
|
|
|
| 410 |
? `${entry.benchmark_names.length} benchmark${entry.benchmark_names.length === 1 ? "" : "s"}`
|
| 411 |
: undefined,
|
| 412 |
params_billions: parseParamsBillions(entry.params_billions),
|
| 413 |
+
reproducibility_summary: entry.reproducibility_summary,
|
| 414 |
+
provenance_summary: entry.provenance_summary,
|
| 415 |
+
comparability_summary: entry.comparability_summary,
|
| 416 |
benchmark_names: (entry.benchmark_names ?? []).map((name) => getBenchmarkDisplayName(name)),
|
| 417 |
score_summary: {
|
| 418 |
count: entry.score_summary.count,
|
|
|
|
| 430 |
// HF eval-list.json → BenchmarkEvalListItem
|
| 431 |
// ---------------------------------------------------------------------------
|
| 432 |
|
| 433 |
+
function hfEvalEntryToListItem(entry: HFEvalListEntry): BenchmarkEvalListItem {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
// Use the pipeline's category directly, mapped to our CategoryType
|
| 435 |
const category = mapHFCategories([entry.category])[0] ?? "General" as CategoryType
|
| 436 |
|
|
|
|
| 484 |
subtasks_count: entry.subtasks_count ?? 0,
|
| 485 |
is_summary_score: entry.is_summary_score ?? false,
|
| 486 |
summary_eval_ids: entry.summary_eval_ids ?? [],
|
| 487 |
+
evalcards: entry.evalcards,
|
| 488 |
+
reproducibility_summary: entry.reproducibility_summary,
|
| 489 |
+
provenance_summary: entry.provenance_summary,
|
| 490 |
+
comparability_summary: entry.comparability_summary,
|
| 491 |
}
|
| 492 |
}
|
| 493 |
|
|
|
|
| 654 |
source_metadata: sourceMetadata,
|
| 655 |
source_data: sourceData,
|
| 656 |
values: { [columnKey]: modelResult.score ?? null },
|
| 657 |
+
annotations_by_metric: { [columnKey]: modelResult.evalcards?.annotations ?? null },
|
| 658 |
metrics_present: 0,
|
| 659 |
_timestampValue: nextTimestamp,
|
| 660 |
})
|
|
|
|
| 662 |
}
|
| 663 |
|
| 664 |
existing.values[columnKey] = modelResult.score ?? null
|
| 665 |
+
existing.annotations_by_metric = {
|
| 666 |
+
...(existing.annotations_by_metric ?? {}),
|
| 667 |
+
[columnKey]: modelResult.evalcards?.annotations ?? null,
|
| 668 |
+
}
|
| 669 |
if (!existing.model_route_id && modelResult.model_route_id) {
|
| 670 |
existing.model_route_id = modelResult.model_route_id
|
| 671 |
}
|
|
|
|
| 732 |
detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
|
| 733 |
mr.detailed_evaluation_results
|
| 734 |
),
|
| 735 |
+
evalcards: mr.evalcards,
|
| 736 |
}
|
| 737 |
|
| 738 |
return {
|
|
|
|
| 805 |
subtasks,
|
| 806 |
leaderboard_metrics: leaderboardMatrix.leaderboard_metrics,
|
| 807 |
leaderboard_rows: leaderboardMatrix.leaderboard_rows,
|
| 808 |
+
source_data: detail.source_data,
|
| 809 |
+
evalcards: detail.evalcards,
|
| 810 |
+
reproducibility_summary: detail.reproducibility_summary,
|
| 811 |
+
provenance_summary: detail.provenance_summary,
|
| 812 |
+
comparability_summary: detail.comparability_summary,
|
| 813 |
}
|
| 814 |
}
|
| 815 |
|
|
|
|
| 860 |
subtasks,
|
| 861 |
leaderboard_metrics: leaderboardMatrix.leaderboard_metrics,
|
| 862 |
leaderboard_rows: leaderboardMatrix.leaderboard_rows,
|
| 863 |
+
source_data: detail.source_data,
|
| 864 |
+
evalcards: detail.evalcards,
|
| 865 |
+
reproducibility_summary: detail.reproducibility_summary,
|
| 866 |
+
provenance_summary: detail.provenance_summary,
|
| 867 |
+
comparability_summary: detail.comparability_summary,
|
| 868 |
}
|
| 869 |
}
|
| 870 |
|
|
|
|
| 1158 |
source_metadata: sourceMetadata,
|
| 1159 |
source_data: sourceData,
|
| 1160 |
values: { [columnKey]: modelResult.score ?? null },
|
| 1161 |
+
annotations_by_metric: { [columnKey]: modelResult.evalcards?.annotations ?? null },
|
| 1162 |
metrics_present: 0,
|
| 1163 |
_timestampValue: nextTimestamp,
|
| 1164 |
})
|
|
|
|
| 1166 |
}
|
| 1167 |
|
| 1168 |
existing.values[columnKey] = modelResult.score ?? null
|
| 1169 |
+
existing.annotations_by_metric = {
|
| 1170 |
+
...(existing.annotations_by_metric ?? {}),
|
| 1171 |
+
[columnKey]: modelResult.evalcards?.annotations ?? null,
|
| 1172 |
+
}
|
| 1173 |
if (!existing.model_route_id && modelResult.model_route_id) {
|
| 1174 |
existing.model_route_id = modelResult.model_route_id
|
| 1175 |
}
|
|
|
|
| 1492 |
if (detail) {
|
| 1493 |
const evaluations = flattenModelEvaluations(detail)
|
| 1494 |
if (evaluations.length > 0) {
|
| 1495 |
+
return attachModelSignalSummaries(createModelFamilySummary(evaluations), detail)
|
| 1496 |
}
|
| 1497 |
}
|
| 1498 |
}
|
|
|
|
| 1512 |
if (detail) {
|
| 1513 |
const evaluations = flattenModelEvaluations(detail)
|
| 1514 |
if (evaluations.length > 0) {
|
| 1515 |
+
return attachModelSignalSummaries(createModelFamilySummary(evaluations), detail)
|
| 1516 |
}
|
| 1517 |
}
|
| 1518 |
|
|
|
|
| 1524 |
if (variantDetail) {
|
| 1525 |
const evaluations = flattenModelEvaluations(variantDetail)
|
| 1526 |
if (evaluations.length > 0) {
|
| 1527 |
+
return attachModelSignalSummaries(createModelFamilySummary(evaluations), variantDetail)
|
| 1528 |
}
|
| 1529 |
}
|
| 1530 |
}
|
public/peer-ranks.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
scripts/cache-hf-data.mjs
CHANGED
|
@@ -32,11 +32,13 @@ const CACHE_ROOT_FILES = [
|
|
| 32 |
"benchmark-metadata.json",
|
| 33 |
"eval-hierarchy.json",
|
| 34 |
"comparison-index.json",
|
|
|
|
| 35 |
]
|
| 36 |
|
| 37 |
const OPTIONAL_CACHE_ROOT_FILES = new Set([
|
| 38 |
"model-cards-lite.json",
|
| 39 |
"eval-list-lite.json",
|
|
|
|
| 40 |
])
|
| 41 |
|
| 42 |
const CACHE_DIRECTORIES = ["developers", "evals", "models"]
|
|
|
|
| 32 |
"benchmark-metadata.json",
|
| 33 |
"eval-hierarchy.json",
|
| 34 |
"comparison-index.json",
|
| 35 |
+
"corpus-aggregates.json",
|
| 36 |
]
|
| 37 |
|
| 38 |
const OPTIONAL_CACHE_ROOT_FILES = new Set([
|
| 39 |
"model-cards-lite.json",
|
| 40 |
"eval-list-lite.json",
|
| 41 |
+
"corpus-aggregates.json",
|
| 42 |
])
|
| 43 |
|
| 44 |
const CACHE_DIRECTORIES = ["developers", "evals", "models"]
|