Add researcher join analysis to eval detail

#2
by yananlong - opened
app/api/research/eval-joins/route.ts ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { NextResponse } from "next/server"
2
+
3
+ import { getResearchJoinDataset } from "@/lib/research-joins"
4
+
5
+ export async function GET(request: Request) {
6
+ const { searchParams } = new URL(request.url)
7
+ const id = searchParams.get("id")
8
+
9
+ if (!id) {
10
+ return NextResponse.json({ error: "Missing evaluation id" }, { status: 400 })
11
+ }
12
+
13
+ const dataset = await getResearchJoinDataset(id)
14
+
15
+ if (!dataset) {
16
+ return NextResponse.json({ error: "Evaluation join dataset not found" }, { status: 404 })
17
+ }
18
+
19
+ return NextResponse.json(dataset, {
20
+ headers: {
21
+ "Cache-Control": "public, max-age=300, stale-while-revalidate=1800",
22
+ },
23
+ })
24
+ }
components/eval-detail.tsx CHANGED
@@ -1,6 +1,7 @@
1
  "use client"
2
 
3
  import { useAudienceMode } from "@/components/audience-mode-provider"
 
4
  import { Fragment, useEffect, useMemo, useState } from "react"
5
  import Link from "next/link"
6
  import { Badge } from "@/components/ui/badge"
@@ -914,46 +915,51 @@ export function EvalDetail({ summary }: EvalDetailProps) {
914
  </Card>
915
 
916
  {hasMultiMetricLeaderboard ? (
917
- <MultiMetricLeaderboard summary={summary} isResearchView={isResearchView} />
 
 
 
918
  ) : (
919
- <Card className="overflow-hidden">
920
- <CardHeader className="border-b bg-muted/10 space-y-3">
921
- <ApplesToApplesBanner
922
- summary={summary.comparability_summary}
923
- detailsAnchorId="comparability-panel"
924
- />
925
- <div className="flex flex-col gap-3 lg:flex-row lg:items-end lg:justify-between">
926
- <div className="space-y-2">
927
- <div className="flex items-center gap-2">
928
- <Medal className="h-5 w-5 text-primary" />
929
- <CardTitle className="text-xl">{leaderboardTitle}</CardTitle>
 
 
 
 
930
  </div>
931
- <CardDescription>{leaderboardDescription}</CardDescription>
932
- </div>
933
 
934
- <div className="flex flex-wrap items-center gap-2 text-xs text-muted-foreground">
935
- <Badge variant="secondary">
936
- {leaderboardRows.length === summary.models_count
937
- ? `${summary.models_count} models`
938
- : `${leaderboardRows.length} of ${summary.models_count} models`}
939
- </Badge>
940
- <Badge variant="outline">{scoreDirectionLabel}</Badge>
941
- {hasParameterData && (numericMinParams != null || numericMaxParams != null) && (
942
- <Badge variant="outline">
943
- Params {formatParamBoundLabel(minParamStep, "min")} to {formatParamBoundLabel(maxParamStep, "max")}
944
- </Badge>
945
- )}
946
- {isResearchView && (
947
- <Badge variant="outline">
948
- Scale {summary.metric_config.min_score ?? 0} - {summary.metric_config.max_score ?? 1}
949
  </Badge>
950
- )}
 
 
 
 
 
 
 
 
 
 
 
951
  </div>
952
- </div>
953
- </CardHeader>
954
 
955
- <CardContent className="p-0">
956
- {hasParameterData && (
957
  <div className="border-b bg-background px-5 py-4 sm:px-6">
958
  <div className="flex flex-col gap-3 lg:flex-row lg:items-center lg:justify-between">
959
  <div className="space-y-1">
@@ -1456,8 +1462,9 @@ export function EvalDetail({ summary }: EvalDetailProps) {
1456
  </Button>
1457
  </div>
1458
  )}
1459
- </CardContent>
1460
- </Card>
 
1461
  )}
1462
  </div>
1463
  )
 
1
  "use client"
2
 
3
  import { useAudienceMode } from "@/components/audience-mode-provider"
4
+ import { EvalJoinAnalysis } from "@/components/eval-join-analysis"
5
  import { Fragment, useEffect, useMemo, useState } from "react"
6
  import Link from "next/link"
7
  import { Badge } from "@/components/ui/badge"
 
915
  </Card>
916
 
917
  {hasMultiMetricLeaderboard ? (
918
+ <>
919
+ <EvalJoinAnalysis evalId={summary.evaluation_id} />
920
+ <MultiMetricLeaderboard summary={summary} isResearchView={isResearchView} />
921
+ </>
922
  ) : (
923
+ <>
924
+ <EvalJoinAnalysis evalId={summary.evaluation_id} />
925
+ <Card className="overflow-hidden">
926
+ <CardHeader className="border-b bg-muted/10 space-y-3">
927
+ <ApplesToApplesBanner
928
+ summary={summary.comparability_summary}
929
+ detailsAnchorId="comparability-panel"
930
+ />
931
+ <div className="flex flex-col gap-3 lg:flex-row lg:items-end lg:justify-between">
932
+ <div className="space-y-2">
933
+ <div className="flex items-center gap-2">
934
+ <Medal className="h-5 w-5 text-primary" />
935
+ <CardTitle className="text-xl">{leaderboardTitle}</CardTitle>
936
+ </div>
937
+ <CardDescription>{leaderboardDescription}</CardDescription>
938
  </div>
 
 
939
 
940
+ <div className="flex flex-wrap items-center gap-2 text-xs text-muted-foreground">
941
+ <Badge variant="secondary">
942
+ {leaderboardRows.length === summary.models_count
943
+ ? `${summary.models_count} models`
944
+ : `${leaderboardRows.length} of ${summary.models_count} models`}
 
 
 
 
 
 
 
 
 
 
945
  </Badge>
946
+ <Badge variant="outline">{scoreDirectionLabel}</Badge>
947
+ {hasParameterData && (numericMinParams != null || numericMaxParams != null) && (
948
+ <Badge variant="outline">
949
+ Params {formatParamBoundLabel(minParamStep, "min")} to {formatParamBoundLabel(maxParamStep, "max")}
950
+ </Badge>
951
+ )}
952
+ {isResearchView && (
953
+ <Badge variant="outline">
954
+ Scale {summary.metric_config.min_score ?? 0} - {summary.metric_config.max_score ?? 1}
955
+ </Badge>
956
+ )}
957
+ </div>
958
  </div>
959
+ </CardHeader>
 
960
 
961
+ <CardContent className="p-0">
962
+ {hasParameterData && (
963
  <div className="border-b bg-background px-5 py-4 sm:px-6">
964
  <div className="flex flex-col gap-3 lg:flex-row lg:items-center lg:justify-between">
965
  <div className="space-y-1">
 
1462
  </Button>
1463
  </div>
1464
  )}
1465
+ </CardContent>
1466
+ </Card>
1467
+ </>
1468
  )}
1469
  </div>
1470
  )
components/eval-join-analysis.tsx ADDED
@@ -0,0 +1,697 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client"
2
+
3
+ import { useEffect, useMemo, useState } from "react"
4
+ import {
5
+ ClipboardList,
6
+ Download,
7
+ Filter,
8
+ GitBranch,
9
+ KeyRound,
10
+ Loader2,
11
+ Network,
12
+ TableProperties,
13
+ } from "lucide-react"
14
+
15
+ import { Badge } from "@/components/ui/badge"
16
+ import { Button } from "@/components/ui/button"
17
+ import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"
18
+ import { Input } from "@/components/ui/input"
19
+ import {
20
+ Select,
21
+ SelectContent,
22
+ SelectItem,
23
+ SelectTrigger,
24
+ SelectValue,
25
+ } from "@/components/ui/select"
26
+ import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table"
27
+ import { fetchEvalResearchJoins } from "@/lib/dashboard-data-client"
28
+ import type {
29
+ ResearchJoinColumn,
30
+ ResearchJoinColumnGroup,
31
+ ResearchJoinDataset,
32
+ ResearchJoinGrain,
33
+ ResearchJoinRow,
34
+ } from "@/lib/research-join-types"
35
+ import { cn } from "@/lib/utils"
36
+
37
+ const CORE_GROUPS: ResearchJoinColumnGroup[] = ["identity", "metric", "model", "score"]
38
+ const JOIN_GROUPS: Array<{
39
+ group: ResearchJoinColumnGroup
40
+ label: string
41
+ description: string
42
+ }> = [
43
+ {
44
+ group: "hierarchy",
45
+ label: "Hierarchy",
46
+ description: "Family, composite, leaf, and component keys.",
47
+ },
48
+ {
49
+ group: "source",
50
+ label: "Source",
51
+ description: "Source buckets, organizations, and evaluator relationship.",
52
+ },
53
+ {
54
+ group: "instance",
55
+ label: "Instances",
56
+ description: "Sample-link status and detailed result URLs.",
57
+ },
58
+ {
59
+ group: "quality",
60
+ label: "Quality",
61
+ description: "Sample sizes, uncertainty, timestamps, and config presence.",
62
+ },
63
+ ]
64
+
65
+ function formatCellValue(value: unknown) {
66
+ if (value == null || value === "") {
67
+ return "N/A"
68
+ }
69
+
70
+ if (typeof value === "boolean") {
71
+ return value ? "Yes" : "No"
72
+ }
73
+
74
+ if (typeof value === "number") {
75
+ if (!Number.isFinite(value)) {
76
+ return "N/A"
77
+ }
78
+ return Math.abs(value) >= 100 ? value.toLocaleString() : Number(value.toFixed(4)).toString()
79
+ }
80
+
81
+ return String(value)
82
+ }
83
+
84
+ function getRowValue(row: ResearchJoinRow, key: string) {
85
+ return row[key as keyof ResearchJoinRow]
86
+ }
87
+
88
+ function getDefaultColumnKeys(columns: ResearchJoinColumn[]) {
89
+ return new Set(columns.filter((column) => column.defaultVisible).map((column) => column.key))
90
+ }
91
+
92
+ function sortRows(rows: ResearchJoinRow[], sortKey: string, direction: "asc" | "desc") {
93
+ return [...rows].sort((left, right) => {
94
+ const leftValue = getRowValue(left, sortKey)
95
+ const rightValue = getRowValue(right, sortKey)
96
+
97
+ if (typeof leftValue === "number" && typeof rightValue === "number") {
98
+ return direction === "asc" ? leftValue - rightValue : rightValue - leftValue
99
+ }
100
+
101
+ const comparison = formatCellValue(leftValue).localeCompare(formatCellValue(rightValue), undefined, {
102
+ numeric: true,
103
+ sensitivity: "base",
104
+ })
105
+ return direction === "asc" ? comparison : -comparison
106
+ })
107
+ }
108
+
109
+ function escapeCsvCell(value: unknown) {
110
+ const text = formatCellValue(value)
111
+ if (/[",\n]/.test(text)) {
112
+ return `"${text.replace(/"/g, '""')}"`
113
+ }
114
+ return text
115
+ }
116
+
117
+ function downloadText(filename: string, text: string, mimeType: string) {
118
+ const blob = new Blob([text], { type: mimeType })
119
+ const url = URL.createObjectURL(blob)
120
+ const link = document.createElement("a")
121
+ link.href = url
122
+ link.download = filename
123
+ document.body.appendChild(link)
124
+ link.click()
125
+ document.body.removeChild(link)
126
+ URL.revokeObjectURL(url)
127
+ }
128
+
129
+ function getJoinRecipe(dataset: ResearchJoinDataset, grain: ResearchJoinGrain) {
130
+ const grainOption = dataset.available_grains.find((option) => option.grain === grain)
131
+ const keys = grainOption?.join_keys ?? ["eval_summary_id", "metric_summary_id", "model_route_id"]
132
+
133
+ return [
134
+ `Base grain: ${grainOption?.label ?? grain}`,
135
+ `Join keys: ${keys.join(" + ")}`,
136
+ "Keep source_name explicit; do not join across evaluation_name alone.",
137
+ "Add hierarchy, source, instance, or quality fields, then export the filtered rows.",
138
+ ].join("\n")
139
+ }
140
+
141
+ export function EvalJoinAnalysis({ evalId }: { evalId: string }) {
142
+ const [dataset, setDataset] = useState<ResearchJoinDataset | null>(null)
143
+ const [loading, setLoading] = useState(true)
144
+ const [error, setError] = useState<string | null>(null)
145
+ const [selectedGrain, setSelectedGrain] = useState<ResearchJoinGrain>("model_metric_source")
146
+ const [metricFilter, setMetricFilter] = useState("all")
147
+ const [sourceFilter, setSourceFilter] = useState("all")
148
+ const [relationshipFilter, setRelationshipFilter] = useState("all")
149
+ const [instanceFilter, setInstanceFilter] = useState("all")
150
+ const [query, setQuery] = useState("")
151
+ const [sortKey, setSortKey] = useState("rank")
152
+ const [sortDirection, setSortDirection] = useState<"asc" | "desc">("asc")
153
+ const [selectedColumns, setSelectedColumns] = useState<Set<string>>(new Set())
154
+ const [copiedRecipe, setCopiedRecipe] = useState(false)
155
+
156
+ useEffect(() => {
157
+ let isCancelled = false
158
+
159
+ const load = async () => {
160
+ try {
161
+ setLoading(true)
162
+ setError(null)
163
+ const nextDataset = await fetchEvalResearchJoins(evalId)
164
+
165
+ if (isCancelled) {
166
+ return
167
+ }
168
+
169
+ setDataset(nextDataset)
170
+ const defaultGrain =
171
+ nextDataset.available_grains.find((grain) => grain.grain === "model_metric_source") ??
172
+ nextDataset.available_grains[0]
173
+ if (defaultGrain) {
174
+ setSelectedGrain(defaultGrain.grain)
175
+ }
176
+ setSelectedColumns(getDefaultColumnKeys(nextDataset.columns))
177
+ } catch (err) {
178
+ if (!isCancelled) {
179
+ setError(err instanceof Error ? err.message : "Failed to load research joins")
180
+ }
181
+ } finally {
182
+ if (!isCancelled) {
183
+ setLoading(false)
184
+ }
185
+ }
186
+ }
187
+
188
+ load()
189
+
190
+ return () => {
191
+ isCancelled = true
192
+ }
193
+ }, [evalId])
194
+
195
+ const visibleColumns = useMemo(() => {
196
+ if (!dataset) {
197
+ return []
198
+ }
199
+
200
+ return dataset.columns.filter((column) => selectedColumns.has(column.key))
201
+ }, [dataset, selectedColumns])
202
+
203
+ const selectedGrainOption = dataset?.available_grains.find((option) => option.grain === selectedGrain)
204
+ const selectedJoinKeys = selectedGrainOption?.join_keys ?? []
205
+
206
+ const filteredRows = useMemo(() => {
207
+ if (!dataset) {
208
+ return []
209
+ }
210
+
211
+ const normalizedQuery = query.trim().toLowerCase()
212
+ const rows = dataset.rows.filter((row) => {
213
+ const grainMatches =
214
+ selectedGrain === "model_metric_instance"
215
+ ? row.has_instance_data
216
+ : row.join_grain === selectedGrain
217
+
218
+ if (!grainMatches) {
219
+ return false
220
+ }
221
+
222
+ if (metricFilter !== "all" && row.metric_name !== metricFilter) {
223
+ return false
224
+ }
225
+
226
+ if (sourceFilter !== "all" && row.source_name !== sourceFilter) {
227
+ return false
228
+ }
229
+
230
+ if (relationshipFilter !== "all" && row.evaluator_relationship !== relationshipFilter) {
231
+ return false
232
+ }
233
+
234
+ if (instanceFilter === "linked" && !row.has_instance_data) {
235
+ return false
236
+ }
237
+
238
+ if (instanceFilter === "unlinked" && row.has_instance_data) {
239
+ return false
240
+ }
241
+
242
+ if (!normalizedQuery) {
243
+ return true
244
+ }
245
+
246
+ return [
247
+ row.model_name,
248
+ row.model_id,
249
+ row.developer,
250
+ row.metric_name,
251
+ row.source_name,
252
+ row.source_organization_name,
253
+ row.component_name,
254
+ ]
255
+ .filter(Boolean)
256
+ .some((value) => String(value).toLowerCase().includes(normalizedQuery))
257
+ })
258
+
259
+ return sortRows(rows, sortKey, sortDirection)
260
+ }, [
261
+ dataset,
262
+ instanceFilter,
263
+ metricFilter,
264
+ query,
265
+ relationshipFilter,
266
+ selectedGrain,
267
+ sortDirection,
268
+ sortKey,
269
+ sourceFilter,
270
+ ])
271
+
272
+ const previewRows = filteredRows.slice(0, 100)
273
+
274
+ const facets = useMemo(() => {
275
+ if (!dataset) {
276
+ return {
277
+ metrics: [],
278
+ sources: [],
279
+ relationships: [],
280
+ }
281
+ }
282
+
283
+ const byKey = new Map(dataset.facets.map((facet) => [facet.key, facet.values]))
284
+ return {
285
+ metrics: byKey.get("metric_name") ?? [],
286
+ sources: byKey.get("source_name") ?? [],
287
+ relationships: byKey.get("evaluator_relationship") ?? [],
288
+ }
289
+ }, [dataset])
290
+
291
+ const toggleColumn = (key: string) => {
292
+ setSelectedColumns((current) => {
293
+ const next = new Set(current)
294
+ if (next.has(key)) {
295
+ next.delete(key)
296
+ } else {
297
+ next.add(key)
298
+ }
299
+ return next
300
+ })
301
+ }
302
+
303
+ const toggleJoinGroup = (group: ResearchJoinColumnGroup) => {
304
+ if (!dataset) {
305
+ return
306
+ }
307
+
308
+ const groupKeys = dataset.columns
309
+ .filter((column) => column.group === group)
310
+ .map((column) => column.key)
311
+
312
+ setSelectedColumns((current) => {
313
+ const next = new Set(current)
314
+ const allSelected = groupKeys.every((key) => next.has(key))
315
+ for (const key of groupKeys) {
316
+ if (allSelected && !CORE_GROUPS.includes(group)) {
317
+ next.delete(key)
318
+ } else {
319
+ next.add(key)
320
+ }
321
+ }
322
+ return next
323
+ })
324
+ }
325
+
326
+ const exportRows = (format: "csv" | "json") => {
327
+ if (!dataset) {
328
+ return
329
+ }
330
+
331
+ const columns = visibleColumns.length > 0 ? visibleColumns : dataset.columns.filter((column) => column.defaultVisible)
332
+ const filenameBase = `${dataset.eval_summary_id}-${selectedGrain}-join`
333
+
334
+ if (format === "json") {
335
+ const payload = filteredRows.map((row) =>
336
+ Object.fromEntries(columns.map((column) => [column.key, getRowValue(row, column.key)]))
337
+ )
338
+ downloadText(`${filenameBase}.json`, JSON.stringify(payload, null, 2), "application/json;charset=utf-8")
339
+ return
340
+ }
341
+
342
+ const header = columns.map((column) => escapeCsvCell(column.key)).join(",")
343
+ const body = filteredRows
344
+ .map((row) => columns.map((column) => escapeCsvCell(getRowValue(row, column.key))).join(","))
345
+ .join("\n")
346
+ downloadText(`${filenameBase}.csv`, `${header}\n${body}`, "text/csv;charset=utf-8")
347
+ }
348
+
349
+ const copyRecipe = async () => {
350
+ if (!dataset) {
351
+ return
352
+ }
353
+
354
+ try {
355
+ await navigator.clipboard.writeText(getJoinRecipe(dataset, selectedGrain))
356
+ setCopiedRecipe(true)
357
+ window.setTimeout(() => setCopiedRecipe(false), 1800)
358
+ } catch {
359
+ setCopiedRecipe(false)
360
+ }
361
+ }
362
+
363
+ if (loading) {
364
+ return (
365
+ <Card className="overflow-hidden">
366
+ <CardContent className="flex items-center gap-3 p-5 text-sm text-muted-foreground">
367
+ <Loader2 className="h-4 w-4 animate-spin" />
368
+ Loading research join dataset...
369
+ </CardContent>
370
+ </Card>
371
+ )
372
+ }
373
+
374
+ if (error || !dataset) {
375
+ return (
376
+ <Card className="overflow-hidden border-amber-200/70 bg-amber-50/30 dark:border-amber-900/50 dark:bg-amber-950/10">
377
+ <CardContent className="p-5 text-sm text-amber-900 dark:text-amber-100">
378
+ Research joins are unavailable for this benchmark.
379
+ </CardContent>
380
+ </Card>
381
+ )
382
+ }
383
+
384
+ return (
385
+ <Card className="overflow-hidden">
386
+ <CardHeader className="border-b bg-muted/10">
387
+ <div className="flex flex-col gap-4 xl:flex-row xl:items-start xl:justify-between">
388
+ <div className="space-y-2">
389
+ <div className="flex items-center gap-2">
390
+ <Network className="h-5 w-5 text-primary" />
391
+ <CardTitle className="text-xl">Research Join Builder</CardTitle>
392
+ </div>
393
+ <CardDescription>
394
+ Build source-safe tables from benchmark rows, metric identities, model routes, hierarchy keys, and instance links.
395
+ </CardDescription>
396
+ </div>
397
+
398
+ <div className="flex flex-wrap items-center gap-2">
399
+ <Badge variant={dataset.source === "query_api" ? "default" : "secondary"}>
400
+ {dataset.source === "query_api" ? "Query API" : "Artifact fallback"}
401
+ </Badge>
402
+ <Badge variant="outline">{filteredRows.length.toLocaleString()} joined rows</Badge>
403
+ </div>
404
+ </div>
405
+ </CardHeader>
406
+
407
+ <CardContent className="space-y-5 p-4 sm:p-5">
408
+ <section className="grid gap-3 lg:grid-cols-3">
409
+ {dataset.join_steps.map((step) => (
410
+ <div key={step.step} className="rounded-2xl border bg-background/70 p-4">
411
+ <div className="flex items-center gap-2">
412
+ <span className="flex h-7 w-7 items-center justify-center rounded-full bg-primary text-xs font-semibold text-primary-foreground">
413
+ {step.step}
414
+ </span>
415
+ <div className="font-semibold">{step.title}</div>
416
+ </div>
417
+ <p className="mt-2 text-sm leading-6 text-muted-foreground">{step.description}</p>
418
+ <div className="mt-3 flex flex-wrap gap-1.5">
419
+ {step.keys.map((key) => (
420
+ <span key={key} className="rounded-md border bg-muted/20 px-2 py-1 font-mono text-[11px]">
421
+ {key}
422
+ </span>
423
+ ))}
424
+ </div>
425
+ </div>
426
+ ))}
427
+ </section>
428
+
429
+ <section className="grid gap-4 xl:grid-cols-[minmax(0,0.95fr)_minmax(0,1.05fr)]">
430
+ <div className="space-y-4 rounded-2xl border bg-muted/5 p-4">
431
+ <div className="flex items-center gap-2">
432
+ <GitBranch className="h-4 w-4 text-muted-foreground" />
433
+ <div className="text-sm font-semibold">Base Grain</div>
434
+ </div>
435
+
436
+ <Select value={selectedGrain} onValueChange={(value) => setSelectedGrain(value as ResearchJoinGrain)}>
437
+ <SelectTrigger className="w-full">
438
+ <SelectValue />
439
+ </SelectTrigger>
440
+ <SelectContent>
441
+ {dataset.available_grains.map((grain) => (
442
+ <SelectItem key={grain.grain} value={grain.grain}>
443
+ {grain.label} ({grain.row_count.toLocaleString()})
444
+ </SelectItem>
445
+ ))}
446
+ </SelectContent>
447
+ </Select>
448
+
449
+ {selectedGrainOption && (
450
+ <div className="rounded-xl border bg-background p-3">
451
+ <div className="text-sm font-medium">{selectedGrainOption.description}</div>
452
+ <div className="mt-3 flex items-center gap-2 text-xs font-semibold uppercase tracking-[0.16em] text-muted-foreground">
453
+ <KeyRound className="h-3.5 w-3.5" />
454
+ Join Keys
455
+ </div>
456
+ <div className="mt-2 flex flex-wrap gap-1.5">
457
+ {selectedJoinKeys.map((key) => (
458
+ <span key={key} className="rounded-md border bg-muted/30 px-2 py-1 font-mono text-[11px]">
459
+ {key}
460
+ </span>
461
+ ))}
462
+ </div>
463
+ </div>
464
+ )}
465
+ </div>
466
+
467
+ <div className="space-y-4 rounded-2xl border bg-muted/5 p-4">
468
+ <div className="flex items-center gap-2">
469
+ <TableProperties className="h-4 w-4 text-muted-foreground" />
470
+ <div className="text-sm font-semibold">Joined Field Groups</div>
471
+ </div>
472
+
473
+ <div className="grid gap-2 sm:grid-cols-2">
474
+ {JOIN_GROUPS.map((joinGroup) => {
475
+ const groupColumns = dataset.columns.filter((column) => column.group === joinGroup.group)
476
+ const active = groupColumns.some((column) => selectedColumns.has(column.key))
477
+
478
+ return (
479
+ <button
480
+ key={joinGroup.group}
481
+ type="button"
482
+ onClick={() => toggleJoinGroup(joinGroup.group)}
483
+ className={cn(
484
+ "rounded-xl border p-3 text-left transition-colors",
485
+ active
486
+ ? "border-foreground/20 bg-background shadow-sm"
487
+ : "border-border/70 bg-muted/20 text-muted-foreground hover:bg-muted/30"
488
+ )}
489
+ >
490
+ <div className="text-sm font-semibold">{joinGroup.label}</div>
491
+ <div className="mt-1 text-xs leading-5 text-muted-foreground">{joinGroup.description}</div>
492
+ </button>
493
+ )
494
+ })}
495
+ </div>
496
+ </div>
497
+ </section>
498
+
499
+ <section className="space-y-3 rounded-2xl border bg-background/70 p-4">
500
+ <div className="flex flex-col gap-3 lg:flex-row lg:items-center lg:justify-between">
501
+ <div className="flex items-center gap-2">
502
+ <Filter className="h-4 w-4 text-muted-foreground" />
503
+ <div className="text-sm font-semibold">Filters and Export</div>
504
+ </div>
505
+ <div className="flex flex-wrap gap-2">
506
+ <Button variant="outline" size="sm" className="gap-2" onClick={copyRecipe}>
507
+ <ClipboardList className="h-4 w-4" />
508
+ {copiedRecipe ? "Copied" : "Copy recipe"}
509
+ </Button>
510
+ <Button variant="outline" size="sm" className="gap-2" onClick={() => exportRows("csv")}>
511
+ <Download className="h-4 w-4" />
512
+ CSV
513
+ </Button>
514
+ <Button variant="outline" size="sm" className="gap-2" onClick={() => exportRows("json")}>
515
+ <Download className="h-4 w-4" />
516
+ JSON
517
+ </Button>
518
+ </div>
519
+ </div>
520
+
521
+ <div className="grid gap-3 md:grid-cols-2 xl:grid-cols-6">
522
+ <Input
523
+ value={query}
524
+ onChange={(event) => setQuery(event.target.value)}
525
+ placeholder="Search model, metric, source..."
526
+ className="xl:col-span-2"
527
+ />
528
+
529
+ <Select value={metricFilter} onValueChange={setMetricFilter}>
530
+ <SelectTrigger className="w-full">
531
+ <SelectValue placeholder="Metric" />
532
+ </SelectTrigger>
533
+ <SelectContent>
534
+ <SelectItem value="all">All metrics</SelectItem>
535
+ {facets.metrics.map((facet) => (
536
+ <SelectItem key={facet.value} value={facet.value}>
537
+ {facet.label} ({facet.count})
538
+ </SelectItem>
539
+ ))}
540
+ </SelectContent>
541
+ </Select>
542
+
543
+ <Select value={sourceFilter} onValueChange={setSourceFilter}>
544
+ <SelectTrigger className="w-full">
545
+ <SelectValue placeholder="Source" />
546
+ </SelectTrigger>
547
+ <SelectContent>
548
+ <SelectItem value="all">All sources</SelectItem>
549
+ {facets.sources.map((facet) => (
550
+ <SelectItem key={facet.value} value={facet.value}>
551
+ {facet.label} ({facet.count})
552
+ </SelectItem>
553
+ ))}
554
+ </SelectContent>
555
+ </Select>
556
+
557
+ <Select value={relationshipFilter} onValueChange={setRelationshipFilter}>
558
+ <SelectTrigger className="w-full">
559
+ <SelectValue placeholder="Relationship" />
560
+ </SelectTrigger>
561
+ <SelectContent>
562
+ <SelectItem value="all">All relationships</SelectItem>
563
+ {facets.relationships.map((facet) => (
564
+ <SelectItem key={facet.value} value={facet.value}>
565
+ {facet.label} ({facet.count})
566
+ </SelectItem>
567
+ ))}
568
+ </SelectContent>
569
+ </Select>
570
+
571
+ <Select value={instanceFilter} onValueChange={setInstanceFilter}>
572
+ <SelectTrigger className="w-full">
573
+ <SelectValue placeholder="Instances" />
574
+ </SelectTrigger>
575
+ <SelectContent>
576
+ <SelectItem value="all">All instance states</SelectItem>
577
+ <SelectItem value="linked">Has instances</SelectItem>
578
+ <SelectItem value="unlinked">No instances</SelectItem>
579
+ </SelectContent>
580
+ </Select>
581
+ </div>
582
+
583
+ <div className="grid gap-3 md:grid-cols-2 xl:grid-cols-[minmax(0,1fr)_16rem_10rem]">
584
+ <div className="flex flex-wrap gap-1.5">
585
+ {dataset.columns.map((column) => (
586
+ <button
587
+ key={column.key}
588
+ type="button"
589
+ onClick={() => toggleColumn(column.key)}
590
+ title={column.description}
591
+ className={cn(
592
+ "rounded-md border px-2.5 py-1 text-xs font-medium transition-colors",
593
+ selectedColumns.has(column.key)
594
+ ? "border-foreground/20 bg-muted text-foreground"
595
+ : "border-border/70 bg-background text-muted-foreground hover:bg-muted/20"
596
+ )}
597
+ >
598
+ {column.label}
599
+ </button>
600
+ ))}
601
+ </div>
602
+
603
+ <Select value={sortKey} onValueChange={setSortKey}>
604
+ <SelectTrigger className="w-full">
605
+ <SelectValue placeholder="Sort" />
606
+ </SelectTrigger>
607
+ <SelectContent>
608
+ {dataset.columns.map((column) => (
609
+ <SelectItem key={column.key} value={column.key}>
610
+ Sort: {column.label}
611
+ </SelectItem>
612
+ ))}
613
+ </SelectContent>
614
+ </Select>
615
+
616
+ <Select value={sortDirection} onValueChange={(value) => setSortDirection(value as "asc" | "desc")}>
617
+ <SelectTrigger className="w-full">
618
+ <SelectValue />
619
+ </SelectTrigger>
620
+ <SelectContent>
621
+ <SelectItem value="asc">Ascending</SelectItem>
622
+ <SelectItem value="desc">Descending</SelectItem>
623
+ </SelectContent>
624
+ </Select>
625
+ </div>
626
+ </section>
627
+
628
+ {dataset.warnings.length > 0 && (
629
+ <div className="rounded-2xl border border-amber-200/80 bg-amber-50/70 p-3 text-sm leading-6 text-amber-950 dark:border-amber-900/50 dark:bg-amber-950/20 dark:text-amber-100">
630
+ {dataset.warnings.map((warning) => (
631
+ <div key={warning}>{warning}</div>
632
+ ))}
633
+ </div>
634
+ )}
635
+
636
+ <div className="overflow-hidden rounded-2xl border">
637
+ <Table className="min-w-[980px]">
638
+ <TableHeader>
639
+ <TableRow className="hover:bg-transparent">
640
+ {visibleColumns.map((column) => (
641
+ <TableHead key={column.key} className="px-3">
642
+ <span className={cn(column.isJoinKey && "inline-flex items-center gap-1")}>
643
+ {column.isJoinKey && <KeyRound className="h-3 w-3 text-muted-foreground" />}
644
+ {column.label}
645
+ </span>
646
+ </TableHead>
647
+ ))}
648
+ </TableRow>
649
+ </TableHeader>
650
+ <TableBody>
651
+ {previewRows.map((row) => (
652
+ <TableRow key={row.row_id}>
653
+ {visibleColumns.map((column) => {
654
+ const value = getRowValue(row, column.key)
655
+ const isUrl = column.type === "url" && typeof value === "string" && value.startsWith("http")
656
+
657
+ return (
658
+ <TableCell key={column.key} className="max-w-[22rem] whitespace-normal px-3 align-top">
659
+ {isUrl ? (
660
+ <a
661
+ href={value}
662
+ target="_blank"
663
+ rel="noreferrer"
664
+ className="text-primary underline-offset-4 hover:underline"
665
+ >
666
+ Open
667
+ </a>
668
+ ) : (
669
+ <span className={cn(column.isJoinKey && "font-mono text-xs")}>
670
+ {formatCellValue(value)}
671
+ </span>
672
+ )}
673
+ </TableCell>
674
+ )
675
+ })}
676
+ </TableRow>
677
+ ))}
678
+ {previewRows.length === 0 && (
679
+ <TableRow>
680
+ <TableCell colSpan={Math.max(visibleColumns.length, 1)} className="px-6 py-10 text-center text-sm text-muted-foreground">
681
+ No rows match the selected join filters.
682
+ </TableCell>
683
+ </TableRow>
684
+ )}
685
+ </TableBody>
686
+ </Table>
687
+ </div>
688
+
689
+ {filteredRows.length > previewRows.length && (
690
+ <div className="text-center text-xs text-muted-foreground">
691
+ Showing first {previewRows.length.toLocaleString()} rows. Export includes all {filteredRows.length.toLocaleString()} filtered rows.
692
+ </div>
693
+ )}
694
+ </CardContent>
695
+ </Card>
696
+ )
697
+ }
lib/benchmark-schema.ts CHANGED
@@ -88,16 +88,24 @@ export interface ModelInfo {
88
  }
89
 
90
  export interface EvaluationResult {
 
91
  evaluation_name: string
92
  display_name?: string
93
  canonical_display_name?: string
94
  metric_summary_id?: string
 
95
  metric_key?: string
 
 
 
 
96
  evaluation_timestamp: string
97
  source_data?: string[] | SourceData
 
98
  metric_config: MetricConfig
99
  score_details: ScoreDetails
100
  detailed_evaluation_results_url?: string
 
101
  generation_config?: GenerationConfig
102
  evalcards?: { annotations?: RowAnnotations }
103
  }
@@ -106,6 +114,10 @@ export interface MetricConfig {
106
  evaluation_description: string
107
  lower_is_better: boolean
108
  score_type: 'continuous' | 'discrete' | 'binary'
 
 
 
 
109
  min_score?: number
110
  max_score?: number
111
  unit?: string
 
88
  }
89
 
90
  export interface EvaluationResult {
91
+ evaluation_result_id?: string
92
  evaluation_name: string
93
  display_name?: string
94
  canonical_display_name?: string
95
  metric_summary_id?: string
96
+ metric_id?: string
97
  metric_key?: string
98
+ metric_name?: string
99
+ metric_kind?: string
100
+ metric_unit?: string
101
+ metric_parameters?: Record<string, any>
102
  evaluation_timestamp: string
103
  source_data?: string[] | SourceData
104
+ source_record_url?: string
105
  metric_config: MetricConfig
106
  score_details: ScoreDetails
107
  detailed_evaluation_results_url?: string
108
+ detailed_evaluation_results_meta?: unknown
109
  generation_config?: GenerationConfig
110
  evalcards?: { annotations?: RowAnnotations }
111
  }
 
114
  evaluation_description: string
115
  lower_is_better: boolean
116
  score_type: 'continuous' | 'discrete' | 'binary'
117
+ metric_id?: string
118
+ metric_kind?: string
119
+ metric_unit?: string
120
+ metric_parameters?: Record<string, any>
121
  min_score?: number
122
  max_score?: number
123
  unit?: string
lib/dashboard-data-client.ts CHANGED
@@ -1,4 +1,5 @@
1
  import type { BackendManifestStatus, ComparisonIndex, CorpusAggregates, EvalHierarchy } from "@/lib/backend-artifacts"
 
2
  import type { BenchmarkEvaluationCardData } from "@/components/benchmark-evaluation-card"
3
  import type { HFEvalDetail } from "@/lib/hf-data"
4
  import type {
@@ -103,3 +104,9 @@ export function fetchComparisonIndex() {
103
  export function fetchCorpusAggregates() {
104
  return fetchJson<CorpusAggregates>("/api/corpus-aggregates")
105
  }
 
 
 
 
 
 
 
1
  import type { BackendManifestStatus, ComparisonIndex, CorpusAggregates, EvalHierarchy } from "@/lib/backend-artifacts"
2
+ import type { ResearchJoinDataset } from "@/lib/research-join-types"
3
  import type { BenchmarkEvaluationCardData } from "@/components/benchmark-evaluation-card"
4
  import type { HFEvalDetail } from "@/lib/hf-data"
5
  import type {
 
104
  export function fetchCorpusAggregates() {
105
  return fetchJson<CorpusAggregates>("/api/corpus-aggregates")
106
  }
107
+
108
+ export function fetchEvalResearchJoins(evalId: string) {
109
+ return fetchJson<ResearchJoinDataset>(
110
+ `/api/research/eval-joins?id=${encodeURIComponent(evalId)}`
111
+ )
112
+ }
lib/hf-data.ts CHANGED
@@ -544,6 +544,7 @@ export interface HFEvalListEntry extends SignalSummaries {
544
  }
545
 
546
  export interface HFEvalModelResult {
 
547
  model_id: string
548
  model_route_id: string
549
  model_name: string
@@ -569,6 +570,10 @@ export interface HFEvalMetric {
569
  legacy_eval_summary_id?: string
570
  evaluation_name?: string
571
  metric_name: string
 
 
 
 
572
  metric_key: string
573
  display_name?: string
574
  canonical_display_name?: string
@@ -1383,6 +1388,7 @@ function flattenHierarchyNode(
1383
  const modelInfo = buildModelInfoForVariant(detail, result, variantMeta)
1384
  const inlineSamples = parseInstanceLevelData(result.instance_level_data)
1385
  const evaluationResult: EvaluationResult = {
 
1386
  evaluation_name: metric.metric_name || metric.evaluation_name || metric.display_name,
1387
  display_name: metric.display_name || metric.metric_name || metric.evaluation_name,
1388
  canonical_display_name:
@@ -1390,9 +1396,17 @@ function flattenHierarchyNode(
1390
  metric.display_name ||
1391
  `${context.benchmark ?? context.display_name ?? "Benchmark"} / ${metric.metric_name}`,
1392
  metric_summary_id: metric.metric_summary_id,
 
1393
  metric_key: metric.metric_key,
 
 
 
 
 
 
1394
  evaluation_timestamp: result.retrieved_timestamp ?? detail.last_updated ?? "",
1395
  source_data: sourceData,
 
1396
  metric_config: metric.metric_config,
1397
  score_details: {
1398
  score: result.score,
@@ -1401,6 +1415,7 @@ function flattenHierarchyNode(
1401
  result.detailed_evaluation_results
1402
  ),
1403
  evalcards: result.evalcards,
 
1404
  }
1405
 
1406
  const existing = resultsByVariant.get(variantKey)
 
544
  }
545
 
546
  export interface HFEvalModelResult {
547
+ evaluation_result_id?: string
548
  model_id: string
549
  model_route_id: string
550
  model_name: string
 
570
  legacy_eval_summary_id?: string
571
  evaluation_name?: string
572
  metric_name: string
573
+ metric_id?: string | null
574
+ metric_kind?: string | null
575
+ metric_unit?: string | null
576
+ metric_parameters?: Record<string, unknown> | null
577
  metric_key: string
578
  display_name?: string
579
  canonical_display_name?: string
 
1388
  const modelInfo = buildModelInfoForVariant(detail, result, variantMeta)
1389
  const inlineSamples = parseInstanceLevelData(result.instance_level_data)
1390
  const evaluationResult: EvaluationResult = {
1391
+ evaluation_result_id: result.evaluation_result_id,
1392
  evaluation_name: metric.metric_name || metric.evaluation_name || metric.display_name,
1393
  display_name: metric.display_name || metric.metric_name || metric.evaluation_name,
1394
  canonical_display_name:
 
1396
  metric.display_name ||
1397
  `${context.benchmark ?? context.display_name ?? "Benchmark"} / ${metric.metric_name}`,
1398
  metric_summary_id: metric.metric_summary_id,
1399
+ metric_id: metric.metric_id ?? undefined,
1400
  metric_key: metric.metric_key,
1401
+ metric_name: metric.metric_name,
1402
+ metric_kind: metric.metric_kind ?? undefined,
1403
+ metric_unit:
1404
+ metric.metric_unit ??
1405
+ (typeof metric.metric_config?.unit === "string" ? metric.metric_config.unit : undefined),
1406
+ metric_parameters: metric.metric_parameters ?? undefined,
1407
  evaluation_timestamp: result.retrieved_timestamp ?? detail.last_updated ?? "",
1408
  source_data: sourceData,
1409
+ source_record_url: result.source_record_url,
1410
  metric_config: metric.metric_config,
1411
  score_details: {
1412
  score: result.score,
 
1415
  result.detailed_evaluation_results
1416
  ),
1417
  evalcards: result.evalcards,
1418
+ detailed_evaluation_results_meta: result.detailed_evaluation_results_meta,
1419
  }
1420
 
1421
  const existing = resultsByVariant.get(variantKey)
lib/model-data.ts CHANGED
@@ -729,18 +729,26 @@ function toModelResultsForMetric(
729
  }
730
 
731
  const evaluationResult: EvaluationResult = {
 
732
  evaluation_name: metric.metric_name || metric.evaluation_name || metric.display_name || "",
733
  display_name: metric.display_name || metric.metric_name || metric.evaluation_name,
734
  canonical_display_name: metric.canonical_display_name,
735
  metric_summary_id: metric.metric_summary_id,
 
736
  metric_key: metric.metric_key,
 
 
 
 
737
  evaluation_timestamp: evaluationTimestamp,
 
738
  metric_config: metricConfig,
739
  score_details: { score: mr.score ?? 0 },
740
  detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
741
  mr.detailed_evaluation_results
742
  ),
743
  evalcards: mr.evalcards,
 
744
  }
745
 
746
  return {
 
729
  }
730
 
731
  const evaluationResult: EvaluationResult = {
732
+ evaluation_result_id: mr.evaluation_result_id,
733
  evaluation_name: metric.metric_name || metric.evaluation_name || metric.display_name || "",
734
  display_name: metric.display_name || metric.metric_name || metric.evaluation_name,
735
  canonical_display_name: metric.canonical_display_name,
736
  metric_summary_id: metric.metric_summary_id,
737
+ metric_id: metric.metric_id ?? undefined,
738
  metric_key: metric.metric_key,
739
+ metric_name: metric.metric_name,
740
+ metric_kind: metric.metric_kind ?? undefined,
741
+ metric_unit: metric.metric_unit ?? metricConfig.unit,
742
+ metric_parameters: metric.metric_parameters ?? undefined,
743
  evaluation_timestamp: evaluationTimestamp,
744
+ source_record_url: mr.source_record_url,
745
  metric_config: metricConfig,
746
  score_details: { score: mr.score ?? 0 },
747
  detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
748
  mr.detailed_evaluation_results
749
  ),
750
  evalcards: mr.evalcards,
751
+ detailed_evaluation_results_meta: mr.detailed_evaluation_results_meta,
752
  }
753
 
754
  return {
lib/research-join-types.ts ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export type ResearchJoinSource = "query_api" | "artifact"
2
+
3
+ export type ResearchJoinGrain =
4
+ | "model_metric"
5
+ | "model_metric_source"
6
+ | "model_metric_instance"
7
+ | "composite_component"
8
+
9
+ export type ResearchJoinColumnGroup =
10
+ | "identity"
11
+ | "hierarchy"
12
+ | "metric"
13
+ | "model"
14
+ | "score"
15
+ | "source"
16
+ | "instance"
17
+ | "quality"
18
+
19
+ export interface ResearchJoinColumn {
20
+ key: string
21
+ label: string
22
+ group: ResearchJoinColumnGroup
23
+ description: string
24
+ defaultVisible: boolean
25
+ isJoinKey?: boolean
26
+ type?: "string" | "number" | "boolean" | "url" | "date"
27
+ }
28
+
29
+ export interface ResearchJoinGrainOption {
30
+ grain: ResearchJoinGrain
31
+ label: string
32
+ description: string
33
+ row_count: number
34
+ join_keys: string[]
35
+ }
36
+
37
+ export interface ResearchJoinFacetValue {
38
+ value: string
39
+ label: string
40
+ count: number
41
+ }
42
+
43
+ export interface ResearchJoinFacet {
44
+ key: string
45
+ label: string
46
+ values: ResearchJoinFacetValue[]
47
+ }
48
+
49
+ export interface ResearchJoinStep {
50
+ step: number
51
+ title: string
52
+ description: string
53
+ keys: string[]
54
+ }
55
+
56
+ export interface ResearchJoinRow {
57
+ row_id: string
58
+ join_grain: ResearchJoinGrain
59
+ eval_summary_id: string
60
+ evaluation_name: string
61
+ benchmark_family_key?: string | null
62
+ benchmark_parent_key?: string | null
63
+ benchmark_leaf_key?: string | null
64
+ component_eval_summary_id?: string | null
65
+ component_name?: string | null
66
+ metric_summary_id?: string | null
67
+ metric_id?: string | null
68
+ metric_key?: string | null
69
+ metric_name: string
70
+ metric_kind?: string | null
71
+ metric_unit?: string | null
72
+ model_route_id?: string | null
73
+ model_id: string
74
+ model_name: string
75
+ developer?: string | null
76
+ score?: number | null
77
+ normalized_score?: number | null
78
+ rank?: number | null
79
+ rank_total?: number | null
80
+ lower_is_better?: boolean | null
81
+ source_name: string
82
+ source_type?: string | null
83
+ source_organization_name?: string | null
84
+ evaluator_relationship?: string | null
85
+ source_record_url?: string | null
86
+ source_dataset_name?: string | null
87
+ source_dataset_version?: string | null
88
+ source_hf_repo?: string | null
89
+ source_hf_split?: string | null
90
+ retrieved_at?: string | null
91
+ evaluation_timestamp?: string | null
92
+ has_instance_data: boolean
93
+ instance_join_status: "metric_exact" | "benchmark_available" | "not_available"
94
+ detailed_evaluation_results_url?: string | null
95
+ sample_size?: number | null
96
+ standard_error?: number | null
97
+ confidence_interval?: string | null
98
+ generation_config_available?: boolean | null
99
+ }
100
+
101
+ export interface ResearchJoinDataset {
102
+ source: ResearchJoinSource
103
+ generated_at: string
104
+ eval_summary_id: string
105
+ eval_name: string
106
+ warnings: string[]
107
+ join_steps: ResearchJoinStep[]
108
+ available_grains: ResearchJoinGrainOption[]
109
+ columns: ResearchJoinColumn[]
110
+ facets: ResearchJoinFacet[]
111
+ rows: ResearchJoinRow[]
112
+ }
lib/research-joins.ts ADDED
@@ -0,0 +1,712 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import "server-only"
2
+
3
+ import type { SourceData } from "@/lib/benchmark-schema"
4
+ import type { BenchmarkEvalSummary, ModelResultForBenchmark } from "@/lib/eval-processing"
5
+ import { getModelFamilyRouteId } from "@/lib/model-family"
6
+ import { getEvalSummaryById } from "@/lib/model-data"
7
+ import type {
8
+ ResearchJoinColumn,
9
+ ResearchJoinDataset,
10
+ ResearchJoinFacet,
11
+ ResearchJoinGrain,
12
+ ResearchJoinGrainOption,
13
+ ResearchJoinRow,
14
+ } from "@/lib/research-join-types"
15
+
16
+ const MISSING_SOURCE_BUCKET = "__missing_source__"
17
+
18
+ const COLUMN_DEFINITIONS: ResearchJoinColumn[] = [
19
+ {
20
+ key: "join_grain",
21
+ label: "Grain",
22
+ group: "identity",
23
+ description: "The row shape selected for the join.",
24
+ defaultVisible: false,
25
+ isJoinKey: true,
26
+ },
27
+ {
28
+ key: "eval_summary_id",
29
+ label: "Eval ID",
30
+ group: "identity",
31
+ description: "Canonical benchmark detail identifier.",
32
+ defaultVisible: false,
33
+ isJoinKey: true,
34
+ },
35
+ {
36
+ key: "metric_summary_id",
37
+ label: "Metric ID",
38
+ group: "metric",
39
+ description: "Metric-level identifier used to align ranks, scores, and samples.",
40
+ defaultVisible: true,
41
+ isJoinKey: true,
42
+ },
43
+ {
44
+ key: "model_route_id",
45
+ label: "Model Route",
46
+ group: "model",
47
+ description: "Stable frontend route identifier for the model family.",
48
+ defaultVisible: false,
49
+ isJoinKey: true,
50
+ },
51
+ {
52
+ key: "model_name",
53
+ label: "Model",
54
+ group: "model",
55
+ description: "Display name for the model row.",
56
+ defaultVisible: true,
57
+ },
58
+ {
59
+ key: "developer",
60
+ label: "Developer",
61
+ group: "model",
62
+ description: "Reported developer or provider.",
63
+ defaultVisible: true,
64
+ },
65
+ {
66
+ key: "metric_name",
67
+ label: "Metric",
68
+ group: "metric",
69
+ description: "Metric display name from the benchmark artifact.",
70
+ defaultVisible: true,
71
+ },
72
+ {
73
+ key: "score",
74
+ label: "Score",
75
+ group: "score",
76
+ description: "Raw reported score for the selected row grain.",
77
+ defaultVisible: true,
78
+ type: "number",
79
+ },
80
+ {
81
+ key: "rank",
82
+ label: "Rank",
83
+ group: "score",
84
+ description: "Rank within the selected benchmark metric.",
85
+ defaultVisible: true,
86
+ type: "number",
87
+ },
88
+ {
89
+ key: "rank_total",
90
+ label: "Rank Total",
91
+ group: "score",
92
+ description: "Number of scored rows in the ranking partition.",
93
+ defaultVisible: false,
94
+ type: "number",
95
+ },
96
+ {
97
+ key: "source_name",
98
+ label: "Source",
99
+ group: "source",
100
+ description: "Source name, or the explicit missing-source bucket.",
101
+ defaultVisible: true,
102
+ isJoinKey: true,
103
+ },
104
+ {
105
+ key: "source_organization_name",
106
+ label: "Organization",
107
+ group: "source",
108
+ description: "Organization associated with the reported result.",
109
+ defaultVisible: true,
110
+ },
111
+ {
112
+ key: "evaluator_relationship",
113
+ label: "Relationship",
114
+ group: "source",
115
+ description: "First-party, third-party, collaborative, or other source relationship.",
116
+ defaultVisible: true,
117
+ },
118
+ {
119
+ key: "benchmark_family_key",
120
+ label: "Family",
121
+ group: "hierarchy",
122
+ description: "Backend-declared benchmark family key.",
123
+ defaultVisible: false,
124
+ isJoinKey: true,
125
+ },
126
+ {
127
+ key: "benchmark_parent_key",
128
+ label: "Composite",
129
+ group: "hierarchy",
130
+ description: "Backend-declared composite or parent benchmark key.",
131
+ defaultVisible: false,
132
+ isJoinKey: true,
133
+ },
134
+ {
135
+ key: "benchmark_leaf_key",
136
+ label: "Leaf",
137
+ group: "hierarchy",
138
+ description: "Backend-declared leaf benchmark key.",
139
+ defaultVisible: false,
140
+ isJoinKey: true,
141
+ },
142
+ {
143
+ key: "component_name",
144
+ label: "Component",
145
+ group: "hierarchy",
146
+ description: "Composite component name when viewing rollup score rows.",
147
+ defaultVisible: false,
148
+ },
149
+ {
150
+ key: "has_instance_data",
151
+ label: "Instances",
152
+ group: "instance",
153
+ description: "Whether this row has exact or benchmark-level instance linkage.",
154
+ defaultVisible: true,
155
+ type: "boolean",
156
+ },
157
+ {
158
+ key: "instance_join_status",
159
+ label: "Instance Join",
160
+ group: "instance",
161
+ description: "Whether instance linkage is exact, benchmark-level only, or unavailable.",
162
+ defaultVisible: false,
163
+ },
164
+ {
165
+ key: "detailed_evaluation_results_url",
166
+ label: "Instance URL",
167
+ group: "instance",
168
+ description: "Metric-selective sample data URL when the artifact exposes it.",
169
+ defaultVisible: false,
170
+ type: "url",
171
+ },
172
+ {
173
+ key: "sample_size",
174
+ label: "Sample Size",
175
+ group: "quality",
176
+ description: "Reported sample size when available.",
177
+ defaultVisible: false,
178
+ type: "number",
179
+ },
180
+ {
181
+ key: "standard_error",
182
+ label: "Std. Error",
183
+ group: "quality",
184
+ description: "Reported standard error when available.",
185
+ defaultVisible: false,
186
+ type: "number",
187
+ },
188
+ {
189
+ key: "generation_config_available",
190
+ label: "Gen Config",
191
+ group: "quality",
192
+ description: "Whether generation config is present for reproducibility checks.",
193
+ defaultVisible: false,
194
+ type: "boolean",
195
+ },
196
+ {
197
+ key: "retrieved_at",
198
+ label: "Retrieved",
199
+ group: "quality",
200
+ description: "Retrieval timestamp preserved separately from evaluation time.",
201
+ defaultVisible: false,
202
+ type: "date",
203
+ },
204
+ ]
205
+
206
+ function normalizeSourceName(value: string | null | undefined) {
207
+ const trimmed = value?.trim()
208
+ return trimmed || MISSING_SOURCE_BUCKET
209
+ }
210
+
211
+ function getSourceDataFields(sourceData: ModelResultForBenchmark["source_data"] | SourceData | undefined) {
212
+ if (!sourceData || Array.isArray(sourceData)) {
213
+ return {
214
+ source_dataset_name: null,
215
+ source_dataset_version: null,
216
+ source_hf_repo: null,
217
+ source_hf_split: null,
218
+ }
219
+ }
220
+
221
+ return {
222
+ source_dataset_name: sourceData.dataset_name ?? null,
223
+ source_dataset_version: sourceData.dataset_version ?? null,
224
+ source_hf_repo: sourceData.hf_repo ?? null,
225
+ source_hf_split: sourceData.hf_split ?? null,
226
+ }
227
+ }
228
+
229
+ function formatConfidenceInterval(result: ModelResultForBenchmark["score_details"]) {
230
+ const ci = result.confidence_interval
231
+ if (!ci) {
232
+ return null
233
+ }
234
+
235
+ return `${ci.lower} - ${ci.upper} (${ci.confidence_level}%)`
236
+ }
237
+
238
+ function normalizeScore(score: number | null | undefined, minScore?: number, maxScore?: number) {
239
+ if (!isFiniteNumber(score)) {
240
+ return null
241
+ }
242
+
243
+ const min = minScore ?? 0
244
+ const max = maxScore ?? 1
245
+ const range = max - min
246
+
247
+ return range > 0 ? (score - min) / range : score
248
+ }
249
+
250
+ function isFiniteNumber(value: unknown): value is number {
251
+ return typeof value === "number" && Number.isFinite(value)
252
+ }
253
+
254
+ function rankScoredRows<T>(
255
+ entries: T[],
256
+ getScore: (entry: T) => number | null | undefined,
257
+ lowerIsBetter: boolean
258
+ ) {
259
+ const scored = entries
260
+ .filter((entry) => isFiniteNumber(getScore(entry)))
261
+ .sort((a, b) => {
262
+ const left = getScore(a) ?? 0
263
+ const right = getScore(b) ?? 0
264
+ return lowerIsBetter ? left - right : right - left
265
+ })
266
+
267
+ const ranks = new Map<T, number>()
268
+ let currentRank = 0
269
+ let previousScore: number | null = null
270
+
271
+ scored.forEach((entry, index) => {
272
+ const score = getScore(entry) ?? 0
273
+ if (previousScore == null || Math.abs(score - previousScore) > 1e-9) {
274
+ currentRank = index + 1
275
+ previousScore = score
276
+ }
277
+ ranks.set(entry, currentRank)
278
+ })
279
+
280
+ return {
281
+ ranks,
282
+ total: scored.length,
283
+ }
284
+ }
285
+
286
+ function getMetricIdentity(result: ModelResultForBenchmark["result"]) {
287
+ return {
288
+ metric_summary_id: result.metric_summary_id ?? null,
289
+ metric_id: result.metric_id ?? null,
290
+ metric_key: result.metric_key ?? null,
291
+ metric_name: result.display_name ?? result.evaluation_name,
292
+ metric_kind: result.metric_kind ?? null,
293
+ metric_unit: result.metric_unit ?? result.metric_config.unit ?? null,
294
+ }
295
+ }
296
+
297
+ function getInstanceStatus(result: ModelResultForBenchmark["result"], summary: BenchmarkEvalSummary) {
298
+ if (result.detailed_evaluation_results_url) {
299
+ return {
300
+ has_instance_data: true,
301
+ instance_join_status: "metric_exact" as const,
302
+ }
303
+ }
304
+
305
+ if (summary.instance_data?.available) {
306
+ return {
307
+ has_instance_data: true,
308
+ instance_join_status: "benchmark_available" as const,
309
+ }
310
+ }
311
+
312
+ return {
313
+ has_instance_data: false,
314
+ instance_join_status: "not_available" as const,
315
+ }
316
+ }
317
+
318
+ function baseRowFields(
319
+ summary: BenchmarkEvalSummary,
320
+ modelResult: ModelResultForBenchmark,
321
+ grain: ResearchJoinGrain,
322
+ rank: number | null,
323
+ rankTotal: number | null
324
+ ): Omit<ResearchJoinRow, "row_id"> {
325
+ const metric = getMetricIdentity(modelResult.result)
326
+ const sourceDataFields = getSourceDataFields(modelResult.source_data)
327
+ const instanceStatus = getInstanceStatus(modelResult.result, summary)
328
+ const sourceRecordUrl =
329
+ modelResult.result.source_record_url ??
330
+ modelResult.source_metadata.source_url ??
331
+ null
332
+
333
+ return {
334
+ join_grain: grain,
335
+ eval_summary_id: summary.evaluation_id,
336
+ evaluation_name: summary.evaluation_name,
337
+ benchmark_family_key: summary.benchmark_family_key ?? null,
338
+ benchmark_parent_key: summary.composite_benchmark_key ?? null,
339
+ benchmark_leaf_key: summary.benchmark_leaf_key ?? null,
340
+ component_eval_summary_id: null,
341
+ component_name: null,
342
+ ...metric,
343
+ model_route_id: modelResult.model_route_id ?? getModelFamilyRouteId(modelResult.model_info),
344
+ model_id: modelResult.model_info.id,
345
+ model_name: modelResult.model_info.name,
346
+ developer: modelResult.model_info.developer ?? null,
347
+ score: modelResult.score,
348
+ normalized_score: normalizeScore(
349
+ modelResult.score,
350
+ summary.metric_config.min_score,
351
+ summary.metric_config.max_score
352
+ ),
353
+ rank,
354
+ rank_total: rankTotal,
355
+ lower_is_better: summary.metric_config.lower_is_better,
356
+ source_name: normalizeSourceName(modelResult.source_metadata.source_name),
357
+ source_type: modelResult.source_metadata.source_type,
358
+ source_organization_name: modelResult.source_metadata.source_organization_name,
359
+ evaluator_relationship: modelResult.source_metadata.evaluator_relationship,
360
+ source_record_url: sourceRecordUrl,
361
+ ...sourceDataFields,
362
+ retrieved_at: modelResult.evaluation_timestamp,
363
+ evaluation_timestamp: modelResult.result.evaluation_timestamp,
364
+ ...instanceStatus,
365
+ detailed_evaluation_results_url: modelResult.result.detailed_evaluation_results_url ?? null,
366
+ sample_size: modelResult.score_details.sample_size ?? null,
367
+ standard_error: modelResult.score_details.standard_error ?? null,
368
+ confidence_interval: formatConfidenceInterval(modelResult.score_details),
369
+ generation_config_available: Boolean(modelResult.result.generation_config),
370
+ }
371
+ }
372
+
373
+ function buildRowsFromModelResults(summary: BenchmarkEvalSummary): ResearchJoinRow[] {
374
+ const rankData = rankScoredRows(
375
+ summary.model_results,
376
+ (modelResult) => modelResult.score,
377
+ summary.metric_config.lower_is_better
378
+ )
379
+
380
+ const rows: ResearchJoinRow[] = []
381
+
382
+ for (const [index, modelResult] of summary.model_results.entries()) {
383
+ const rank = rankData.ranks.get(modelResult) ?? null
384
+ const base = baseRowFields(summary, modelResult, "model_metric_source", rank, rankData.total)
385
+
386
+ rows.push({
387
+ ...base,
388
+ row_id: [
389
+ "model_metric_source",
390
+ summary.evaluation_id,
391
+ base.metric_summary_id ?? base.metric_key ?? base.metric_name,
392
+ base.model_route_id ?? base.model_id,
393
+ index,
394
+ ].join("::"),
395
+ })
396
+
397
+ for (const [componentIndex, component] of (modelResult.aggregate_components ?? []).entries()) {
398
+ rows.push({
399
+ ...base,
400
+ row_id: [
401
+ "composite_component",
402
+ component.evaluation_id,
403
+ base.model_route_id ?? base.model_id,
404
+ componentIndex,
405
+ ].join("::"),
406
+ join_grain: "composite_component",
407
+ component_eval_summary_id: component.evaluation_id,
408
+ component_name: component.composite_benchmark_name,
409
+ score: component.score,
410
+ normalized_score: component.normalized_score,
411
+ source_name: normalizeSourceName(component.source_name),
412
+ source_type: component.source_type,
413
+ source_organization_name: component.source_organization_name,
414
+ evaluator_relationship: component.evaluator_relationship,
415
+ retrieved_at: component.evaluation_timestamp,
416
+ evaluation_timestamp: component.evaluation_timestamp,
417
+ })
418
+ }
419
+ }
420
+
421
+ return rows
422
+ }
423
+
424
+ function buildRowsFromMatrix(summary: BenchmarkEvalSummary): ResearchJoinRow[] {
425
+ const rows: ResearchJoinRow[] = []
426
+ const metrics = summary.leaderboard_metrics ?? []
427
+ const matrixRows = summary.leaderboard_rows ?? []
428
+
429
+ for (const metric of metrics) {
430
+ const rankData = rankScoredRows(
431
+ matrixRows,
432
+ (row) => row.values[metric.column_key],
433
+ metric.lower_is_better
434
+ )
435
+
436
+ for (const row of matrixRows) {
437
+ const score = row.values[metric.column_key]
438
+ if (!isFiniteNumber(score)) {
439
+ continue
440
+ }
441
+
442
+ const sourceDataFields = getSourceDataFields(row.source_data)
443
+ const hasBenchmarkInstances = Boolean(summary.instance_data?.available)
444
+ const modelRouteId = row.model_route_id ?? getModelFamilyRouteId(row.model_info)
445
+
446
+ rows.push({
447
+ row_id: [
448
+ "model_metric",
449
+ summary.evaluation_id,
450
+ metric.column_key,
451
+ modelRouteId,
452
+ ].join("::"),
453
+ join_grain: "model_metric",
454
+ eval_summary_id: summary.evaluation_id,
455
+ evaluation_name: summary.evaluation_name,
456
+ benchmark_family_key: summary.benchmark_family_key ?? null,
457
+ benchmark_parent_key: summary.composite_benchmark_key ?? null,
458
+ benchmark_leaf_key: summary.benchmark_leaf_key ?? null,
459
+ component_eval_summary_id: null,
460
+ component_name:
461
+ metric.scope === "subtask"
462
+ ? metric.subtask_name ?? metric.subtask_key ?? null
463
+ : null,
464
+ metric_summary_id: metric.metric_summary_id,
465
+ metric_id: null,
466
+ metric_key: null,
467
+ metric_name: metric.display_name || metric.metric_name,
468
+ metric_kind: null,
469
+ metric_unit: metric.unit ?? null,
470
+ model_route_id: modelRouteId,
471
+ model_id: row.model_info.id,
472
+ model_name: row.model_info.name,
473
+ developer: row.model_info.developer ?? null,
474
+ score: score ?? null,
475
+ normalized_score: null,
476
+ rank: rankData.ranks.get(row) ?? null,
477
+ rank_total: rankData.total,
478
+ lower_is_better: metric.lower_is_better,
479
+ source_name: normalizeSourceName(row.source_metadata.source_name),
480
+ source_type: row.source_metadata.source_type,
481
+ source_organization_name: row.source_metadata.source_organization_name,
482
+ evaluator_relationship: row.source_metadata.evaluator_relationship,
483
+ source_record_url: row.source_metadata.source_url ?? null,
484
+ ...sourceDataFields,
485
+ retrieved_at: row.evaluation_timestamp,
486
+ evaluation_timestamp: row.evaluation_timestamp,
487
+ has_instance_data: hasBenchmarkInstances,
488
+ instance_join_status: hasBenchmarkInstances ? "benchmark_available" : "not_available",
489
+ detailed_evaluation_results_url: null,
490
+ sample_size: null,
491
+ standard_error: null,
492
+ confidence_interval: null,
493
+ generation_config_available: null,
494
+ })
495
+ }
496
+ }
497
+
498
+ return rows
499
+ }
500
+
501
+ function buildFacets(rows: ResearchJoinRow[]): ResearchJoinFacet[] {
502
+ const facetSpecs = [
503
+ { key: "metric_name", label: "Metric" },
504
+ { key: "source_name", label: "Source" },
505
+ { key: "evaluator_relationship", label: "Relationship" },
506
+ ] as const
507
+
508
+ return facetSpecs.map((spec) => {
509
+ const counts = new Map<string, number>()
510
+
511
+ for (const row of rows) {
512
+ const value = String(row[spec.key] ?? "")
513
+ if (!value) {
514
+ continue
515
+ }
516
+ counts.set(value, (counts.get(value) ?? 0) + 1)
517
+ }
518
+
519
+ return {
520
+ key: spec.key,
521
+ label: spec.label,
522
+ values: Array.from(counts.entries())
523
+ .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
524
+ .map(([value, count]) => ({
525
+ value,
526
+ label: value === MISSING_SOURCE_BUCKET ? "Missing source" : value,
527
+ count,
528
+ })),
529
+ }
530
+ })
531
+ }
532
+
533
+ function buildGrainOptions(rows: ResearchJoinRow[]): ResearchJoinGrainOption[] {
534
+ const grainDetails: Record<ResearchJoinGrain, Omit<ResearchJoinGrainOption, "row_count">> = {
535
+ model_metric: {
536
+ grain: "model_metric",
537
+ label: "Model x metric",
538
+ description: "One row per model and benchmark metric, best for score matrices.",
539
+ join_keys: ["eval_summary_id", "metric_summary_id", "model_route_id"],
540
+ },
541
+ model_metric_source: {
542
+ grain: "model_metric_source",
543
+ label: "Model x metric x source",
544
+ description: "Adds source provenance to each model metric row.",
545
+ join_keys: ["eval_summary_id", "metric_summary_id", "model_route_id", "source_name"],
546
+ },
547
+ model_metric_instance: {
548
+ grain: "model_metric_instance",
549
+ label: "Model x metric x instances",
550
+ description: "Filters to rows with exact or benchmark-level sample links.",
551
+ join_keys: ["eval_summary_id", "metric_summary_id", "model_route_id", "detailed_evaluation_results_url"],
552
+ },
553
+ composite_component: {
554
+ grain: "composite_component",
555
+ label: "Composite components",
556
+ description: "One row per model and component score inside a rollup benchmark.",
557
+ join_keys: ["component_eval_summary_id", "model_route_id", "source_name"],
558
+ },
559
+ }
560
+
561
+ return (Object.keys(grainDetails) as ResearchJoinGrain[])
562
+ .map((grain) => ({
563
+ ...grainDetails[grain],
564
+ row_count: rows.filter((row) => {
565
+ if (grain === "model_metric_instance") {
566
+ return row.has_instance_data
567
+ }
568
+ return row.join_grain === grain
569
+ }).length,
570
+ }))
571
+ .filter((option) => option.row_count > 0)
572
+ }
573
+
574
+ function buildJoinSteps(grainOptions: ResearchJoinGrainOption[]) {
575
+ const defaultKeys = grainOptions[0]?.join_keys ?? ["eval_summary_id", "metric_summary_id", "model_route_id"]
576
+
577
+ return [
578
+ {
579
+ step: 1,
580
+ title: "Choose the base row grain",
581
+ description:
582
+ "Start from model-metric rows, source-aware rows, instance-linked rows, or composite component rows.",
583
+ keys: defaultKeys,
584
+ },
585
+ {
586
+ step: 2,
587
+ title: "Add field groups as joins",
588
+ description:
589
+ "Join hierarchy, source provenance, instance links, and quality fields by the keys shown for the selected grain.",
590
+ keys: ["benchmark_family_key", "benchmark_parent_key", "source_name", "detailed_evaluation_results_url"],
591
+ },
592
+ {
593
+ step: 3,
594
+ title: "Filter, inspect keys, and export",
595
+ description:
596
+ "Filter to the rows you need, keep source identity explicit, then export the denormalized join table.",
597
+ keys: ["source_name", "evaluator_relationship", "metric_name"],
598
+ },
599
+ ]
600
+ }
601
+
602
+ function buildArtifactResearchJoinDataset(summary: BenchmarkEvalSummary, warnings: string[] = []): ResearchJoinDataset {
603
+ const hasMatrixRows =
604
+ (summary.leaderboard_metrics?.length ?? 0) > 1 &&
605
+ (summary.leaderboard_rows?.length ?? 0) > 0
606
+ const rows = hasMatrixRows
607
+ ? buildRowsFromMatrix(summary)
608
+ : buildRowsFromModelResults(summary)
609
+
610
+ const finalRows = rows
611
+ const grainOptions = buildGrainOptions(finalRows)
612
+
613
+ const benchmarkLevelInstanceWarning = finalRows.some(
614
+ (row) => row.instance_join_status === "benchmark_available"
615
+ )
616
+
617
+ return {
618
+ source: "artifact",
619
+ generated_at: new Date().toISOString(),
620
+ eval_summary_id: summary.evaluation_id,
621
+ eval_name: summary.evaluation_name,
622
+ warnings: [
623
+ ...warnings,
624
+ "Artifact fallback keeps joins source-aware but cannot execute arbitrary SQL.",
625
+ ...(benchmarkLevelInstanceWarning
626
+ ? ["Some instance joins are benchmark-level availability signals, not exact metric-level sample links."]
627
+ : []),
628
+ ],
629
+ join_steps: buildJoinSteps(grainOptions),
630
+ available_grains: grainOptions,
631
+ columns: COLUMN_DEFINITIONS,
632
+ facets: buildFacets(finalRows),
633
+ rows: finalRows,
634
+ }
635
+ }
636
+
637
+ function isResearchJoinDataset(value: unknown): value is ResearchJoinDataset {
638
+ if (!value || typeof value !== "object") {
639
+ return false
640
+ }
641
+
642
+ const record = value as Partial<ResearchJoinDataset>
643
+ return Array.isArray(record.rows) && Array.isArray(record.columns) && Array.isArray(record.available_grains)
644
+ }
645
+
646
+ async function fetchLiveResearchJoinDataset(evalId: string): Promise<ResearchJoinDataset | null> {
647
+ const baseUrl = process.env.QUERY_API_BASE_URL ?? process.env.EVAL_QUERY_API_BASE_URL
648
+ if (!baseUrl) {
649
+ return null
650
+ }
651
+
652
+ const trimmedBase = baseUrl.replace(/\/+$/, "")
653
+ const candidates = [
654
+ `${trimmedBase}/benchmarks/${encodeURIComponent(evalId)}/research-joins`,
655
+ `${trimmedBase}/research/benchmarks/${encodeURIComponent(evalId)}/joins`,
656
+ ]
657
+
658
+ for (const url of candidates) {
659
+ try {
660
+ const response = await fetch(url, {
661
+ cache: "no-store",
662
+ headers: { Accept: "application/json" },
663
+ })
664
+
665
+ if (!response.ok) {
666
+ continue
667
+ }
668
+
669
+ const payload = await response.json()
670
+ const record = payload && typeof payload === "object" ? payload as Record<string, unknown> : {}
671
+ const candidate =
672
+ isResearchJoinDataset(payload)
673
+ ? payload
674
+ : isResearchJoinDataset(record.research_join_dataset)
675
+ ? record.research_join_dataset
676
+ : null
677
+
678
+ if (candidate) {
679
+ return {
680
+ ...candidate,
681
+ source: "query_api",
682
+ warnings: candidate.warnings ?? [],
683
+ }
684
+ }
685
+ } catch {
686
+ continue
687
+ }
688
+ }
689
+
690
+ return null
691
+ }
692
+
693
+ export async function getResearchJoinDataset(evalId: string): Promise<ResearchJoinDataset | null> {
694
+ const liveDataset = await fetchLiveResearchJoinDataset(evalId)
695
+ if (liveDataset) {
696
+ return liveDataset
697
+ }
698
+
699
+ const summary = await getEvalSummaryById(evalId)
700
+ if (!summary) {
701
+ return null
702
+ }
703
+
704
+ const liveWarning =
705
+ process.env.QUERY_API_BASE_URL || process.env.EVAL_QUERY_API_BASE_URL
706
+ ? ["Live Query API join endpoint was unavailable, so this dataset was built from frontend artifacts."]
707
+ : []
708
+
709
+ return buildArtifactResearchJoinDataset(summary, liveWarning)
710
+ }
711
+
712
+ export { buildArtifactResearchJoinDataset }
tests/research-joins.test.ts ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { describe, expect, it, vi } from "vitest"
2
+ import type { BenchmarkEvalSummary, ModelResultForBenchmark } from "../lib/eval-processing"
3
+
4
+ vi.mock("server-only", () => ({}))
5
+
6
+ const { buildArtifactResearchJoinDataset } = await import("../lib/research-joins")
7
+
8
+ const metricConfig = {
9
+ evaluation_description: "Accuracy on examples",
10
+ lower_is_better: false,
11
+ score_type: "continuous" as const,
12
+ min_score: 0,
13
+ max_score: 1,
14
+ unit: "%",
15
+ }
16
+
17
+ function makeModelResult(overrides: Partial<ModelResultForBenchmark> = {}): ModelResultForBenchmark {
18
+ return {
19
+ model_info: {
20
+ id: "openai/example-model",
21
+ name: "Example Model",
22
+ developer: "OpenAI",
23
+ },
24
+ model_route_id: "openai__example-model",
25
+ score: 0.82,
26
+ score_details: {
27
+ score: 0.82,
28
+ sample_size: 100,
29
+ standard_error: 0.01,
30
+ confidence_interval: {
31
+ lower: 0.8,
32
+ upper: 0.84,
33
+ confidence_level: 95,
34
+ },
35
+ },
36
+ evaluation_timestamp: "2026-01-01T00:00:00Z",
37
+ source_metadata: {
38
+ source_type: "evaluation_run",
39
+ source_organization_name: "OpenAI",
40
+ evaluator_relationship: "first_party",
41
+ },
42
+ source_data: {
43
+ dataset_name: "Example Dataset",
44
+ dataset_version: "v1",
45
+ hf_repo: "example/dataset",
46
+ hf_split: "test",
47
+ samples_number: 100,
48
+ },
49
+ result: {
50
+ evaluation_result_id: "result-1",
51
+ evaluation_name: "accuracy",
52
+ display_name: "Accuracy",
53
+ metric_summary_id: "metric-accuracy",
54
+ metric_id: "accuracy",
55
+ metric_key: "accuracy",
56
+ metric_name: "accuracy",
57
+ metric_kind: "score",
58
+ metric_unit: "%",
59
+ evaluation_timestamp: "2026-01-01T00:00:00Z",
60
+ source_record_url: "https://example.test/records/result-1",
61
+ metric_config: metricConfig,
62
+ score_details: { score: 0.82 },
63
+ detailed_evaluation_results_url:
64
+ "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/instances/example.jsonl",
65
+ },
66
+ ...overrides,
67
+ }
68
+ }
69
+
70
+ function makeSummary(overrides: Partial<BenchmarkEvalSummary> = {}): BenchmarkEvalSummary {
71
+ return {
72
+ evaluation_name: "Example Eval",
73
+ evaluation_id: "example_eval",
74
+ composite_benchmark_key: "example_suite",
75
+ composite_benchmark_name: "Example Suite",
76
+ category: "Reasoning",
77
+ metric_config: metricConfig,
78
+ model_results: [makeModelResult()],
79
+ models_count: 1,
80
+ evaluator_names: [],
81
+ source_types: [],
82
+ third_party_ratio: 0,
83
+ missing_generation_config_count: 0,
84
+ best_model: { name: "Example Model", score: 0.82 },
85
+ worst_model: { name: "Example Model", score: 0.82 },
86
+ avg_score: 0.82,
87
+ avg_score_norm: 0.82,
88
+ benchmark_family_key: "example_family",
89
+ benchmark_leaf_key: "example_leaf",
90
+ ...overrides,
91
+ }
92
+ }
93
+
94
+ describe("buildArtifactResearchJoinDataset", () => {
95
+ it("preserves source-safe metric rows and uses an explicit missing-source bucket", () => {
96
+ const dataset = buildArtifactResearchJoinDataset(makeSummary())
97
+
98
+ expect(dataset.source).toBe("artifact")
99
+ expect(dataset.rows).toHaveLength(1)
100
+ expect(dataset.rows[0]).toMatchObject({
101
+ eval_summary_id: "example_eval",
102
+ metric_summary_id: "metric-accuracy",
103
+ metric_id: "accuracy",
104
+ model_route_id: "openai__example-model",
105
+ source_name: "__missing_source__",
106
+ source_record_url: "https://example.test/records/result-1",
107
+ has_instance_data: true,
108
+ instance_join_status: "metric_exact",
109
+ sample_size: 100,
110
+ standard_error: 0.01,
111
+ confidence_interval: "0.8 - 0.84 (95%)",
112
+ })
113
+
114
+ expect(dataset.facets.find((facet) => facet.key === "source_name")?.values[0]).toMatchObject({
115
+ value: "__missing_source__",
116
+ label: "Missing source",
117
+ count: 1,
118
+ })
119
+ })
120
+
121
+ it("offers instance-linked joins without duplicating artifact fallback rows", () => {
122
+ const dataset = buildArtifactResearchJoinDataset(makeSummary())
123
+ const instanceGrain = dataset.available_grains.find((grain) => grain.grain === "model_metric_instance")
124
+
125
+ expect(instanceGrain?.row_count).toBe(1)
126
+ expect(dataset.rows.filter((row) => row.has_instance_data)).toHaveLength(1)
127
+ })
128
+
129
+ it("materializes composite component rows from aggregate components", () => {
130
+ const modelResult = makeModelResult({
131
+ aggregate_components: [
132
+ {
133
+ evaluation_id: "component_a",
134
+ composite_benchmark_key: "suite",
135
+ composite_benchmark_name: "Component A",
136
+ score: 0.7,
137
+ normalized_score: 0.7,
138
+ evaluation_timestamp: "2026-01-02T00:00:00Z",
139
+ source_name: "component-source",
140
+ source_type: "leaderboard",
141
+ source_organization_name: "Example Org",
142
+ evaluator_relationship: "third_party",
143
+ },
144
+ {
145
+ evaluation_id: "component_b",
146
+ composite_benchmark_key: "suite",
147
+ composite_benchmark_name: "Component B",
148
+ score: 0.9,
149
+ normalized_score: 0.9,
150
+ evaluation_timestamp: "2026-01-03T00:00:00Z",
151
+ source_type: "paper",
152
+ source_organization_name: "Example Paper",
153
+ evaluator_relationship: "third_party",
154
+ },
155
+ ],
156
+ })
157
+ const dataset = buildArtifactResearchJoinDataset(makeSummary({ model_results: [modelResult] }))
158
+
159
+ const componentRows = dataset.rows.filter((row) => row.join_grain === "composite_component")
160
+ expect(componentRows).toHaveLength(2)
161
+ expect(componentRows[0]).toMatchObject({
162
+ component_eval_summary_id: "component_a",
163
+ component_name: "Component A",
164
+ source_name: "component-source",
165
+ score: 0.7,
166
+ })
167
+ expect(componentRows[1]).toMatchObject({
168
+ component_eval_summary_id: "component_b",
169
+ component_name: "Component B",
170
+ source_name: "__missing_source__",
171
+ score: 0.9,
172
+ })
173
+ })
174
+ })