Spaces:

evaleval
/

general-eval-card

Running

App Files Files Community

Add researcher join analysis to eval detail

by yananlong - opened 26 days ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+1804

-36

Files changed (10) hide show

app/api/research/eval-joins/route.ts +24 -0
components/eval-detail.tsx +43 -36
components/eval-join-analysis.tsx +697 -0
lib/benchmark-schema.ts +12 -0
lib/dashboard-data-client.ts +7 -0
lib/hf-data.ts +15 -0
lib/model-data.ts +8 -0
lib/research-join-types.ts +112 -0
lib/research-joins.ts +712 -0
tests/research-joins.test.ts +174 -0

app/api/research/eval-joins/route.ts ADDED Viewed

	@@ -0,0 +1,24 @@

+import { NextResponse } from "next/server"
+import { getResearchJoinDataset } from "@/lib/research-joins"
+export async function GET(request: Request) {
+  const { searchParams } = new URL(request.url)
+  const id = searchParams.get("id")
+  if (!id) {
+    return NextResponse.json({ error: "Missing evaluation id" }, { status: 400 })
+  }
+  const dataset = await getResearchJoinDataset(id)
+  if (!dataset) {
+    return NextResponse.json({ error: "Evaluation join dataset not found" }, { status: 404 })
+  }
+  return NextResponse.json(dataset, {
+    headers: {
+      "Cache-Control": "public, max-age=300, stale-while-revalidate=1800",
+    },
+  })
+}

components/eval-detail.tsx CHANGED Viewed

@@ -1,6 +1,7 @@
 "use client"
 import { useAudienceMode } from "@/components/audience-mode-provider"
 import { Fragment, useEffect, useMemo, useState } from "react"
 import Link from "next/link"
 import { Badge } from "@/components/ui/badge"
@@ -914,46 +915,51 @@ export function EvalDetail({ summary }: EvalDetailProps) {
       </Card>
       {hasMultiMetricLeaderboard ? (
-        <MultiMetricLeaderboard summary={summary} isResearchView={isResearchView} />
       ) : (
-        <Card className="overflow-hidden">
-          <CardHeader className="border-b bg-muted/10 space-y-3">
-            <ApplesToApplesBanner
-              summary={summary.comparability_summary}
-              detailsAnchorId="comparability-panel"
-            />
-            <div className="flex flex-col gap-3 lg:flex-row lg:items-end lg:justify-between">
-              <div className="space-y-2">
-                <div className="flex items-center gap-2">
-                  <Medal className="h-5 w-5 text-primary" />
-                  <CardTitle className="text-xl">{leaderboardTitle}</CardTitle>
                 </div>
-                <CardDescription>{leaderboardDescription}</CardDescription>
-              </div>
-              <div className="flex flex-wrap items-center gap-2 text-xs text-muted-foreground">
-                <Badge variant="secondary">
-                  {leaderboardRows.length === summary.models_count
-                    ? `${summary.models_count} models`
-                    : `${leaderboardRows.length} of ${summary.models_count} models`}
-                </Badge>
-                <Badge variant="outline">{scoreDirectionLabel}</Badge>
-                {hasParameterData && (numericMinParams != null || numericMaxParams != null) && (
-                  <Badge variant="outline">
-                    Params {formatParamBoundLabel(minParamStep, "min")} to {formatParamBoundLabel(maxParamStep, "max")}
-                  </Badge>
-                )}
-                {isResearchView && (
-                  <Badge variant="outline">
-                    Scale {summary.metric_config.min_score ?? 0} - {summary.metric_config.max_score ?? 1}
                   </Badge>
-                )}
               </div>
-            </div>
-          </CardHeader>
-          <CardContent className="p-0">
-            {hasParameterData && (
               <div className="border-b bg-background px-5 py-4 sm:px-6">
                 <div className="flex flex-col gap-3 lg:flex-row lg:items-center lg:justify-between">
                   <div className="space-y-1">
@@ -1456,8 +1462,9 @@ export function EvalDetail({ summary }: EvalDetailProps) {
                 </Button>
               </div>
             )}
-          </CardContent>
-        </Card>
       )}
     </div>
   )

 "use client"
 import { useAudienceMode } from "@/components/audience-mode-provider"
+import { EvalJoinAnalysis } from "@/components/eval-join-analysis"
 import { Fragment, useEffect, useMemo, useState } from "react"
 import Link from "next/link"
 import { Badge } from "@/components/ui/badge"
       </Card>
       {hasMultiMetricLeaderboard ? (
+        <>
+          <EvalJoinAnalysis evalId={summary.evaluation_id} />
+          <MultiMetricLeaderboard summary={summary} isResearchView={isResearchView} />
+        </>
       ) : (
+        <>
+          <EvalJoinAnalysis evalId={summary.evaluation_id} />
+          <Card className="overflow-hidden">
+            <CardHeader className="border-b bg-muted/10 space-y-3">
+              <ApplesToApplesBanner
+                summary={summary.comparability_summary}
+                detailsAnchorId="comparability-panel"
+              />
+              <div className="flex flex-col gap-3 lg:flex-row lg:items-end lg:justify-between">
+                <div className="space-y-2">
+                  <div className="flex items-center gap-2">
+                    <Medal className="h-5 w-5 text-primary" />
+                    <CardTitle className="text-xl">{leaderboardTitle}</CardTitle>
+                  </div>
+                  <CardDescription>{leaderboardDescription}</CardDescription>
                 </div>
+                <div className="flex flex-wrap items-center gap-2 text-xs text-muted-foreground">
+                  <Badge variant="secondary">
+                    {leaderboardRows.length === summary.models_count
+                      ? `${summary.models_count} models`
+                      : `${leaderboardRows.length} of ${summary.models_count} models`}
                   </Badge>
+                  <Badge variant="outline">{scoreDirectionLabel}</Badge>
+                  {hasParameterData && (numericMinParams != null || numericMaxParams != null) && (
+                    <Badge variant="outline">
+                      Params {formatParamBoundLabel(minParamStep, "min")} to {formatParamBoundLabel(maxParamStep, "max")}
+                    </Badge>
+                  )}
+                  {isResearchView && (
+                    <Badge variant="outline">
+                      Scale {summary.metric_config.min_score ?? 0} - {summary.metric_config.max_score ?? 1}
+                    </Badge>
+                  )}
+                </div>
               </div>
+            </CardHeader>
+            <CardContent className="p-0">
+              {hasParameterData && (
               <div className="border-b bg-background px-5 py-4 sm:px-6">
                 <div className="flex flex-col gap-3 lg:flex-row lg:items-center lg:justify-between">
                   <div className="space-y-1">
                 </Button>
               </div>
             )}
+            </CardContent>
+          </Card>
+        </>
       )}
     </div>
   )

components/eval-join-analysis.tsx ADDED Viewed

	@@ -0,0 +1,697 @@

+"use client"
+import { useEffect, useMemo, useState } from "react"
+import {
+  ClipboardList,
+  Download,
+  Filter,
+  GitBranch,
+  KeyRound,
+  Loader2,
+  Network,
+  TableProperties,
+} from "lucide-react"
+import { Badge } from "@/components/ui/badge"
+import { Button } from "@/components/ui/button"
+import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"
+import { Input } from "@/components/ui/input"
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from "@/components/ui/select"
+import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table"
+import { fetchEvalResearchJoins } from "@/lib/dashboard-data-client"
+import type {
+  ResearchJoinColumn,
+  ResearchJoinColumnGroup,
+  ResearchJoinDataset,
+  ResearchJoinGrain,
+  ResearchJoinRow,
+} from "@/lib/research-join-types"
+import { cn } from "@/lib/utils"
+const CORE_GROUPS: ResearchJoinColumnGroup[] = ["identity", "metric", "model", "score"]
+const JOIN_GROUPS: Array<{
+  group: ResearchJoinColumnGroup
+  label: string
+  description: string
+}> = [
+  {
+    group: "hierarchy",
+    label: "Hierarchy",
+    description: "Family, composite, leaf, and component keys.",
+  },
+  {
+    group: "source",
+    label: "Source",
+    description: "Source buckets, organizations, and evaluator relationship.",
+  },
+  {
+    group: "instance",
+    label: "Instances",
+    description: "Sample-link status and detailed result URLs.",
+  },
+  {
+    group: "quality",
+    label: "Quality",
+    description: "Sample sizes, uncertainty, timestamps, and config presence.",
+  },
+]
+function formatCellValue(value: unknown) {
+  if (value == null || value === "") {
+    return "N/A"
+  }
+  if (typeof value === "boolean") {
+    return value ? "Yes" : "No"
+  }
+  if (typeof value === "number") {
+    if (!Number.isFinite(value)) {
+      return "N/A"
+    }
+    return Math.abs(value) >= 100 ? value.toLocaleString() : Number(value.toFixed(4)).toString()
+  }
+  return String(value)
+}
+function getRowValue(row: ResearchJoinRow, key: string) {
+  return row[key as keyof ResearchJoinRow]
+}
+function getDefaultColumnKeys(columns: ResearchJoinColumn[]) {
+  return new Set(columns.filter((column) => column.defaultVisible).map((column) => column.key))
+}
+function sortRows(rows: ResearchJoinRow[], sortKey: string, direction: "asc" | "desc") {
+  return [...rows].sort((left, right) => {
+    const leftValue = getRowValue(left, sortKey)
+    const rightValue = getRowValue(right, sortKey)
+    if (typeof leftValue === "number" && typeof rightValue === "number") {
+      return direction === "asc" ? leftValue - rightValue : rightValue - leftValue
+    }
+    const comparison = formatCellValue(leftValue).localeCompare(formatCellValue(rightValue), undefined, {
+      numeric: true,
+      sensitivity: "base",
+    })
+    return direction === "asc" ? comparison : -comparison
+  })
+}
+function escapeCsvCell(value: unknown) {
+  const text = formatCellValue(value)
+  if (/[",\n]/.test(text)) {
+    return `"${text.replace(/"/g, '""')}"`
+  }
+  return text
+}
+function downloadText(filename: string, text: string, mimeType: string) {
+  const blob = new Blob([text], { type: mimeType })
+  const url = URL.createObjectURL(blob)
+  const link = document.createElement("a")
+  link.href = url
+  link.download = filename
+  document.body.appendChild(link)
+  link.click()
+  document.body.removeChild(link)
+  URL.revokeObjectURL(url)
+}
+function getJoinRecipe(dataset: ResearchJoinDataset, grain: ResearchJoinGrain) {
+  const grainOption = dataset.available_grains.find((option) => option.grain === grain)
+  const keys = grainOption?.join_keys ?? ["eval_summary_id", "metric_summary_id", "model_route_id"]
+  return [
+    `Base grain: ${grainOption?.label ?? grain}`,
+    `Join keys: ${keys.join(" + ")}`,
+    "Keep source_name explicit; do not join across evaluation_name alone.",
+    "Add hierarchy, source, instance, or quality fields, then export the filtered rows.",
+  ].join("\n")
+}
+export function EvalJoinAnalysis({ evalId }: { evalId: string }) {
+  const [dataset, setDataset] = useState<ResearchJoinDataset | null>(null)
+  const [loading, setLoading] = useState(true)
+  const [error, setError] = useState<string | null>(null)
+  const [selectedGrain, setSelectedGrain] = useState<ResearchJoinGrain>("model_metric_source")
+  const [metricFilter, setMetricFilter] = useState("all")
+  const [sourceFilter, setSourceFilter] = useState("all")
+  const [relationshipFilter, setRelationshipFilter] = useState("all")
+  const [instanceFilter, setInstanceFilter] = useState("all")
+  const [query, setQuery] = useState("")
+  const [sortKey, setSortKey] = useState("rank")
+  const [sortDirection, setSortDirection] = useState<"asc" | "desc">("asc")
+  const [selectedColumns, setSelectedColumns] = useState<Set<string>>(new Set())
+  const [copiedRecipe, setCopiedRecipe] = useState(false)
+  useEffect(() => {
+    let isCancelled = false
+    const load = async () => {
+      try {
+        setLoading(true)
+        setError(null)
+        const nextDataset = await fetchEvalResearchJoins(evalId)
+        if (isCancelled) {
+          return
+        }
+        setDataset(nextDataset)
+        const defaultGrain =
+          nextDataset.available_grains.find((grain) => grain.grain === "model_metric_source") ??
+          nextDataset.available_grains[0]
+        if (defaultGrain) {
+          setSelectedGrain(defaultGrain.grain)
+        }
+        setSelectedColumns(getDefaultColumnKeys(nextDataset.columns))
+      } catch (err) {
+        if (!isCancelled) {
+          setError(err instanceof Error ? err.message : "Failed to load research joins")
+        }
+      } finally {
+        if (!isCancelled) {
+          setLoading(false)
+        }
+      }
+    }
+    load()
+    return () => {
+      isCancelled = true
+    }
+  }, [evalId])
+  const visibleColumns = useMemo(() => {
+    if (!dataset) {
+      return []
+    }
+    return dataset.columns.filter((column) => selectedColumns.has(column.key))
+  }, [dataset, selectedColumns])
+  const selectedGrainOption = dataset?.available_grains.find((option) => option.grain === selectedGrain)
+  const selectedJoinKeys = selectedGrainOption?.join_keys ?? []
+  const filteredRows = useMemo(() => {
+    if (!dataset) {
+      return []
+    }
+    const normalizedQuery = query.trim().toLowerCase()
+    const rows = dataset.rows.filter((row) => {
+      const grainMatches =
+        selectedGrain === "model_metric_instance"
+          ? row.has_instance_data
+          : row.join_grain === selectedGrain
+      if (!grainMatches) {
+        return false
+      }
+      if (metricFilter !== "all" && row.metric_name !== metricFilter) {
+        return false
+      }
+      if (sourceFilter !== "all" && row.source_name !== sourceFilter) {
+        return false
+      }
+      if (relationshipFilter !== "all" && row.evaluator_relationship !== relationshipFilter) {
+        return false
+      }
+      if (instanceFilter === "linked" && !row.has_instance_data) {
+        return false
+      }
+      if (instanceFilter === "unlinked" && row.has_instance_data) {
+        return false
+      }
+      if (!normalizedQuery) {
+        return true
+      }
+      return [
+        row.model_name,
+        row.model_id,
+        row.developer,
+        row.metric_name,
+        row.source_name,
+        row.source_organization_name,
+        row.component_name,
+      ]
+        .filter(Boolean)
+        .some((value) => String(value).toLowerCase().includes(normalizedQuery))
+    })
+    return sortRows(rows, sortKey, sortDirection)
+  }, [
+    dataset,
+    instanceFilter,
+    metricFilter,
+    query,
+    relationshipFilter,
+    selectedGrain,
+    sortDirection,
+    sortKey,
+    sourceFilter,
+  ])
+  const previewRows = filteredRows.slice(0, 100)
+  const facets = useMemo(() => {
+    if (!dataset) {
+      return {
+        metrics: [],
+        sources: [],
+        relationships: [],
+      }
+    }
+    const byKey = new Map(dataset.facets.map((facet) => [facet.key, facet.values]))
+    return {
+      metrics: byKey.get("metric_name") ?? [],
+      sources: byKey.get("source_name") ?? [],
+      relationships: byKey.get("evaluator_relationship") ?? [],
+    }
+  }, [dataset])
+  const toggleColumn = (key: string) => {
+    setSelectedColumns((current) => {
+      const next = new Set(current)
+      if (next.has(key)) {
+        next.delete(key)
+      } else {
+        next.add(key)
+      }
+      return next
+    })
+  }
+  const toggleJoinGroup = (group: ResearchJoinColumnGroup) => {
+    if (!dataset) {
+      return
+    }
+    const groupKeys = dataset.columns
+      .filter((column) => column.group === group)
+      .map((column) => column.key)
+    setSelectedColumns((current) => {
+      const next = new Set(current)
+      const allSelected = groupKeys.every((key) => next.has(key))
+      for (const key of groupKeys) {
+        if (allSelected && !CORE_GROUPS.includes(group)) {
+          next.delete(key)
+        } else {
+          next.add(key)
+        }
+      }
+      return next
+    })
+  }
+  const exportRows = (format: "csv" | "json") => {
+    if (!dataset) {
+      return
+    }
+    const columns = visibleColumns.length > 0 ? visibleColumns : dataset.columns.filter((column) => column.defaultVisible)
+    const filenameBase = `${dataset.eval_summary_id}-${selectedGrain}-join`
+    if (format === "json") {
+      const payload = filteredRows.map((row) =>
+        Object.fromEntries(columns.map((column) => [column.key, getRowValue(row, column.key)]))
+      )
+      downloadText(`${filenameBase}.json`, JSON.stringify(payload, null, 2), "application/json;charset=utf-8")
+      return
+    }
+    const header = columns.map((column) => escapeCsvCell(column.key)).join(",")
+    const body = filteredRows
+      .map((row) => columns.map((column) => escapeCsvCell(getRowValue(row, column.key))).join(","))
+      .join("\n")
+    downloadText(`${filenameBase}.csv`, `${header}\n${body}`, "text/csv;charset=utf-8")
+  }
+  const copyRecipe = async () => {
+    if (!dataset) {
+      return
+    }
+    try {
+      await navigator.clipboard.writeText(getJoinRecipe(dataset, selectedGrain))
+      setCopiedRecipe(true)
+      window.setTimeout(() => setCopiedRecipe(false), 1800)
+    } catch {
+      setCopiedRecipe(false)
+    }
+  }
+  if (loading) {
+    return (
+      <Card className="overflow-hidden">
+        <CardContent className="flex items-center gap-3 p-5 text-sm text-muted-foreground">
+          <Loader2 className="h-4 w-4 animate-spin" />
+          Loading research join dataset...
+        </CardContent>
+      </Card>
+    )
+  }
+  if (error || !dataset) {
+    return (
+      <Card className="overflow-hidden border-amber-200/70 bg-amber-50/30 dark:border-amber-900/50 dark:bg-amber-950/10">
+        <CardContent className="p-5 text-sm text-amber-900 dark:text-amber-100">
+          Research joins are unavailable for this benchmark.
+        </CardContent>
+      </Card>
+    )
+  }
+  return (
+    <Card className="overflow-hidden">
+      <CardHeader className="border-b bg-muted/10">
+        <div className="flex flex-col gap-4 xl:flex-row xl:items-start xl:justify-between">
+          <div className="space-y-2">
+            <div className="flex items-center gap-2">
+              <Network className="h-5 w-5 text-primary" />
+              <CardTitle className="text-xl">Research Join Builder</CardTitle>
+            </div>
+            <CardDescription>
+              Build source-safe tables from benchmark rows, metric identities, model routes, hierarchy keys, and instance links.
+            </CardDescription>
+          </div>
+          <div className="flex flex-wrap items-center gap-2">
+            <Badge variant={dataset.source === "query_api" ? "default" : "secondary"}>
+              {dataset.source === "query_api" ? "Query API" : "Artifact fallback"}
+            </Badge>
+            <Badge variant="outline">{filteredRows.length.toLocaleString()} joined rows</Badge>
+          </div>
+        </div>
+      </CardHeader>
+      <CardContent className="space-y-5 p-4 sm:p-5">
+        <section className="grid gap-3 lg:grid-cols-3">
+          {dataset.join_steps.map((step) => (
+            <div key={step.step} className="rounded-2xl border bg-background/70 p-4">
+              <div className="flex items-center gap-2">
+                <span className="flex h-7 w-7 items-center justify-center rounded-full bg-primary text-xs font-semibold text-primary-foreground">
+                  {step.step}
+                </span>
+                <div className="font-semibold">{step.title}</div>
+              </div>
+              <p className="mt-2 text-sm leading-6 text-muted-foreground">{step.description}</p>
+              <div className="mt-3 flex flex-wrap gap-1.5">
+                {step.keys.map((key) => (
+                  <span key={key} className="rounded-md border bg-muted/20 px-2 py-1 font-mono text-[11px]">
+                    {key}
+                  </span>
+                ))}
+              </div>
+            </div>
+          ))}
+        </section>
+        <section className="grid gap-4 xl:grid-cols-[minmax(0,0.95fr)_minmax(0,1.05fr)]">
+          <div className="space-y-4 rounded-2xl border bg-muted/5 p-4">
+            <div className="flex items-center gap-2">
+              <GitBranch className="h-4 w-4 text-muted-foreground" />
+              <div className="text-sm font-semibold">Base Grain</div>
+            </div>
+            <Select value={selectedGrain} onValueChange={(value) => setSelectedGrain(value as ResearchJoinGrain)}>
+              <SelectTrigger className="w-full">
+                <SelectValue />
+              </SelectTrigger>
+              <SelectContent>
+                {dataset.available_grains.map((grain) => (
+                  <SelectItem key={grain.grain} value={grain.grain}>
+                    {grain.label} ({grain.row_count.toLocaleString()})
+                  </SelectItem>
+                ))}
+              </SelectContent>
+            </Select>
+            {selectedGrainOption && (
+              <div className="rounded-xl border bg-background p-3">
+                <div className="text-sm font-medium">{selectedGrainOption.description}</div>
+                <div className="mt-3 flex items-center gap-2 text-xs font-semibold uppercase tracking-[0.16em] text-muted-foreground">
+                  <KeyRound className="h-3.5 w-3.5" />
+                  Join Keys
+                </div>
+                <div className="mt-2 flex flex-wrap gap-1.5">
+                  {selectedJoinKeys.map((key) => (
+                    <span key={key} className="rounded-md border bg-muted/30 px-2 py-1 font-mono text-[11px]">
+                      {key}
+                    </span>
+                  ))}
+                </div>
+              </div>
+            )}
+          </div>
+          <div className="space-y-4 rounded-2xl border bg-muted/5 p-4">
+            <div className="flex items-center gap-2">
+              <TableProperties className="h-4 w-4 text-muted-foreground" />
+              <div className="text-sm font-semibold">Joined Field Groups</div>
+            </div>
+            <div className="grid gap-2 sm:grid-cols-2">
+              {JOIN_GROUPS.map((joinGroup) => {
+                const groupColumns = dataset.columns.filter((column) => column.group === joinGroup.group)
+                const active = groupColumns.some((column) => selectedColumns.has(column.key))
+                return (
+                  <button
+                    key={joinGroup.group}
+                    type="button"
+                    onClick={() => toggleJoinGroup(joinGroup.group)}
+                    className={cn(
+                      "rounded-xl border p-3 text-left transition-colors",
+                      active
+                        ? "border-foreground/20 bg-background shadow-sm"
+                        : "border-border/70 bg-muted/20 text-muted-foreground hover:bg-muted/30"
+                    )}
+                  >
+                    <div className="text-sm font-semibold">{joinGroup.label}</div>
+                    <div className="mt-1 text-xs leading-5 text-muted-foreground">{joinGroup.description}</div>
+                  </button>
+                )
+              })}
+            </div>
+          </div>
+        </section>
+        <section className="space-y-3 rounded-2xl border bg-background/70 p-4">
+          <div className="flex flex-col gap-3 lg:flex-row lg:items-center lg:justify-between">
+            <div className="flex items-center gap-2">
+              <Filter className="h-4 w-4 text-muted-foreground" />
+              <div className="text-sm font-semibold">Filters and Export</div>
+            </div>
+            <div className="flex flex-wrap gap-2">
+              <Button variant="outline" size="sm" className="gap-2" onClick={copyRecipe}>
+                <ClipboardList className="h-4 w-4" />
+                {copiedRecipe ? "Copied" : "Copy recipe"}
+              </Button>
+              <Button variant="outline" size="sm" className="gap-2" onClick={() => exportRows("csv")}>
+                <Download className="h-4 w-4" />
+                CSV
+              </Button>
+              <Button variant="outline" size="sm" className="gap-2" onClick={() => exportRows("json")}>
+                <Download className="h-4 w-4" />
+                JSON
+              </Button>
+            </div>
+          </div>
+          <div className="grid gap-3 md:grid-cols-2 xl:grid-cols-6">
+            <Input
+              value={query}
+              onChange={(event) => setQuery(event.target.value)}
+              placeholder="Search model, metric, source..."
+              className="xl:col-span-2"
+            />
+            <Select value={metricFilter} onValueChange={setMetricFilter}>
+              <SelectTrigger className="w-full">
+                <SelectValue placeholder="Metric" />
+              </SelectTrigger>
+              <SelectContent>
+                <SelectItem value="all">All metrics</SelectItem>
+                {facets.metrics.map((facet) => (
+                  <SelectItem key={facet.value} value={facet.value}>
+                    {facet.label} ({facet.count})
+                  </SelectItem>
+                ))}
+              </SelectContent>
+            </Select>
+            <Select value={sourceFilter} onValueChange={setSourceFilter}>
+              <SelectTrigger className="w-full">
+                <SelectValue placeholder="Source" />
+              </SelectTrigger>
+              <SelectContent>
+                <SelectItem value="all">All sources</SelectItem>
+                {facets.sources.map((facet) => (
+                  <SelectItem key={facet.value} value={facet.value}>
+                    {facet.label} ({facet.count})
+                  </SelectItem>
+                ))}
+              </SelectContent>
+            </Select>
+            <Select value={relationshipFilter} onValueChange={setRelationshipFilter}>
+              <SelectTrigger className="w-full">
+                <SelectValue placeholder="Relationship" />
+              </SelectTrigger>
+              <SelectContent>
+                <SelectItem value="all">All relationships</SelectItem>
+                {facets.relationships.map((facet) => (
+                  <SelectItem key={facet.value} value={facet.value}>
+                    {facet.label} ({facet.count})
+                  </SelectItem>
+                ))}
+              </SelectContent>
+            </Select>
+            <Select value={instanceFilter} onValueChange={setInstanceFilter}>
+              <SelectTrigger className="w-full">
+                <SelectValue placeholder="Instances" />
+              </SelectTrigger>
+              <SelectContent>
+                <SelectItem value="all">All instance states</SelectItem>
+                <SelectItem value="linked">Has instances</SelectItem>
+                <SelectItem value="unlinked">No instances</SelectItem>
+              </SelectContent>
+            </Select>
+          </div>
+          <div className="grid gap-3 md:grid-cols-2 xl:grid-cols-[minmax(0,1fr)_16rem_10rem]">
+            <div className="flex flex-wrap gap-1.5">
+              {dataset.columns.map((column) => (
+                <button
+                  key={column.key}
+                  type="button"
+                  onClick={() => toggleColumn(column.key)}
+                  title={column.description}
+                  className={cn(
+                    "rounded-md border px-2.5 py-1 text-xs font-medium transition-colors",
+                    selectedColumns.has(column.key)
+                      ? "border-foreground/20 bg-muted text-foreground"
+                      : "border-border/70 bg-background text-muted-foreground hover:bg-muted/20"
+                  )}
+                >
+                  {column.label}
+                </button>
+              ))}
+            </div>
+            <Select value={sortKey} onValueChange={setSortKey}>
+              <SelectTrigger className="w-full">
+                <SelectValue placeholder="Sort" />
+              </SelectTrigger>
+              <SelectContent>
+                {dataset.columns.map((column) => (
+                  <SelectItem key={column.key} value={column.key}>
+                    Sort: {column.label}
+                  </SelectItem>
+                ))}
+              </SelectContent>
+            </Select>
+            <Select value={sortDirection} onValueChange={(value) => setSortDirection(value as "asc" | "desc")}>
+              <SelectTrigger className="w-full">
+                <SelectValue />
+              </SelectTrigger>
+              <SelectContent>
+                <SelectItem value="asc">Ascending</SelectItem>
+                <SelectItem value="desc">Descending</SelectItem>
+              </SelectContent>
+            </Select>
+          </div>
+        </section>
+        {dataset.warnings.length > 0 && (
+          <div className="rounded-2xl border border-amber-200/80 bg-amber-50/70 p-3 text-sm leading-6 text-amber-950 dark:border-amber-900/50 dark:bg-amber-950/20 dark:text-amber-100">
+            {dataset.warnings.map((warning) => (
+              <div key={warning}>{warning}</div>
+            ))}
+          </div>
+        )}
+        <div className="overflow-hidden rounded-2xl border">
+          <Table className="min-w-[980px]">
+            <TableHeader>
+              <TableRow className="hover:bg-transparent">
+                {visibleColumns.map((column) => (
+                  <TableHead key={column.key} className="px-3">
+                    <span className={cn(column.isJoinKey && "inline-flex items-center gap-1")}>
+                      {column.isJoinKey && <KeyRound className="h-3 w-3 text-muted-foreground" />}
+                      {column.label}
+                    </span>
+                  </TableHead>
+                ))}
+              </TableRow>
+            </TableHeader>
+            <TableBody>
+              {previewRows.map((row) => (
+                <TableRow key={row.row_id}>
+                  {visibleColumns.map((column) => {
+                    const value = getRowValue(row, column.key)
+                    const isUrl = column.type === "url" && typeof value === "string" && value.startsWith("http")
+                    return (
+                      <TableCell key={column.key} className="max-w-[22rem] whitespace-normal px-3 align-top">
+                        {isUrl ? (
+                          <a
+                            href={value}
+                            target="_blank"
+                            rel="noreferrer"
+                            className="text-primary underline-offset-4 hover:underline"
+                          >
+                            Open
+                          </a>
+                        ) : (
+                          <span className={cn(column.isJoinKey && "font-mono text-xs")}>
+                            {formatCellValue(value)}
+                          </span>
+                        )}
+                      </TableCell>
+                    )
+                  })}
+                </TableRow>
+              ))}
+              {previewRows.length === 0 && (
+                <TableRow>
+                  <TableCell colSpan={Math.max(visibleColumns.length, 1)} className="px-6 py-10 text-center text-sm text-muted-foreground">
+                    No rows match the selected join filters.
+                  </TableCell>
+                </TableRow>
+              )}
+            </TableBody>
+          </Table>
+        </div>
+        {filteredRows.length > previewRows.length && (
+          <div className="text-center text-xs text-muted-foreground">
+            Showing first {previewRows.length.toLocaleString()} rows. Export includes all {filteredRows.length.toLocaleString()} filtered rows.
+          </div>
+        )}
+      </CardContent>
+    </Card>
+  )
+}

lib/benchmark-schema.ts CHANGED Viewed

@@ -88,16 +88,24 @@ export interface ModelInfo {
 }
 export interface EvaluationResult {
   evaluation_name: string
   display_name?: string
   canonical_display_name?: string
   metric_summary_id?: string
   metric_key?: string
   evaluation_timestamp: string
   source_data?: string[] | SourceData
   metric_config: MetricConfig
   score_details: ScoreDetails
   detailed_evaluation_results_url?: string
   generation_config?: GenerationConfig
   evalcards?: { annotations?: RowAnnotations }
 }
@@ -106,6 +114,10 @@ export interface MetricConfig {
   evaluation_description: string
   lower_is_better: boolean
   score_type: 'continuous' | 'discrete' | 'binary'
   min_score?: number
   max_score?: number
   unit?: string

 }
 export interface EvaluationResult {
+  evaluation_result_id?: string
   evaluation_name: string
   display_name?: string
   canonical_display_name?: string
   metric_summary_id?: string
+  metric_id?: string
   metric_key?: string
+  metric_name?: string
+  metric_kind?: string
+  metric_unit?: string
+  metric_parameters?: Record<string, any>
   evaluation_timestamp: string
   source_data?: string[] | SourceData
+  source_record_url?: string
   metric_config: MetricConfig
   score_details: ScoreDetails
   detailed_evaluation_results_url?: string
+  detailed_evaluation_results_meta?: unknown
   generation_config?: GenerationConfig
   evalcards?: { annotations?: RowAnnotations }
 }
   evaluation_description: string
   lower_is_better: boolean
   score_type: 'continuous' | 'discrete' | 'binary'
+  metric_id?: string
+  metric_kind?: string
+  metric_unit?: string
+  metric_parameters?: Record<string, any>
   min_score?: number
   max_score?: number
   unit?: string

lib/dashboard-data-client.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import type { BackendManifestStatus, ComparisonIndex, CorpusAggregates, EvalHierarchy } from "@/lib/backend-artifacts"
 import type { BenchmarkEvaluationCardData } from "@/components/benchmark-evaluation-card"
 import type { HFEvalDetail } from "@/lib/hf-data"
 import type {
@@ -103,3 +104,9 @@ export function fetchComparisonIndex() {
 export function fetchCorpusAggregates() {
   return fetchJson<CorpusAggregates>("/api/corpus-aggregates")
 }

 import type { BackendManifestStatus, ComparisonIndex, CorpusAggregates, EvalHierarchy } from "@/lib/backend-artifacts"
+import type { ResearchJoinDataset } from "@/lib/research-join-types"
 import type { BenchmarkEvaluationCardData } from "@/components/benchmark-evaluation-card"
 import type { HFEvalDetail } from "@/lib/hf-data"
 import type {
 export function fetchCorpusAggregates() {
   return fetchJson<CorpusAggregates>("/api/corpus-aggregates")
 }
+export function fetchEvalResearchJoins(evalId: string) {
+  return fetchJson<ResearchJoinDataset>(
+    `/api/research/eval-joins?id=${encodeURIComponent(evalId)}`
+  )
+}

lib/hf-data.ts CHANGED Viewed

@@ -544,6 +544,7 @@ export interface HFEvalListEntry extends SignalSummaries {
 }
 export interface HFEvalModelResult {
   model_id: string
   model_route_id: string
   model_name: string
@@ -569,6 +570,10 @@ export interface HFEvalMetric {
   legacy_eval_summary_id?: string
   evaluation_name?: string
   metric_name: string
   metric_key: string
   display_name?: string
   canonical_display_name?: string
@@ -1383,6 +1388,7 @@ function flattenHierarchyNode(
       const modelInfo = buildModelInfoForVariant(detail, result, variantMeta)
       const inlineSamples = parseInstanceLevelData(result.instance_level_data)
       const evaluationResult: EvaluationResult = {
         evaluation_name: metric.metric_name || metric.evaluation_name || metric.display_name,
         display_name: metric.display_name || metric.metric_name || metric.evaluation_name,
         canonical_display_name:
@@ -1390,9 +1396,17 @@ function flattenHierarchyNode(
           metric.display_name ||
           `${context.benchmark ?? context.display_name ?? "Benchmark"} / ${metric.metric_name}`,
         metric_summary_id: metric.metric_summary_id,
         metric_key: metric.metric_key,
         evaluation_timestamp: result.retrieved_timestamp ?? detail.last_updated ?? "",
         source_data: sourceData,
         metric_config: metric.metric_config,
         score_details: {
           score: result.score,
@@ -1401,6 +1415,7 @@ function flattenHierarchyNode(
           result.detailed_evaluation_results
         ),
         evalcards: result.evalcards,
       }
       const existing = resultsByVariant.get(variantKey)

 }
 export interface HFEvalModelResult {
+  evaluation_result_id?: string
   model_id: string
   model_route_id: string
   model_name: string
   legacy_eval_summary_id?: string
   evaluation_name?: string
   metric_name: string
+  metric_id?: string | null
+  metric_kind?: string | null
+  metric_unit?: string | null
+  metric_parameters?: Record<string, unknown> | null
   metric_key: string
   display_name?: string
   canonical_display_name?: string
       const modelInfo = buildModelInfoForVariant(detail, result, variantMeta)
       const inlineSamples = parseInstanceLevelData(result.instance_level_data)
       const evaluationResult: EvaluationResult = {
+        evaluation_result_id: result.evaluation_result_id,
         evaluation_name: metric.metric_name || metric.evaluation_name || metric.display_name,
         display_name: metric.display_name || metric.metric_name || metric.evaluation_name,
         canonical_display_name:
           metric.display_name ||
           `${context.benchmark ?? context.display_name ?? "Benchmark"} / ${metric.metric_name}`,
         metric_summary_id: metric.metric_summary_id,
+        metric_id: metric.metric_id ?? undefined,
         metric_key: metric.metric_key,
+        metric_name: metric.metric_name,
+        metric_kind: metric.metric_kind ?? undefined,
+        metric_unit:
+          metric.metric_unit ??
+          (typeof metric.metric_config?.unit === "string" ? metric.metric_config.unit : undefined),
+        metric_parameters: metric.metric_parameters ?? undefined,
         evaluation_timestamp: result.retrieved_timestamp ?? detail.last_updated ?? "",
         source_data: sourceData,
+        source_record_url: result.source_record_url,
         metric_config: metric.metric_config,
         score_details: {
           score: result.score,
           result.detailed_evaluation_results
         ),
         evalcards: result.evalcards,
+        detailed_evaluation_results_meta: result.detailed_evaluation_results_meta,
       }
       const existing = resultsByVariant.get(variantKey)

lib/model-data.ts CHANGED Viewed

@@ -729,18 +729,26 @@ function toModelResultsForMetric(
     }
     const evaluationResult: EvaluationResult = {
       evaluation_name: metric.metric_name || metric.evaluation_name || metric.display_name || "",
       display_name: metric.display_name || metric.metric_name || metric.evaluation_name,
       canonical_display_name: metric.canonical_display_name,
       metric_summary_id: metric.metric_summary_id,
       metric_key: metric.metric_key,
       evaluation_timestamp: evaluationTimestamp,
       metric_config: metricConfig,
       score_details: { score: mr.score ?? 0 },
       detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
         mr.detailed_evaluation_results
       ),
       evalcards: mr.evalcards,
     }
     return {

     }
     const evaluationResult: EvaluationResult = {
+      evaluation_result_id: mr.evaluation_result_id,
       evaluation_name: metric.metric_name || metric.evaluation_name || metric.display_name || "",
       display_name: metric.display_name || metric.metric_name || metric.evaluation_name,
       canonical_display_name: metric.canonical_display_name,
       metric_summary_id: metric.metric_summary_id,
+      metric_id: metric.metric_id ?? undefined,
       metric_key: metric.metric_key,
+      metric_name: metric.metric_name,
+      metric_kind: metric.metric_kind ?? undefined,
+      metric_unit: metric.metric_unit ?? metricConfig.unit,
+      metric_parameters: metric.metric_parameters ?? undefined,
       evaluation_timestamp: evaluationTimestamp,
+      source_record_url: mr.source_record_url,
       metric_config: metricConfig,
       score_details: { score: mr.score ?? 0 },
       detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
         mr.detailed_evaluation_results
       ),
       evalcards: mr.evalcards,
+      detailed_evaluation_results_meta: mr.detailed_evaluation_results_meta,
     }
     return {

lib/research-join-types.ts ADDED Viewed

	@@ -0,0 +1,112 @@

+export type ResearchJoinSource = "query_api" | "artifact"
+export type ResearchJoinGrain =
+  | "model_metric"
+  | "model_metric_source"
+  | "model_metric_instance"
+  | "composite_component"
+export type ResearchJoinColumnGroup =
+  | "identity"
+  | "hierarchy"
+  | "metric"
+  | "model"
+  | "score"
+  | "source"
+  | "instance"
+  | "quality"
+export interface ResearchJoinColumn {
+  key: string
+  label: string
+  group: ResearchJoinColumnGroup
+  description: string
+  defaultVisible: boolean
+  isJoinKey?: boolean
+  type?: "string" | "number" | "boolean" | "url" | "date"
+}
+export interface ResearchJoinGrainOption {
+  grain: ResearchJoinGrain
+  label: string
+  description: string
+  row_count: number
+  join_keys: string[]
+}
+export interface ResearchJoinFacetValue {
+  value: string
+  label: string
+  count: number
+}
+export interface ResearchJoinFacet {
+  key: string
+  label: string
+  values: ResearchJoinFacetValue[]
+}
+export interface ResearchJoinStep {
+  step: number
+  title: string
+  description: string
+  keys: string[]
+}
+export interface ResearchJoinRow {
+  row_id: string
+  join_grain: ResearchJoinGrain
+  eval_summary_id: string
+  evaluation_name: string
+  benchmark_family_key?: string | null
+  benchmark_parent_key?: string | null
+  benchmark_leaf_key?: string | null
+  component_eval_summary_id?: string | null
+  component_name?: string | null
+  metric_summary_id?: string | null
+  metric_id?: string | null
+  metric_key?: string | null
+  metric_name: string
+  metric_kind?: string | null
+  metric_unit?: string | null
+  model_route_id?: string | null
+  model_id: string
+  model_name: string
+  developer?: string | null
+  score?: number | null
+  normalized_score?: number | null
+  rank?: number | null
+  rank_total?: number | null
+  lower_is_better?: boolean | null
+  source_name: string
+  source_type?: string | null
+  source_organization_name?: string | null
+  evaluator_relationship?: string | null
+  source_record_url?: string | null
+  source_dataset_name?: string | null
+  source_dataset_version?: string | null
+  source_hf_repo?: string | null
+  source_hf_split?: string | null
+  retrieved_at?: string | null
+  evaluation_timestamp?: string | null
+  has_instance_data: boolean
+  instance_join_status: "metric_exact" | "benchmark_available" | "not_available"
+  detailed_evaluation_results_url?: string | null
+  sample_size?: number | null
+  standard_error?: number | null
+  confidence_interval?: string | null
+  generation_config_available?: boolean | null
+}
+export interface ResearchJoinDataset {
+  source: ResearchJoinSource
+  generated_at: string
+  eval_summary_id: string
+  eval_name: string
+  warnings: string[]
+  join_steps: ResearchJoinStep[]
+  available_grains: ResearchJoinGrainOption[]
+  columns: ResearchJoinColumn[]
+  facets: ResearchJoinFacet[]
+  rows: ResearchJoinRow[]
+}

lib/research-joins.ts ADDED Viewed

	@@ -0,0 +1,712 @@

+import "server-only"
+import type { SourceData } from "@/lib/benchmark-schema"
+import type { BenchmarkEvalSummary, ModelResultForBenchmark } from "@/lib/eval-processing"
+import { getModelFamilyRouteId } from "@/lib/model-family"
+import { getEvalSummaryById } from "@/lib/model-data"
+import type {
+  ResearchJoinColumn,
+  ResearchJoinDataset,
+  ResearchJoinFacet,
+  ResearchJoinGrain,
+  ResearchJoinGrainOption,
+  ResearchJoinRow,
+} from "@/lib/research-join-types"
+const MISSING_SOURCE_BUCKET = "__missing_source__"
+const COLUMN_DEFINITIONS: ResearchJoinColumn[] = [
+  {
+    key: "join_grain",
+    label: "Grain",
+    group: "identity",
+    description: "The row shape selected for the join.",
+    defaultVisible: false,
+    isJoinKey: true,
+  },
+  {
+    key: "eval_summary_id",
+    label: "Eval ID",
+    group: "identity",
+    description: "Canonical benchmark detail identifier.",
+    defaultVisible: false,
+    isJoinKey: true,
+  },
+  {
+    key: "metric_summary_id",
+    label: "Metric ID",
+    group: "metric",
+    description: "Metric-level identifier used to align ranks, scores, and samples.",
+    defaultVisible: true,
+    isJoinKey: true,
+  },
+  {
+    key: "model_route_id",
+    label: "Model Route",
+    group: "model",
+    description: "Stable frontend route identifier for the model family.",
+    defaultVisible: false,
+    isJoinKey: true,
+  },
+  {
+    key: "model_name",
+    label: "Model",
+    group: "model",
+    description: "Display name for the model row.",
+    defaultVisible: true,
+  },
+  {
+    key: "developer",
+    label: "Developer",
+    group: "model",
+    description: "Reported developer or provider.",
+    defaultVisible: true,
+  },
+  {
+    key: "metric_name",
+    label: "Metric",
+    group: "metric",
+    description: "Metric display name from the benchmark artifact.",
+    defaultVisible: true,
+  },
+  {
+    key: "score",
+    label: "Score",
+    group: "score",
+    description: "Raw reported score for the selected row grain.",
+    defaultVisible: true,
+    type: "number",
+  },
+  {
+    key: "rank",
+    label: "Rank",
+    group: "score",
+    description: "Rank within the selected benchmark metric.",
+    defaultVisible: true,
+    type: "number",
+  },
+  {
+    key: "rank_total",
+    label: "Rank Total",
+    group: "score",
+    description: "Number of scored rows in the ranking partition.",
+    defaultVisible: false,
+    type: "number",
+  },
+  {
+    key: "source_name",
+    label: "Source",
+    group: "source",
+    description: "Source name, or the explicit missing-source bucket.",
+    defaultVisible: true,
+    isJoinKey: true,
+  },
+  {
+    key: "source_organization_name",
+    label: "Organization",
+    group: "source",
+    description: "Organization associated with the reported result.",
+    defaultVisible: true,
+  },
+  {
+    key: "evaluator_relationship",
+    label: "Relationship",
+    group: "source",
+    description: "First-party, third-party, collaborative, or other source relationship.",
+    defaultVisible: true,
+  },
+  {
+    key: "benchmark_family_key",
+    label: "Family",
+    group: "hierarchy",
+    description: "Backend-declared benchmark family key.",
+    defaultVisible: false,
+    isJoinKey: true,
+  },
+  {
+    key: "benchmark_parent_key",
+    label: "Composite",
+    group: "hierarchy",
+    description: "Backend-declared composite or parent benchmark key.",
+    defaultVisible: false,
+    isJoinKey: true,
+  },
+  {
+    key: "benchmark_leaf_key",
+    label: "Leaf",
+    group: "hierarchy",
+    description: "Backend-declared leaf benchmark key.",
+    defaultVisible: false,
+    isJoinKey: true,
+  },
+  {
+    key: "component_name",
+    label: "Component",
+    group: "hierarchy",
+    description: "Composite component name when viewing rollup score rows.",
+    defaultVisible: false,
+  },
+  {
+    key: "has_instance_data",
+    label: "Instances",
+    group: "instance",
+    description: "Whether this row has exact or benchmark-level instance linkage.",
+    defaultVisible: true,
+    type: "boolean",
+  },
+  {
+    key: "instance_join_status",
+    label: "Instance Join",
+    group: "instance",
+    description: "Whether instance linkage is exact, benchmark-level only, or unavailable.",
+    defaultVisible: false,
+  },
+  {
+    key: "detailed_evaluation_results_url",
+    label: "Instance URL",
+    group: "instance",
+    description: "Metric-selective sample data URL when the artifact exposes it.",
+    defaultVisible: false,
+    type: "url",
+  },
+  {
+    key: "sample_size",
+    label: "Sample Size",
+    group: "quality",
+    description: "Reported sample size when available.",
+    defaultVisible: false,
+    type: "number",
+  },
+  {
+    key: "standard_error",
+    label: "Std. Error",
+    group: "quality",
+    description: "Reported standard error when available.",
+    defaultVisible: false,
+    type: "number",
+  },
+  {
+    key: "generation_config_available",
+    label: "Gen Config",
+    group: "quality",
+    description: "Whether generation config is present for reproducibility checks.",
+    defaultVisible: false,
+    type: "boolean",
+  },
+  {
+    key: "retrieved_at",
+    label: "Retrieved",
+    group: "quality",
+    description: "Retrieval timestamp preserved separately from evaluation time.",
+    defaultVisible: false,
+    type: "date",
+  },
+]
+function normalizeSourceName(value: string | null | undefined) {
+  const trimmed = value?.trim()
+  return trimmed || MISSING_SOURCE_BUCKET
+}
+function getSourceDataFields(sourceData: ModelResultForBenchmark["source_data"] | SourceData | undefined) {
+  if (!sourceData || Array.isArray(sourceData)) {
+    return {
+      source_dataset_name: null,
+      source_dataset_version: null,
+      source_hf_repo: null,
+      source_hf_split: null,
+    }
+  }
+  return {
+    source_dataset_name: sourceData.dataset_name ?? null,
+    source_dataset_version: sourceData.dataset_version ?? null,
+    source_hf_repo: sourceData.hf_repo ?? null,
+    source_hf_split: sourceData.hf_split ?? null,
+  }
+}
+function formatConfidenceInterval(result: ModelResultForBenchmark["score_details"]) {
+  const ci = result.confidence_interval
+  if (!ci) {
+    return null
+  }
+  return `${ci.lower} - ${ci.upper} (${ci.confidence_level}%)`
+}
+function normalizeScore(score: number | null | undefined, minScore?: number, maxScore?: number) {
+  if (!isFiniteNumber(score)) {
+    return null
+  }
+  const min = minScore ?? 0
+  const max = maxScore ?? 1
+  const range = max - min
+  return range > 0 ? (score - min) / range : score
+}
+function isFiniteNumber(value: unknown): value is number {
+  return typeof value === "number" && Number.isFinite(value)
+}
+function rankScoredRows<T>(
+  entries: T[],
+  getScore: (entry: T) => number | null | undefined,
+  lowerIsBetter: boolean
+) {
+  const scored = entries
+    .filter((entry) => isFiniteNumber(getScore(entry)))
+    .sort((a, b) => {
+      const left = getScore(a) ?? 0
+      const right = getScore(b) ?? 0
+      return lowerIsBetter ? left - right : right - left
+    })
+  const ranks = new Map<T, number>()
+  let currentRank = 0
+  let previousScore: number | null = null
+  scored.forEach((entry, index) => {
+    const score = getScore(entry) ?? 0
+    if (previousScore == null || Math.abs(score - previousScore) > 1e-9) {
+      currentRank = index + 1
+      previousScore = score
+    }
+    ranks.set(entry, currentRank)
+  })
+  return {
+    ranks,
+    total: scored.length,
+  }
+}
+function getMetricIdentity(result: ModelResultForBenchmark["result"]) {
+  return {
+    metric_summary_id: result.metric_summary_id ?? null,
+    metric_id: result.metric_id ?? null,
+    metric_key: result.metric_key ?? null,
+    metric_name: result.display_name ?? result.evaluation_name,
+    metric_kind: result.metric_kind ?? null,
+    metric_unit: result.metric_unit ?? result.metric_config.unit ?? null,
+  }
+}
+function getInstanceStatus(result: ModelResultForBenchmark["result"], summary: BenchmarkEvalSummary) {
+  if (result.detailed_evaluation_results_url) {
+    return {
+      has_instance_data: true,
+      instance_join_status: "metric_exact" as const,
+    }
+  }
+  if (summary.instance_data?.available) {
+    return {
+      has_instance_data: true,
+      instance_join_status: "benchmark_available" as const,
+    }
+  }
+  return {
+    has_instance_data: false,
+    instance_join_status: "not_available" as const,
+  }
+}
+function baseRowFields(
+  summary: BenchmarkEvalSummary,
+  modelResult: ModelResultForBenchmark,
+  grain: ResearchJoinGrain,
+  rank: number | null,
+  rankTotal: number | null
+): Omit<ResearchJoinRow, "row_id"> {
+  const metric = getMetricIdentity(modelResult.result)
+  const sourceDataFields = getSourceDataFields(modelResult.source_data)
+  const instanceStatus = getInstanceStatus(modelResult.result, summary)
+  const sourceRecordUrl =
+    modelResult.result.source_record_url ??
+    modelResult.source_metadata.source_url ??
+    null
+  return {
+    join_grain: grain,
+    eval_summary_id: summary.evaluation_id,
+    evaluation_name: summary.evaluation_name,
+    benchmark_family_key: summary.benchmark_family_key ?? null,
+    benchmark_parent_key: summary.composite_benchmark_key ?? null,
+    benchmark_leaf_key: summary.benchmark_leaf_key ?? null,
+    component_eval_summary_id: null,
+    component_name: null,
+    ...metric,
+    model_route_id: modelResult.model_route_id ?? getModelFamilyRouteId(modelResult.model_info),
+    model_id: modelResult.model_info.id,
+    model_name: modelResult.model_info.name,
+    developer: modelResult.model_info.developer ?? null,
+    score: modelResult.score,
+    normalized_score: normalizeScore(
+      modelResult.score,
+      summary.metric_config.min_score,
+      summary.metric_config.max_score
+    ),
+    rank,
+    rank_total: rankTotal,
+    lower_is_better: summary.metric_config.lower_is_better,
+    source_name: normalizeSourceName(modelResult.source_metadata.source_name),
+    source_type: modelResult.source_metadata.source_type,
+    source_organization_name: modelResult.source_metadata.source_organization_name,
+    evaluator_relationship: modelResult.source_metadata.evaluator_relationship,
+    source_record_url: sourceRecordUrl,
+    ...sourceDataFields,
+    retrieved_at: modelResult.evaluation_timestamp,
+    evaluation_timestamp: modelResult.result.evaluation_timestamp,
+    ...instanceStatus,
+    detailed_evaluation_results_url: modelResult.result.detailed_evaluation_results_url ?? null,
+    sample_size: modelResult.score_details.sample_size ?? null,
+    standard_error: modelResult.score_details.standard_error ?? null,
+    confidence_interval: formatConfidenceInterval(modelResult.score_details),
+    generation_config_available: Boolean(modelResult.result.generation_config),
+  }
+}
+function buildRowsFromModelResults(summary: BenchmarkEvalSummary): ResearchJoinRow[] {
+  const rankData = rankScoredRows(
+    summary.model_results,
+    (modelResult) => modelResult.score,
+    summary.metric_config.lower_is_better
+  )
+  const rows: ResearchJoinRow[] = []
+  for (const [index, modelResult] of summary.model_results.entries()) {
+    const rank = rankData.ranks.get(modelResult) ?? null
+    const base = baseRowFields(summary, modelResult, "model_metric_source", rank, rankData.total)
+    rows.push({
+      ...base,
+      row_id: [
+        "model_metric_source",
+        summary.evaluation_id,
+        base.metric_summary_id ?? base.metric_key ?? base.metric_name,
+        base.model_route_id ?? base.model_id,
+        index,
+      ].join("::"),
+    })
+    for (const [componentIndex, component] of (modelResult.aggregate_components ?? []).entries()) {
+      rows.push({
+        ...base,
+        row_id: [
+          "composite_component",
+          component.evaluation_id,
+          base.model_route_id ?? base.model_id,
+          componentIndex,
+        ].join("::"),
+        join_grain: "composite_component",
+        component_eval_summary_id: component.evaluation_id,
+        component_name: component.composite_benchmark_name,
+        score: component.score,
+        normalized_score: component.normalized_score,
+        source_name: normalizeSourceName(component.source_name),
+        source_type: component.source_type,
+        source_organization_name: component.source_organization_name,
+        evaluator_relationship: component.evaluator_relationship,
+        retrieved_at: component.evaluation_timestamp,
+        evaluation_timestamp: component.evaluation_timestamp,
+      })
+    }
+  }
+  return rows
+}
+function buildRowsFromMatrix(summary: BenchmarkEvalSummary): ResearchJoinRow[] {
+  const rows: ResearchJoinRow[] = []
+  const metrics = summary.leaderboard_metrics ?? []
+  const matrixRows = summary.leaderboard_rows ?? []
+  for (const metric of metrics) {
+    const rankData = rankScoredRows(
+      matrixRows,
+      (row) => row.values[metric.column_key],
+      metric.lower_is_better
+    )
+    for (const row of matrixRows) {
+      const score = row.values[metric.column_key]
+      if (!isFiniteNumber(score)) {
+        continue
+      }
+      const sourceDataFields = getSourceDataFields(row.source_data)
+      const hasBenchmarkInstances = Boolean(summary.instance_data?.available)
+      const modelRouteId = row.model_route_id ?? getModelFamilyRouteId(row.model_info)
+      rows.push({
+        row_id: [
+          "model_metric",
+          summary.evaluation_id,
+          metric.column_key,
+          modelRouteId,
+        ].join("::"),
+        join_grain: "model_metric",
+        eval_summary_id: summary.evaluation_id,
+        evaluation_name: summary.evaluation_name,
+        benchmark_family_key: summary.benchmark_family_key ?? null,
+        benchmark_parent_key: summary.composite_benchmark_key ?? null,
+        benchmark_leaf_key: summary.benchmark_leaf_key ?? null,
+        component_eval_summary_id: null,
+        component_name:
+          metric.scope === "subtask"
+            ? metric.subtask_name ?? metric.subtask_key ?? null
+            : null,
+        metric_summary_id: metric.metric_summary_id,
+        metric_id: null,
+        metric_key: null,
+        metric_name: metric.display_name || metric.metric_name,
+        metric_kind: null,
+        metric_unit: metric.unit ?? null,
+        model_route_id: modelRouteId,
+        model_id: row.model_info.id,
+        model_name: row.model_info.name,
+        developer: row.model_info.developer ?? null,
+        score: score ?? null,
+        normalized_score: null,
+        rank: rankData.ranks.get(row) ?? null,
+        rank_total: rankData.total,
+        lower_is_better: metric.lower_is_better,
+        source_name: normalizeSourceName(row.source_metadata.source_name),
+        source_type: row.source_metadata.source_type,
+        source_organization_name: row.source_metadata.source_organization_name,
+        evaluator_relationship: row.source_metadata.evaluator_relationship,
+        source_record_url: row.source_metadata.source_url ?? null,
+        ...sourceDataFields,
+        retrieved_at: row.evaluation_timestamp,
+        evaluation_timestamp: row.evaluation_timestamp,
+        has_instance_data: hasBenchmarkInstances,
+        instance_join_status: hasBenchmarkInstances ? "benchmark_available" : "not_available",
+        detailed_evaluation_results_url: null,
+        sample_size: null,
+        standard_error: null,
+        confidence_interval: null,
+        generation_config_available: null,
+      })
+    }
+  }
+  return rows
+}
+function buildFacets(rows: ResearchJoinRow[]): ResearchJoinFacet[] {
+  const facetSpecs = [
+    { key: "metric_name", label: "Metric" },
+    { key: "source_name", label: "Source" },
+    { key: "evaluator_relationship", label: "Relationship" },
+  ] as const
+  return facetSpecs.map((spec) => {
+    const counts = new Map<string, number>()
+    for (const row of rows) {
+      const value = String(row[spec.key] ?? "")
+      if (!value) {
+        continue
+      }
+      counts.set(value, (counts.get(value) ?? 0) + 1)
+    }
+    return {
+      key: spec.key,
+      label: spec.label,
+      values: Array.from(counts.entries())
+        .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
+        .map(([value, count]) => ({
+          value,
+          label: value === MISSING_SOURCE_BUCKET ? "Missing source" : value,
+          count,
+        })),
+    }
+  })
+}
+function buildGrainOptions(rows: ResearchJoinRow[]): ResearchJoinGrainOption[] {
+  const grainDetails: Record<ResearchJoinGrain, Omit<ResearchJoinGrainOption, "row_count">> = {
+    model_metric: {
+      grain: "model_metric",
+      label: "Model x metric",
+      description: "One row per model and benchmark metric, best for score matrices.",
+      join_keys: ["eval_summary_id", "metric_summary_id", "model_route_id"],
+    },
+    model_metric_source: {
+      grain: "model_metric_source",
+      label: "Model x metric x source",
+      description: "Adds source provenance to each model metric row.",
+      join_keys: ["eval_summary_id", "metric_summary_id", "model_route_id", "source_name"],
+    },
+    model_metric_instance: {
+      grain: "model_metric_instance",
+      label: "Model x metric x instances",
+      description: "Filters to rows with exact or benchmark-level sample links.",
+      join_keys: ["eval_summary_id", "metric_summary_id", "model_route_id", "detailed_evaluation_results_url"],
+    },
+    composite_component: {
+      grain: "composite_component",
+      label: "Composite components",
+      description: "One row per model and component score inside a rollup benchmark.",
+      join_keys: ["component_eval_summary_id", "model_route_id", "source_name"],
+    },
+  }
+  return (Object.keys(grainDetails) as ResearchJoinGrain[])
+    .map((grain) => ({
+      ...grainDetails[grain],
+      row_count: rows.filter((row) => {
+        if (grain === "model_metric_instance") {
+          return row.has_instance_data
+        }
+        return row.join_grain === grain
+      }).length,
+    }))
+    .filter((option) => option.row_count > 0)
+}
+function buildJoinSteps(grainOptions: ResearchJoinGrainOption[]) {
+  const defaultKeys = grainOptions[0]?.join_keys ?? ["eval_summary_id", "metric_summary_id", "model_route_id"]
+  return [
+    {
+      step: 1,
+      title: "Choose the base row grain",
+      description:
+        "Start from model-metric rows, source-aware rows, instance-linked rows, or composite component rows.",
+      keys: defaultKeys,
+    },
+    {
+      step: 2,
+      title: "Add field groups as joins",
+      description:
+        "Join hierarchy, source provenance, instance links, and quality fields by the keys shown for the selected grain.",
+      keys: ["benchmark_family_key", "benchmark_parent_key", "source_name", "detailed_evaluation_results_url"],
+    },
+    {
+      step: 3,
+      title: "Filter, inspect keys, and export",
+      description:
+        "Filter to the rows you need, keep source identity explicit, then export the denormalized join table.",
+      keys: ["source_name", "evaluator_relationship", "metric_name"],
+    },
+  ]
+}
+function buildArtifactResearchJoinDataset(summary: BenchmarkEvalSummary, warnings: string[] = []): ResearchJoinDataset {
+  const hasMatrixRows =
+    (summary.leaderboard_metrics?.length ?? 0) > 1 &&
+    (summary.leaderboard_rows?.length ?? 0) > 0
+  const rows = hasMatrixRows
+    ? buildRowsFromMatrix(summary)
+    : buildRowsFromModelResults(summary)
+  const finalRows = rows
+  const grainOptions = buildGrainOptions(finalRows)
+  const benchmarkLevelInstanceWarning = finalRows.some(
+    (row) => row.instance_join_status === "benchmark_available"
+  )
+  return {
+    source: "artifact",
+    generated_at: new Date().toISOString(),
+    eval_summary_id: summary.evaluation_id,
+    eval_name: summary.evaluation_name,
+    warnings: [
+      ...warnings,
+      "Artifact fallback keeps joins source-aware but cannot execute arbitrary SQL.",
+      ...(benchmarkLevelInstanceWarning
+        ? ["Some instance joins are benchmark-level availability signals, not exact metric-level sample links."]
+        : []),
+    ],
+    join_steps: buildJoinSteps(grainOptions),
+    available_grains: grainOptions,
+    columns: COLUMN_DEFINITIONS,
+    facets: buildFacets(finalRows),
+    rows: finalRows,
+  }
+}
+function isResearchJoinDataset(value: unknown): value is ResearchJoinDataset {
+  if (!value || typeof value !== "object") {
+    return false
+  }
+  const record = value as Partial<ResearchJoinDataset>
+  return Array.isArray(record.rows) && Array.isArray(record.columns) && Array.isArray(record.available_grains)
+}
+async function fetchLiveResearchJoinDataset(evalId: string): Promise<ResearchJoinDataset | null> {
+  const baseUrl = process.env.QUERY_API_BASE_URL ?? process.env.EVAL_QUERY_API_BASE_URL
+  if (!baseUrl) {
+    return null
+  }
+  const trimmedBase = baseUrl.replace(/\/+$/, "")
+  const candidates = [
+    `${trimmedBase}/benchmarks/${encodeURIComponent(evalId)}/research-joins`,
+    `${trimmedBase}/research/benchmarks/${encodeURIComponent(evalId)}/joins`,
+  ]
+  for (const url of candidates) {
+    try {
+      const response = await fetch(url, {
+        cache: "no-store",
+        headers: { Accept: "application/json" },
+      })
+      if (!response.ok) {
+        continue
+      }
+      const payload = await response.json()
+      const record = payload && typeof payload === "object" ? payload as Record<string, unknown> : {}
+      const candidate =
+        isResearchJoinDataset(payload)
+          ? payload
+          : isResearchJoinDataset(record.research_join_dataset)
+            ? record.research_join_dataset
+            : null
+      if (candidate) {
+        return {
+          ...candidate,
+          source: "query_api",
+          warnings: candidate.warnings ?? [],
+        }
+      }
+    } catch {
+      continue
+    }
+  }
+  return null
+}
+export async function getResearchJoinDataset(evalId: string): Promise<ResearchJoinDataset | null> {
+  const liveDataset = await fetchLiveResearchJoinDataset(evalId)
+  if (liveDataset) {
+    return liveDataset
+  }
+  const summary = await getEvalSummaryById(evalId)
+  if (!summary) {
+    return null
+  }
+  const liveWarning =
+    process.env.QUERY_API_BASE_URL || process.env.EVAL_QUERY_API_BASE_URL
+      ? ["Live Query API join endpoint was unavailable, so this dataset was built from frontend artifacts."]
+      : []
+  return buildArtifactResearchJoinDataset(summary, liveWarning)
+}
+export { buildArtifactResearchJoinDataset }

tests/research-joins.test.ts ADDED Viewed

	@@ -0,0 +1,174 @@

+import { describe, expect, it, vi } from "vitest"
+import type { BenchmarkEvalSummary, ModelResultForBenchmark } from "../lib/eval-processing"
+vi.mock("server-only", () => ({}))
+const { buildArtifactResearchJoinDataset } = await import("../lib/research-joins")
+const metricConfig = {
+  evaluation_description: "Accuracy on examples",
+  lower_is_better: false,
+  score_type: "continuous" as const,
+  min_score: 0,
+  max_score: 1,
+  unit: "%",
+}
+function makeModelResult(overrides: Partial<ModelResultForBenchmark> = {}): ModelResultForBenchmark {
+  return {
+    model_info: {
+      id: "openai/example-model",
+      name: "Example Model",
+      developer: "OpenAI",
+    },
+    model_route_id: "openai__example-model",
+    score: 0.82,
+    score_details: {
+      score: 0.82,
+      sample_size: 100,
+      standard_error: 0.01,
+      confidence_interval: {
+        lower: 0.8,
+        upper: 0.84,
+        confidence_level: 95,
+      },
+    },
+    evaluation_timestamp: "2026-01-01T00:00:00Z",
+    source_metadata: {
+      source_type: "evaluation_run",
+      source_organization_name: "OpenAI",
+      evaluator_relationship: "first_party",
+    },
+    source_data: {
+      dataset_name: "Example Dataset",
+      dataset_version: "v1",
+      hf_repo: "example/dataset",
+      hf_split: "test",
+      samples_number: 100,
+    },
+    result: {
+      evaluation_result_id: "result-1",
+      evaluation_name: "accuracy",
+      display_name: "Accuracy",
+      metric_summary_id: "metric-accuracy",
+      metric_id: "accuracy",
+      metric_key: "accuracy",
+      metric_name: "accuracy",
+      metric_kind: "score",
+      metric_unit: "%",
+      evaluation_timestamp: "2026-01-01T00:00:00Z",
+      source_record_url: "https://example.test/records/result-1",
+      metric_config: metricConfig,
+      score_details: { score: 0.82 },
+      detailed_evaluation_results_url:
+        "https://huggingface.co/datasets/evaleval/card_backend/resolve/main/instances/example.jsonl",
+    },
+    ...overrides,
+  }
+}
+function makeSummary(overrides: Partial<BenchmarkEvalSummary> = {}): BenchmarkEvalSummary {
+  return {
+    evaluation_name: "Example Eval",
+    evaluation_id: "example_eval",
+    composite_benchmark_key: "example_suite",
+    composite_benchmark_name: "Example Suite",
+    category: "Reasoning",
+    metric_config: metricConfig,
+    model_results: [makeModelResult()],
+    models_count: 1,
+    evaluator_names: [],
+    source_types: [],
+    third_party_ratio: 0,
+    missing_generation_config_count: 0,
+    best_model: { name: "Example Model", score: 0.82 },
+    worst_model: { name: "Example Model", score: 0.82 },
+    avg_score: 0.82,
+    avg_score_norm: 0.82,
+    benchmark_family_key: "example_family",
+    benchmark_leaf_key: "example_leaf",
+    ...overrides,
+  }
+}
+describe("buildArtifactResearchJoinDataset", () => {
+  it("preserves source-safe metric rows and uses an explicit missing-source bucket", () => {
+    const dataset = buildArtifactResearchJoinDataset(makeSummary())
+    expect(dataset.source).toBe("artifact")
+    expect(dataset.rows).toHaveLength(1)
+    expect(dataset.rows[0]).toMatchObject({
+      eval_summary_id: "example_eval",
+      metric_summary_id: "metric-accuracy",
+      metric_id: "accuracy",
+      model_route_id: "openai__example-model",
+      source_name: "__missing_source__",
+      source_record_url: "https://example.test/records/result-1",
+      has_instance_data: true,
+      instance_join_status: "metric_exact",
+      sample_size: 100,
+      standard_error: 0.01,
+      confidence_interval: "0.8 - 0.84 (95%)",
+    })
+    expect(dataset.facets.find((facet) => facet.key === "source_name")?.values[0]).toMatchObject({
+      value: "__missing_source__",
+      label: "Missing source",
+      count: 1,
+    })
+  })
+  it("offers instance-linked joins without duplicating artifact fallback rows", () => {
+    const dataset = buildArtifactResearchJoinDataset(makeSummary())
+    const instanceGrain = dataset.available_grains.find((grain) => grain.grain === "model_metric_instance")
+    expect(instanceGrain?.row_count).toBe(1)
+    expect(dataset.rows.filter((row) => row.has_instance_data)).toHaveLength(1)
+  })
+  it("materializes composite component rows from aggregate components", () => {
+    const modelResult = makeModelResult({
+      aggregate_components: [
+        {
+          evaluation_id: "component_a",
+          composite_benchmark_key: "suite",
+          composite_benchmark_name: "Component A",
+          score: 0.7,
+          normalized_score: 0.7,
+          evaluation_timestamp: "2026-01-02T00:00:00Z",
+          source_name: "component-source",
+          source_type: "leaderboard",
+          source_organization_name: "Example Org",
+          evaluator_relationship: "third_party",
+        },
+        {
+          evaluation_id: "component_b",
+          composite_benchmark_key: "suite",
+          composite_benchmark_name: "Component B",
+          score: 0.9,
+          normalized_score: 0.9,
+          evaluation_timestamp: "2026-01-03T00:00:00Z",
+          source_type: "paper",
+          source_organization_name: "Example Paper",
+          evaluator_relationship: "third_party",
+        },
+      ],
+    })
+    const dataset = buildArtifactResearchJoinDataset(makeSummary({ model_results: [modelResult] }))
+    const componentRows = dataset.rows.filter((row) => row.join_grain === "composite_component")
+    expect(componentRows).toHaveLength(2)
+    expect(componentRows[0]).toMatchObject({
+      component_eval_summary_id: "component_a",
+      component_name: "Component A",
+      source_name: "component-source",
+      score: 0.7,
+    })
+    expect(componentRows[1]).toMatchObject({
+      component_eval_summary_id: "component_b",
+      component_name: "Component B",
+      source_name: "__missing_source__",
+      score: 0.9,
+    })
+  })
+})