Spaces:
Running
Running
Add researcher join analysis to eval detail
#2
by yananlong - opened
- app/api/research/eval-joins/route.ts +24 -0
- components/eval-detail.tsx +43 -36
- components/eval-join-analysis.tsx +697 -0
- lib/benchmark-schema.ts +12 -0
- lib/dashboard-data-client.ts +7 -0
- lib/hf-data.ts +15 -0
- lib/model-data.ts +8 -0
- lib/research-join-types.ts +112 -0
- lib/research-joins.ts +712 -0
- tests/research-joins.test.ts +174 -0
app/api/research/eval-joins/route.ts
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { NextResponse } from "next/server"
|
| 2 |
+
|
| 3 |
+
import { getResearchJoinDataset } from "@/lib/research-joins"
|
| 4 |
+
|
| 5 |
+
export async function GET(request: Request) {
|
| 6 |
+
const { searchParams } = new URL(request.url)
|
| 7 |
+
const id = searchParams.get("id")
|
| 8 |
+
|
| 9 |
+
if (!id) {
|
| 10 |
+
return NextResponse.json({ error: "Missing evaluation id" }, { status: 400 })
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
const dataset = await getResearchJoinDataset(id)
|
| 14 |
+
|
| 15 |
+
if (!dataset) {
|
| 16 |
+
return NextResponse.json({ error: "Evaluation join dataset not found" }, { status: 404 })
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
return NextResponse.json(dataset, {
|
| 20 |
+
headers: {
|
| 21 |
+
"Cache-Control": "public, max-age=300, stale-while-revalidate=1800",
|
| 22 |
+
},
|
| 23 |
+
})
|
| 24 |
+
}
|
components/eval-detail.tsx
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
"use client"
|
| 2 |
|
| 3 |
import { useAudienceMode } from "@/components/audience-mode-provider"
|
|
|
|
| 4 |
import { Fragment, useEffect, useMemo, useState } from "react"
|
| 5 |
import Link from "next/link"
|
| 6 |
import { Badge } from "@/components/ui/badge"
|
|
@@ -914,46 +915,51 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 914 |
</Card>
|
| 915 |
|
| 916 |
{hasMultiMetricLeaderboard ? (
|
| 917 |
-
<
|
|
|
|
|
|
|
|
|
|
| 918 |
) : (
|
| 919 |
-
<
|
| 920 |
-
<
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
|
| 926 |
-
|
| 927 |
-
|
| 928 |
-
|
| 929 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
| 930 |
</div>
|
| 931 |
-
<CardDescription>{leaderboardDescription}</CardDescription>
|
| 932 |
-
</div>
|
| 933 |
|
| 934 |
-
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
| 939 |
-
</Badge>
|
| 940 |
-
<Badge variant="outline">{scoreDirectionLabel}</Badge>
|
| 941 |
-
{hasParameterData && (numericMinParams != null || numericMaxParams != null) && (
|
| 942 |
-
<Badge variant="outline">
|
| 943 |
-
Params {formatParamBoundLabel(minParamStep, "min")} to {formatParamBoundLabel(maxParamStep, "max")}
|
| 944 |
-
</Badge>
|
| 945 |
-
)}
|
| 946 |
-
{isResearchView && (
|
| 947 |
-
<Badge variant="outline">
|
| 948 |
-
Scale {summary.metric_config.min_score ?? 0} - {summary.metric_config.max_score ?? 1}
|
| 949 |
</Badge>
|
| 950 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 951 |
</div>
|
| 952 |
-
</
|
| 953 |
-
</CardHeader>
|
| 954 |
|
| 955 |
-
|
| 956 |
-
|
| 957 |
<div className="border-b bg-background px-5 py-4 sm:px-6">
|
| 958 |
<div className="flex flex-col gap-3 lg:flex-row lg:items-center lg:justify-between">
|
| 959 |
<div className="space-y-1">
|
|
@@ -1456,8 +1462,9 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 1456 |
</Button>
|
| 1457 |
</div>
|
| 1458 |
)}
|
| 1459 |
-
|
| 1460 |
-
|
|
|
|
| 1461 |
)}
|
| 1462 |
</div>
|
| 1463 |
)
|
|
|
|
| 1 |
"use client"
|
| 2 |
|
| 3 |
import { useAudienceMode } from "@/components/audience-mode-provider"
|
| 4 |
+
import { EvalJoinAnalysis } from "@/components/eval-join-analysis"
|
| 5 |
import { Fragment, useEffect, useMemo, useState } from "react"
|
| 6 |
import Link from "next/link"
|
| 7 |
import { Badge } from "@/components/ui/badge"
|
|
|
|
| 915 |
</Card>
|
| 916 |
|
| 917 |
{hasMultiMetricLeaderboard ? (
|
| 918 |
+
<>
|
| 919 |
+
<EvalJoinAnalysis evalId={summary.evaluation_id} />
|
| 920 |
+
<MultiMetricLeaderboard summary={summary} isResearchView={isResearchView} />
|
| 921 |
+
</>
|
| 922 |
) : (
|
| 923 |
+
<>
|
| 924 |
+
<EvalJoinAnalysis evalId={summary.evaluation_id} />
|
| 925 |
+
<Card className="overflow-hidden">
|
| 926 |
+
<CardHeader className="border-b bg-muted/10 space-y-3">
|
| 927 |
+
<ApplesToApplesBanner
|
| 928 |
+
summary={summary.comparability_summary}
|
| 929 |
+
detailsAnchorId="comparability-panel"
|
| 930 |
+
/>
|
| 931 |
+
<div className="flex flex-col gap-3 lg:flex-row lg:items-end lg:justify-between">
|
| 932 |
+
<div className="space-y-2">
|
| 933 |
+
<div className="flex items-center gap-2">
|
| 934 |
+
<Medal className="h-5 w-5 text-primary" />
|
| 935 |
+
<CardTitle className="text-xl">{leaderboardTitle}</CardTitle>
|
| 936 |
+
</div>
|
| 937 |
+
<CardDescription>{leaderboardDescription}</CardDescription>
|
| 938 |
</div>
|
|
|
|
|
|
|
| 939 |
|
| 940 |
+
<div className="flex flex-wrap items-center gap-2 text-xs text-muted-foreground">
|
| 941 |
+
<Badge variant="secondary">
|
| 942 |
+
{leaderboardRows.length === summary.models_count
|
| 943 |
+
? `${summary.models_count} models`
|
| 944 |
+
: `${leaderboardRows.length} of ${summary.models_count} models`}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 945 |
</Badge>
|
| 946 |
+
<Badge variant="outline">{scoreDirectionLabel}</Badge>
|
| 947 |
+
{hasParameterData && (numericMinParams != null || numericMaxParams != null) && (
|
| 948 |
+
<Badge variant="outline">
|
| 949 |
+
Params {formatParamBoundLabel(minParamStep, "min")} to {formatParamBoundLabel(maxParamStep, "max")}
|
| 950 |
+
</Badge>
|
| 951 |
+
)}
|
| 952 |
+
{isResearchView && (
|
| 953 |
+
<Badge variant="outline">
|
| 954 |
+
Scale {summary.metric_config.min_score ?? 0} - {summary.metric_config.max_score ?? 1}
|
| 955 |
+
</Badge>
|
| 956 |
+
)}
|
| 957 |
+
</div>
|
| 958 |
</div>
|
| 959 |
+
</CardHeader>
|
|
|
|
| 960 |
|
| 961 |
+
<CardContent className="p-0">
|
| 962 |
+
{hasParameterData && (
|
| 963 |
<div className="border-b bg-background px-5 py-4 sm:px-6">
|
| 964 |
<div className="flex flex-col gap-3 lg:flex-row lg:items-center lg:justify-between">
|
| 965 |
<div className="space-y-1">
|
|
|
|
| 1462 |
</Button>
|
| 1463 |
</div>
|
| 1464 |
)}
|
| 1465 |
+
</CardContent>
|
| 1466 |
+
</Card>
|
| 1467 |
+
</>
|
| 1468 |
)}
|
| 1469 |
</div>
|
| 1470 |
)
|
components/eval-join-analysis.tsx
ADDED
|
@@ -0,0 +1,697 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"use client"
|
| 2 |
+
|
| 3 |
+
import { useEffect, useMemo, useState } from "react"
|
| 4 |
+
import {
|
| 5 |
+
ClipboardList,
|
| 6 |
+
Download,
|
| 7 |
+
Filter,
|
| 8 |
+
GitBranch,
|
| 9 |
+
KeyRound,
|
| 10 |
+
Loader2,
|
| 11 |
+
Network,
|
| 12 |
+
TableProperties,
|
| 13 |
+
} from "lucide-react"
|
| 14 |
+
|
| 15 |
+
import { Badge } from "@/components/ui/badge"
|
| 16 |
+
import { Button } from "@/components/ui/button"
|
| 17 |
+
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"
|
| 18 |
+
import { Input } from "@/components/ui/input"
|
| 19 |
+
import {
|
| 20 |
+
Select,
|
| 21 |
+
SelectContent,
|
| 22 |
+
SelectItem,
|
| 23 |
+
SelectTrigger,
|
| 24 |
+
SelectValue,
|
| 25 |
+
} from "@/components/ui/select"
|
| 26 |
+
import { Table, TableBody, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui/table"
|
| 27 |
+
import { fetchEvalResearchJoins } from "@/lib/dashboard-data-client"
|
| 28 |
+
import type {
|
| 29 |
+
ResearchJoinColumn,
|
| 30 |
+
ResearchJoinColumnGroup,
|
| 31 |
+
ResearchJoinDataset,
|
| 32 |
+
ResearchJoinGrain,
|
| 33 |
+
ResearchJoinRow,
|
| 34 |
+
} from "@/lib/research-join-types"
|
| 35 |
+
import { cn } from "@/lib/utils"
|
| 36 |
+
|
| 37 |
+
const CORE_GROUPS: ResearchJoinColumnGroup[] = ["identity", "metric", "model", "score"]
|
| 38 |
+
const JOIN_GROUPS: Array<{
|
| 39 |
+
group: ResearchJoinColumnGroup
|
| 40 |
+
label: string
|
| 41 |
+
description: string
|
| 42 |
+
}> = [
|
| 43 |
+
{
|
| 44 |
+
group: "hierarchy",
|
| 45 |
+
label: "Hierarchy",
|
| 46 |
+
description: "Family, composite, leaf, and component keys.",
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
group: "source",
|
| 50 |
+
label: "Source",
|
| 51 |
+
description: "Source buckets, organizations, and evaluator relationship.",
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
group: "instance",
|
| 55 |
+
label: "Instances",
|
| 56 |
+
description: "Sample-link status and detailed result URLs.",
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
group: "quality",
|
| 60 |
+
label: "Quality",
|
| 61 |
+
description: "Sample sizes, uncertainty, timestamps, and config presence.",
|
| 62 |
+
},
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
function formatCellValue(value: unknown) {
|
| 66 |
+
if (value == null || value === "") {
|
| 67 |
+
return "N/A"
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
if (typeof value === "boolean") {
|
| 71 |
+
return value ? "Yes" : "No"
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
if (typeof value === "number") {
|
| 75 |
+
if (!Number.isFinite(value)) {
|
| 76 |
+
return "N/A"
|
| 77 |
+
}
|
| 78 |
+
return Math.abs(value) >= 100 ? value.toLocaleString() : Number(value.toFixed(4)).toString()
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
return String(value)
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
function getRowValue(row: ResearchJoinRow, key: string) {
|
| 85 |
+
return row[key as keyof ResearchJoinRow]
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
function getDefaultColumnKeys(columns: ResearchJoinColumn[]) {
|
| 89 |
+
return new Set(columns.filter((column) => column.defaultVisible).map((column) => column.key))
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
function sortRows(rows: ResearchJoinRow[], sortKey: string, direction: "asc" | "desc") {
|
| 93 |
+
return [...rows].sort((left, right) => {
|
| 94 |
+
const leftValue = getRowValue(left, sortKey)
|
| 95 |
+
const rightValue = getRowValue(right, sortKey)
|
| 96 |
+
|
| 97 |
+
if (typeof leftValue === "number" && typeof rightValue === "number") {
|
| 98 |
+
return direction === "asc" ? leftValue - rightValue : rightValue - leftValue
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
const comparison = formatCellValue(leftValue).localeCompare(formatCellValue(rightValue), undefined, {
|
| 102 |
+
numeric: true,
|
| 103 |
+
sensitivity: "base",
|
| 104 |
+
})
|
| 105 |
+
return direction === "asc" ? comparison : -comparison
|
| 106 |
+
})
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
function escapeCsvCell(value: unknown) {
|
| 110 |
+
const text = formatCellValue(value)
|
| 111 |
+
if (/[",\n]/.test(text)) {
|
| 112 |
+
return `"${text.replace(/"/g, '""')}"`
|
| 113 |
+
}
|
| 114 |
+
return text
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
function downloadText(filename: string, text: string, mimeType: string) {
|
| 118 |
+
const blob = new Blob([text], { type: mimeType })
|
| 119 |
+
const url = URL.createObjectURL(blob)
|
| 120 |
+
const link = document.createElement("a")
|
| 121 |
+
link.href = url
|
| 122 |
+
link.download = filename
|
| 123 |
+
document.body.appendChild(link)
|
| 124 |
+
link.click()
|
| 125 |
+
document.body.removeChild(link)
|
| 126 |
+
URL.revokeObjectURL(url)
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
function getJoinRecipe(dataset: ResearchJoinDataset, grain: ResearchJoinGrain) {
|
| 130 |
+
const grainOption = dataset.available_grains.find((option) => option.grain === grain)
|
| 131 |
+
const keys = grainOption?.join_keys ?? ["eval_summary_id", "metric_summary_id", "model_route_id"]
|
| 132 |
+
|
| 133 |
+
return [
|
| 134 |
+
`Base grain: ${grainOption?.label ?? grain}`,
|
| 135 |
+
`Join keys: ${keys.join(" + ")}`,
|
| 136 |
+
"Keep source_name explicit; do not join across evaluation_name alone.",
|
| 137 |
+
"Add hierarchy, source, instance, or quality fields, then export the filtered rows.",
|
| 138 |
+
].join("\n")
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
export function EvalJoinAnalysis({ evalId }: { evalId: string }) {
|
| 142 |
+
const [dataset, setDataset] = useState<ResearchJoinDataset | null>(null)
|
| 143 |
+
const [loading, setLoading] = useState(true)
|
| 144 |
+
const [error, setError] = useState<string | null>(null)
|
| 145 |
+
const [selectedGrain, setSelectedGrain] = useState<ResearchJoinGrain>("model_metric_source")
|
| 146 |
+
const [metricFilter, setMetricFilter] = useState("all")
|
| 147 |
+
const [sourceFilter, setSourceFilter] = useState("all")
|
| 148 |
+
const [relationshipFilter, setRelationshipFilter] = useState("all")
|
| 149 |
+
const [instanceFilter, setInstanceFilter] = useState("all")
|
| 150 |
+
const [query, setQuery] = useState("")
|
| 151 |
+
const [sortKey, setSortKey] = useState("rank")
|
| 152 |
+
const [sortDirection, setSortDirection] = useState<"asc" | "desc">("asc")
|
| 153 |
+
const [selectedColumns, setSelectedColumns] = useState<Set<string>>(new Set())
|
| 154 |
+
const [copiedRecipe, setCopiedRecipe] = useState(false)
|
| 155 |
+
|
| 156 |
+
useEffect(() => {
|
| 157 |
+
let isCancelled = false
|
| 158 |
+
|
| 159 |
+
const load = async () => {
|
| 160 |
+
try {
|
| 161 |
+
setLoading(true)
|
| 162 |
+
setError(null)
|
| 163 |
+
const nextDataset = await fetchEvalResearchJoins(evalId)
|
| 164 |
+
|
| 165 |
+
if (isCancelled) {
|
| 166 |
+
return
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
setDataset(nextDataset)
|
| 170 |
+
const defaultGrain =
|
| 171 |
+
nextDataset.available_grains.find((grain) => grain.grain === "model_metric_source") ??
|
| 172 |
+
nextDataset.available_grains[0]
|
| 173 |
+
if (defaultGrain) {
|
| 174 |
+
setSelectedGrain(defaultGrain.grain)
|
| 175 |
+
}
|
| 176 |
+
setSelectedColumns(getDefaultColumnKeys(nextDataset.columns))
|
| 177 |
+
} catch (err) {
|
| 178 |
+
if (!isCancelled) {
|
| 179 |
+
setError(err instanceof Error ? err.message : "Failed to load research joins")
|
| 180 |
+
}
|
| 181 |
+
} finally {
|
| 182 |
+
if (!isCancelled) {
|
| 183 |
+
setLoading(false)
|
| 184 |
+
}
|
| 185 |
+
}
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
load()
|
| 189 |
+
|
| 190 |
+
return () => {
|
| 191 |
+
isCancelled = true
|
| 192 |
+
}
|
| 193 |
+
}, [evalId])
|
| 194 |
+
|
| 195 |
+
const visibleColumns = useMemo(() => {
|
| 196 |
+
if (!dataset) {
|
| 197 |
+
return []
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
return dataset.columns.filter((column) => selectedColumns.has(column.key))
|
| 201 |
+
}, [dataset, selectedColumns])
|
| 202 |
+
|
| 203 |
+
const selectedGrainOption = dataset?.available_grains.find((option) => option.grain === selectedGrain)
|
| 204 |
+
const selectedJoinKeys = selectedGrainOption?.join_keys ?? []
|
| 205 |
+
|
| 206 |
+
const filteredRows = useMemo(() => {
|
| 207 |
+
if (!dataset) {
|
| 208 |
+
return []
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
const normalizedQuery = query.trim().toLowerCase()
|
| 212 |
+
const rows = dataset.rows.filter((row) => {
|
| 213 |
+
const grainMatches =
|
| 214 |
+
selectedGrain === "model_metric_instance"
|
| 215 |
+
? row.has_instance_data
|
| 216 |
+
: row.join_grain === selectedGrain
|
| 217 |
+
|
| 218 |
+
if (!grainMatches) {
|
| 219 |
+
return false
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
if (metricFilter !== "all" && row.metric_name !== metricFilter) {
|
| 223 |
+
return false
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
if (sourceFilter !== "all" && row.source_name !== sourceFilter) {
|
| 227 |
+
return false
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
if (relationshipFilter !== "all" && row.evaluator_relationship !== relationshipFilter) {
|
| 231 |
+
return false
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
if (instanceFilter === "linked" && !row.has_instance_data) {
|
| 235 |
+
return false
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
if (instanceFilter === "unlinked" && row.has_instance_data) {
|
| 239 |
+
return false
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
if (!normalizedQuery) {
|
| 243 |
+
return true
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
return [
|
| 247 |
+
row.model_name,
|
| 248 |
+
row.model_id,
|
| 249 |
+
row.developer,
|
| 250 |
+
row.metric_name,
|
| 251 |
+
row.source_name,
|
| 252 |
+
row.source_organization_name,
|
| 253 |
+
row.component_name,
|
| 254 |
+
]
|
| 255 |
+
.filter(Boolean)
|
| 256 |
+
.some((value) => String(value).toLowerCase().includes(normalizedQuery))
|
| 257 |
+
})
|
| 258 |
+
|
| 259 |
+
return sortRows(rows, sortKey, sortDirection)
|
| 260 |
+
}, [
|
| 261 |
+
dataset,
|
| 262 |
+
instanceFilter,
|
| 263 |
+
metricFilter,
|
| 264 |
+
query,
|
| 265 |
+
relationshipFilter,
|
| 266 |
+
selectedGrain,
|
| 267 |
+
sortDirection,
|
| 268 |
+
sortKey,
|
| 269 |
+
sourceFilter,
|
| 270 |
+
])
|
| 271 |
+
|
| 272 |
+
const previewRows = filteredRows.slice(0, 100)
|
| 273 |
+
|
| 274 |
+
const facets = useMemo(() => {
|
| 275 |
+
if (!dataset) {
|
| 276 |
+
return {
|
| 277 |
+
metrics: [],
|
| 278 |
+
sources: [],
|
| 279 |
+
relationships: [],
|
| 280 |
+
}
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
const byKey = new Map(dataset.facets.map((facet) => [facet.key, facet.values]))
|
| 284 |
+
return {
|
| 285 |
+
metrics: byKey.get("metric_name") ?? [],
|
| 286 |
+
sources: byKey.get("source_name") ?? [],
|
| 287 |
+
relationships: byKey.get("evaluator_relationship") ?? [],
|
| 288 |
+
}
|
| 289 |
+
}, [dataset])
|
| 290 |
+
|
| 291 |
+
const toggleColumn = (key: string) => {
|
| 292 |
+
setSelectedColumns((current) => {
|
| 293 |
+
const next = new Set(current)
|
| 294 |
+
if (next.has(key)) {
|
| 295 |
+
next.delete(key)
|
| 296 |
+
} else {
|
| 297 |
+
next.add(key)
|
| 298 |
+
}
|
| 299 |
+
return next
|
| 300 |
+
})
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
const toggleJoinGroup = (group: ResearchJoinColumnGroup) => {
|
| 304 |
+
if (!dataset) {
|
| 305 |
+
return
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
const groupKeys = dataset.columns
|
| 309 |
+
.filter((column) => column.group === group)
|
| 310 |
+
.map((column) => column.key)
|
| 311 |
+
|
| 312 |
+
setSelectedColumns((current) => {
|
| 313 |
+
const next = new Set(current)
|
| 314 |
+
const allSelected = groupKeys.every((key) => next.has(key))
|
| 315 |
+
for (const key of groupKeys) {
|
| 316 |
+
if (allSelected && !CORE_GROUPS.includes(group)) {
|
| 317 |
+
next.delete(key)
|
| 318 |
+
} else {
|
| 319 |
+
next.add(key)
|
| 320 |
+
}
|
| 321 |
+
}
|
| 322 |
+
return next
|
| 323 |
+
})
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
const exportRows = (format: "csv" | "json") => {
|
| 327 |
+
if (!dataset) {
|
| 328 |
+
return
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
const columns = visibleColumns.length > 0 ? visibleColumns : dataset.columns.filter((column) => column.defaultVisible)
|
| 332 |
+
const filenameBase = `${dataset.eval_summary_id}-${selectedGrain}-join`
|
| 333 |
+
|
| 334 |
+
if (format === "json") {
|
| 335 |
+
const payload = filteredRows.map((row) =>
|
| 336 |
+
Object.fromEntries(columns.map((column) => [column.key, getRowValue(row, column.key)]))
|
| 337 |
+
)
|
| 338 |
+
downloadText(`${filenameBase}.json`, JSON.stringify(payload, null, 2), "application/json;charset=utf-8")
|
| 339 |
+
return
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
const header = columns.map((column) => escapeCsvCell(column.key)).join(",")
|
| 343 |
+
const body = filteredRows
|
| 344 |
+
.map((row) => columns.map((column) => escapeCsvCell(getRowValue(row, column.key))).join(","))
|
| 345 |
+
.join("\n")
|
| 346 |
+
downloadText(`${filenameBase}.csv`, `${header}\n${body}`, "text/csv;charset=utf-8")
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
const copyRecipe = async () => {
|
| 350 |
+
if (!dataset) {
|
| 351 |
+
return
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
try {
|
| 355 |
+
await navigator.clipboard.writeText(getJoinRecipe(dataset, selectedGrain))
|
| 356 |
+
setCopiedRecipe(true)
|
| 357 |
+
window.setTimeout(() => setCopiedRecipe(false), 1800)
|
| 358 |
+
} catch {
|
| 359 |
+
setCopiedRecipe(false)
|
| 360 |
+
}
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
if (loading) {
|
| 364 |
+
return (
|
| 365 |
+
<Card className="overflow-hidden">
|
| 366 |
+
<CardContent className="flex items-center gap-3 p-5 text-sm text-muted-foreground">
|
| 367 |
+
<Loader2 className="h-4 w-4 animate-spin" />
|
| 368 |
+
Loading research join dataset...
|
| 369 |
+
</CardContent>
|
| 370 |
+
</Card>
|
| 371 |
+
)
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
if (error || !dataset) {
|
| 375 |
+
return (
|
| 376 |
+
<Card className="overflow-hidden border-amber-200/70 bg-amber-50/30 dark:border-amber-900/50 dark:bg-amber-950/10">
|
| 377 |
+
<CardContent className="p-5 text-sm text-amber-900 dark:text-amber-100">
|
| 378 |
+
Research joins are unavailable for this benchmark.
|
| 379 |
+
</CardContent>
|
| 380 |
+
</Card>
|
| 381 |
+
)
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
return (
|
| 385 |
+
<Card className="overflow-hidden">
|
| 386 |
+
<CardHeader className="border-b bg-muted/10">
|
| 387 |
+
<div className="flex flex-col gap-4 xl:flex-row xl:items-start xl:justify-between">
|
| 388 |
+
<div className="space-y-2">
|
| 389 |
+
<div className="flex items-center gap-2">
|
| 390 |
+
<Network className="h-5 w-5 text-primary" />
|
| 391 |
+
<CardTitle className="text-xl">Research Join Builder</CardTitle>
|
| 392 |
+
</div>
|
| 393 |
+
<CardDescription>
|
| 394 |
+
Build source-safe tables from benchmark rows, metric identities, model routes, hierarchy keys, and instance links.
|
| 395 |
+
</CardDescription>
|
| 396 |
+
</div>
|
| 397 |
+
|
| 398 |
+
<div className="flex flex-wrap items-center gap-2">
|
| 399 |
+
<Badge variant={dataset.source === "query_api" ? "default" : "secondary"}>
|
| 400 |
+
{dataset.source === "query_api" ? "Query API" : "Artifact fallback"}
|
| 401 |
+
</Badge>
|
| 402 |
+
<Badge variant="outline">{filteredRows.length.toLocaleString()} joined rows</Badge>
|
| 403 |
+
</div>
|
| 404 |
+
</div>
|
| 405 |
+
</CardHeader>
|
| 406 |
+
|
| 407 |
+
<CardContent className="space-y-5 p-4 sm:p-5">
|
| 408 |
+
<section className="grid gap-3 lg:grid-cols-3">
|
| 409 |
+
{dataset.join_steps.map((step) => (
|
| 410 |
+
<div key={step.step} className="rounded-2xl border bg-background/70 p-4">
|
| 411 |
+
<div className="flex items-center gap-2">
|
| 412 |
+
<span className="flex h-7 w-7 items-center justify-center rounded-full bg-primary text-xs font-semibold text-primary-foreground">
|
| 413 |
+
{step.step}
|
| 414 |
+
</span>
|
| 415 |
+
<div className="font-semibold">{step.title}</div>
|
| 416 |
+
</div>
|
| 417 |
+
<p className="mt-2 text-sm leading-6 text-muted-foreground">{step.description}</p>
|
| 418 |
+
<div className="mt-3 flex flex-wrap gap-1.5">
|
| 419 |
+
{step.keys.map((key) => (
|
| 420 |
+
<span key={key} className="rounded-md border bg-muted/20 px-2 py-1 font-mono text-[11px]">
|
| 421 |
+
{key}
|
| 422 |
+
</span>
|
| 423 |
+
))}
|
| 424 |
+
</div>
|
| 425 |
+
</div>
|
| 426 |
+
))}
|
| 427 |
+
</section>
|
| 428 |
+
|
| 429 |
+
<section className="grid gap-4 xl:grid-cols-[minmax(0,0.95fr)_minmax(0,1.05fr)]">
|
| 430 |
+
<div className="space-y-4 rounded-2xl border bg-muted/5 p-4">
|
| 431 |
+
<div className="flex items-center gap-2">
|
| 432 |
+
<GitBranch className="h-4 w-4 text-muted-foreground" />
|
| 433 |
+
<div className="text-sm font-semibold">Base Grain</div>
|
| 434 |
+
</div>
|
| 435 |
+
|
| 436 |
+
<Select value={selectedGrain} onValueChange={(value) => setSelectedGrain(value as ResearchJoinGrain)}>
|
| 437 |
+
<SelectTrigger className="w-full">
|
| 438 |
+
<SelectValue />
|
| 439 |
+
</SelectTrigger>
|
| 440 |
+
<SelectContent>
|
| 441 |
+
{dataset.available_grains.map((grain) => (
|
| 442 |
+
<SelectItem key={grain.grain} value={grain.grain}>
|
| 443 |
+
{grain.label} ({grain.row_count.toLocaleString()})
|
| 444 |
+
</SelectItem>
|
| 445 |
+
))}
|
| 446 |
+
</SelectContent>
|
| 447 |
+
</Select>
|
| 448 |
+
|
| 449 |
+
{selectedGrainOption && (
|
| 450 |
+
<div className="rounded-xl border bg-background p-3">
|
| 451 |
+
<div className="text-sm font-medium">{selectedGrainOption.description}</div>
|
| 452 |
+
<div className="mt-3 flex items-center gap-2 text-xs font-semibold uppercase tracking-[0.16em] text-muted-foreground">
|
| 453 |
+
<KeyRound className="h-3.5 w-3.5" />
|
| 454 |
+
Join Keys
|
| 455 |
+
</div>
|
| 456 |
+
<div className="mt-2 flex flex-wrap gap-1.5">
|
| 457 |
+
{selectedJoinKeys.map((key) => (
|
| 458 |
+
<span key={key} className="rounded-md border bg-muted/30 px-2 py-1 font-mono text-[11px]">
|
| 459 |
+
{key}
|
| 460 |
+
</span>
|
| 461 |
+
))}
|
| 462 |
+
</div>
|
| 463 |
+
</div>
|
| 464 |
+
)}
|
| 465 |
+
</div>
|
| 466 |
+
|
| 467 |
+
<div className="space-y-4 rounded-2xl border bg-muted/5 p-4">
|
| 468 |
+
<div className="flex items-center gap-2">
|
| 469 |
+
<TableProperties className="h-4 w-4 text-muted-foreground" />
|
| 470 |
+
<div className="text-sm font-semibold">Joined Field Groups</div>
|
| 471 |
+
</div>
|
| 472 |
+
|
| 473 |
+
<div className="grid gap-2 sm:grid-cols-2">
|
| 474 |
+
{JOIN_GROUPS.map((joinGroup) => {
|
| 475 |
+
const groupColumns = dataset.columns.filter((column) => column.group === joinGroup.group)
|
| 476 |
+
const active = groupColumns.some((column) => selectedColumns.has(column.key))
|
| 477 |
+
|
| 478 |
+
return (
|
| 479 |
+
<button
|
| 480 |
+
key={joinGroup.group}
|
| 481 |
+
type="button"
|
| 482 |
+
onClick={() => toggleJoinGroup(joinGroup.group)}
|
| 483 |
+
className={cn(
|
| 484 |
+
"rounded-xl border p-3 text-left transition-colors",
|
| 485 |
+
active
|
| 486 |
+
? "border-foreground/20 bg-background shadow-sm"
|
| 487 |
+
: "border-border/70 bg-muted/20 text-muted-foreground hover:bg-muted/30"
|
| 488 |
+
)}
|
| 489 |
+
>
|
| 490 |
+
<div className="text-sm font-semibold">{joinGroup.label}</div>
|
| 491 |
+
<div className="mt-1 text-xs leading-5 text-muted-foreground">{joinGroup.description}</div>
|
| 492 |
+
</button>
|
| 493 |
+
)
|
| 494 |
+
})}
|
| 495 |
+
</div>
|
| 496 |
+
</div>
|
| 497 |
+
</section>
|
| 498 |
+
|
| 499 |
+
<section className="space-y-3 rounded-2xl border bg-background/70 p-4">
|
| 500 |
+
<div className="flex flex-col gap-3 lg:flex-row lg:items-center lg:justify-between">
|
| 501 |
+
<div className="flex items-center gap-2">
|
| 502 |
+
<Filter className="h-4 w-4 text-muted-foreground" />
|
| 503 |
+
<div className="text-sm font-semibold">Filters and Export</div>
|
| 504 |
+
</div>
|
| 505 |
+
<div className="flex flex-wrap gap-2">
|
| 506 |
+
<Button variant="outline" size="sm" className="gap-2" onClick={copyRecipe}>
|
| 507 |
+
<ClipboardList className="h-4 w-4" />
|
| 508 |
+
{copiedRecipe ? "Copied" : "Copy recipe"}
|
| 509 |
+
</Button>
|
| 510 |
+
<Button variant="outline" size="sm" className="gap-2" onClick={() => exportRows("csv")}>
|
| 511 |
+
<Download className="h-4 w-4" />
|
| 512 |
+
CSV
|
| 513 |
+
</Button>
|
| 514 |
+
<Button variant="outline" size="sm" className="gap-2" onClick={() => exportRows("json")}>
|
| 515 |
+
<Download className="h-4 w-4" />
|
| 516 |
+
JSON
|
| 517 |
+
</Button>
|
| 518 |
+
</div>
|
| 519 |
+
</div>
|
| 520 |
+
|
| 521 |
+
<div className="grid gap-3 md:grid-cols-2 xl:grid-cols-6">
|
| 522 |
+
<Input
|
| 523 |
+
value={query}
|
| 524 |
+
onChange={(event) => setQuery(event.target.value)}
|
| 525 |
+
placeholder="Search model, metric, source..."
|
| 526 |
+
className="xl:col-span-2"
|
| 527 |
+
/>
|
| 528 |
+
|
| 529 |
+
<Select value={metricFilter} onValueChange={setMetricFilter}>
|
| 530 |
+
<SelectTrigger className="w-full">
|
| 531 |
+
<SelectValue placeholder="Metric" />
|
| 532 |
+
</SelectTrigger>
|
| 533 |
+
<SelectContent>
|
| 534 |
+
<SelectItem value="all">All metrics</SelectItem>
|
| 535 |
+
{facets.metrics.map((facet) => (
|
| 536 |
+
<SelectItem key={facet.value} value={facet.value}>
|
| 537 |
+
{facet.label} ({facet.count})
|
| 538 |
+
</SelectItem>
|
| 539 |
+
))}
|
| 540 |
+
</SelectContent>
|
| 541 |
+
</Select>
|
| 542 |
+
|
| 543 |
+
<Select value={sourceFilter} onValueChange={setSourceFilter}>
|
| 544 |
+
<SelectTrigger className="w-full">
|
| 545 |
+
<SelectValue placeholder="Source" />
|
| 546 |
+
</SelectTrigger>
|
| 547 |
+
<SelectContent>
|
| 548 |
+
<SelectItem value="all">All sources</SelectItem>
|
| 549 |
+
{facets.sources.map((facet) => (
|
| 550 |
+
<SelectItem key={facet.value} value={facet.value}>
|
| 551 |
+
{facet.label} ({facet.count})
|
| 552 |
+
</SelectItem>
|
| 553 |
+
))}
|
| 554 |
+
</SelectContent>
|
| 555 |
+
</Select>
|
| 556 |
+
|
| 557 |
+
<Select value={relationshipFilter} onValueChange={setRelationshipFilter}>
|
| 558 |
+
<SelectTrigger className="w-full">
|
| 559 |
+
<SelectValue placeholder="Relationship" />
|
| 560 |
+
</SelectTrigger>
|
| 561 |
+
<SelectContent>
|
| 562 |
+
<SelectItem value="all">All relationships</SelectItem>
|
| 563 |
+
{facets.relationships.map((facet) => (
|
| 564 |
+
<SelectItem key={facet.value} value={facet.value}>
|
| 565 |
+
{facet.label} ({facet.count})
|
| 566 |
+
</SelectItem>
|
| 567 |
+
))}
|
| 568 |
+
</SelectContent>
|
| 569 |
+
</Select>
|
| 570 |
+
|
| 571 |
+
<Select value={instanceFilter} onValueChange={setInstanceFilter}>
|
| 572 |
+
<SelectTrigger className="w-full">
|
| 573 |
+
<SelectValue placeholder="Instances" />
|
| 574 |
+
</SelectTrigger>
|
| 575 |
+
<SelectContent>
|
| 576 |
+
<SelectItem value="all">All instance states</SelectItem>
|
| 577 |
+
<SelectItem value="linked">Has instances</SelectItem>
|
| 578 |
+
<SelectItem value="unlinked">No instances</SelectItem>
|
| 579 |
+
</SelectContent>
|
| 580 |
+
</Select>
|
| 581 |
+
</div>
|
| 582 |
+
|
| 583 |
+
<div className="grid gap-3 md:grid-cols-2 xl:grid-cols-[minmax(0,1fr)_16rem_10rem]">
|
| 584 |
+
<div className="flex flex-wrap gap-1.5">
|
| 585 |
+
{dataset.columns.map((column) => (
|
| 586 |
+
<button
|
| 587 |
+
key={column.key}
|
| 588 |
+
type="button"
|
| 589 |
+
onClick={() => toggleColumn(column.key)}
|
| 590 |
+
title={column.description}
|
| 591 |
+
className={cn(
|
| 592 |
+
"rounded-md border px-2.5 py-1 text-xs font-medium transition-colors",
|
| 593 |
+
selectedColumns.has(column.key)
|
| 594 |
+
? "border-foreground/20 bg-muted text-foreground"
|
| 595 |
+
: "border-border/70 bg-background text-muted-foreground hover:bg-muted/20"
|
| 596 |
+
)}
|
| 597 |
+
>
|
| 598 |
+
{column.label}
|
| 599 |
+
</button>
|
| 600 |
+
))}
|
| 601 |
+
</div>
|
| 602 |
+
|
| 603 |
+
<Select value={sortKey} onValueChange={setSortKey}>
|
| 604 |
+
<SelectTrigger className="w-full">
|
| 605 |
+
<SelectValue placeholder="Sort" />
|
| 606 |
+
</SelectTrigger>
|
| 607 |
+
<SelectContent>
|
| 608 |
+
{dataset.columns.map((column) => (
|
| 609 |
+
<SelectItem key={column.key} value={column.key}>
|
| 610 |
+
Sort: {column.label}
|
| 611 |
+
</SelectItem>
|
| 612 |
+
))}
|
| 613 |
+
</SelectContent>
|
| 614 |
+
</Select>
|
| 615 |
+
|
| 616 |
+
<Select value={sortDirection} onValueChange={(value) => setSortDirection(value as "asc" | "desc")}>
|
| 617 |
+
<SelectTrigger className="w-full">
|
| 618 |
+
<SelectValue />
|
| 619 |
+
</SelectTrigger>
|
| 620 |
+
<SelectContent>
|
| 621 |
+
<SelectItem value="asc">Ascending</SelectItem>
|
| 622 |
+
<SelectItem value="desc">Descending</SelectItem>
|
| 623 |
+
</SelectContent>
|
| 624 |
+
</Select>
|
| 625 |
+
</div>
|
| 626 |
+
</section>
|
| 627 |
+
|
| 628 |
+
{dataset.warnings.length > 0 && (
|
| 629 |
+
<div className="rounded-2xl border border-amber-200/80 bg-amber-50/70 p-3 text-sm leading-6 text-amber-950 dark:border-amber-900/50 dark:bg-amber-950/20 dark:text-amber-100">
|
| 630 |
+
{dataset.warnings.map((warning) => (
|
| 631 |
+
<div key={warning}>{warning}</div>
|
| 632 |
+
))}
|
| 633 |
+
</div>
|
| 634 |
+
)}
|
| 635 |
+
|
| 636 |
+
<div className="overflow-hidden rounded-2xl border">
|
| 637 |
+
<Table className="min-w-[980px]">
|
| 638 |
+
<TableHeader>
|
| 639 |
+
<TableRow className="hover:bg-transparent">
|
| 640 |
+
{visibleColumns.map((column) => (
|
| 641 |
+
<TableHead key={column.key} className="px-3">
|
| 642 |
+
<span className={cn(column.isJoinKey && "inline-flex items-center gap-1")}>
|
| 643 |
+
{column.isJoinKey && <KeyRound className="h-3 w-3 text-muted-foreground" />}
|
| 644 |
+
{column.label}
|
| 645 |
+
</span>
|
| 646 |
+
</TableHead>
|
| 647 |
+
))}
|
| 648 |
+
</TableRow>
|
| 649 |
+
</TableHeader>
|
| 650 |
+
<TableBody>
|
| 651 |
+
{previewRows.map((row) => (
|
| 652 |
+
<TableRow key={row.row_id}>
|
| 653 |
+
{visibleColumns.map((column) => {
|
| 654 |
+
const value = getRowValue(row, column.key)
|
| 655 |
+
const isUrl = column.type === "url" && typeof value === "string" && value.startsWith("http")
|
| 656 |
+
|
| 657 |
+
return (
|
| 658 |
+
<TableCell key={column.key} className="max-w-[22rem] whitespace-normal px-3 align-top">
|
| 659 |
+
{isUrl ? (
|
| 660 |
+
<a
|
| 661 |
+
href={value}
|
| 662 |
+
target="_blank"
|
| 663 |
+
rel="noreferrer"
|
| 664 |
+
className="text-primary underline-offset-4 hover:underline"
|
| 665 |
+
>
|
| 666 |
+
Open
|
| 667 |
+
</a>
|
| 668 |
+
) : (
|
| 669 |
+
<span className={cn(column.isJoinKey && "font-mono text-xs")}>
|
| 670 |
+
{formatCellValue(value)}
|
| 671 |
+
</span>
|
| 672 |
+
)}
|
| 673 |
+
</TableCell>
|
| 674 |
+
)
|
| 675 |
+
})}
|
| 676 |
+
</TableRow>
|
| 677 |
+
))}
|
| 678 |
+
{previewRows.length === 0 && (
|
| 679 |
+
<TableRow>
|
| 680 |
+
<TableCell colSpan={Math.max(visibleColumns.length, 1)} className="px-6 py-10 text-center text-sm text-muted-foreground">
|
| 681 |
+
No rows match the selected join filters.
|
| 682 |
+
</TableCell>
|
| 683 |
+
</TableRow>
|
| 684 |
+
)}
|
| 685 |
+
</TableBody>
|
| 686 |
+
</Table>
|
| 687 |
+
</div>
|
| 688 |
+
|
| 689 |
+
{filteredRows.length > previewRows.length && (
|
| 690 |
+
<div className="text-center text-xs text-muted-foreground">
|
| 691 |
+
Showing first {previewRows.length.toLocaleString()} rows. Export includes all {filteredRows.length.toLocaleString()} filtered rows.
|
| 692 |
+
</div>
|
| 693 |
+
)}
|
| 694 |
+
</CardContent>
|
| 695 |
+
</Card>
|
| 696 |
+
)
|
| 697 |
+
}
|
lib/benchmark-schema.ts
CHANGED
|
@@ -88,16 +88,24 @@ export interface ModelInfo {
|
|
| 88 |
}
|
| 89 |
|
| 90 |
export interface EvaluationResult {
|
|
|
|
| 91 |
evaluation_name: string
|
| 92 |
display_name?: string
|
| 93 |
canonical_display_name?: string
|
| 94 |
metric_summary_id?: string
|
|
|
|
| 95 |
metric_key?: string
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
evaluation_timestamp: string
|
| 97 |
source_data?: string[] | SourceData
|
|
|
|
| 98 |
metric_config: MetricConfig
|
| 99 |
score_details: ScoreDetails
|
| 100 |
detailed_evaluation_results_url?: string
|
|
|
|
| 101 |
generation_config?: GenerationConfig
|
| 102 |
evalcards?: { annotations?: RowAnnotations }
|
| 103 |
}
|
|
@@ -106,6 +114,10 @@ export interface MetricConfig {
|
|
| 106 |
evaluation_description: string
|
| 107 |
lower_is_better: boolean
|
| 108 |
score_type: 'continuous' | 'discrete' | 'binary'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
min_score?: number
|
| 110 |
max_score?: number
|
| 111 |
unit?: string
|
|
|
|
| 88 |
}
|
| 89 |
|
| 90 |
export interface EvaluationResult {
|
| 91 |
+
evaluation_result_id?: string
|
| 92 |
evaluation_name: string
|
| 93 |
display_name?: string
|
| 94 |
canonical_display_name?: string
|
| 95 |
metric_summary_id?: string
|
| 96 |
+
metric_id?: string
|
| 97 |
metric_key?: string
|
| 98 |
+
metric_name?: string
|
| 99 |
+
metric_kind?: string
|
| 100 |
+
metric_unit?: string
|
| 101 |
+
metric_parameters?: Record<string, any>
|
| 102 |
evaluation_timestamp: string
|
| 103 |
source_data?: string[] | SourceData
|
| 104 |
+
source_record_url?: string
|
| 105 |
metric_config: MetricConfig
|
| 106 |
score_details: ScoreDetails
|
| 107 |
detailed_evaluation_results_url?: string
|
| 108 |
+
detailed_evaluation_results_meta?: unknown
|
| 109 |
generation_config?: GenerationConfig
|
| 110 |
evalcards?: { annotations?: RowAnnotations }
|
| 111 |
}
|
|
|
|
| 114 |
evaluation_description: string
|
| 115 |
lower_is_better: boolean
|
| 116 |
score_type: 'continuous' | 'discrete' | 'binary'
|
| 117 |
+
metric_id?: string
|
| 118 |
+
metric_kind?: string
|
| 119 |
+
metric_unit?: string
|
| 120 |
+
metric_parameters?: Record<string, any>
|
| 121 |
min_score?: number
|
| 122 |
max_score?: number
|
| 123 |
unit?: string
|
lib/dashboard-data-client.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import type { BackendManifestStatus, ComparisonIndex, CorpusAggregates, EvalHierarchy } from "@/lib/backend-artifacts"
|
|
|
|
| 2 |
import type { BenchmarkEvaluationCardData } from "@/components/benchmark-evaluation-card"
|
| 3 |
import type { HFEvalDetail } from "@/lib/hf-data"
|
| 4 |
import type {
|
|
@@ -103,3 +104,9 @@ export function fetchComparisonIndex() {
|
|
| 103 |
export function fetchCorpusAggregates() {
|
| 104 |
return fetchJson<CorpusAggregates>("/api/corpus-aggregates")
|
| 105 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import type { BackendManifestStatus, ComparisonIndex, CorpusAggregates, EvalHierarchy } from "@/lib/backend-artifacts"
|
| 2 |
+
import type { ResearchJoinDataset } from "@/lib/research-join-types"
|
| 3 |
import type { BenchmarkEvaluationCardData } from "@/components/benchmark-evaluation-card"
|
| 4 |
import type { HFEvalDetail } from "@/lib/hf-data"
|
| 5 |
import type {
|
|
|
|
| 104 |
export function fetchCorpusAggregates() {
|
| 105 |
return fetchJson<CorpusAggregates>("/api/corpus-aggregates")
|
| 106 |
}
|
| 107 |
+
|
| 108 |
+
export function fetchEvalResearchJoins(evalId: string) {
|
| 109 |
+
return fetchJson<ResearchJoinDataset>(
|
| 110 |
+
`/api/research/eval-joins?id=${encodeURIComponent(evalId)}`
|
| 111 |
+
)
|
| 112 |
+
}
|
lib/hf-data.ts
CHANGED
|
@@ -544,6 +544,7 @@ export interface HFEvalListEntry extends SignalSummaries {
|
|
| 544 |
}
|
| 545 |
|
| 546 |
export interface HFEvalModelResult {
|
|
|
|
| 547 |
model_id: string
|
| 548 |
model_route_id: string
|
| 549 |
model_name: string
|
|
@@ -569,6 +570,10 @@ export interface HFEvalMetric {
|
|
| 569 |
legacy_eval_summary_id?: string
|
| 570 |
evaluation_name?: string
|
| 571 |
metric_name: string
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
metric_key: string
|
| 573 |
display_name?: string
|
| 574 |
canonical_display_name?: string
|
|
@@ -1383,6 +1388,7 @@ function flattenHierarchyNode(
|
|
| 1383 |
const modelInfo = buildModelInfoForVariant(detail, result, variantMeta)
|
| 1384 |
const inlineSamples = parseInstanceLevelData(result.instance_level_data)
|
| 1385 |
const evaluationResult: EvaluationResult = {
|
|
|
|
| 1386 |
evaluation_name: metric.metric_name || metric.evaluation_name || metric.display_name,
|
| 1387 |
display_name: metric.display_name || metric.metric_name || metric.evaluation_name,
|
| 1388 |
canonical_display_name:
|
|
@@ -1390,9 +1396,17 @@ function flattenHierarchyNode(
|
|
| 1390 |
metric.display_name ||
|
| 1391 |
`${context.benchmark ?? context.display_name ?? "Benchmark"} / ${metric.metric_name}`,
|
| 1392 |
metric_summary_id: metric.metric_summary_id,
|
|
|
|
| 1393 |
metric_key: metric.metric_key,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1394 |
evaluation_timestamp: result.retrieved_timestamp ?? detail.last_updated ?? "",
|
| 1395 |
source_data: sourceData,
|
|
|
|
| 1396 |
metric_config: metric.metric_config,
|
| 1397 |
score_details: {
|
| 1398 |
score: result.score,
|
|
@@ -1401,6 +1415,7 @@ function flattenHierarchyNode(
|
|
| 1401 |
result.detailed_evaluation_results
|
| 1402 |
),
|
| 1403 |
evalcards: result.evalcards,
|
|
|
|
| 1404 |
}
|
| 1405 |
|
| 1406 |
const existing = resultsByVariant.get(variantKey)
|
|
|
|
| 544 |
}
|
| 545 |
|
| 546 |
export interface HFEvalModelResult {
|
| 547 |
+
evaluation_result_id?: string
|
| 548 |
model_id: string
|
| 549 |
model_route_id: string
|
| 550 |
model_name: string
|
|
|
|
| 570 |
legacy_eval_summary_id?: string
|
| 571 |
evaluation_name?: string
|
| 572 |
metric_name: string
|
| 573 |
+
metric_id?: string | null
|
| 574 |
+
metric_kind?: string | null
|
| 575 |
+
metric_unit?: string | null
|
| 576 |
+
metric_parameters?: Record<string, unknown> | null
|
| 577 |
metric_key: string
|
| 578 |
display_name?: string
|
| 579 |
canonical_display_name?: string
|
|
|
|
| 1388 |
const modelInfo = buildModelInfoForVariant(detail, result, variantMeta)
|
| 1389 |
const inlineSamples = parseInstanceLevelData(result.instance_level_data)
|
| 1390 |
const evaluationResult: EvaluationResult = {
|
| 1391 |
+
evaluation_result_id: result.evaluation_result_id,
|
| 1392 |
evaluation_name: metric.metric_name || metric.evaluation_name || metric.display_name,
|
| 1393 |
display_name: metric.display_name || metric.metric_name || metric.evaluation_name,
|
| 1394 |
canonical_display_name:
|
|
|
|
| 1396 |
metric.display_name ||
|
| 1397 |
`${context.benchmark ?? context.display_name ?? "Benchmark"} / ${metric.metric_name}`,
|
| 1398 |
metric_summary_id: metric.metric_summary_id,
|
| 1399 |
+
metric_id: metric.metric_id ?? undefined,
|
| 1400 |
metric_key: metric.metric_key,
|
| 1401 |
+
metric_name: metric.metric_name,
|
| 1402 |
+
metric_kind: metric.metric_kind ?? undefined,
|
| 1403 |
+
metric_unit:
|
| 1404 |
+
metric.metric_unit ??
|
| 1405 |
+
(typeof metric.metric_config?.unit === "string" ? metric.metric_config.unit : undefined),
|
| 1406 |
+
metric_parameters: metric.metric_parameters ?? undefined,
|
| 1407 |
evaluation_timestamp: result.retrieved_timestamp ?? detail.last_updated ?? "",
|
| 1408 |
source_data: sourceData,
|
| 1409 |
+
source_record_url: result.source_record_url,
|
| 1410 |
metric_config: metric.metric_config,
|
| 1411 |
score_details: {
|
| 1412 |
score: result.score,
|
|
|
|
| 1415 |
result.detailed_evaluation_results
|
| 1416 |
),
|
| 1417 |
evalcards: result.evalcards,
|
| 1418 |
+
detailed_evaluation_results_meta: result.detailed_evaluation_results_meta,
|
| 1419 |
}
|
| 1420 |
|
| 1421 |
const existing = resultsByVariant.get(variantKey)
|
lib/model-data.ts
CHANGED
|
@@ -729,18 +729,26 @@ function toModelResultsForMetric(
|
|
| 729 |
}
|
| 730 |
|
| 731 |
const evaluationResult: EvaluationResult = {
|
|
|
|
| 732 |
evaluation_name: metric.metric_name || metric.evaluation_name || metric.display_name || "",
|
| 733 |
display_name: metric.display_name || metric.metric_name || metric.evaluation_name,
|
| 734 |
canonical_display_name: metric.canonical_display_name,
|
| 735 |
metric_summary_id: metric.metric_summary_id,
|
|
|
|
| 736 |
metric_key: metric.metric_key,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 737 |
evaluation_timestamp: evaluationTimestamp,
|
|
|
|
| 738 |
metric_config: metricConfig,
|
| 739 |
score_details: { score: mr.score ?? 0 },
|
| 740 |
detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
|
| 741 |
mr.detailed_evaluation_results
|
| 742 |
),
|
| 743 |
evalcards: mr.evalcards,
|
|
|
|
| 744 |
}
|
| 745 |
|
| 746 |
return {
|
|
|
|
| 729 |
}
|
| 730 |
|
| 731 |
const evaluationResult: EvaluationResult = {
|
| 732 |
+
evaluation_result_id: mr.evaluation_result_id,
|
| 733 |
evaluation_name: metric.metric_name || metric.evaluation_name || metric.display_name || "",
|
| 734 |
display_name: metric.display_name || metric.metric_name || metric.evaluation_name,
|
| 735 |
canonical_display_name: metric.canonical_display_name,
|
| 736 |
metric_summary_id: metric.metric_summary_id,
|
| 737 |
+
metric_id: metric.metric_id ?? undefined,
|
| 738 |
metric_key: metric.metric_key,
|
| 739 |
+
metric_name: metric.metric_name,
|
| 740 |
+
metric_kind: metric.metric_kind ?? undefined,
|
| 741 |
+
metric_unit: metric.metric_unit ?? metricConfig.unit,
|
| 742 |
+
metric_parameters: metric.metric_parameters ?? undefined,
|
| 743 |
evaluation_timestamp: evaluationTimestamp,
|
| 744 |
+
source_record_url: mr.source_record_url,
|
| 745 |
metric_config: metricConfig,
|
| 746 |
score_details: { score: mr.score ?? 0 },
|
| 747 |
detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
|
| 748 |
mr.detailed_evaluation_results
|
| 749 |
),
|
| 750 |
evalcards: mr.evalcards,
|
| 751 |
+
detailed_evaluation_results_meta: mr.detailed_evaluation_results_meta,
|
| 752 |
}
|
| 753 |
|
| 754 |
return {
|
lib/research-join-types.ts
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
export type ResearchJoinSource = "query_api" | "artifact"
|
| 2 |
+
|
| 3 |
+
export type ResearchJoinGrain =
|
| 4 |
+
| "model_metric"
|
| 5 |
+
| "model_metric_source"
|
| 6 |
+
| "model_metric_instance"
|
| 7 |
+
| "composite_component"
|
| 8 |
+
|
| 9 |
+
export type ResearchJoinColumnGroup =
|
| 10 |
+
| "identity"
|
| 11 |
+
| "hierarchy"
|
| 12 |
+
| "metric"
|
| 13 |
+
| "model"
|
| 14 |
+
| "score"
|
| 15 |
+
| "source"
|
| 16 |
+
| "instance"
|
| 17 |
+
| "quality"
|
| 18 |
+
|
| 19 |
+
export interface ResearchJoinColumn {
|
| 20 |
+
key: string
|
| 21 |
+
label: string
|
| 22 |
+
group: ResearchJoinColumnGroup
|
| 23 |
+
description: string
|
| 24 |
+
defaultVisible: boolean
|
| 25 |
+
isJoinKey?: boolean
|
| 26 |
+
type?: "string" | "number" | "boolean" | "url" | "date"
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
export interface ResearchJoinGrainOption {
|
| 30 |
+
grain: ResearchJoinGrain
|
| 31 |
+
label: string
|
| 32 |
+
description: string
|
| 33 |
+
row_count: number
|
| 34 |
+
join_keys: string[]
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
export interface ResearchJoinFacetValue {
|
| 38 |
+
value: string
|
| 39 |
+
label: string
|
| 40 |
+
count: number
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
export interface ResearchJoinFacet {
|
| 44 |
+
key: string
|
| 45 |
+
label: string
|
| 46 |
+
values: ResearchJoinFacetValue[]
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
export interface ResearchJoinStep {
|
| 50 |
+
step: number
|
| 51 |
+
title: string
|
| 52 |
+
description: string
|
| 53 |
+
keys: string[]
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
export interface ResearchJoinRow {
|
| 57 |
+
row_id: string
|
| 58 |
+
join_grain: ResearchJoinGrain
|
| 59 |
+
eval_summary_id: string
|
| 60 |
+
evaluation_name: string
|
| 61 |
+
benchmark_family_key?: string | null
|
| 62 |
+
benchmark_parent_key?: string | null
|
| 63 |
+
benchmark_leaf_key?: string | null
|
| 64 |
+
component_eval_summary_id?: string | null
|
| 65 |
+
component_name?: string | null
|
| 66 |
+
metric_summary_id?: string | null
|
| 67 |
+
metric_id?: string | null
|
| 68 |
+
metric_key?: string | null
|
| 69 |
+
metric_name: string
|
| 70 |
+
metric_kind?: string | null
|
| 71 |
+
metric_unit?: string | null
|
| 72 |
+
model_route_id?: string | null
|
| 73 |
+
model_id: string
|
| 74 |
+
model_name: string
|
| 75 |
+
developer?: string | null
|
| 76 |
+
score?: number | null
|
| 77 |
+
normalized_score?: number | null
|
| 78 |
+
rank?: number | null
|
| 79 |
+
rank_total?: number | null
|
| 80 |
+
lower_is_better?: boolean | null
|
| 81 |
+
source_name: string
|
| 82 |
+
source_type?: string | null
|
| 83 |
+
source_organization_name?: string | null
|
| 84 |
+
evaluator_relationship?: string | null
|
| 85 |
+
source_record_url?: string | null
|
| 86 |
+
source_dataset_name?: string | null
|
| 87 |
+
source_dataset_version?: string | null
|
| 88 |
+
source_hf_repo?: string | null
|
| 89 |
+
source_hf_split?: string | null
|
| 90 |
+
retrieved_at?: string | null
|
| 91 |
+
evaluation_timestamp?: string | null
|
| 92 |
+
has_instance_data: boolean
|
| 93 |
+
instance_join_status: "metric_exact" | "benchmark_available" | "not_available"
|
| 94 |
+
detailed_evaluation_results_url?: string | null
|
| 95 |
+
sample_size?: number | null
|
| 96 |
+
standard_error?: number | null
|
| 97 |
+
confidence_interval?: string | null
|
| 98 |
+
generation_config_available?: boolean | null
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
export interface ResearchJoinDataset {
|
| 102 |
+
source: ResearchJoinSource
|
| 103 |
+
generated_at: string
|
| 104 |
+
eval_summary_id: string
|
| 105 |
+
eval_name: string
|
| 106 |
+
warnings: string[]
|
| 107 |
+
join_steps: ResearchJoinStep[]
|
| 108 |
+
available_grains: ResearchJoinGrainOption[]
|
| 109 |
+
columns: ResearchJoinColumn[]
|
| 110 |
+
facets: ResearchJoinFacet[]
|
| 111 |
+
rows: ResearchJoinRow[]
|
| 112 |
+
}
|
lib/research-joins.ts
ADDED
|
@@ -0,0 +1,712 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import "server-only"
|
| 2 |
+
|
| 3 |
+
import type { SourceData } from "@/lib/benchmark-schema"
|
| 4 |
+
import type { BenchmarkEvalSummary, ModelResultForBenchmark } from "@/lib/eval-processing"
|
| 5 |
+
import { getModelFamilyRouteId } from "@/lib/model-family"
|
| 6 |
+
import { getEvalSummaryById } from "@/lib/model-data"
|
| 7 |
+
import type {
|
| 8 |
+
ResearchJoinColumn,
|
| 9 |
+
ResearchJoinDataset,
|
| 10 |
+
ResearchJoinFacet,
|
| 11 |
+
ResearchJoinGrain,
|
| 12 |
+
ResearchJoinGrainOption,
|
| 13 |
+
ResearchJoinRow,
|
| 14 |
+
} from "@/lib/research-join-types"
|
| 15 |
+
|
| 16 |
+
const MISSING_SOURCE_BUCKET = "__missing_source__"
|
| 17 |
+
|
| 18 |
+
const COLUMN_DEFINITIONS: ResearchJoinColumn[] = [
|
| 19 |
+
{
|
| 20 |
+
key: "join_grain",
|
| 21 |
+
label: "Grain",
|
| 22 |
+
group: "identity",
|
| 23 |
+
description: "The row shape selected for the join.",
|
| 24 |
+
defaultVisible: false,
|
| 25 |
+
isJoinKey: true,
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
key: "eval_summary_id",
|
| 29 |
+
label: "Eval ID",
|
| 30 |
+
group: "identity",
|
| 31 |
+
description: "Canonical benchmark detail identifier.",
|
| 32 |
+
defaultVisible: false,
|
| 33 |
+
isJoinKey: true,
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
key: "metric_summary_id",
|
| 37 |
+
label: "Metric ID",
|
| 38 |
+
group: "metric",
|
| 39 |
+
description: "Metric-level identifier used to align ranks, scores, and samples.",
|
| 40 |
+
defaultVisible: true,
|
| 41 |
+
isJoinKey: true,
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
key: "model_route_id",
|
| 45 |
+
label: "Model Route",
|
| 46 |
+
group: "model",
|
| 47 |
+
description: "Stable frontend route identifier for the model family.",
|
| 48 |
+
defaultVisible: false,
|
| 49 |
+
isJoinKey: true,
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
key: "model_name",
|
| 53 |
+
label: "Model",
|
| 54 |
+
group: "model",
|
| 55 |
+
description: "Display name for the model row.",
|
| 56 |
+
defaultVisible: true,
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
key: "developer",
|
| 60 |
+
label: "Developer",
|
| 61 |
+
group: "model",
|
| 62 |
+
description: "Reported developer or provider.",
|
| 63 |
+
defaultVisible: true,
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
key: "metric_name",
|
| 67 |
+
label: "Metric",
|
| 68 |
+
group: "metric",
|
| 69 |
+
description: "Metric display name from the benchmark artifact.",
|
| 70 |
+
defaultVisible: true,
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
key: "score",
|
| 74 |
+
label: "Score",
|
| 75 |
+
group: "score",
|
| 76 |
+
description: "Raw reported score for the selected row grain.",
|
| 77 |
+
defaultVisible: true,
|
| 78 |
+
type: "number",
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
key: "rank",
|
| 82 |
+
label: "Rank",
|
| 83 |
+
group: "score",
|
| 84 |
+
description: "Rank within the selected benchmark metric.",
|
| 85 |
+
defaultVisible: true,
|
| 86 |
+
type: "number",
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
key: "rank_total",
|
| 90 |
+
label: "Rank Total",
|
| 91 |
+
group: "score",
|
| 92 |
+
description: "Number of scored rows in the ranking partition.",
|
| 93 |
+
defaultVisible: false,
|
| 94 |
+
type: "number",
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
key: "source_name",
|
| 98 |
+
label: "Source",
|
| 99 |
+
group: "source",
|
| 100 |
+
description: "Source name, or the explicit missing-source bucket.",
|
| 101 |
+
defaultVisible: true,
|
| 102 |
+
isJoinKey: true,
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
key: "source_organization_name",
|
| 106 |
+
label: "Organization",
|
| 107 |
+
group: "source",
|
| 108 |
+
description: "Organization associated with the reported result.",
|
| 109 |
+
defaultVisible: true,
|
| 110 |
+
},
|
| 111 |
+
{
|
| 112 |
+
key: "evaluator_relationship",
|
| 113 |
+
label: "Relationship",
|
| 114 |
+
group: "source",
|
| 115 |
+
description: "First-party, third-party, collaborative, or other source relationship.",
|
| 116 |
+
defaultVisible: true,
|
| 117 |
+
},
|
| 118 |
+
{
|
| 119 |
+
key: "benchmark_family_key",
|
| 120 |
+
label: "Family",
|
| 121 |
+
group: "hierarchy",
|
| 122 |
+
description: "Backend-declared benchmark family key.",
|
| 123 |
+
defaultVisible: false,
|
| 124 |
+
isJoinKey: true,
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
key: "benchmark_parent_key",
|
| 128 |
+
label: "Composite",
|
| 129 |
+
group: "hierarchy",
|
| 130 |
+
description: "Backend-declared composite or parent benchmark key.",
|
| 131 |
+
defaultVisible: false,
|
| 132 |
+
isJoinKey: true,
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
key: "benchmark_leaf_key",
|
| 136 |
+
label: "Leaf",
|
| 137 |
+
group: "hierarchy",
|
| 138 |
+
description: "Backend-declared leaf benchmark key.",
|
| 139 |
+
defaultVisible: false,
|
| 140 |
+
isJoinKey: true,
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
key: "component_name",
|
| 144 |
+
label: "Component",
|
| 145 |
+
group: "hierarchy",
|
| 146 |
+
description: "Composite component name when viewing rollup score rows.",
|
| 147 |
+
defaultVisible: false,
|
| 148 |
+
},
|
| 149 |
+
{
|
| 150 |
+
key: "has_instance_data",
|
| 151 |
+
label: "Instances",
|
| 152 |
+
group: "instance",
|
| 153 |
+
description: "Whether this row has exact or benchmark-level instance linkage.",
|
| 154 |
+
defaultVisible: true,
|
| 155 |
+
type: "boolean",
|
| 156 |
+
},
|
| 157 |
+
{
|
| 158 |
+
key: "instance_join_status",
|
| 159 |
+
label: "Instance Join",
|
| 160 |
+
group: "instance",
|
| 161 |
+
description: "Whether instance linkage is exact, benchmark-level only, or unavailable.",
|
| 162 |
+
defaultVisible: false,
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
key: "detailed_evaluation_results_url",
|
| 166 |
+
label: "Instance URL",
|
| 167 |
+
group: "instance",
|
| 168 |
+
description: "Metric-selective sample data URL when the artifact exposes it.",
|
| 169 |
+
defaultVisible: false,
|
| 170 |
+
type: "url",
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
key: "sample_size",
|
| 174 |
+
label: "Sample Size",
|
| 175 |
+
group: "quality",
|
| 176 |
+
description: "Reported sample size when available.",
|
| 177 |
+
defaultVisible: false,
|
| 178 |
+
type: "number",
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
key: "standard_error",
|
| 182 |
+
label: "Std. Error",
|
| 183 |
+
group: "quality",
|
| 184 |
+
description: "Reported standard error when available.",
|
| 185 |
+
defaultVisible: false,
|
| 186 |
+
type: "number",
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
key: "generation_config_available",
|
| 190 |
+
label: "Gen Config",
|
| 191 |
+
group: "quality",
|
| 192 |
+
description: "Whether generation config is present for reproducibility checks.",
|
| 193 |
+
defaultVisible: false,
|
| 194 |
+
type: "boolean",
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
key: "retrieved_at",
|
| 198 |
+
label: "Retrieved",
|
| 199 |
+
group: "quality",
|
| 200 |
+
description: "Retrieval timestamp preserved separately from evaluation time.",
|
| 201 |
+
defaultVisible: false,
|
| 202 |
+
type: "date",
|
| 203 |
+
},
|
| 204 |
+
]
|
| 205 |
+
|
| 206 |
+
function normalizeSourceName(value: string | null | undefined) {
|
| 207 |
+
const trimmed = value?.trim()
|
| 208 |
+
return trimmed || MISSING_SOURCE_BUCKET
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
function getSourceDataFields(sourceData: ModelResultForBenchmark["source_data"] | SourceData | undefined) {
|
| 212 |
+
if (!sourceData || Array.isArray(sourceData)) {
|
| 213 |
+
return {
|
| 214 |
+
source_dataset_name: null,
|
| 215 |
+
source_dataset_version: null,
|
| 216 |
+
source_hf_repo: null,
|
| 217 |
+
source_hf_split: null,
|
| 218 |
+
}
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
return {
|
| 222 |
+
source_dataset_name: sourceData.dataset_name ?? null,
|
| 223 |
+
source_dataset_version: sourceData.dataset_version ?? null,
|
| 224 |
+
source_hf_repo: sourceData.hf_repo ?? null,
|
| 225 |
+
source_hf_split: sourceData.hf_split ?? null,
|
| 226 |
+
}
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
function formatConfidenceInterval(result: ModelResultForBenchmark["score_details"]) {
|
| 230 |
+
const ci = result.confidence_interval
|
| 231 |
+
if (!ci) {
|
| 232 |
+
return null
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
return `${ci.lower} - ${ci.upper} (${ci.confidence_level}%)`
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
function normalizeScore(score: number | null | undefined, minScore?: number, maxScore?: number) {
|
| 239 |
+
if (!isFiniteNumber(score)) {
|
| 240 |
+
return null
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
const min = minScore ?? 0
|
| 244 |
+
const max = maxScore ?? 1
|
| 245 |
+
const range = max - min
|
| 246 |
+
|
| 247 |
+
return range > 0 ? (score - min) / range : score
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
function isFiniteNumber(value: unknown): value is number {
|
| 251 |
+
return typeof value === "number" && Number.isFinite(value)
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
function rankScoredRows<T>(
|
| 255 |
+
entries: T[],
|
| 256 |
+
getScore: (entry: T) => number | null | undefined,
|
| 257 |
+
lowerIsBetter: boolean
|
| 258 |
+
) {
|
| 259 |
+
const scored = entries
|
| 260 |
+
.filter((entry) => isFiniteNumber(getScore(entry)))
|
| 261 |
+
.sort((a, b) => {
|
| 262 |
+
const left = getScore(a) ?? 0
|
| 263 |
+
const right = getScore(b) ?? 0
|
| 264 |
+
return lowerIsBetter ? left - right : right - left
|
| 265 |
+
})
|
| 266 |
+
|
| 267 |
+
const ranks = new Map<T, number>()
|
| 268 |
+
let currentRank = 0
|
| 269 |
+
let previousScore: number | null = null
|
| 270 |
+
|
| 271 |
+
scored.forEach((entry, index) => {
|
| 272 |
+
const score = getScore(entry) ?? 0
|
| 273 |
+
if (previousScore == null || Math.abs(score - previousScore) > 1e-9) {
|
| 274 |
+
currentRank = index + 1
|
| 275 |
+
previousScore = score
|
| 276 |
+
}
|
| 277 |
+
ranks.set(entry, currentRank)
|
| 278 |
+
})
|
| 279 |
+
|
| 280 |
+
return {
|
| 281 |
+
ranks,
|
| 282 |
+
total: scored.length,
|
| 283 |
+
}
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
function getMetricIdentity(result: ModelResultForBenchmark["result"]) {
|
| 287 |
+
return {
|
| 288 |
+
metric_summary_id: result.metric_summary_id ?? null,
|
| 289 |
+
metric_id: result.metric_id ?? null,
|
| 290 |
+
metric_key: result.metric_key ?? null,
|
| 291 |
+
metric_name: result.display_name ?? result.evaluation_name,
|
| 292 |
+
metric_kind: result.metric_kind ?? null,
|
| 293 |
+
metric_unit: result.metric_unit ?? result.metric_config.unit ?? null,
|
| 294 |
+
}
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
function getInstanceStatus(result: ModelResultForBenchmark["result"], summary: BenchmarkEvalSummary) {
|
| 298 |
+
if (result.detailed_evaluation_results_url) {
|
| 299 |
+
return {
|
| 300 |
+
has_instance_data: true,
|
| 301 |
+
instance_join_status: "metric_exact" as const,
|
| 302 |
+
}
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
if (summary.instance_data?.available) {
|
| 306 |
+
return {
|
| 307 |
+
has_instance_data: true,
|
| 308 |
+
instance_join_status: "benchmark_available" as const,
|
| 309 |
+
}
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
return {
|
| 313 |
+
has_instance_data: false,
|
| 314 |
+
instance_join_status: "not_available" as const,
|
| 315 |
+
}
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
function baseRowFields(
|
| 319 |
+
summary: BenchmarkEvalSummary,
|
| 320 |
+
modelResult: ModelResultForBenchmark,
|
| 321 |
+
grain: ResearchJoinGrain,
|
| 322 |
+
rank: number | null,
|
| 323 |
+
rankTotal: number | null
|
| 324 |
+
): Omit<ResearchJoinRow, "row_id"> {
|
| 325 |
+
const metric = getMetricIdentity(modelResult.result)
|
| 326 |
+
const sourceDataFields = getSourceDataFields(modelResult.source_data)
|
| 327 |
+
const instanceStatus = getInstanceStatus(modelResult.result, summary)
|
| 328 |
+
const sourceRecordUrl =
|
| 329 |
+
modelResult.result.source_record_url ??
|
| 330 |
+
modelResult.source_metadata.source_url ??
|
| 331 |
+
null
|
| 332 |
+
|
| 333 |
+
return {
|
| 334 |
+
join_grain: grain,
|
| 335 |
+
eval_summary_id: summary.evaluation_id,
|
| 336 |
+
evaluation_name: summary.evaluation_name,
|
| 337 |
+
benchmark_family_key: summary.benchmark_family_key ?? null,
|
| 338 |
+
benchmark_parent_key: summary.composite_benchmark_key ?? null,
|
| 339 |
+
benchmark_leaf_key: summary.benchmark_leaf_key ?? null,
|
| 340 |
+
component_eval_summary_id: null,
|
| 341 |
+
component_name: null,
|
| 342 |
+
...metric,
|
| 343 |
+
model_route_id: modelResult.model_route_id ?? getModelFamilyRouteId(modelResult.model_info),
|
| 344 |
+
model_id: modelResult.model_info.id,
|
| 345 |
+
model_name: modelResult.model_info.name,
|
| 346 |
+
developer: modelResult.model_info.developer ?? null,
|
| 347 |
+
score: modelResult.score,
|
| 348 |
+
normalized_score: normalizeScore(
|
| 349 |
+
modelResult.score,
|
| 350 |
+
summary.metric_config.min_score,
|
| 351 |
+
summary.metric_config.max_score
|
| 352 |
+
),
|
| 353 |
+
rank,
|
| 354 |
+
rank_total: rankTotal,
|
| 355 |
+
lower_is_better: summary.metric_config.lower_is_better,
|
| 356 |
+
source_name: normalizeSourceName(modelResult.source_metadata.source_name),
|
| 357 |
+
source_type: modelResult.source_metadata.source_type,
|
| 358 |
+
source_organization_name: modelResult.source_metadata.source_organization_name,
|
| 359 |
+
evaluator_relationship: modelResult.source_metadata.evaluator_relationship,
|
| 360 |
+
source_record_url: sourceRecordUrl,
|
| 361 |
+
...sourceDataFields,
|
| 362 |
+
retrieved_at: modelResult.evaluation_timestamp,
|
| 363 |
+
evaluation_timestamp: modelResult.result.evaluation_timestamp,
|
| 364 |
+
...instanceStatus,
|
| 365 |
+
detailed_evaluation_results_url: modelResult.result.detailed_evaluation_results_url ?? null,
|
| 366 |
+
sample_size: modelResult.score_details.sample_size ?? null,
|
| 367 |
+
standard_error: modelResult.score_details.standard_error ?? null,
|
| 368 |
+
confidence_interval: formatConfidenceInterval(modelResult.score_details),
|
| 369 |
+
generation_config_available: Boolean(modelResult.result.generation_config),
|
| 370 |
+
}
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
function buildRowsFromModelResults(summary: BenchmarkEvalSummary): ResearchJoinRow[] {
|
| 374 |
+
const rankData = rankScoredRows(
|
| 375 |
+
summary.model_results,
|
| 376 |
+
(modelResult) => modelResult.score,
|
| 377 |
+
summary.metric_config.lower_is_better
|
| 378 |
+
)
|
| 379 |
+
|
| 380 |
+
const rows: ResearchJoinRow[] = []
|
| 381 |
+
|
| 382 |
+
for (const [index, modelResult] of summary.model_results.entries()) {
|
| 383 |
+
const rank = rankData.ranks.get(modelResult) ?? null
|
| 384 |
+
const base = baseRowFields(summary, modelResult, "model_metric_source", rank, rankData.total)
|
| 385 |
+
|
| 386 |
+
rows.push({
|
| 387 |
+
...base,
|
| 388 |
+
row_id: [
|
| 389 |
+
"model_metric_source",
|
| 390 |
+
summary.evaluation_id,
|
| 391 |
+
base.metric_summary_id ?? base.metric_key ?? base.metric_name,
|
| 392 |
+
base.model_route_id ?? base.model_id,
|
| 393 |
+
index,
|
| 394 |
+
].join("::"),
|
| 395 |
+
})
|
| 396 |
+
|
| 397 |
+
for (const [componentIndex, component] of (modelResult.aggregate_components ?? []).entries()) {
|
| 398 |
+
rows.push({
|
| 399 |
+
...base,
|
| 400 |
+
row_id: [
|
| 401 |
+
"composite_component",
|
| 402 |
+
component.evaluation_id,
|
| 403 |
+
base.model_route_id ?? base.model_id,
|
| 404 |
+
componentIndex,
|
| 405 |
+
].join("::"),
|
| 406 |
+
join_grain: "composite_component",
|
| 407 |
+
component_eval_summary_id: component.evaluation_id,
|
| 408 |
+
component_name: component.composite_benchmark_name,
|
| 409 |
+
score: component.score,
|
| 410 |
+
normalized_score: component.normalized_score,
|
| 411 |
+
source_name: normalizeSourceName(component.source_name),
|
| 412 |
+
source_type: component.source_type,
|
| 413 |
+
source_organization_name: component.source_organization_name,
|
| 414 |
+
evaluator_relationship: component.evaluator_relationship,
|
| 415 |
+
retrieved_at: component.evaluation_timestamp,
|
| 416 |
+
evaluation_timestamp: component.evaluation_timestamp,
|
| 417 |
+
})
|
| 418 |
+
}
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
return rows
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
function buildRowsFromMatrix(summary: BenchmarkEvalSummary): ResearchJoinRow[] {
|
| 425 |
+
const rows: ResearchJoinRow[] = []
|
| 426 |
+
const metrics = summary.leaderboard_metrics ?? []
|
| 427 |
+
const matrixRows = summary.leaderboard_rows ?? []
|
| 428 |
+
|
| 429 |
+
for (const metric of metrics) {
|
| 430 |
+
const rankData = rankScoredRows(
|
| 431 |
+
matrixRows,
|
| 432 |
+
(row) => row.values[metric.column_key],
|
| 433 |
+
metric.lower_is_better
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
for (const row of matrixRows) {
|
| 437 |
+
const score = row.values[metric.column_key]
|
| 438 |
+
if (!isFiniteNumber(score)) {
|
| 439 |
+
continue
|
| 440 |
+
}
|
| 441 |
+
|
| 442 |
+
const sourceDataFields = getSourceDataFields(row.source_data)
|
| 443 |
+
const hasBenchmarkInstances = Boolean(summary.instance_data?.available)
|
| 444 |
+
const modelRouteId = row.model_route_id ?? getModelFamilyRouteId(row.model_info)
|
| 445 |
+
|
| 446 |
+
rows.push({
|
| 447 |
+
row_id: [
|
| 448 |
+
"model_metric",
|
| 449 |
+
summary.evaluation_id,
|
| 450 |
+
metric.column_key,
|
| 451 |
+
modelRouteId,
|
| 452 |
+
].join("::"),
|
| 453 |
+
join_grain: "model_metric",
|
| 454 |
+
eval_summary_id: summary.evaluation_id,
|
| 455 |
+
evaluation_name: summary.evaluation_name,
|
| 456 |
+
benchmark_family_key: summary.benchmark_family_key ?? null,
|
| 457 |
+
benchmark_parent_key: summary.composite_benchmark_key ?? null,
|
| 458 |
+
benchmark_leaf_key: summary.benchmark_leaf_key ?? null,
|
| 459 |
+
component_eval_summary_id: null,
|
| 460 |
+
component_name:
|
| 461 |
+
metric.scope === "subtask"
|
| 462 |
+
? metric.subtask_name ?? metric.subtask_key ?? null
|
| 463 |
+
: null,
|
| 464 |
+
metric_summary_id: metric.metric_summary_id,
|
| 465 |
+
metric_id: null,
|
| 466 |
+
metric_key: null,
|
| 467 |
+
metric_name: metric.display_name || metric.metric_name,
|
| 468 |
+
metric_kind: null,
|
| 469 |
+
metric_unit: metric.unit ?? null,
|
| 470 |
+
model_route_id: modelRouteId,
|
| 471 |
+
model_id: row.model_info.id,
|
| 472 |
+
model_name: row.model_info.name,
|
| 473 |
+
developer: row.model_info.developer ?? null,
|
| 474 |
+
score: score ?? null,
|
| 475 |
+
normalized_score: null,
|
| 476 |
+
rank: rankData.ranks.get(row) ?? null,
|
| 477 |
+
rank_total: rankData.total,
|
| 478 |
+
lower_is_better: metric.lower_is_better,
|
| 479 |
+
source_name: normalizeSourceName(row.source_metadata.source_name),
|
| 480 |
+
source_type: row.source_metadata.source_type,
|
| 481 |
+
source_organization_name: row.source_metadata.source_organization_name,
|
| 482 |
+
evaluator_relationship: row.source_metadata.evaluator_relationship,
|
| 483 |
+
source_record_url: row.source_metadata.source_url ?? null,
|
| 484 |
+
...sourceDataFields,
|
| 485 |
+
retrieved_at: row.evaluation_timestamp,
|
| 486 |
+
evaluation_timestamp: row.evaluation_timestamp,
|
| 487 |
+
has_instance_data: hasBenchmarkInstances,
|
| 488 |
+
instance_join_status: hasBenchmarkInstances ? "benchmark_available" : "not_available",
|
| 489 |
+
detailed_evaluation_results_url: null,
|
| 490 |
+
sample_size: null,
|
| 491 |
+
standard_error: null,
|
| 492 |
+
confidence_interval: null,
|
| 493 |
+
generation_config_available: null,
|
| 494 |
+
})
|
| 495 |
+
}
|
| 496 |
+
}
|
| 497 |
+
|
| 498 |
+
return rows
|
| 499 |
+
}
|
| 500 |
+
|
| 501 |
+
function buildFacets(rows: ResearchJoinRow[]): ResearchJoinFacet[] {
|
| 502 |
+
const facetSpecs = [
|
| 503 |
+
{ key: "metric_name", label: "Metric" },
|
| 504 |
+
{ key: "source_name", label: "Source" },
|
| 505 |
+
{ key: "evaluator_relationship", label: "Relationship" },
|
| 506 |
+
] as const
|
| 507 |
+
|
| 508 |
+
return facetSpecs.map((spec) => {
|
| 509 |
+
const counts = new Map<string, number>()
|
| 510 |
+
|
| 511 |
+
for (const row of rows) {
|
| 512 |
+
const value = String(row[spec.key] ?? "")
|
| 513 |
+
if (!value) {
|
| 514 |
+
continue
|
| 515 |
+
}
|
| 516 |
+
counts.set(value, (counts.get(value) ?? 0) + 1)
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
return {
|
| 520 |
+
key: spec.key,
|
| 521 |
+
label: spec.label,
|
| 522 |
+
values: Array.from(counts.entries())
|
| 523 |
+
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
|
| 524 |
+
.map(([value, count]) => ({
|
| 525 |
+
value,
|
| 526 |
+
label: value === MISSING_SOURCE_BUCKET ? "Missing source" : value,
|
| 527 |
+
count,
|
| 528 |
+
})),
|
| 529 |
+
}
|
| 530 |
+
})
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
function buildGrainOptions(rows: ResearchJoinRow[]): ResearchJoinGrainOption[] {
|
| 534 |
+
const grainDetails: Record<ResearchJoinGrain, Omit<ResearchJoinGrainOption, "row_count">> = {
|
| 535 |
+
model_metric: {
|
| 536 |
+
grain: "model_metric",
|
| 537 |
+
label: "Model x metric",
|
| 538 |
+
description: "One row per model and benchmark metric, best for score matrices.",
|
| 539 |
+
join_keys: ["eval_summary_id", "metric_summary_id", "model_route_id"],
|
| 540 |
+
},
|
| 541 |
+
model_metric_source: {
|
| 542 |
+
grain: "model_metric_source",
|
| 543 |
+
label: "Model x metric x source",
|
| 544 |
+
description: "Adds source provenance to each model metric row.",
|
| 545 |
+
join_keys: ["eval_summary_id", "metric_summary_id", "model_route_id", "source_name"],
|
| 546 |
+
},
|
| 547 |
+
model_metric_instance: {
|
| 548 |
+
grain: "model_metric_instance",
|
| 549 |
+
label: "Model x metric x instances",
|
| 550 |
+
description: "Filters to rows with exact or benchmark-level sample links.",
|
| 551 |
+
join_keys: ["eval_summary_id", "metric_summary_id", "model_route_id", "detailed_evaluation_results_url"],
|
| 552 |
+
},
|
| 553 |
+
composite_component: {
|
| 554 |
+
grain: "composite_component",
|
| 555 |
+
label: "Composite components",
|
| 556 |
+
description: "One row per model and component score inside a rollup benchmark.",
|
| 557 |
+
join_keys: ["component_eval_summary_id", "model_route_id", "source_name"],
|
| 558 |
+
},
|
| 559 |
+
}
|
| 560 |
+
|
| 561 |
+
return (Object.keys(grainDetails) as ResearchJoinGrain[])
|
| 562 |
+
.map((grain) => ({
|
| 563 |
+
...grainDetails[grain],
|
| 564 |
+
row_count: rows.filter((row) => {
|
| 565 |
+
if (grain === "model_metric_instance") {
|
| 566 |
+
return row.has_instance_data
|
| 567 |
+
}
|
| 568 |
+
return row.join_grain === grain
|
| 569 |
+
}).length,
|
| 570 |
+
}))
|
| 571 |
+
.filter((option) => option.row_count > 0)
|
| 572 |
+
}
|
| 573 |
+
|
| 574 |
+
function buildJoinSteps(grainOptions: ResearchJoinGrainOption[]) {
|
| 575 |
+
const defaultKeys = grainOptions[0]?.join_keys ?? ["eval_summary_id", "metric_summary_id", "model_route_id"]
|
| 576 |
+
|
| 577 |
+
return [
|
| 578 |
+
{
|
| 579 |
+
step: 1,
|
| 580 |
+
title: "Choose the base row grain",
|
| 581 |
+
description:
|
| 582 |
+
"Start from model-metric rows, source-aware rows, instance-linked rows, or composite component rows.",
|
| 583 |
+
keys: defaultKeys,
|
| 584 |
+
},
|
| 585 |
+
{
|
| 586 |
+
step: 2,
|
| 587 |
+
title: "Add field groups as joins",
|
| 588 |
+
description:
|
| 589 |
+
"Join hierarchy, source provenance, instance links, and quality fields by the keys shown for the selected grain.",
|
| 590 |
+
keys: ["benchmark_family_key", "benchmark_parent_key", "source_name", "detailed_evaluation_results_url"],
|
| 591 |
+
},
|
| 592 |
+
{
|
| 593 |
+
step: 3,
|
| 594 |
+
title: "Filter, inspect keys, and export",
|
| 595 |
+
description:
|
| 596 |
+
"Filter to the rows you need, keep source identity explicit, then export the denormalized join table.",
|
| 597 |
+
keys: ["source_name", "evaluator_relationship", "metric_name"],
|
| 598 |
+
},
|
| 599 |
+
]
|
| 600 |
+
}
|
| 601 |
+
|
| 602 |
+
function buildArtifactResearchJoinDataset(summary: BenchmarkEvalSummary, warnings: string[] = []): ResearchJoinDataset {
|
| 603 |
+
const hasMatrixRows =
|
| 604 |
+
(summary.leaderboard_metrics?.length ?? 0) > 1 &&
|
| 605 |
+
(summary.leaderboard_rows?.length ?? 0) > 0
|
| 606 |
+
const rows = hasMatrixRows
|
| 607 |
+
? buildRowsFromMatrix(summary)
|
| 608 |
+
: buildRowsFromModelResults(summary)
|
| 609 |
+
|
| 610 |
+
const finalRows = rows
|
| 611 |
+
const grainOptions = buildGrainOptions(finalRows)
|
| 612 |
+
|
| 613 |
+
const benchmarkLevelInstanceWarning = finalRows.some(
|
| 614 |
+
(row) => row.instance_join_status === "benchmark_available"
|
| 615 |
+
)
|
| 616 |
+
|
| 617 |
+
return {
|
| 618 |
+
source: "artifact",
|
| 619 |
+
generated_at: new Date().toISOString(),
|
| 620 |
+
eval_summary_id: summary.evaluation_id,
|
| 621 |
+
eval_name: summary.evaluation_name,
|
| 622 |
+
warnings: [
|
| 623 |
+
...warnings,
|
| 624 |
+
"Artifact fallback keeps joins source-aware but cannot execute arbitrary SQL.",
|
| 625 |
+
...(benchmarkLevelInstanceWarning
|
| 626 |
+
? ["Some instance joins are benchmark-level availability signals, not exact metric-level sample links."]
|
| 627 |
+
: []),
|
| 628 |
+
],
|
| 629 |
+
join_steps: buildJoinSteps(grainOptions),
|
| 630 |
+
available_grains: grainOptions,
|
| 631 |
+
columns: COLUMN_DEFINITIONS,
|
| 632 |
+
facets: buildFacets(finalRows),
|
| 633 |
+
rows: finalRows,
|
| 634 |
+
}
|
| 635 |
+
}
|
| 636 |
+
|
| 637 |
+
function isResearchJoinDataset(value: unknown): value is ResearchJoinDataset {
|
| 638 |
+
if (!value || typeof value !== "object") {
|
| 639 |
+
return false
|
| 640 |
+
}
|
| 641 |
+
|
| 642 |
+
const record = value as Partial<ResearchJoinDataset>
|
| 643 |
+
return Array.isArray(record.rows) && Array.isArray(record.columns) && Array.isArray(record.available_grains)
|
| 644 |
+
}
|
| 645 |
+
|
| 646 |
+
async function fetchLiveResearchJoinDataset(evalId: string): Promise<ResearchJoinDataset | null> {
|
| 647 |
+
const baseUrl = process.env.QUERY_API_BASE_URL ?? process.env.EVAL_QUERY_API_BASE_URL
|
| 648 |
+
if (!baseUrl) {
|
| 649 |
+
return null
|
| 650 |
+
}
|
| 651 |
+
|
| 652 |
+
const trimmedBase = baseUrl.replace(/\/+$/, "")
|
| 653 |
+
const candidates = [
|
| 654 |
+
`${trimmedBase}/benchmarks/${encodeURIComponent(evalId)}/research-joins`,
|
| 655 |
+
`${trimmedBase}/research/benchmarks/${encodeURIComponent(evalId)}/joins`,
|
| 656 |
+
]
|
| 657 |
+
|
| 658 |
+
for (const url of candidates) {
|
| 659 |
+
try {
|
| 660 |
+
const response = await fetch(url, {
|
| 661 |
+
cache: "no-store",
|
| 662 |
+
headers: { Accept: "application/json" },
|
| 663 |
+
})
|
| 664 |
+
|
| 665 |
+
if (!response.ok) {
|
| 666 |
+
continue
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
const payload = await response.json()
|
| 670 |
+
const record = payload && typeof payload === "object" ? payload as Record<string, unknown> : {}
|
| 671 |
+
const candidate =
|
| 672 |
+
isResearchJoinDataset(payload)
|
| 673 |
+
? payload
|
| 674 |
+
: isResearchJoinDataset(record.research_join_dataset)
|
| 675 |
+
? record.research_join_dataset
|
| 676 |
+
: null
|
| 677 |
+
|
| 678 |
+
if (candidate) {
|
| 679 |
+
return {
|
| 680 |
+
...candidate,
|
| 681 |
+
source: "query_api",
|
| 682 |
+
warnings: candidate.warnings ?? [],
|
| 683 |
+
}
|
| 684 |
+
}
|
| 685 |
+
} catch {
|
| 686 |
+
continue
|
| 687 |
+
}
|
| 688 |
+
}
|
| 689 |
+
|
| 690 |
+
return null
|
| 691 |
+
}
|
| 692 |
+
|
| 693 |
+
export async function getResearchJoinDataset(evalId: string): Promise<ResearchJoinDataset | null> {
|
| 694 |
+
const liveDataset = await fetchLiveResearchJoinDataset(evalId)
|
| 695 |
+
if (liveDataset) {
|
| 696 |
+
return liveDataset
|
| 697 |
+
}
|
| 698 |
+
|
| 699 |
+
const summary = await getEvalSummaryById(evalId)
|
| 700 |
+
if (!summary) {
|
| 701 |
+
return null
|
| 702 |
+
}
|
| 703 |
+
|
| 704 |
+
const liveWarning =
|
| 705 |
+
process.env.QUERY_API_BASE_URL || process.env.EVAL_QUERY_API_BASE_URL
|
| 706 |
+
? ["Live Query API join endpoint was unavailable, so this dataset was built from frontend artifacts."]
|
| 707 |
+
: []
|
| 708 |
+
|
| 709 |
+
return buildArtifactResearchJoinDataset(summary, liveWarning)
|
| 710 |
+
}
|
| 711 |
+
|
| 712 |
+
export { buildArtifactResearchJoinDataset }
|
tests/research-joins.test.ts
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { describe, expect, it, vi } from "vitest"
|
| 2 |
+
import type { BenchmarkEvalSummary, ModelResultForBenchmark } from "../lib/eval-processing"
|
| 3 |
+
|
| 4 |
+
vi.mock("server-only", () => ({}))
|
| 5 |
+
|
| 6 |
+
const { buildArtifactResearchJoinDataset } = await import("../lib/research-joins")
|
| 7 |
+
|
| 8 |
+
const metricConfig = {
|
| 9 |
+
evaluation_description: "Accuracy on examples",
|
| 10 |
+
lower_is_better: false,
|
| 11 |
+
score_type: "continuous" as const,
|
| 12 |
+
min_score: 0,
|
| 13 |
+
max_score: 1,
|
| 14 |
+
unit: "%",
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
function makeModelResult(overrides: Partial<ModelResultForBenchmark> = {}): ModelResultForBenchmark {
|
| 18 |
+
return {
|
| 19 |
+
model_info: {
|
| 20 |
+
id: "openai/example-model",
|
| 21 |
+
name: "Example Model",
|
| 22 |
+
developer: "OpenAI",
|
| 23 |
+
},
|
| 24 |
+
model_route_id: "openai__example-model",
|
| 25 |
+
score: 0.82,
|
| 26 |
+
score_details: {
|
| 27 |
+
score: 0.82,
|
| 28 |
+
sample_size: 100,
|
| 29 |
+
standard_error: 0.01,
|
| 30 |
+
confidence_interval: {
|
| 31 |
+
lower: 0.8,
|
| 32 |
+
upper: 0.84,
|
| 33 |
+
confidence_level: 95,
|
| 34 |
+
},
|
| 35 |
+
},
|
| 36 |
+
evaluation_timestamp: "2026-01-01T00:00:00Z",
|
| 37 |
+
source_metadata: {
|
| 38 |
+
source_type: "evaluation_run",
|
| 39 |
+
source_organization_name: "OpenAI",
|
| 40 |
+
evaluator_relationship: "first_party",
|
| 41 |
+
},
|
| 42 |
+
source_data: {
|
| 43 |
+
dataset_name: "Example Dataset",
|
| 44 |
+
dataset_version: "v1",
|
| 45 |
+
hf_repo: "example/dataset",
|
| 46 |
+
hf_split: "test",
|
| 47 |
+
samples_number: 100,
|
| 48 |
+
},
|
| 49 |
+
result: {
|
| 50 |
+
evaluation_result_id: "result-1",
|
| 51 |
+
evaluation_name: "accuracy",
|
| 52 |
+
display_name: "Accuracy",
|
| 53 |
+
metric_summary_id: "metric-accuracy",
|
| 54 |
+
metric_id: "accuracy",
|
| 55 |
+
metric_key: "accuracy",
|
| 56 |
+
metric_name: "accuracy",
|
| 57 |
+
metric_kind: "score",
|
| 58 |
+
metric_unit: "%",
|
| 59 |
+
evaluation_timestamp: "2026-01-01T00:00:00Z",
|
| 60 |
+
source_record_url: "https://example.test/records/result-1",
|
| 61 |
+
metric_config: metricConfig,
|
| 62 |
+
score_details: { score: 0.82 },
|
| 63 |
+
detailed_evaluation_results_url:
|
| 64 |
+
"https://huggingface.co/datasets/evaleval/card_backend/resolve/main/instances/example.jsonl",
|
| 65 |
+
},
|
| 66 |
+
...overrides,
|
| 67 |
+
}
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
function makeSummary(overrides: Partial<BenchmarkEvalSummary> = {}): BenchmarkEvalSummary {
|
| 71 |
+
return {
|
| 72 |
+
evaluation_name: "Example Eval",
|
| 73 |
+
evaluation_id: "example_eval",
|
| 74 |
+
composite_benchmark_key: "example_suite",
|
| 75 |
+
composite_benchmark_name: "Example Suite",
|
| 76 |
+
category: "Reasoning",
|
| 77 |
+
metric_config: metricConfig,
|
| 78 |
+
model_results: [makeModelResult()],
|
| 79 |
+
models_count: 1,
|
| 80 |
+
evaluator_names: [],
|
| 81 |
+
source_types: [],
|
| 82 |
+
third_party_ratio: 0,
|
| 83 |
+
missing_generation_config_count: 0,
|
| 84 |
+
best_model: { name: "Example Model", score: 0.82 },
|
| 85 |
+
worst_model: { name: "Example Model", score: 0.82 },
|
| 86 |
+
avg_score: 0.82,
|
| 87 |
+
avg_score_norm: 0.82,
|
| 88 |
+
benchmark_family_key: "example_family",
|
| 89 |
+
benchmark_leaf_key: "example_leaf",
|
| 90 |
+
...overrides,
|
| 91 |
+
}
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
describe("buildArtifactResearchJoinDataset", () => {
|
| 95 |
+
it("preserves source-safe metric rows and uses an explicit missing-source bucket", () => {
|
| 96 |
+
const dataset = buildArtifactResearchJoinDataset(makeSummary())
|
| 97 |
+
|
| 98 |
+
expect(dataset.source).toBe("artifact")
|
| 99 |
+
expect(dataset.rows).toHaveLength(1)
|
| 100 |
+
expect(dataset.rows[0]).toMatchObject({
|
| 101 |
+
eval_summary_id: "example_eval",
|
| 102 |
+
metric_summary_id: "metric-accuracy",
|
| 103 |
+
metric_id: "accuracy",
|
| 104 |
+
model_route_id: "openai__example-model",
|
| 105 |
+
source_name: "__missing_source__",
|
| 106 |
+
source_record_url: "https://example.test/records/result-1",
|
| 107 |
+
has_instance_data: true,
|
| 108 |
+
instance_join_status: "metric_exact",
|
| 109 |
+
sample_size: 100,
|
| 110 |
+
standard_error: 0.01,
|
| 111 |
+
confidence_interval: "0.8 - 0.84 (95%)",
|
| 112 |
+
})
|
| 113 |
+
|
| 114 |
+
expect(dataset.facets.find((facet) => facet.key === "source_name")?.values[0]).toMatchObject({
|
| 115 |
+
value: "__missing_source__",
|
| 116 |
+
label: "Missing source",
|
| 117 |
+
count: 1,
|
| 118 |
+
})
|
| 119 |
+
})
|
| 120 |
+
|
| 121 |
+
it("offers instance-linked joins without duplicating artifact fallback rows", () => {
|
| 122 |
+
const dataset = buildArtifactResearchJoinDataset(makeSummary())
|
| 123 |
+
const instanceGrain = dataset.available_grains.find((grain) => grain.grain === "model_metric_instance")
|
| 124 |
+
|
| 125 |
+
expect(instanceGrain?.row_count).toBe(1)
|
| 126 |
+
expect(dataset.rows.filter((row) => row.has_instance_data)).toHaveLength(1)
|
| 127 |
+
})
|
| 128 |
+
|
| 129 |
+
it("materializes composite component rows from aggregate components", () => {
|
| 130 |
+
const modelResult = makeModelResult({
|
| 131 |
+
aggregate_components: [
|
| 132 |
+
{
|
| 133 |
+
evaluation_id: "component_a",
|
| 134 |
+
composite_benchmark_key: "suite",
|
| 135 |
+
composite_benchmark_name: "Component A",
|
| 136 |
+
score: 0.7,
|
| 137 |
+
normalized_score: 0.7,
|
| 138 |
+
evaluation_timestamp: "2026-01-02T00:00:00Z",
|
| 139 |
+
source_name: "component-source",
|
| 140 |
+
source_type: "leaderboard",
|
| 141 |
+
source_organization_name: "Example Org",
|
| 142 |
+
evaluator_relationship: "third_party",
|
| 143 |
+
},
|
| 144 |
+
{
|
| 145 |
+
evaluation_id: "component_b",
|
| 146 |
+
composite_benchmark_key: "suite",
|
| 147 |
+
composite_benchmark_name: "Component B",
|
| 148 |
+
score: 0.9,
|
| 149 |
+
normalized_score: 0.9,
|
| 150 |
+
evaluation_timestamp: "2026-01-03T00:00:00Z",
|
| 151 |
+
source_type: "paper",
|
| 152 |
+
source_organization_name: "Example Paper",
|
| 153 |
+
evaluator_relationship: "third_party",
|
| 154 |
+
},
|
| 155 |
+
],
|
| 156 |
+
})
|
| 157 |
+
const dataset = buildArtifactResearchJoinDataset(makeSummary({ model_results: [modelResult] }))
|
| 158 |
+
|
| 159 |
+
const componentRows = dataset.rows.filter((row) => row.join_grain === "composite_component")
|
| 160 |
+
expect(componentRows).toHaveLength(2)
|
| 161 |
+
expect(componentRows[0]).toMatchObject({
|
| 162 |
+
component_eval_summary_id: "component_a",
|
| 163 |
+
component_name: "Component A",
|
| 164 |
+
source_name: "component-source",
|
| 165 |
+
score: 0.7,
|
| 166 |
+
})
|
| 167 |
+
expect(componentRows[1]).toMatchObject({
|
| 168 |
+
component_eval_summary_id: "component_b",
|
| 169 |
+
component_name: "Component B",
|
| 170 |
+
source_name: "__missing_source__",
|
| 171 |
+
score: 0.9,
|
| 172 |
+
})
|
| 173 |
+
})
|
| 174 |
+
})
|