general-eval-card / components /benchmark-evaluation-card.tsx
evijit's picture
evijit HF Staff
Add plain-language captions and mode-aware framing for policy readers
3ad47c6
"use client"
import type { CSSProperties } from "react"
import { useMemo } from "react"
import { useAudienceMode } from "@/components/audience-mode-provider"
import { useRouter } from "next/navigation"
import {
AlertTriangle,
Award,
ChevronDown,
ChevronRight,
ExternalLink,
Eye,
MoreHorizontal,
} from "lucide-react"
import type { CategoryType } from "@/lib/benchmark-schema"
import type { SignalSummaries } from "@/lib/backend-artifacts"
import { getCategoryColor } from "@/lib/benchmark-schema"
import type { BenchmarkCard } from "@/lib/benchmark-schema"
import { lookupBenchmarkCard } from "@/lib/benchmark-metadata-utils"
import { Badge } from "@/components/ui/badge"
import { Button } from "@/components/ui/button"
import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"
import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
import { DropdownMenu, DropdownMenuContent, DropdownMenuItem, DropdownMenuTrigger } from "@/components/ui/dropdown-menu"
export type BenchmarkEvaluationCardData = {
id: string
route_id: string
model_name: string
model_id: string
canonical_model_name: string
developer: string
evaluations_count: number
benchmarks_count: number
variant_count: number
categories: CategoryType[]
category_stats: Record<CategoryType, number>
latest_timestamp: string
evaluator_count: number
evaluator_names: string[]
source_type_count: number
source_types: string[]
evidence_count: number
missing_generation_config_count: number
third_party_eval_count: number
independent_verification_ratio: number
reproducibility_status: "complete" | "partial" | "missing"
eval_libraries: Array<{
name: string
version?: string
fork?: string
}>
latest_source_name?: string
params_billions?: number | null
benchmark_names?: string[]
score_summary?: {
count: number
min: number
max: number
average: number | null
}
reproducibility_summary?: SignalSummaries["reproducibility_summary"]
provenance_summary?: SignalSummaries["provenance_summary"]
comparability_summary?: SignalSummaries["comparability_summary"]
top_scores: Array<{
benchmark: string
score: number
metric: string
unit?: string
}>
source_urls: string[]
detail_urls: string[]
model_url?: string
release_date?: string
input_modalities?: string[]
output_modalities?: string[]
architecture?: string
params?: string
inference_engine?: string
inference_platform?: string
}
interface BenchmarkEvaluationCardProps {
data: BenchmarkEvaluationCardData
benchmarkCards?: Record<string, BenchmarkCard>
onDelete?: (id: string) => void
delayMs?: number
selectedForCompare?: boolean
onToggleCompare?: (id: string) => void
}
function formatDate(isoString: string) {
const numeric = Number(isoString)
const parsedDate =
!Number.isNaN(numeric) && !isoString.includes("-")
? new Date(numeric * 1000)
: new Date(isoString)
try {
return parsedDate.toLocaleDateString("en-US", {
year: "numeric",
month: "short",
day: "numeric",
})
} catch {
return isoString
}
}
function formatParamsBillions(value: number | null | undefined) {
if (value == null || Number.isNaN(value)) return null
if (value >= 100) return `${Math.round(value)}B`
return `${value.toFixed(1)}B`
}
function formatScoreValue(value: number | null | undefined) {
if (value == null || !Number.isFinite(value)) {
return null
}
if (value >= 0 && value <= 1) {
return `${(value * 100).toFixed(1)}%`
}
if (Math.abs(value) >= 100) {
return value.toFixed(0)
}
return value.toFixed(2)
}
function getCoverageSummaryLabel(data: BenchmarkEvaluationCardData) {
if (data.benchmarks_count > 0) {
return `${data.benchmarks_count} benchmark suite${data.benchmarks_count === 1 ? "" : "s"} surfaced`
}
if (data.latest_source_name) {
return data.latest_source_name
}
return "Coverage summary"
}
function getTopBenchmarks(data: BenchmarkEvaluationCardData) {
const surfaced = Array.from(new Set(data.top_scores.map((score) => score.benchmark)))
if (surfaced.length > 0) {
return surfaced
}
if (data.benchmark_names?.length) {
return Array.from(new Set(data.benchmark_names))
}
return []
}
const CATEGORY_PLOT_COLORS: Record<string, string> = {
"General": "#2563eb",
"Reasoning": "#7c3aed",
"Agentic": "#ea580c",
"Safety": "#16a34a",
"Knowledge": "#0f766e",
}
function getCategoryPlotColor(category: string) {
return CATEGORY_PLOT_COLORS[category] ?? "#64748b"
}
function CategoryCoveragePlot({
coverage,
}: {
coverage: Array<{ category: CategoryType; count: number }>
}) {
if (coverage.length === 0) {
return (
<div className="rounded-xl border border-dashed border-border/60 px-3 py-4 text-sm text-muted-foreground">
No category coverage recorded.
</div>
)
}
const totalCount = coverage.reduce((sum, item) => sum + item.count, 0)
return (
<div className="space-y-2">
<div
className="flex h-3 w-full items-stretch gap-1 rounded-full bg-muted/70"
aria-label="Category coverage distribution"
role="img"
>
{coverage.map((item) => (
<div
key={item.category}
className="min-w-2 rounded-full"
style={{
width: `${(item.count / totalCount) * 100}%`,
backgroundColor: getCategoryPlotColor(item.category),
}}
title={`${item.category}: ${item.count} benchmark${item.count !== 1 ? "s" : ""}`}
/>
))}
</div>
<div className="flex flex-wrap gap-1.5">
{coverage.slice(0, 4).map((item) => (
<span
key={item.category}
className={`inline-flex items-center gap-1 rounded-full border px-2 py-0.5 text-[10px] font-semibold ${getCategoryColor(item.category)}`}
>
<span
className="h-1.5 w-1.5 rounded-full"
style={{ backgroundColor: getCategoryPlotColor(item.category) }}
/>
{item.category}
<span className="opacity-70">{item.count}</span>
</span>
))}
</div>
</div>
)
}
export function BenchmarkEvaluationCard({
data,
benchmarkCards,
onDelete,
delayMs = 0,
selectedForCompare = false,
onToggleCompare,
}: BenchmarkEvaluationCardProps) {
const router = useRouter()
const { mode } = useAudienceMode()
const isResearchView = mode === "research"
// Collect unique domains from this model's benchmarks using metadata cards
const modelDomains = useMemo(() => {
if (!benchmarkCards) return []
const domainCounts = new Map<string, number>()
for (const { benchmark } of data.top_scores) {
const card = lookupBenchmarkCard(benchmarkCards, benchmark)
for (const domain of card?.benchmark_details?.domains ?? []) {
domainCounts.set(domain, (domainCounts.get(domain) ?? 0) + 1)
}
}
return Array.from(domainCounts.entries())
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
.map(([domain]) => domain)
}, [benchmarkCards, data.top_scores])
const categoryCoverage = useMemo(
() =>
Object.entries(data.category_stats)
.filter(([, count]) => count > 0)
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
.map(([category, count]) => ({
category: category as CategoryType,
count,
})),
[data.category_stats]
)
const paramsBillions = formatParamsBillions(data.params_billions)
const coverageSummaryLabel = getCoverageSummaryLabel(data)
const topBenchmarks = getTopBenchmarks(data)
const scoreRange = [formatScoreValue(data.score_summary?.min), formatScoreValue(data.score_summary?.max)]
.filter((value): value is string => Boolean(value))
.join(" to ")
const reproducibilityGapCount = data.reproducibility_summary?.has_reproducibility_gap_count ?? 0
const reproducibilityTotal = data.reproducibility_summary?.results_total ?? data.evaluations_count
return (
<Card
className="motion-academic-enter motion-academic-surface motion-academic-hover group cursor-pointer overflow-hidden border-border/70 bg-card hover:shadow-xl"
style={{ "--enter-delay": `${delayMs}ms` } as CSSProperties}
onClick={() => router.push(`/models/${data.route_id}`)}
>
<CardHeader className="space-y-4 border-b border-border/60 pb-4">
<div className="flex flex-wrap items-center justify-between gap-2 text-[10px] font-semibold uppercase tracking-[0.24em] text-muted-foreground">
<span>Model Summary</span>
<div className="flex flex-wrap items-center gap-2 text-[11px] font-medium tracking-[0.08em] text-muted-foreground/90">
<span className="max-w-[13rem] truncate">{coverageSummaryLabel}</span>
<span className="text-border">/</span>
<span>{formatDate(data.latest_timestamp)}</span>
</div>
</div>
<div className="flex items-start justify-between gap-4">
<div className="min-w-0 flex-1 space-y-2">
<div>
<CardTitle className="truncate text-lg font-bold transition-colors group-hover:text-primary sm:text-xl">
{data.model_name}
</CardTitle>
<div className="mt-0.5 text-sm text-muted-foreground">
{data.developer || "Unknown developer"}
</div>
</div>
<div className="flex flex-wrap gap-2">
{data.variant_count > 1 && (
<Badge variant="secondary">{data.variant_count} versions</Badge>
)}
{paramsBillions && <Badge variant="secondary">{paramsBillions} parameters</Badge>}
<Badge variant="outline">{data.benchmarks_count} benchmark suites</Badge>
<Badge variant="outline">{data.evaluations_count} reported results</Badge>
{reproducibilityGapCount > 0 && (
<Badge
className="border-amber-300 bg-amber-50 text-amber-900 hover:bg-amber-50 dark:border-amber-900/60 dark:bg-amber-950/40 dark:text-amber-100"
title={
isResearchView
? `${reproducibilityGapCount} of ${reproducibilityTotal} reported scores have at least one missing setup field.`
: `${reproducibilityGapCount} of ${reproducibilityTotal} reported scores cannot be independently re-run because the setup is not documented.`
}
>
<AlertTriangle className="h-3 w-3" />
{isResearchView
? `${reproducibilityGapCount} reproducibility gaps`
: `${reproducibilityGapCount} re-run gaps`}
</Badge>
)}
</div>
</div>
<div className="flex items-center gap-2">
{onToggleCompare ? (
<Button
variant={selectedForCompare ? "default" : "outline"}
size="sm"
className="shrink-0"
onClick={(event) => {
event.stopPropagation()
onToggleCompare(data.id)
}}
>
{selectedForCompare ? "Selected" : "Compare"}
</Button>
) : null}
<DropdownMenu>
<DropdownMenuTrigger asChild>
<Button
variant="ghost"
size="icon"
className="motion-academic-button opacity-0 transition-opacity group-hover:opacity-100"
onClick={(event) => event.stopPropagation()}
>
<MoreHorizontal className="h-4 w-4" />
</Button>
</DropdownMenuTrigger>
<DropdownMenuContent align="end">
<DropdownMenuItem onClick={() => router.push(`/models/${data.route_id}`)}>
<Eye className="mr-2 h-4 w-4" />
View Details
</DropdownMenuItem>
{data.source_urls.length > 0 && (
<DropdownMenuItem onClick={() => window.open(data.source_urls[0], "_blank")}>
<ExternalLink className="mr-2 h-4 w-4" />
View Source
</DropdownMenuItem>
)}
{onDelete && (
<DropdownMenuItem onClick={() => onDelete(data.id)} className="text-destructive">
<Award className="mr-2 h-4 w-4" />
Remove
</DropdownMenuItem>
)}
</DropdownMenuContent>
</DropdownMenu>
</div>
</div>
</CardHeader>
<CardContent className="space-y-4 pt-4">
<div className="rounded-2xl border border-border/70 bg-muted/10 px-4 py-3">
<div className="flex flex-wrap items-center justify-between gap-3">
<div>
<div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-muted-foreground">
Category coverage
</div>
<div className="mt-1 text-sm text-muted-foreground">
{categoryCoverage.length} {categoryCoverage.length === 1 ? "category" : "categories"}
</div>
</div>
<div className="text-right">
<div className="text-lg font-semibold tabular-nums text-foreground">{data.evaluations_count}</div>
<div className="text-[11px] uppercase tracking-[0.18em] text-muted-foreground">reported results</div>
</div>
</div>
<div className="mt-3">
<CategoryCoveragePlot coverage={categoryCoverage} />
</div>
</div>
{modelDomains.length > 0 && (
<div className="space-y-2">
<div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-muted-foreground">
Top domain coverage
</div>
<div className="flex flex-wrap gap-1.5">
{modelDomains.slice(0, 5).map((domain) => (
<span
key={domain}
className="inline-flex items-center rounded-full border border-border/50 bg-muted/40 px-2.5 py-0.5 text-[11px] font-medium capitalize text-muted-foreground"
>
{domain}
</span>
))}
{modelDomains.length > 5 && (
<span className="inline-flex items-center rounded-full border border-border/50 bg-muted/40 px-2.5 py-0.5 text-[11px] font-medium text-muted-foreground">
+{modelDomains.length - 5} more
</span>
)}
</div>
</div>
)}
{topBenchmarks.length > 0 && (
<div className="space-y-2">
<div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-muted-foreground">
Covered benchmarks
</div>
<div className="flex flex-wrap gap-1.5">
{topBenchmarks.slice(0, 6).map((benchmark) => (
<span
key={benchmark}
className="inline-flex items-center rounded-full border border-border/50 bg-background px-2.5 py-0.5 text-[11px] font-medium text-foreground/85"
>
{benchmark}
</span>
))}
{topBenchmarks.length > 6 && (
<span className="inline-flex items-center rounded-full border border-dashed border-border/50 bg-background px-2.5 py-0.5 text-[11px] font-medium text-muted-foreground">
See the full list in details
</span>
)}
</div>
</div>
)}
<Collapsible className="rounded-2xl border border-border/70 bg-background">
<CollapsibleTrigger asChild>
<button
type="button"
onClick={(event) => event.stopPropagation()}
className="flex w-full items-center justify-between px-4 py-3 text-left"
>
<div>
<div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-muted-foreground">
Dive Deeper
</div>
<div className="mt-1 text-sm font-semibold text-foreground">
{isResearchView ? "Coverage and score summary" : "Coverage and benchmark context"}
</div>
</div>
<ChevronDown className="h-4 w-4 text-muted-foreground" />
</button>
</CollapsibleTrigger>
<CollapsibleContent onClick={(event) => event.stopPropagation()} className="border-t border-border/60 px-4 py-4">
<div className="space-y-0 text-sm">
<KeyValueRow label={isResearchView ? "Coverage" : "Benchmark coverage"} value={coverageSummaryLabel} />
{topBenchmarks.length > 0 && (
<KeyValueRow label="Benchmarks" value={topBenchmarks.slice(0, 6).join(", ")} />
)}
{scoreRange && (
<KeyValueRow label={isResearchView ? "Score span" : "Score range"} value={scoreRange} />
)}
<KeyValueRow label="Updated" value={formatDate(data.latest_timestamp)} />
{data.architecture && isResearchView && (
<KeyValueRow label="Architecture" value={data.architecture} />
)}
{data.source_types.length > 0 && (
<KeyValueRow
label={isResearchView ? "Artifact type" : "Source type"}
value={data.source_types.map((s) => s.replace(/_/g, " ")).join(", ")}
/>
)}
{reproducibilityGapCount > 0 && (
<KeyValueRow
label={isResearchView ? "Re-runnability" : "Re-run readiness"}
value={
isResearchView
? `${reproducibilityGapCount} of ${reproducibilityTotal} reported scores are not fully documented`
: `${reproducibilityGapCount} of ${reproducibilityTotal} scores cannot be re-run with the information available`
}
/>
)}
</div>
</CollapsibleContent>
</Collapsible>
<div className="flex items-center justify-between rounded-2xl border border-border/70 bg-muted/10 px-4 py-3">
<div>
<div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-muted-foreground">
Full record
</div>
<div className="mt-1 text-sm text-muted-foreground">
Open for the full benchmark list, provenance, and comparison detail.
</div>
</div>
<Button
variant="ghost"
size="sm"
className="gap-1"
onClick={(event) => {
event.stopPropagation()
router.push(`/models/${data.route_id}`)
}}
>
Open card
<ChevronRight className="h-4 w-4" />
</Button>
</div>
</CardContent>
</Card>
)
}
function KeyValueRow({ label, value }: { label: string; value: string }) {
return (
<div className="grid grid-cols-[7rem_minmax(0,1fr)] items-start gap-x-3 border-b border-border/40 py-2 last:border-b-0 last:pb-0 first:pt-0">
<span className="text-muted-foreground">{label}</span>
<span className="min-w-0 max-w-full justify-self-end text-right font-medium leading-tight text-foreground break-words">
{value}
</span>
</div>
)
}