Spaces:
Running
Running
Improve eval score displays and summary fallbacks
Browse files- app/evals/[id]/page.tsx +4 -4
- components/eval-detail.tsx +20 -29
- lib/model-data.ts +165 -6
app/evals/[id]/page.tsx
CHANGED
|
@@ -245,7 +245,7 @@ function CompositeEvalView({
|
|
| 245 |
</TabsTrigger>
|
| 246 |
<TabsTrigger value="matrix" className="gap-2">
|
| 247 |
<Grid3X3 className="h-4 w-4" />
|
| 248 |
-
|
| 249 |
</TabsTrigger>
|
| 250 |
</TabsList>
|
| 251 |
|
|
@@ -496,9 +496,9 @@ function MatrixLeaderboard({
|
|
| 496 |
}
|
| 497 |
|
| 498 |
function formatScore(score: number): string {
|
| 499 |
-
if (score >=
|
| 500 |
-
if (score >
|
| 501 |
-
return score.toFixed(
|
| 502 |
}
|
| 503 |
|
| 504 |
function handleSort(col: string) {
|
|
|
|
| 245 |
</TabsTrigger>
|
| 246 |
<TabsTrigger value="matrix" className="gap-2">
|
| 247 |
<Grid3X3 className="h-4 w-4" />
|
| 248 |
+
Score breakdown
|
| 249 |
</TabsTrigger>
|
| 250 |
</TabsList>
|
| 251 |
|
|
|
|
| 496 |
}
|
| 497 |
|
| 498 |
function formatScore(score: number): string {
|
| 499 |
+
if (Math.abs(score) >= 100) return score.toFixed(1)
|
| 500 |
+
if (Math.abs(score) >= 10) return score.toFixed(2)
|
| 501 |
+
return score.toFixed(3).replace(/0+$/g, "").replace(/\.$/, "")
|
| 502 |
}
|
| 503 |
|
| 504 |
function handleSort(col: string) {
|
components/eval-detail.tsx
CHANGED
|
@@ -199,9 +199,17 @@ function formatMetadataValue(value: unknown): string {
|
|
| 199 |
}
|
| 200 |
|
| 201 |
function formatDate(ts: string) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
const numeric = Number(ts)
|
| 203 |
const parsedDate = !Number.isNaN(numeric) && !ts.includes("-") ? new Date(numeric * 1000) : new Date(ts)
|
| 204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
try {
|
| 206 |
return parsedDate.toLocaleDateString("en-US", {
|
| 207 |
year: "numeric",
|
|
@@ -322,7 +330,7 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 322 |
[leaderboardRows, leaderboardPage]
|
| 323 |
)
|
| 324 |
|
| 325 |
-
const
|
| 326 |
const scoreDirectionLabel = summary.metric_config.lower_is_better ? "Lower scores rank higher" : "Higher scores rank higher"
|
| 327 |
const leaderboardTitle = isResearchView ? "Leaderboard" : "Reporting Comparison"
|
| 328 |
const sourceDatasetLabel = summary.source_data?.hf_repo ?? summary.source_data?.dataset_name ?? "Backend summary"
|
|
@@ -331,8 +339,8 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 331 |
: "Not linked"
|
| 332 |
const leaderboardDescription = isResearchView
|
| 333 |
? summary.is_aggregated
|
| 334 |
-
? "Models ranked by average
|
| 335 |
-
: "Models ranked by
|
| 336 |
: summary.is_aggregated
|
| 337 |
? "Averaged model results across the contributing composite benchmarks, with drill-down to each component score."
|
| 338 |
: "Model results with benchmark context, source dataset detail, and optional instance-data links."
|
|
@@ -396,9 +404,9 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 396 |
</div>
|
| 397 |
<div className="rounded-2xl border border-border/70 bg-muted/20 px-4 py-3">
|
| 398 |
<div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-muted-foreground">
|
| 399 |
-
{isResearchView ? "Avg
|
| 400 |
</div>
|
| 401 |
-
<div className="mt-1 text-2xl font-semibold">{isResearchView ?
|
| 402 |
</div>
|
| 403 |
<div className="rounded-2xl border border-emerald-200/80 bg-emerald-50/80 px-4 py-3 dark:border-emerald-900/40 dark:bg-emerald-950/20">
|
| 404 |
<div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-emerald-700 dark:text-emerald-200">
|
|
@@ -409,7 +417,7 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 409 |
</div>
|
| 410 |
{isResearchView && summary.best_model && (
|
| 411 |
<div className="mt-1 text-xs text-emerald-700/80 dark:text-emerald-200/80">
|
| 412 |
-
{
|
| 413 |
</div>
|
| 414 |
)}
|
| 415 |
</div>
|
|
@@ -424,7 +432,7 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 424 |
</div>
|
| 425 |
{isResearchView && summary.worst_model && (
|
| 426 |
<div className="mt-1 text-xs text-amber-700/80 dark:text-amber-200/80">
|
| 427 |
-
{
|
| 428 |
</div>
|
| 429 |
)}
|
| 430 |
</div>
|
|
@@ -703,21 +711,13 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 703 |
</TableCell>
|
| 704 |
|
| 705 |
<TableCell className="text-right">
|
| 706 |
-
<div className="
|
| 707 |
-
<div className="text-xl font-semibold tabular-nums">{formatPercent(normalizedScore)}</div>
|
| 708 |
-
<div className="text-xs text-muted-foreground">
|
| 709 |
-
Raw {formatRawScore(modelResult.score, summary.metric_config.unit)}
|
| 710 |
-
</div>
|
| 711 |
-
</div>
|
| 712 |
</TableCell>
|
| 713 |
|
| 714 |
{isResearchView ? (
|
| 715 |
<TableCell className="hidden md:table-cell">
|
| 716 |
-
<div className="
|
| 717 |
-
<Progress value={normalizedScore * 100} className="h-2
|
| 718 |
-
<span className="w-12 text-right text-xs tabular-nums text-muted-foreground">
|
| 719 |
-
{formatPercent(normalizedScore)}
|
| 720 |
-
</span>
|
| 721 |
</div>
|
| 722 |
</TableCell>
|
| 723 |
) : (
|
|
@@ -846,15 +846,11 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 846 |
subtitle={
|
| 847 |
isResearchView
|
| 848 |
? "Raw metric values and uncertainty details."
|
| 849 |
-
: "
|
| 850 |
}
|
| 851 |
>
|
| 852 |
<MetaRow
|
| 853 |
-
label={modelResult.aggregate_components ? "Average Score" : "
|
| 854 |
-
value={formatPercent(normalizedScore)}
|
| 855 |
-
/>
|
| 856 |
-
<MetaRow
|
| 857 |
-
label={modelResult.aggregate_components ? "Average Raw Value" : "Raw Score"}
|
| 858 |
value={formatRawScore(modelResult.score, summary.metric_config.unit)}
|
| 859 |
/>
|
| 860 |
<MetaRow label="Score Type" value={modelResult.result.metric_config.score_type} />
|
|
@@ -888,7 +884,6 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 888 |
<th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Benchmark</th>
|
| 889 |
<th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Source</th>
|
| 890 |
<th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Raw</th>
|
| 891 |
-
<th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Score</th>
|
| 892 |
</tr>
|
| 893 |
</thead>
|
| 894 |
<tbody>
|
|
@@ -897,7 +892,6 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 897 |
<td className="px-3 py-2 font-medium">{component.composite_benchmark_name}</td>
|
| 898 |
<td className="px-3 py-2 text-muted-foreground">{component.source_organization_name}</td>
|
| 899 |
<td className="px-3 py-2 text-right tabular-nums text-muted-foreground">{formatRawScore(component.score)}</td>
|
| 900 |
-
<td className="px-3 py-2 text-right font-semibold tabular-nums">{formatPercent(component.normalized_score)}</td>
|
| 901 |
</tr>
|
| 902 |
))}
|
| 903 |
</tbody>
|
|
@@ -917,18 +911,15 @@ export function EvalDetail({ summary }: EvalDetailProps) {
|
|
| 917 |
<tr className="border-b bg-muted/30">
|
| 918 |
<th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Subtask</th>
|
| 919 |
<th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Raw</th>
|
| 920 |
-
<th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Score</th>
|
| 921 |
</tr>
|
| 922 |
</thead>
|
| 923 |
<tbody>
|
| 924 |
{subtasks.map(([subtaskName, value]) => {
|
| 925 |
const numericValue = value as number
|
| 926 |
-
const normalizedSubtaskScore = range > 0 ? (numericValue - minScore) / range : numericValue
|
| 927 |
return (
|
| 928 |
<tr key={subtaskName} className="border-b last:border-0 hover:bg-muted/10">
|
| 929 |
<td className="px-3 py-2 font-medium capitalize">{subtaskName.replace(/_/g, " ")}</td>
|
| 930 |
<td className="px-3 py-2 text-right tabular-nums text-muted-foreground">{formatRawScore(numericValue, summary.metric_config.unit)}</td>
|
| 931 |
-
<td className="px-3 py-2 text-right font-semibold tabular-nums">{formatPercent(normalizedSubtaskScore)}</td>
|
| 932 |
</tr>
|
| 933 |
)
|
| 934 |
})}
|
|
|
|
| 199 |
}
|
| 200 |
|
| 201 |
function formatDate(ts: string) {
|
| 202 |
+
if (!ts || !ts.trim()) {
|
| 203 |
+
return "Unknown"
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
const numeric = Number(ts)
|
| 207 |
const parsedDate = !Number.isNaN(numeric) && !ts.includes("-") ? new Date(numeric * 1000) : new Date(ts)
|
| 208 |
|
| 209 |
+
if (Number.isNaN(parsedDate.getTime())) {
|
| 210 |
+
return "Unknown"
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
try {
|
| 214 |
return parsedDate.toLocaleDateString("en-US", {
|
| 215 |
year: "numeric",
|
|
|
|
| 330 |
[leaderboardRows, leaderboardPage]
|
| 331 |
)
|
| 332 |
|
| 333 |
+
const avgScoreLabel = formatRawScore(summary.avg_score, summary.metric_config.unit)
|
| 334 |
const scoreDirectionLabel = summary.metric_config.lower_is_better ? "Lower scores rank higher" : "Higher scores rank higher"
|
| 335 |
const leaderboardTitle = isResearchView ? "Leaderboard" : "Reporting Comparison"
|
| 336 |
const sourceDatasetLabel = summary.source_data?.hf_repo ?? summary.source_data?.dataset_name ?? "Backend summary"
|
|
|
|
| 339 |
: "Not linked"
|
| 340 |
const leaderboardDescription = isResearchView
|
| 341 |
? summary.is_aggregated
|
| 342 |
+
? "Models ranked by average raw score across the contributing composite benchmarks."
|
| 343 |
+
: "Models ranked by raw score for this benchmark."
|
| 344 |
: summary.is_aggregated
|
| 345 |
? "Averaged model results across the contributing composite benchmarks, with drill-down to each component score."
|
| 346 |
: "Model results with benchmark context, source dataset detail, and optional instance-data links."
|
|
|
|
| 404 |
</div>
|
| 405 |
<div className="rounded-2xl border border-border/70 bg-muted/20 px-4 py-3">
|
| 406 |
<div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-muted-foreground">
|
| 407 |
+
{isResearchView ? "Avg score" : "Metrics"}
|
| 408 |
</div>
|
| 409 |
+
<div className="mt-1 text-2xl font-semibold">{isResearchView ? avgScoreLabel : summary.metrics_count ?? 1}</div>
|
| 410 |
</div>
|
| 411 |
<div className="rounded-2xl border border-emerald-200/80 bg-emerald-50/80 px-4 py-3 dark:border-emerald-900/40 dark:bg-emerald-950/20">
|
| 412 |
<div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-emerald-700 dark:text-emerald-200">
|
|
|
|
| 417 |
</div>
|
| 418 |
{isResearchView && summary.best_model && (
|
| 419 |
<div className="mt-1 text-xs text-emerald-700/80 dark:text-emerald-200/80">
|
| 420 |
+
{formatRawScore(summary.best_model.score, summary.metric_config.unit)}
|
| 421 |
</div>
|
| 422 |
)}
|
| 423 |
</div>
|
|
|
|
| 432 |
</div>
|
| 433 |
{isResearchView && summary.worst_model && (
|
| 434 |
<div className="mt-1 text-xs text-amber-700/80 dark:text-amber-200/80">
|
| 435 |
+
{formatRawScore(summary.worst_model.score, summary.metric_config.unit)}
|
| 436 |
</div>
|
| 437 |
)}
|
| 438 |
</div>
|
|
|
|
| 711 |
</TableCell>
|
| 712 |
|
| 713 |
<TableCell className="text-right">
|
| 714 |
+
<div className="text-xl font-semibold tabular-nums">{formatRawScore(modelResult.score, summary.metric_config.unit)}</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 715 |
</TableCell>
|
| 716 |
|
| 717 |
{isResearchView ? (
|
| 718 |
<TableCell className="hidden md:table-cell">
|
| 719 |
+
<div className="min-w-[220px]">
|
| 720 |
+
<Progress value={normalizedScore * 100} className="h-2" />
|
|
|
|
|
|
|
|
|
|
| 721 |
</div>
|
| 722 |
</TableCell>
|
| 723 |
) : (
|
|
|
|
| 846 |
subtitle={
|
| 847 |
isResearchView
|
| 848 |
? "Raw metric values and uncertainty details."
|
| 849 |
+
: "Raw performance plus uncertainty and sample details."
|
| 850 |
}
|
| 851 |
>
|
| 852 |
<MetaRow
|
| 853 |
+
label={modelResult.aggregate_components ? "Average Raw Score" : "Raw Score"}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 854 |
value={formatRawScore(modelResult.score, summary.metric_config.unit)}
|
| 855 |
/>
|
| 856 |
<MetaRow label="Score Type" value={modelResult.result.metric_config.score_type} />
|
|
|
|
| 884 |
<th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Benchmark</th>
|
| 885 |
<th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Source</th>
|
| 886 |
<th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Raw</th>
|
|
|
|
| 887 |
</tr>
|
| 888 |
</thead>
|
| 889 |
<tbody>
|
|
|
|
| 892 |
<td className="px-3 py-2 font-medium">{component.composite_benchmark_name}</td>
|
| 893 |
<td className="px-3 py-2 text-muted-foreground">{component.source_organization_name}</td>
|
| 894 |
<td className="px-3 py-2 text-right tabular-nums text-muted-foreground">{formatRawScore(component.score)}</td>
|
|
|
|
| 895 |
</tr>
|
| 896 |
))}
|
| 897 |
</tbody>
|
|
|
|
| 911 |
<tr className="border-b bg-muted/30">
|
| 912 |
<th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Subtask</th>
|
| 913 |
<th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Raw</th>
|
|
|
|
| 914 |
</tr>
|
| 915 |
</thead>
|
| 916 |
<tbody>
|
| 917 |
{subtasks.map(([subtaskName, value]) => {
|
| 918 |
const numericValue = value as number
|
|
|
|
| 919 |
return (
|
| 920 |
<tr key={subtaskName} className="border-b last:border-0 hover:bg-muted/10">
|
| 921 |
<td className="px-3 py-2 font-medium capitalize">{subtaskName.replace(/_/g, " ")}</td>
|
| 922 |
<td className="px-3 py-2 text-right tabular-nums text-muted-foreground">{formatRawScore(numericValue, summary.metric_config.unit)}</td>
|
|
|
|
| 923 |
</tr>
|
| 924 |
)
|
| 925 |
})}
|
lib/model-data.ts
CHANGED
|
@@ -443,11 +443,171 @@ function hfEvalEntryToListItem(entry: {
|
|
| 443 |
// ---------------------------------------------------------------------------
|
| 444 |
|
| 445 |
function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
|
| 446 |
-
// New structure: detail.metrics is an array of metric objects, each with model_results
|
| 447 |
-
// Use the first metric as the primary for the summary leaderboard
|
| 448 |
const evalName = detail.benchmark_leaf_name || detail.eval_summary_id || "Unknown"
|
| 449 |
const benchmarkKey = detail.benchmark ?? ""
|
|
|
|
|
|
|
| 450 |
const allMetrics = detail.metrics ?? []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
const primaryMetric = allMetrics[0]
|
| 452 |
if (!primaryMetric) {
|
| 453 |
return {
|
|
@@ -473,8 +633,7 @@ function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
|
|
| 473 |
}
|
| 474 |
|
| 475 |
const modelResults: ModelResultForBenchmark[] = (primaryMetric.model_results ?? []).map((mr) => {
|
| 476 |
-
const
|
| 477 |
-
const sourceOrganization = detail.source_data?.hf_repo || sourceName
|
| 478 |
const modelInfo: ModelInfo = {
|
| 479 |
name: mr.model_name ?? "",
|
| 480 |
id: mr.model_id ?? "",
|
|
@@ -483,7 +642,7 @@ function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
|
|
| 483 |
|
| 484 |
const evaluationResult: EvaluationResult = {
|
| 485 |
evaluation_name: primaryMetric.metric_name ?? "",
|
| 486 |
-
evaluation_timestamp:
|
| 487 |
metric_config: {
|
| 488 |
evaluation_description: primaryMetric.metric_name ?? "",
|
| 489 |
lower_is_better: primaryMetric.lower_is_better ?? false,
|
|
@@ -502,7 +661,7 @@ function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
|
|
| 502 |
model_route_id: mr.model_route_id,
|
| 503 |
score: mr.score ?? 0,
|
| 504 |
score_details: { score: mr.score ?? 0 },
|
| 505 |
-
evaluation_timestamp:
|
| 506 |
source_metadata: {
|
| 507 |
source_type: "documentation" as const,
|
| 508 |
source_name: sourceName,
|
|
|
|
| 443 |
// ---------------------------------------------------------------------------
|
| 444 |
|
| 445 |
function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
|
|
|
|
|
|
|
| 446 |
const evalName = detail.benchmark_leaf_name || detail.eval_summary_id || "Unknown"
|
| 447 |
const benchmarkKey = detail.benchmark ?? ""
|
| 448 |
+
const sourceName = detail.source_data?.dataset_name || benchmarkKey
|
| 449 |
+
const sourceOrganization = detail.source_data?.hf_repo || sourceName
|
| 450 |
const allMetrics = detail.metrics ?? []
|
| 451 |
+
|
| 452 |
+
if (allMetrics.length === 0) {
|
| 453 |
+
const subtaskMetrics = (Array.isArray(detail.subtasks) ? detail.subtasks : [])
|
| 454 |
+
.flatMap((subtask) => {
|
| 455 |
+
if (!subtask || typeof subtask !== "object") {
|
| 456 |
+
return []
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
const subtaskRecord = subtask as Record<string, unknown>
|
| 460 |
+
const subtaskLabel =
|
| 461 |
+
(typeof subtaskRecord.display_name === "string" && subtaskRecord.display_name.trim()) ||
|
| 462 |
+
(typeof subtaskRecord.subtask_name === "string" && subtaskRecord.subtask_name.trim()) ||
|
| 463 |
+
(typeof subtaskRecord.subtask_key === "string" && humanizeToken(subtaskRecord.subtask_key)) ||
|
| 464 |
+
"Subtask"
|
| 465 |
+
|
| 466 |
+
const metrics = Array.isArray(subtaskRecord.metrics) ? subtaskRecord.metrics : []
|
| 467 |
+
return metrics.map((metric) => {
|
| 468 |
+
const metricRecord = metric as Record<string, unknown>
|
| 469 |
+
const metricName =
|
| 470 |
+
(typeof metricRecord.metric_name === "string" && metricRecord.metric_name.trim()) ||
|
| 471 |
+
(typeof metricRecord.evaluation_name === "string" && metricRecord.evaluation_name.trim()) ||
|
| 472 |
+
"Score"
|
| 473 |
+
|
| 474 |
+
return {
|
| 475 |
+
label: subtaskLabel,
|
| 476 |
+
metricName,
|
| 477 |
+
lowerIsBetter: metricRecord.lower_is_better === true,
|
| 478 |
+
modelResults: Array.isArray(metricRecord.model_results)
|
| 479 |
+
? (metricRecord.model_results as HFEvalModelResult[])
|
| 480 |
+
: [],
|
| 481 |
+
}
|
| 482 |
+
})
|
| 483 |
+
})
|
| 484 |
+
.filter((metric) => metric.modelResults.length > 0)
|
| 485 |
+
|
| 486 |
+
if (subtaskMetrics.length > 0) {
|
| 487 |
+
const modelMap = new Map<string, {
|
| 488 |
+
model_info: ModelInfo
|
| 489 |
+
model_route_id?: string
|
| 490 |
+
evaluation_timestamp?: string
|
| 491 |
+
scores: Record<string, number>
|
| 492 |
+
}>()
|
| 493 |
+
|
| 494 |
+
for (const metric of subtaskMetrics) {
|
| 495 |
+
for (const mr of metric.modelResults) {
|
| 496 |
+
const modelId = mr.model_id ?? mr.model_route_id ?? mr.model_name ?? "unknown-model"
|
| 497 |
+
const existing = modelMap.get(modelId) ?? {
|
| 498 |
+
model_info: {
|
| 499 |
+
name: mr.model_name ?? "",
|
| 500 |
+
id: mr.model_id ?? "",
|
| 501 |
+
developer: mr.developer ?? "",
|
| 502 |
+
},
|
| 503 |
+
model_route_id: mr.model_route_id,
|
| 504 |
+
evaluation_timestamp: mr.retrieved_timestamp,
|
| 505 |
+
scores: {},
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
if (Number.isFinite(mr.score)) {
|
| 509 |
+
existing.scores[metric.label] = mr.score
|
| 510 |
+
}
|
| 511 |
+
|
| 512 |
+
if (
|
| 513 |
+
mr.retrieved_timestamp &&
|
| 514 |
+
(!existing.evaluation_timestamp ||
|
| 515 |
+
normalizeEvalTimestamp(mr.retrieved_timestamp) > normalizeEvalTimestamp(existing.evaluation_timestamp))
|
| 516 |
+
) {
|
| 517 |
+
existing.evaluation_timestamp = mr.retrieved_timestamp
|
| 518 |
+
}
|
| 519 |
+
|
| 520 |
+
modelMap.set(modelId, existing)
|
| 521 |
+
}
|
| 522 |
+
}
|
| 523 |
+
|
| 524 |
+
const lowerIsBetter = subtaskMetrics.every((metric) => metric.lowerIsBetter)
|
| 525 |
+
const modelResults = Array.from(modelMap.values())
|
| 526 |
+
.flatMap((entry): ModelResultForBenchmark[] => {
|
| 527 |
+
const scoreEntries = Object.entries(entry.scores).filter(([, score]) => Number.isFinite(score))
|
| 528 |
+
if (scoreEntries.length === 0) {
|
| 529 |
+
return []
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
const averageScore = scoreEntries.reduce((sum, [, score]) => sum + score, 0) / scoreEntries.length
|
| 533 |
+
const detailScores = Object.fromEntries(scoreEntries)
|
| 534 |
+
|
| 535 |
+
const evaluationResult: EvaluationResult = {
|
| 536 |
+
evaluation_name: "Overall Score",
|
| 537 |
+
evaluation_timestamp: entry.evaluation_timestamp ?? "",
|
| 538 |
+
metric_config: {
|
| 539 |
+
evaluation_description: `Average score across ${subtaskMetrics.length} reported subtasks.`,
|
| 540 |
+
lower_is_better: lowerIsBetter,
|
| 541 |
+
score_type: "continuous",
|
| 542 |
+
min_score: 0,
|
| 543 |
+
max_score: 1,
|
| 544 |
+
},
|
| 545 |
+
score_details: {
|
| 546 |
+
score: averageScore,
|
| 547 |
+
details: detailScores,
|
| 548 |
+
},
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
return [{
|
| 552 |
+
model_info: entry.model_info,
|
| 553 |
+
model_route_id: entry.model_route_id,
|
| 554 |
+
score: averageScore,
|
| 555 |
+
score_details: {
|
| 556 |
+
score: averageScore,
|
| 557 |
+
details: detailScores,
|
| 558 |
+
},
|
| 559 |
+
evaluation_timestamp: entry.evaluation_timestamp ?? "",
|
| 560 |
+
source_metadata: {
|
| 561 |
+
source_type: "documentation" as const,
|
| 562 |
+
source_name: sourceName,
|
| 563 |
+
source_organization_name: sourceOrganization,
|
| 564 |
+
evaluator_relationship: "other" as const,
|
| 565 |
+
},
|
| 566 |
+
source_data: detail.source_data ?? { dataset_name: benchmarkKey },
|
| 567 |
+
result: evaluationResult,
|
| 568 |
+
}]
|
| 569 |
+
})
|
| 570 |
+
|
| 571 |
+
modelResults.sort((a, b) => (lowerIsBetter ? a.score - b.score : b.score - a.score))
|
| 572 |
+
|
| 573 |
+
const scores = modelResults.map((result) => result.score).filter(Number.isFinite)
|
| 574 |
+
const avgScore = scores.length > 0 ? scores.reduce((sum, value) => sum + value, 0) / scores.length : 0
|
| 575 |
+
|
| 576 |
+
return {
|
| 577 |
+
evaluation_name: evalName,
|
| 578 |
+
evaluation_id: detail.eval_summary_id,
|
| 579 |
+
composite_benchmark_key: benchmarkKey,
|
| 580 |
+
composite_benchmark_name: getBenchmarkDisplayName(benchmarkKey),
|
| 581 |
+
category: inferCategoryFromBenchmark(evalName),
|
| 582 |
+
metric_config: {
|
| 583 |
+
evaluation_description: `Average score across ${subtaskMetrics.length} reported subtasks.`,
|
| 584 |
+
lower_is_better: lowerIsBetter,
|
| 585 |
+
score_type: "continuous",
|
| 586 |
+
min_score: 0,
|
| 587 |
+
max_score: 1,
|
| 588 |
+
},
|
| 589 |
+
model_results: modelResults,
|
| 590 |
+
models_count: modelResults.length,
|
| 591 |
+
evaluator_names: [],
|
| 592 |
+
source_types: [],
|
| 593 |
+
latest_source_name: getBenchmarkDisplayName(benchmarkKey),
|
| 594 |
+
third_party_ratio: 0,
|
| 595 |
+
missing_generation_config_count: 0,
|
| 596 |
+
best_model: modelResults.length > 0
|
| 597 |
+
? { name: modelResults[0].model_info.name, score: modelResults[0].score }
|
| 598 |
+
: null,
|
| 599 |
+
worst_model: modelResults.length > 0
|
| 600 |
+
? { name: modelResults[modelResults.length - 1].model_info.name, score: modelResults[modelResults.length - 1].score }
|
| 601 |
+
: null,
|
| 602 |
+
avg_score: avgScore,
|
| 603 |
+
avg_score_norm: avgScore,
|
| 604 |
+
benchmark_card: detail.benchmark_card ?? undefined,
|
| 605 |
+
metric_names: subtaskMetrics.map((metric) => metric.label),
|
| 606 |
+
metrics_count: subtaskMetrics.length,
|
| 607 |
+
}
|
| 608 |
+
}
|
| 609 |
+
}
|
| 610 |
+
|
| 611 |
const primaryMetric = allMetrics[0]
|
| 612 |
if (!primaryMetric) {
|
| 613 |
return {
|
|
|
|
| 633 |
}
|
| 634 |
|
| 635 |
const modelResults: ModelResultForBenchmark[] = (primaryMetric.model_results ?? []).map((mr) => {
|
| 636 |
+
const evaluationTimestamp = mr.retrieved_timestamp ?? ""
|
|
|
|
| 637 |
const modelInfo: ModelInfo = {
|
| 638 |
name: mr.model_name ?? "",
|
| 639 |
id: mr.model_id ?? "",
|
|
|
|
| 642 |
|
| 643 |
const evaluationResult: EvaluationResult = {
|
| 644 |
evaluation_name: primaryMetric.metric_name ?? "",
|
| 645 |
+
evaluation_timestamp: evaluationTimestamp,
|
| 646 |
metric_config: {
|
| 647 |
evaluation_description: primaryMetric.metric_name ?? "",
|
| 648 |
lower_is_better: primaryMetric.lower_is_better ?? false,
|
|
|
|
| 661 |
model_route_id: mr.model_route_id,
|
| 662 |
score: mr.score ?? 0,
|
| 663 |
score_details: { score: mr.score ?? 0 },
|
| 664 |
+
evaluation_timestamp: evaluationTimestamp,
|
| 665 |
source_metadata: {
|
| 666 |
source_type: "documentation" as const,
|
| 667 |
source_name: sourceName,
|