evijit HF Staff commited on
Commit
bd8cbe8
·
1 Parent(s): 9d14977

Improve eval score displays and summary fallbacks

Browse files
app/evals/[id]/page.tsx CHANGED
@@ -245,7 +245,7 @@ function CompositeEvalView({
245
  </TabsTrigger>
246
  <TabsTrigger value="matrix" className="gap-2">
247
  <Grid3X3 className="h-4 w-4" />
248
- Matrix Leaderboard
249
  </TabsTrigger>
250
  </TabsList>
251
 
@@ -496,9 +496,9 @@ function MatrixLeaderboard({
496
  }
497
 
498
  function formatScore(score: number): string {
499
- if (score >= 1 && score <= 100) return score.toFixed(1)
500
- if (score > 0 && score < 1) return (score * 100).toFixed(1)
501
- return score.toFixed(1)
502
  }
503
 
504
  function handleSort(col: string) {
 
245
  </TabsTrigger>
246
  <TabsTrigger value="matrix" className="gap-2">
247
  <Grid3X3 className="h-4 w-4" />
248
+ Score breakdown
249
  </TabsTrigger>
250
  </TabsList>
251
 
 
496
  }
497
 
498
  function formatScore(score: number): string {
499
+ if (Math.abs(score) >= 100) return score.toFixed(1)
500
+ if (Math.abs(score) >= 10) return score.toFixed(2)
501
+ return score.toFixed(3).replace(/0+$/g, "").replace(/\.$/, "")
502
  }
503
 
504
  function handleSort(col: string) {
components/eval-detail.tsx CHANGED
@@ -199,9 +199,17 @@ function formatMetadataValue(value: unknown): string {
199
  }
200
 
201
  function formatDate(ts: string) {
 
 
 
 
202
  const numeric = Number(ts)
203
  const parsedDate = !Number.isNaN(numeric) && !ts.includes("-") ? new Date(numeric * 1000) : new Date(ts)
204
 
 
 
 
 
205
  try {
206
  return parsedDate.toLocaleDateString("en-US", {
207
  year: "numeric",
@@ -322,7 +330,7 @@ export function EvalDetail({ summary }: EvalDetailProps) {
322
  [leaderboardRows, leaderboardPage]
323
  )
324
 
325
- const avgNorm = formatPercent(summary.avg_score_norm)
326
  const scoreDirectionLabel = summary.metric_config.lower_is_better ? "Lower scores rank higher" : "Higher scores rank higher"
327
  const leaderboardTitle = isResearchView ? "Leaderboard" : "Reporting Comparison"
328
  const sourceDatasetLabel = summary.source_data?.hf_repo ?? summary.source_data?.dataset_name ?? "Backend summary"
@@ -331,8 +339,8 @@ export function EvalDetail({ summary }: EvalDetailProps) {
331
  : "Not linked"
332
  const leaderboardDescription = isResearchView
333
  ? summary.is_aggregated
334
- ? "Models ranked by average normalized score across the contributing composite benchmarks."
335
- : "Models ranked by normalized score for this benchmark."
336
  : summary.is_aggregated
337
  ? "Averaged model results across the contributing composite benchmarks, with drill-down to each component score."
338
  : "Model results with benchmark context, source dataset detail, and optional instance-data links."
@@ -396,9 +404,9 @@ export function EvalDetail({ summary }: EvalDetailProps) {
396
  </div>
397
  <div className="rounded-2xl border border-border/70 bg-muted/20 px-4 py-3">
398
  <div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-muted-foreground">
399
- {isResearchView ? "Avg norm" : "Metrics"}
400
  </div>
401
- <div className="mt-1 text-2xl font-semibold">{isResearchView ? avgNorm : summary.metrics_count ?? 1}</div>
402
  </div>
403
  <div className="rounded-2xl border border-emerald-200/80 bg-emerald-50/80 px-4 py-3 dark:border-emerald-900/40 dark:bg-emerald-950/20">
404
  <div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-emerald-700 dark:text-emerald-200">
@@ -409,7 +417,7 @@ export function EvalDetail({ summary }: EvalDetailProps) {
409
  </div>
410
  {isResearchView && summary.best_model && (
411
  <div className="mt-1 text-xs text-emerald-700/80 dark:text-emerald-200/80">
412
- {formatPercent(normalizeScore(summary.best_model.score))}
413
  </div>
414
  )}
415
  </div>
@@ -424,7 +432,7 @@ export function EvalDetail({ summary }: EvalDetailProps) {
424
  </div>
425
  {isResearchView && summary.worst_model && (
426
  <div className="mt-1 text-xs text-amber-700/80 dark:text-amber-200/80">
427
- {formatPercent(normalizeScore(summary.worst_model.score))}
428
  </div>
429
  )}
430
  </div>
@@ -703,21 +711,13 @@ export function EvalDetail({ summary }: EvalDetailProps) {
703
  </TableCell>
704
 
705
  <TableCell className="text-right">
706
- <div className="space-y-1">
707
- <div className="text-xl font-semibold tabular-nums">{formatPercent(normalizedScore)}</div>
708
- <div className="text-xs text-muted-foreground">
709
- Raw {formatRawScore(modelResult.score, summary.metric_config.unit)}
710
- </div>
711
- </div>
712
  </TableCell>
713
 
714
  {isResearchView ? (
715
  <TableCell className="hidden md:table-cell">
716
- <div className="flex min-w-[220px] items-center gap-3">
717
- <Progress value={normalizedScore * 100} className="h-2 flex-1" />
718
- <span className="w-12 text-right text-xs tabular-nums text-muted-foreground">
719
- {formatPercent(normalizedScore)}
720
- </span>
721
  </div>
722
  </TableCell>
723
  ) : (
@@ -846,15 +846,11 @@ export function EvalDetail({ summary }: EvalDetailProps) {
846
  subtitle={
847
  isResearchView
848
  ? "Raw metric values and uncertainty details."
849
- : "Normalized performance plus uncertainty and sample details."
850
  }
851
  >
852
  <MetaRow
853
- label={modelResult.aggregate_components ? "Average Score" : "Normalized Score"}
854
- value={formatPercent(normalizedScore)}
855
- />
856
- <MetaRow
857
- label={modelResult.aggregate_components ? "Average Raw Value" : "Raw Score"}
858
  value={formatRawScore(modelResult.score, summary.metric_config.unit)}
859
  />
860
  <MetaRow label="Score Type" value={modelResult.result.metric_config.score_type} />
@@ -888,7 +884,6 @@ export function EvalDetail({ summary }: EvalDetailProps) {
888
  <th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Benchmark</th>
889
  <th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Source</th>
890
  <th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Raw</th>
891
- <th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Score</th>
892
  </tr>
893
  </thead>
894
  <tbody>
@@ -897,7 +892,6 @@ export function EvalDetail({ summary }: EvalDetailProps) {
897
  <td className="px-3 py-2 font-medium">{component.composite_benchmark_name}</td>
898
  <td className="px-3 py-2 text-muted-foreground">{component.source_organization_name}</td>
899
  <td className="px-3 py-2 text-right tabular-nums text-muted-foreground">{formatRawScore(component.score)}</td>
900
- <td className="px-3 py-2 text-right font-semibold tabular-nums">{formatPercent(component.normalized_score)}</td>
901
  </tr>
902
  ))}
903
  </tbody>
@@ -917,18 +911,15 @@ export function EvalDetail({ summary }: EvalDetailProps) {
917
  <tr className="border-b bg-muted/30">
918
  <th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Subtask</th>
919
  <th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Raw</th>
920
- <th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Score</th>
921
  </tr>
922
  </thead>
923
  <tbody>
924
  {subtasks.map(([subtaskName, value]) => {
925
  const numericValue = value as number
926
- const normalizedSubtaskScore = range > 0 ? (numericValue - minScore) / range : numericValue
927
  return (
928
  <tr key={subtaskName} className="border-b last:border-0 hover:bg-muted/10">
929
  <td className="px-3 py-2 font-medium capitalize">{subtaskName.replace(/_/g, " ")}</td>
930
  <td className="px-3 py-2 text-right tabular-nums text-muted-foreground">{formatRawScore(numericValue, summary.metric_config.unit)}</td>
931
- <td className="px-3 py-2 text-right font-semibold tabular-nums">{formatPercent(normalizedSubtaskScore)}</td>
932
  </tr>
933
  )
934
  })}
 
199
  }
200
 
201
  function formatDate(ts: string) {
202
+ if (!ts || !ts.trim()) {
203
+ return "Unknown"
204
+ }
205
+
206
  const numeric = Number(ts)
207
  const parsedDate = !Number.isNaN(numeric) && !ts.includes("-") ? new Date(numeric * 1000) : new Date(ts)
208
 
209
+ if (Number.isNaN(parsedDate.getTime())) {
210
+ return "Unknown"
211
+ }
212
+
213
  try {
214
  return parsedDate.toLocaleDateString("en-US", {
215
  year: "numeric",
 
330
  [leaderboardRows, leaderboardPage]
331
  )
332
 
333
+ const avgScoreLabel = formatRawScore(summary.avg_score, summary.metric_config.unit)
334
  const scoreDirectionLabel = summary.metric_config.lower_is_better ? "Lower scores rank higher" : "Higher scores rank higher"
335
  const leaderboardTitle = isResearchView ? "Leaderboard" : "Reporting Comparison"
336
  const sourceDatasetLabel = summary.source_data?.hf_repo ?? summary.source_data?.dataset_name ?? "Backend summary"
 
339
  : "Not linked"
340
  const leaderboardDescription = isResearchView
341
  ? summary.is_aggregated
342
+ ? "Models ranked by average raw score across the contributing composite benchmarks."
343
+ : "Models ranked by raw score for this benchmark."
344
  : summary.is_aggregated
345
  ? "Averaged model results across the contributing composite benchmarks, with drill-down to each component score."
346
  : "Model results with benchmark context, source dataset detail, and optional instance-data links."
 
404
  </div>
405
  <div className="rounded-2xl border border-border/70 bg-muted/20 px-4 py-3">
406
  <div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-muted-foreground">
407
+ {isResearchView ? "Avg score" : "Metrics"}
408
  </div>
409
+ <div className="mt-1 text-2xl font-semibold">{isResearchView ? avgScoreLabel : summary.metrics_count ?? 1}</div>
410
  </div>
411
  <div className="rounded-2xl border border-emerald-200/80 bg-emerald-50/80 px-4 py-3 dark:border-emerald-900/40 dark:bg-emerald-950/20">
412
  <div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-emerald-700 dark:text-emerald-200">
 
417
  </div>
418
  {isResearchView && summary.best_model && (
419
  <div className="mt-1 text-xs text-emerald-700/80 dark:text-emerald-200/80">
420
+ {formatRawScore(summary.best_model.score, summary.metric_config.unit)}
421
  </div>
422
  )}
423
  </div>
 
432
  </div>
433
  {isResearchView && summary.worst_model && (
434
  <div className="mt-1 text-xs text-amber-700/80 dark:text-amber-200/80">
435
+ {formatRawScore(summary.worst_model.score, summary.metric_config.unit)}
436
  </div>
437
  )}
438
  </div>
 
711
  </TableCell>
712
 
713
  <TableCell className="text-right">
714
+ <div className="text-xl font-semibold tabular-nums">{formatRawScore(modelResult.score, summary.metric_config.unit)}</div>
 
 
 
 
 
715
  </TableCell>
716
 
717
  {isResearchView ? (
718
  <TableCell className="hidden md:table-cell">
719
+ <div className="min-w-[220px]">
720
+ <Progress value={normalizedScore * 100} className="h-2" />
 
 
 
721
  </div>
722
  </TableCell>
723
  ) : (
 
846
  subtitle={
847
  isResearchView
848
  ? "Raw metric values and uncertainty details."
849
+ : "Raw performance plus uncertainty and sample details."
850
  }
851
  >
852
  <MetaRow
853
+ label={modelResult.aggregate_components ? "Average Raw Score" : "Raw Score"}
 
 
 
 
854
  value={formatRawScore(modelResult.score, summary.metric_config.unit)}
855
  />
856
  <MetaRow label="Score Type" value={modelResult.result.metric_config.score_type} />
 
884
  <th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Benchmark</th>
885
  <th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Source</th>
886
  <th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Raw</th>
 
887
  </tr>
888
  </thead>
889
  <tbody>
 
892
  <td className="px-3 py-2 font-medium">{component.composite_benchmark_name}</td>
893
  <td className="px-3 py-2 text-muted-foreground">{component.source_organization_name}</td>
894
  <td className="px-3 py-2 text-right tabular-nums text-muted-foreground">{formatRawScore(component.score)}</td>
 
895
  </tr>
896
  ))}
897
  </tbody>
 
911
  <tr className="border-b bg-muted/30">
912
  <th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Subtask</th>
913
  <th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Raw</th>
 
914
  </tr>
915
  </thead>
916
  <tbody>
917
  {subtasks.map(([subtaskName, value]) => {
918
  const numericValue = value as number
 
919
  return (
920
  <tr key={subtaskName} className="border-b last:border-0 hover:bg-muted/10">
921
  <td className="px-3 py-2 font-medium capitalize">{subtaskName.replace(/_/g, " ")}</td>
922
  <td className="px-3 py-2 text-right tabular-nums text-muted-foreground">{formatRawScore(numericValue, summary.metric_config.unit)}</td>
 
923
  </tr>
924
  )
925
  })}
lib/model-data.ts CHANGED
@@ -443,11 +443,171 @@ function hfEvalEntryToListItem(entry: {
443
  // ---------------------------------------------------------------------------
444
 
445
  function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
446
- // New structure: detail.metrics is an array of metric objects, each with model_results
447
- // Use the first metric as the primary for the summary leaderboard
448
  const evalName = detail.benchmark_leaf_name || detail.eval_summary_id || "Unknown"
449
  const benchmarkKey = detail.benchmark ?? ""
 
 
450
  const allMetrics = detail.metrics ?? []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
  const primaryMetric = allMetrics[0]
452
  if (!primaryMetric) {
453
  return {
@@ -473,8 +633,7 @@ function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
473
  }
474
 
475
  const modelResults: ModelResultForBenchmark[] = (primaryMetric.model_results ?? []).map((mr) => {
476
- const sourceName = detail.source_data?.dataset_name || benchmarkKey
477
- const sourceOrganization = detail.source_data?.hf_repo || sourceName
478
  const modelInfo: ModelInfo = {
479
  name: mr.model_name ?? "",
480
  id: mr.model_id ?? "",
@@ -483,7 +642,7 @@ function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
483
 
484
  const evaluationResult: EvaluationResult = {
485
  evaluation_name: primaryMetric.metric_name ?? "",
486
- evaluation_timestamp: "",
487
  metric_config: {
488
  evaluation_description: primaryMetric.metric_name ?? "",
489
  lower_is_better: primaryMetric.lower_is_better ?? false,
@@ -502,7 +661,7 @@ function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
502
  model_route_id: mr.model_route_id,
503
  score: mr.score ?? 0,
504
  score_details: { score: mr.score ?? 0 },
505
- evaluation_timestamp: "",
506
  source_metadata: {
507
  source_type: "documentation" as const,
508
  source_name: sourceName,
 
443
  // ---------------------------------------------------------------------------
444
 
445
  function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
 
 
446
  const evalName = detail.benchmark_leaf_name || detail.eval_summary_id || "Unknown"
447
  const benchmarkKey = detail.benchmark ?? ""
448
+ const sourceName = detail.source_data?.dataset_name || benchmarkKey
449
+ const sourceOrganization = detail.source_data?.hf_repo || sourceName
450
  const allMetrics = detail.metrics ?? []
451
+
452
+ if (allMetrics.length === 0) {
453
+ const subtaskMetrics = (Array.isArray(detail.subtasks) ? detail.subtasks : [])
454
+ .flatMap((subtask) => {
455
+ if (!subtask || typeof subtask !== "object") {
456
+ return []
457
+ }
458
+
459
+ const subtaskRecord = subtask as Record<string, unknown>
460
+ const subtaskLabel =
461
+ (typeof subtaskRecord.display_name === "string" && subtaskRecord.display_name.trim()) ||
462
+ (typeof subtaskRecord.subtask_name === "string" && subtaskRecord.subtask_name.trim()) ||
463
+ (typeof subtaskRecord.subtask_key === "string" && humanizeToken(subtaskRecord.subtask_key)) ||
464
+ "Subtask"
465
+
466
+ const metrics = Array.isArray(subtaskRecord.metrics) ? subtaskRecord.metrics : []
467
+ return metrics.map((metric) => {
468
+ const metricRecord = metric as Record<string, unknown>
469
+ const metricName =
470
+ (typeof metricRecord.metric_name === "string" && metricRecord.metric_name.trim()) ||
471
+ (typeof metricRecord.evaluation_name === "string" && metricRecord.evaluation_name.trim()) ||
472
+ "Score"
473
+
474
+ return {
475
+ label: subtaskLabel,
476
+ metricName,
477
+ lowerIsBetter: metricRecord.lower_is_better === true,
478
+ modelResults: Array.isArray(metricRecord.model_results)
479
+ ? (metricRecord.model_results as HFEvalModelResult[])
480
+ : [],
481
+ }
482
+ })
483
+ })
484
+ .filter((metric) => metric.modelResults.length > 0)
485
+
486
+ if (subtaskMetrics.length > 0) {
487
+ const modelMap = new Map<string, {
488
+ model_info: ModelInfo
489
+ model_route_id?: string
490
+ evaluation_timestamp?: string
491
+ scores: Record<string, number>
492
+ }>()
493
+
494
+ for (const metric of subtaskMetrics) {
495
+ for (const mr of metric.modelResults) {
496
+ const modelId = mr.model_id ?? mr.model_route_id ?? mr.model_name ?? "unknown-model"
497
+ const existing = modelMap.get(modelId) ?? {
498
+ model_info: {
499
+ name: mr.model_name ?? "",
500
+ id: mr.model_id ?? "",
501
+ developer: mr.developer ?? "",
502
+ },
503
+ model_route_id: mr.model_route_id,
504
+ evaluation_timestamp: mr.retrieved_timestamp,
505
+ scores: {},
506
+ }
507
+
508
+ if (Number.isFinite(mr.score)) {
509
+ existing.scores[metric.label] = mr.score
510
+ }
511
+
512
+ if (
513
+ mr.retrieved_timestamp &&
514
+ (!existing.evaluation_timestamp ||
515
+ normalizeEvalTimestamp(mr.retrieved_timestamp) > normalizeEvalTimestamp(existing.evaluation_timestamp))
516
+ ) {
517
+ existing.evaluation_timestamp = mr.retrieved_timestamp
518
+ }
519
+
520
+ modelMap.set(modelId, existing)
521
+ }
522
+ }
523
+
524
+ const lowerIsBetter = subtaskMetrics.every((metric) => metric.lowerIsBetter)
525
+ const modelResults = Array.from(modelMap.values())
526
+ .flatMap((entry): ModelResultForBenchmark[] => {
527
+ const scoreEntries = Object.entries(entry.scores).filter(([, score]) => Number.isFinite(score))
528
+ if (scoreEntries.length === 0) {
529
+ return []
530
+ }
531
+
532
+ const averageScore = scoreEntries.reduce((sum, [, score]) => sum + score, 0) / scoreEntries.length
533
+ const detailScores = Object.fromEntries(scoreEntries)
534
+
535
+ const evaluationResult: EvaluationResult = {
536
+ evaluation_name: "Overall Score",
537
+ evaluation_timestamp: entry.evaluation_timestamp ?? "",
538
+ metric_config: {
539
+ evaluation_description: `Average score across ${subtaskMetrics.length} reported subtasks.`,
540
+ lower_is_better: lowerIsBetter,
541
+ score_type: "continuous",
542
+ min_score: 0,
543
+ max_score: 1,
544
+ },
545
+ score_details: {
546
+ score: averageScore,
547
+ details: detailScores,
548
+ },
549
+ }
550
+
551
+ return [{
552
+ model_info: entry.model_info,
553
+ model_route_id: entry.model_route_id,
554
+ score: averageScore,
555
+ score_details: {
556
+ score: averageScore,
557
+ details: detailScores,
558
+ },
559
+ evaluation_timestamp: entry.evaluation_timestamp ?? "",
560
+ source_metadata: {
561
+ source_type: "documentation" as const,
562
+ source_name: sourceName,
563
+ source_organization_name: sourceOrganization,
564
+ evaluator_relationship: "other" as const,
565
+ },
566
+ source_data: detail.source_data ?? { dataset_name: benchmarkKey },
567
+ result: evaluationResult,
568
+ }]
569
+ })
570
+
571
+ modelResults.sort((a, b) => (lowerIsBetter ? a.score - b.score : b.score - a.score))
572
+
573
+ const scores = modelResults.map((result) => result.score).filter(Number.isFinite)
574
+ const avgScore = scores.length > 0 ? scores.reduce((sum, value) => sum + value, 0) / scores.length : 0
575
+
576
+ return {
577
+ evaluation_name: evalName,
578
+ evaluation_id: detail.eval_summary_id,
579
+ composite_benchmark_key: benchmarkKey,
580
+ composite_benchmark_name: getBenchmarkDisplayName(benchmarkKey),
581
+ category: inferCategoryFromBenchmark(evalName),
582
+ metric_config: {
583
+ evaluation_description: `Average score across ${subtaskMetrics.length} reported subtasks.`,
584
+ lower_is_better: lowerIsBetter,
585
+ score_type: "continuous",
586
+ min_score: 0,
587
+ max_score: 1,
588
+ },
589
+ model_results: modelResults,
590
+ models_count: modelResults.length,
591
+ evaluator_names: [],
592
+ source_types: [],
593
+ latest_source_name: getBenchmarkDisplayName(benchmarkKey),
594
+ third_party_ratio: 0,
595
+ missing_generation_config_count: 0,
596
+ best_model: modelResults.length > 0
597
+ ? { name: modelResults[0].model_info.name, score: modelResults[0].score }
598
+ : null,
599
+ worst_model: modelResults.length > 0
600
+ ? { name: modelResults[modelResults.length - 1].model_info.name, score: modelResults[modelResults.length - 1].score }
601
+ : null,
602
+ avg_score: avgScore,
603
+ avg_score_norm: avgScore,
604
+ benchmark_card: detail.benchmark_card ?? undefined,
605
+ metric_names: subtaskMetrics.map((metric) => metric.label),
606
+ metrics_count: subtaskMetrics.length,
607
+ }
608
+ }
609
+ }
610
+
611
  const primaryMetric = allMetrics[0]
612
  if (!primaryMetric) {
613
  return {
 
633
  }
634
 
635
  const modelResults: ModelResultForBenchmark[] = (primaryMetric.model_results ?? []).map((mr) => {
636
+ const evaluationTimestamp = mr.retrieved_timestamp ?? ""
 
637
  const modelInfo: ModelInfo = {
638
  name: mr.model_name ?? "",
639
  id: mr.model_id ?? "",
 
642
 
643
  const evaluationResult: EvaluationResult = {
644
  evaluation_name: primaryMetric.metric_name ?? "",
645
+ evaluation_timestamp: evaluationTimestamp,
646
  metric_config: {
647
  evaluation_description: primaryMetric.metric_name ?? "",
648
  lower_is_better: primaryMetric.lower_is_better ?? false,
 
661
  model_route_id: mr.model_route_id,
662
  score: mr.score ?? 0,
663
  score_details: { score: mr.score ?? 0 },
664
+ evaluation_timestamp: evaluationTimestamp,
665
  source_metadata: {
666
  source_type: "documentation" as const,
667
  source_name: sourceName,