Spaces:

evaleval
/

general-eval-card

Running

App Files Files Community

evijit HF Staff commited on Apr 13

Commit

bd8cbe8

1 Parent(s): 9d14977

Improve eval score displays and summary fallbacks

Browse files

Files changed (3) hide show

app/evals/[id]/page.tsx +4 -4
components/eval-detail.tsx +20 -29
lib/model-data.ts +165 -6

app/evals/[id]/page.tsx CHANGED Viewed

@@ -245,7 +245,7 @@ function CompositeEvalView({
           </TabsTrigger>
           <TabsTrigger value="matrix" className="gap-2">
             <Grid3X3 className="h-4 w-4" />
-            Matrix Leaderboard
           </TabsTrigger>
         </TabsList>
@@ -496,9 +496,9 @@ function MatrixLeaderboard({
   }
   function formatScore(score: number): string {
-    if (score >= 1 && score <= 100) return score.toFixed(1)
-    if (score > 0 && score < 1) return (score * 100).toFixed(1)
-    return score.toFixed(1)
   }
   function handleSort(col: string) {

           </TabsTrigger>
           <TabsTrigger value="matrix" className="gap-2">
             <Grid3X3 className="h-4 w-4" />
+            Score breakdown
           </TabsTrigger>
         </TabsList>
   }
   function formatScore(score: number): string {
+    if (Math.abs(score) >= 100) return score.toFixed(1)
+    if (Math.abs(score) >= 10) return score.toFixed(2)
+    return score.toFixed(3).replace(/0+$/g, "").replace(/\.$/, "")
   }
   function handleSort(col: string) {

components/eval-detail.tsx CHANGED Viewed

@@ -199,9 +199,17 @@ function formatMetadataValue(value: unknown): string {
 }
 function formatDate(ts: string) {
   const numeric = Number(ts)
   const parsedDate = !Number.isNaN(numeric) && !ts.includes("-") ? new Date(numeric * 1000) : new Date(ts)
   try {
     return parsedDate.toLocaleDateString("en-US", {
       year: "numeric",
@@ -322,7 +330,7 @@ export function EvalDetail({ summary }: EvalDetailProps) {
     [leaderboardRows, leaderboardPage]
   )
-  const avgNorm = formatPercent(summary.avg_score_norm)
   const scoreDirectionLabel = summary.metric_config.lower_is_better ? "Lower scores rank higher" : "Higher scores rank higher"
   const leaderboardTitle = isResearchView ? "Leaderboard" : "Reporting Comparison"
   const sourceDatasetLabel = summary.source_data?.hf_repo ?? summary.source_data?.dataset_name ?? "Backend summary"
@@ -331,8 +339,8 @@ export function EvalDetail({ summary }: EvalDetailProps) {
     : "Not linked"
   const leaderboardDescription = isResearchView
     ? summary.is_aggregated
-      ? "Models ranked by average normalized score across the contributing composite benchmarks."
-      : "Models ranked by normalized score for this benchmark."
     : summary.is_aggregated
       ? "Averaged model results across the contributing composite benchmarks, with drill-down to each component score."
       : "Model results with benchmark context, source dataset detail, and optional instance-data links."
@@ -396,9 +404,9 @@ export function EvalDetail({ summary }: EvalDetailProps) {
               </div>
               <div className="rounded-2xl border border-border/70 bg-muted/20 px-4 py-3">
                 <div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-muted-foreground">
-                  {isResearchView ? "Avg norm" : "Metrics"}
                 </div>
-                <div className="mt-1 text-2xl font-semibold">{isResearchView ? avgNorm : summary.metrics_count ?? 1}</div>
               </div>
               <div className="rounded-2xl border border-emerald-200/80 bg-emerald-50/80 px-4 py-3 dark:border-emerald-900/40 dark:bg-emerald-950/20">
                 <div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-emerald-700 dark:text-emerald-200">
@@ -409,7 +417,7 @@ export function EvalDetail({ summary }: EvalDetailProps) {
                 </div>
                 {isResearchView && summary.best_model && (
                   <div className="mt-1 text-xs text-emerald-700/80 dark:text-emerald-200/80">
-                    {formatPercent(normalizeScore(summary.best_model.score))}
                   </div>
                 )}
               </div>
@@ -424,7 +432,7 @@ export function EvalDetail({ summary }: EvalDetailProps) {
                 </div>
                 {isResearchView && summary.worst_model && (
                   <div className="mt-1 text-xs text-amber-700/80 dark:text-amber-200/80">
-                    {formatPercent(normalizeScore(summary.worst_model.score))}
                   </div>
                 )}
               </div>
@@ -703,21 +711,13 @@ export function EvalDetail({ summary }: EvalDetailProps) {
                       </TableCell>
                       <TableCell className="text-right">
-                        <div className="space-y-1">
-                          <div className="text-xl font-semibold tabular-nums">{formatPercent(normalizedScore)}</div>
-                          <div className="text-xs text-muted-foreground">
-                            Raw {formatRawScore(modelResult.score, summary.metric_config.unit)}
-                          </div>
-                        </div>
                       </TableCell>
                       {isResearchView ? (
                         <TableCell className="hidden md:table-cell">
-                          <div className="flex min-w-[220px] items-center gap-3">
-                            <Progress value={normalizedScore * 100} className="h-2 flex-1" />
-                            <span className="w-12 text-right text-xs tabular-nums text-muted-foreground">
-                              {formatPercent(normalizedScore)}
-                            </span>
                           </div>
                         </TableCell>
                       ) : (
@@ -846,15 +846,11 @@ export function EvalDetail({ summary }: EvalDetailProps) {
                                 subtitle={
                                   isResearchView
                                     ? "Raw metric values and uncertainty details."
-                                    : "Normalized performance plus uncertainty and sample details."
                                 }
                               >
                                 <MetaRow
-                                  label={modelResult.aggregate_components ? "Average Score" : "Normalized Score"}
-                                  value={formatPercent(normalizedScore)}
-                                />
-                                <MetaRow
-                                  label={modelResult.aggregate_components ? "Average Raw Value" : "Raw Score"}
                                   value={formatRawScore(modelResult.score, summary.metric_config.unit)}
                                 />
                                 <MetaRow label="Score Type" value={modelResult.result.metric_config.score_type} />
@@ -888,7 +884,6 @@ export function EvalDetail({ summary }: EvalDetailProps) {
                                         <th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Benchmark</th>
                                         <th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Source</th>
                                         <th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Raw</th>
-                                        <th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Score</th>
                                       </tr>
                                     </thead>
                                     <tbody>
@@ -897,7 +892,6 @@ export function EvalDetail({ summary }: EvalDetailProps) {
                                           <td className="px-3 py-2 font-medium">{component.composite_benchmark_name}</td>
                                           <td className="px-3 py-2 text-muted-foreground">{component.source_organization_name}</td>
                                           <td className="px-3 py-2 text-right tabular-nums text-muted-foreground">{formatRawScore(component.score)}</td>
-                                          <td className="px-3 py-2 text-right font-semibold tabular-nums">{formatPercent(component.normalized_score)}</td>
                                         </tr>
                                       ))}
                                     </tbody>
@@ -917,18 +911,15 @@ export function EvalDetail({ summary }: EvalDetailProps) {
                                       <tr className="border-b bg-muted/30">
                                         <th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Subtask</th>
                                         <th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Raw</th>
-                                        <th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Score</th>
                                       </tr>
                                     </thead>
                                     <tbody>
                                       {subtasks.map(([subtaskName, value]) => {
                                         const numericValue = value as number
-                                        const normalizedSubtaskScore = range > 0 ? (numericValue - minScore) / range : numericValue
                                         return (
                                           <tr key={subtaskName} className="border-b last:border-0 hover:bg-muted/10">
                                             <td className="px-3 py-2 font-medium capitalize">{subtaskName.replace(/_/g, " ")}</td>
                                             <td className="px-3 py-2 text-right tabular-nums text-muted-foreground">{formatRawScore(numericValue, summary.metric_config.unit)}</td>
-                                            <td className="px-3 py-2 text-right font-semibold tabular-nums">{formatPercent(normalizedSubtaskScore)}</td>
                                           </tr>
                                         )
                                       })}

 }
 function formatDate(ts: string) {
+  if (!ts || !ts.trim()) {
+    return "Unknown"
+  }
   const numeric = Number(ts)
   const parsedDate = !Number.isNaN(numeric) && !ts.includes("-") ? new Date(numeric * 1000) : new Date(ts)
+  if (Number.isNaN(parsedDate.getTime())) {
+    return "Unknown"
+  }
   try {
     return parsedDate.toLocaleDateString("en-US", {
       year: "numeric",
     [leaderboardRows, leaderboardPage]
   )
+  const avgScoreLabel = formatRawScore(summary.avg_score, summary.metric_config.unit)
   const scoreDirectionLabel = summary.metric_config.lower_is_better ? "Lower scores rank higher" : "Higher scores rank higher"
   const leaderboardTitle = isResearchView ? "Leaderboard" : "Reporting Comparison"
   const sourceDatasetLabel = summary.source_data?.hf_repo ?? summary.source_data?.dataset_name ?? "Backend summary"
     : "Not linked"
   const leaderboardDescription = isResearchView
     ? summary.is_aggregated
+      ? "Models ranked by average raw score across the contributing composite benchmarks."
+      : "Models ranked by raw score for this benchmark."
     : summary.is_aggregated
       ? "Averaged model results across the contributing composite benchmarks, with drill-down to each component score."
       : "Model results with benchmark context, source dataset detail, and optional instance-data links."
               </div>
               <div className="rounded-2xl border border-border/70 bg-muted/20 px-4 py-3">
                 <div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-muted-foreground">
+                  {isResearchView ? "Avg score" : "Metrics"}
                 </div>
+                <div className="mt-1 text-2xl font-semibold">{isResearchView ? avgScoreLabel : summary.metrics_count ?? 1}</div>
               </div>
               <div className="rounded-2xl border border-emerald-200/80 bg-emerald-50/80 px-4 py-3 dark:border-emerald-900/40 dark:bg-emerald-950/20">
                 <div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-emerald-700 dark:text-emerald-200">
                 </div>
                 {isResearchView && summary.best_model && (
                   <div className="mt-1 text-xs text-emerald-700/80 dark:text-emerald-200/80">
+                    {formatRawScore(summary.best_model.score, summary.metric_config.unit)}
                   </div>
                 )}
               </div>
                 </div>
                 {isResearchView && summary.worst_model && (
                   <div className="mt-1 text-xs text-amber-700/80 dark:text-amber-200/80">
+                    {formatRawScore(summary.worst_model.score, summary.metric_config.unit)}
                   </div>
                 )}
               </div>
                       </TableCell>
                       <TableCell className="text-right">
+                        <div className="text-xl font-semibold tabular-nums">{formatRawScore(modelResult.score, summary.metric_config.unit)}</div>
                       </TableCell>
                       {isResearchView ? (
                         <TableCell className="hidden md:table-cell">
+                          <div className="min-w-[220px]">
+                            <Progress value={normalizedScore * 100} className="h-2" />
                           </div>
                         </TableCell>
                       ) : (
                                 subtitle={
                                   isResearchView
                                     ? "Raw metric values and uncertainty details."
+                                    : "Raw performance plus uncertainty and sample details."
                                 }
                               >
                                 <MetaRow
+                                  label={modelResult.aggregate_components ? "Average Raw Score" : "Raw Score"}
                                   value={formatRawScore(modelResult.score, summary.metric_config.unit)}
                                 />
                                 <MetaRow label="Score Type" value={modelResult.result.metric_config.score_type} />
                                         <th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Benchmark</th>
                                         <th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Source</th>
                                         <th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Raw</th>
                                       </tr>
                                     </thead>
                                     <tbody>
                                           <td className="px-3 py-2 font-medium">{component.composite_benchmark_name}</td>
                                           <td className="px-3 py-2 text-muted-foreground">{component.source_organization_name}</td>
                                           <td className="px-3 py-2 text-right tabular-nums text-muted-foreground">{formatRawScore(component.score)}</td>
                                         </tr>
                                       ))}
                                     </tbody>
                                       <tr className="border-b bg-muted/30">
                                         <th className="px-3 py-2 text-left text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Subtask</th>
                                         <th className="px-3 py-2 text-right text-[11px] font-semibold uppercase tracking-wider text-muted-foreground">Raw</th>
                                       </tr>
                                     </thead>
                                     <tbody>
                                       {subtasks.map(([subtaskName, value]) => {
                                         const numericValue = value as number
                                         return (
                                           <tr key={subtaskName} className="border-b last:border-0 hover:bg-muted/10">
                                             <td className="px-3 py-2 font-medium capitalize">{subtaskName.replace(/_/g, " ")}</td>
                                             <td className="px-3 py-2 text-right tabular-nums text-muted-foreground">{formatRawScore(numericValue, summary.metric_config.unit)}</td>
                                           </tr>
                                         )
                                       })}

lib/model-data.ts CHANGED Viewed

@@ -443,11 +443,171 @@ function hfEvalEntryToListItem(entry: {
 // ---------------------------------------------------------------------------
 function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
-  // New structure: detail.metrics is an array of metric objects, each with model_results
-  // Use the first metric as the primary for the summary leaderboard
   const evalName = detail.benchmark_leaf_name || detail.eval_summary_id || "Unknown"
   const benchmarkKey = detail.benchmark ?? ""
   const allMetrics = detail.metrics ?? []
   const primaryMetric = allMetrics[0]
   if (!primaryMetric) {
     return {
@@ -473,8 +633,7 @@ function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
   }
   const modelResults: ModelResultForBenchmark[] = (primaryMetric.model_results ?? []).map((mr) => {
-    const sourceName = detail.source_data?.dataset_name || benchmarkKey
-    const sourceOrganization = detail.source_data?.hf_repo || sourceName
     const modelInfo: ModelInfo = {
       name: mr.model_name ?? "",
       id: mr.model_id ?? "",
@@ -483,7 +642,7 @@ function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
     const evaluationResult: EvaluationResult = {
       evaluation_name: primaryMetric.metric_name ?? "",
-      evaluation_timestamp: "",
       metric_config: {
         evaluation_description: primaryMetric.metric_name ?? "",
         lower_is_better: primaryMetric.lower_is_better ?? false,
@@ -502,7 +661,7 @@ function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
       model_route_id: mr.model_route_id,
       score: mr.score ?? 0,
       score_details: { score: mr.score ?? 0 },
-      evaluation_timestamp: "",
       source_metadata: {
         source_type: "documentation" as const,
         source_name: sourceName,

 // ---------------------------------------------------------------------------
 function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
   const evalName = detail.benchmark_leaf_name || detail.eval_summary_id || "Unknown"
   const benchmarkKey = detail.benchmark ?? ""
+  const sourceName = detail.source_data?.dataset_name || benchmarkKey
+  const sourceOrganization = detail.source_data?.hf_repo || sourceName
   const allMetrics = detail.metrics ?? []
+  if (allMetrics.length === 0) {
+    const subtaskMetrics = (Array.isArray(detail.subtasks) ? detail.subtasks : [])
+      .flatMap((subtask) => {
+        if (!subtask || typeof subtask !== "object") {
+          return []
+        }
+        const subtaskRecord = subtask as Record<string, unknown>
+        const subtaskLabel =
+          (typeof subtaskRecord.display_name === "string" && subtaskRecord.display_name.trim()) ||
+          (typeof subtaskRecord.subtask_name === "string" && subtaskRecord.subtask_name.trim()) ||
+          (typeof subtaskRecord.subtask_key === "string" && humanizeToken(subtaskRecord.subtask_key)) ||
+          "Subtask"
+        const metrics = Array.isArray(subtaskRecord.metrics) ? subtaskRecord.metrics : []
+        return metrics.map((metric) => {
+          const metricRecord = metric as Record<string, unknown>
+          const metricName =
+            (typeof metricRecord.metric_name === "string" && metricRecord.metric_name.trim()) ||
+            (typeof metricRecord.evaluation_name === "string" && metricRecord.evaluation_name.trim()) ||
+            "Score"
+          return {
+            label: subtaskLabel,
+            metricName,
+            lowerIsBetter: metricRecord.lower_is_better === true,
+            modelResults: Array.isArray(metricRecord.model_results)
+              ? (metricRecord.model_results as HFEvalModelResult[])
+              : [],
+          }
+        })
+      })
+      .filter((metric) => metric.modelResults.length > 0)
+    if (subtaskMetrics.length > 0) {
+      const modelMap = new Map<string, {
+        model_info: ModelInfo
+        model_route_id?: string
+        evaluation_timestamp?: string
+        scores: Record<string, number>
+      }>()
+      for (const metric of subtaskMetrics) {
+        for (const mr of metric.modelResults) {
+          const modelId = mr.model_id ?? mr.model_route_id ?? mr.model_name ?? "unknown-model"
+          const existing = modelMap.get(modelId) ?? {
+            model_info: {
+              name: mr.model_name ?? "",
+              id: mr.model_id ?? "",
+              developer: mr.developer ?? "",
+            },
+            model_route_id: mr.model_route_id,
+            evaluation_timestamp: mr.retrieved_timestamp,
+            scores: {},
+          }
+          if (Number.isFinite(mr.score)) {
+            existing.scores[metric.label] = mr.score
+          }
+          if (
+            mr.retrieved_timestamp &&
+            (!existing.evaluation_timestamp ||
+              normalizeEvalTimestamp(mr.retrieved_timestamp) > normalizeEvalTimestamp(existing.evaluation_timestamp))
+          ) {
+            existing.evaluation_timestamp = mr.retrieved_timestamp
+          }
+          modelMap.set(modelId, existing)
+        }
+      }
+      const lowerIsBetter = subtaskMetrics.every((metric) => metric.lowerIsBetter)
+      const modelResults = Array.from(modelMap.values())
+        .flatMap((entry): ModelResultForBenchmark[] => {
+          const scoreEntries = Object.entries(entry.scores).filter(([, score]) => Number.isFinite(score))
+          if (scoreEntries.length === 0) {
+            return []
+          }
+          const averageScore = scoreEntries.reduce((sum, [, score]) => sum + score, 0) / scoreEntries.length
+          const detailScores = Object.fromEntries(scoreEntries)
+          const evaluationResult: EvaluationResult = {
+            evaluation_name: "Overall Score",
+            evaluation_timestamp: entry.evaluation_timestamp ?? "",
+            metric_config: {
+              evaluation_description: `Average score across ${subtaskMetrics.length} reported subtasks.`,
+              lower_is_better: lowerIsBetter,
+              score_type: "continuous",
+              min_score: 0,
+              max_score: 1,
+            },
+            score_details: {
+              score: averageScore,
+              details: detailScores,
+            },
+          }
+          return [{
+            model_info: entry.model_info,
+            model_route_id: entry.model_route_id,
+            score: averageScore,
+            score_details: {
+              score: averageScore,
+              details: detailScores,
+            },
+            evaluation_timestamp: entry.evaluation_timestamp ?? "",
+            source_metadata: {
+              source_type: "documentation" as const,
+              source_name: sourceName,
+              source_organization_name: sourceOrganization,
+              evaluator_relationship: "other" as const,
+            },
+            source_data: detail.source_data ?? { dataset_name: benchmarkKey },
+            result: evaluationResult,
+          }]
+        })
+      modelResults.sort((a, b) => (lowerIsBetter ? a.score - b.score : b.score - a.score))
+      const scores = modelResults.map((result) => result.score).filter(Number.isFinite)
+      const avgScore = scores.length > 0 ? scores.reduce((sum, value) => sum + value, 0) / scores.length : 0
+      return {
+        evaluation_name: evalName,
+        evaluation_id: detail.eval_summary_id,
+        composite_benchmark_key: benchmarkKey,
+        composite_benchmark_name: getBenchmarkDisplayName(benchmarkKey),
+        category: inferCategoryFromBenchmark(evalName),
+        metric_config: {
+          evaluation_description: `Average score across ${subtaskMetrics.length} reported subtasks.`,
+          lower_is_better: lowerIsBetter,
+          score_type: "continuous",
+          min_score: 0,
+          max_score: 1,
+        },
+        model_results: modelResults,
+        models_count: modelResults.length,
+        evaluator_names: [],
+        source_types: [],
+        latest_source_name: getBenchmarkDisplayName(benchmarkKey),
+        third_party_ratio: 0,
+        missing_generation_config_count: 0,
+        best_model: modelResults.length > 0
+          ? { name: modelResults[0].model_info.name, score: modelResults[0].score }
+          : null,
+        worst_model: modelResults.length > 0
+          ? { name: modelResults[modelResults.length - 1].model_info.name, score: modelResults[modelResults.length - 1].score }
+          : null,
+        avg_score: avgScore,
+        avg_score_norm: avgScore,
+        benchmark_card: detail.benchmark_card ?? undefined,
+        metric_names: subtaskMetrics.map((metric) => metric.label),
+        metrics_count: subtaskMetrics.length,
+      }
+    }
+  }
   const primaryMetric = allMetrics[0]
   if (!primaryMetric) {
     return {
   }
   const modelResults: ModelResultForBenchmark[] = (primaryMetric.model_results ?? []).map((mr) => {
+    const evaluationTimestamp = mr.retrieved_timestamp ?? ""
     const modelInfo: ModelInfo = {
       name: mr.model_name ?? "",
       id: mr.model_id ?? "",
     const evaluationResult: EvaluationResult = {
       evaluation_name: primaryMetric.metric_name ?? "",
+      evaluation_timestamp: evaluationTimestamp,
       metric_config: {
         evaluation_description: primaryMetric.metric_name ?? "",
         lower_is_better: primaryMetric.lower_is_better ?? false,
       model_route_id: mr.model_route_id,
       score: mr.score ?? 0,
       score_details: { score: mr.score ?? 0 },
+      evaluation_timestamp: evaluationTimestamp,
       source_metadata: {
         source_type: "documentation" as const,
         source_name: sourceName,