Spaces:

evaleval
/

general-eval-card

Running

File size: 58,505 Bytes

import "server-only"

import type {
  BenchmarkCard,
  BenchmarkEvaluation,
  CategoryType,
  EvaluationCardData,
  EvaluationResult,
  ModelInfo,
  SourceData,
} from "@/lib/benchmark-schema"
import type { BackendManifest, BackendManifestStatus, EvalHierarchy } from "@/lib/backend-artifacts"
import { inferCategoryFromBenchmark } from "@/lib/benchmark-schema"
import {
  type BenchmarkEvalListItem,
  type BenchmarkEvalSummary,
  type ModelResultForBenchmark,
  createEvaluationCard,
  createModelFamilySummary,
  groupEvaluationsByBenchmark,
  groupEvaluationsByModelFamily,
  groupEvaluationsByModel,
  toBenchmarkEvalListItem,
} from "@/lib/eval-processing"
import { getCanonicalModelIdentity, getModelFamilyRouteId } from "@/lib/model-family"
import { normalizeDeveloperName } from "@/lib/known-developers"
import { getBenchmarkCard, normalizeBenchmarkKey } from "@/lib/benchmark-metadata"
import {
  type HFEvalDetail,
  type HFEvalListEntry,
  type HFEvalModelResult,
  type HFModelCardEntry,
  type HFModelDetail,
  fetchBackendManifest,
  fetchBackendManifestStatus,
  fetchEvalHierarchy,
  fetchModelCardsList,
  fetchModelCardsListLite,
  fetchEvalList as fetchHFEvalList,
  fetchEvalListLite as fetchHFEvalListLite,
  fetchDevelopersList,
  fetchDeveloperDetail as fetchHFDeveloperDetail,
  fetchModelDetail as fetchHFModelDetail,
  fetchEvalDetail as fetchHFEvalDetail,
  flattenModelEvaluations,
  mapHFCategories,
} from "@/lib/hf-data"

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

// Pipeline contract: every model_result row carries source_metadata. UI
// components dereference source_metadata.evaluator_relationship etc. without
// optional chaining (~30 sites in components/benchmark-detail.tsx), so a
// silent undefined would surface as a TypeError mid-page-render. Fail loud
// at the read boundary instead so the offending row is identifiable.
function assertSourceMetadata(
  result: { source_metadata?: unknown; evaluation_id?: string; model_id?: string },
  context: string
): asserts result is typeof result & { source_metadata: NonNullable<typeof result.source_metadata> } {
  if (!result.source_metadata) {
    throw new Error(
      `Pipeline contract broken: missing source_metadata on model_result ` +
      `(${context} model=${result.model_id ?? "?"} eval=${result.evaluation_id ?? "?"})`
    )
  }
}

function slugifyEvalId(value: string) {
  return value.toLowerCase().replace(/[^a-z0-9]+/g, "_").replace(/^_|_$/g, "")
}

function getAggregateEvalId(value: string) {
  return `aggregate__${slugifyEvalId(value)}`
}

function normalizeEvalTimestamp(value: string) {
  const numericTimestamp = Number(value)
  return !Number.isNaN(numericTimestamp) && !value.includes("-")
    ? numericTimestamp * 1000
    : new Date(value).getTime()
}

function normalizeSummaryScore(summary: BenchmarkEvalSummary, score: number) {
  const maxScore = summary.metric_config.max_score ?? 1
  const minScore = summary.metric_config.min_score ?? 0
  const range = maxScore - minScore
  return range > 0 ? (score - minScore) / range : score
}

function humanizeToken(token: string) {
  return token
    .split(/[_-]+/g)
    .filter(Boolean)
    .map((part) => part.charAt(0).toUpperCase() + part.slice(1))
    .join(" ")
}

function getCanonicalInstanceResultsUrl(value: unknown) {
  if (typeof value !== "string") {
    return undefined
  }

  return value.includes("/datasets/evaleval/card_backend/") && value.includes("/instances/")
    ? value
    : undefined
}

// Canonical display names — keyed by normalized form (lowercase, hyphens→underscores)
const BENCHMARK_NAMES: Record<string, string> = {
  hfopenllm_v2: "HF Open LLM v2",
  helm_lite: "HELM Lite",
  helm_capabilities: "HELM Capabilities",
  helm_classic: "HELM Classic",
  helm_instruct: "HELM Instruct",
  helm_mmlu: "HELM MMLU",
  reward_bench: "RewardBench",
  reward_bench_2: "RewardBench 2",
  bfcl: "BFCL",
  global_mmlu_lite: "Global MMLU Lite",
  swe_bench: "SWE-bench",
  arc_agi: "ARC-AGI",
  tau_bench_2: "TAU-Bench 2",
  ace: "ACE",
  apex_agents: "APEX Agents",
  apex_v1: "APEX v1",
  appworld: "AppWorld",
  browsecompplus: "BrowseComp+",
  livecodebenchpro: "LiveCodeBench Pro",
  sciarena: "SciArena",
  terminal_bench_2_0: "Terminal Bench 2.0",
  la_leaderboard: "LA Leaderboard",
  theory_of_mind: "Theory of Mind",
  fibble_arena: "Fibble Arena",
  fibble1_arena: "Fibble Arena v1",
  fibble2_arena: "Fibble Arena v2",
  fibble3_arena: "Fibble Arena v3",
  fibble4_arena: "Fibble Arena v4",
  fibble5_arena: "Fibble Arena v5",
  wordle_arena: "Wordle Arena",
}

function normalizeBenchmarkKeyForLookup(key: string) {
  return key.toLowerCase().replace(/[-.\s]+/g, "_").replace(/^_+|_+$/g, "")
}

export function getBenchmarkDisplayName(benchmark: string) {
  return BENCHMARK_NAMES[normalizeBenchmarkKeyForLookup(benchmark)] ?? humanizeToken(benchmark)
}

function pipelineSlugify(text: string) {
  return (
    text
      .replace(/[\x00-\x1f\x7f]/g, "")
      .replace(/[^a-zA-Z0-9._-]/g, "_")
      .replace(/^_+|_+$/g, "") || "unknown"
  )
}

export function getDeveloperRouteId(developer: string) {
  return pipelineSlugify(developer.trim().toLowerCase())
}

// ---------------------------------------------------------------------------
// Model detail slug candidates (for HF file lookup)
// ---------------------------------------------------------------------------

function getModelDetailSlugCandidates(modelId: string): string[] {
  const normalized = modelId.trim()
  // The HF dataset uses "__" to separate namespace/model in filenames
  // e.g., "openai/gpt-4o" → "openai__gpt-4o"
  // It also replaces dots with hyphens: "gpt-3.5" → "gpt-3-5"
  const candidates = new Set<string>()

  const withSlash = normalized.replace(/\//g, "__")
  const withDots = withSlash.replace(/\./g, "-")

  candidates.add(pipelineSlugify(withSlash))
  candidates.add(pipelineSlugify(withSlash.toLowerCase()))
  candidates.add(pipelineSlugify(withDots))
  candidates.add(pipelineSlugify(withDots.toLowerCase()))
  candidates.add(pipelineSlugify(normalized))
  candidates.add(pipelineSlugify(normalized.toLowerCase()))

  return Array.from(candidates)
}

export function getDeveloperSlugCandidates(developerOrRouteId: string): string[] {
  const normalized = developerOrRouteId.trim()
  const candidates = new Set<string>()
  const lowercase = normalized.toLowerCase()
  const underscoreSlug = pipelineSlugify(normalized)
  const lowercaseUnderscoreSlug = pipelineSlugify(lowercase)
  const hyphenSlug = lowercase
    .replace(/[\x00-\x1f\x7f]/g, "")
    .replace(/[^a-z0-9]+/g, "-")
    .replace(/^-+|-+$/g, "")
  const compactSlug = lowercase.replace(/[^a-z0-9]+/g, "")

  candidates.add(underscoreSlug)
  candidates.add(lowercaseUnderscoreSlug)
  candidates.add(underscoreSlug.replace(/_/g, "-"))
  candidates.add(lowercaseUnderscoreSlug.replace(/_/g, "-"))
  if (hyphenSlug) {
    candidates.add(hyphenSlug)
  }
  if (compactSlug) {
    candidates.add(compactSlug)
  }

  return Array.from(candidates)
}

// ---------------------------------------------------------------------------
// Developer name normalization (now lives in @/lib/known-developers)
// ---------------------------------------------------------------------------

export { normalizeDeveloperName }

function getModelCardAverageScore(entry: HFModelCardEntry) {
  if (typeof entry.score_summary?.average === "number") {
    return entry.score_summary.average
  }

  if (typeof entry.score_summary?.avg === "number") {
    return entry.score_summary.avg
  }

  return null
}

function getModelCardLatestTimestamp(entry: HFModelCardEntry) {
  const candidateTimestamps = [entry.last_updated, ...entry.variants.map((variant) => variant.last_updated)]
    .filter((value): value is string => Boolean(value))

  if (candidateTimestamps.length === 0) {
    return entry.last_updated
  }

  return candidateTimestamps.sort((a, b) => normalizeEvalTimestamp(b) - normalizeEvalTimestamp(a))[0]
}

function getModelCardTopScores(entry: HFModelCardEntry): EvaluationCardData["top_scores"] {
  if (Array.isArray(entry.top_benchmark_scores) && entry.top_benchmark_scores.length > 0) {
    return entry.top_benchmark_scores
      .filter((score) => Number.isFinite(score.score))
      .map((score) => ({
        benchmark: getBenchmarkDisplayName(score.benchmark),
        benchmarkKey: score.benchmarkKey,
        score: score.score,
        metric: score.evaluation_name || score.metric,
      }))
  }

  const averageScore = getModelCardAverageScore(entry)
  if (averageScore == null || entry.score_summary.count <= 0) {
    return []
  }

  return [
    {
      benchmark: "Average",
      score: averageScore,
      metric: "Cross-benchmark average",
    },
  ]
}

export function getDeveloperBenchmarkStats(models: HFModelCardEntry[]) {
  const benchmarkCounts = new Map<string, number>()

  for (const model of models) {
    const benchmarkNames = (model.benchmark_names ?? []).filter(Boolean)
    const uniqueBenchmarks = new Set(
      benchmarkNames.length > 0 ? benchmarkNames : model.top_benchmark_scores?.map((score) => score.benchmark).filter(Boolean)
    )

    for (const benchmark of uniqueBenchmarks) {
      benchmarkCounts.set(benchmark, (benchmarkCounts.get(benchmark) ?? 0) + 1)
    }
  }

  return benchmarkCounts
}

function parseParamsBillions(value: unknown): number | null {
  if (typeof value === "number") {
    return Number.isFinite(value) && value > 0 ? value : null
  }

  if (typeof value !== "string") {
    return null
  }

  const normalized = value.trim().toLowerCase()
  if (!normalized) {
    return null
  }

  const compact = normalized.replace(/,/g, "")
  const tokenMatch = compact.match(/(\d+(?:\.\d+)?)\s*(trillion|tn|t|billion|bn|b|million|mn|m|thousand|k)\b/)
  if (tokenMatch) {
    const amount = Number.parseFloat(tokenMatch[1])
    if (!Number.isFinite(amount) || amount <= 0) {
      return null
    }

    const unit = tokenMatch[2]
    if (unit === "trillion" || unit === "tn" || unit === "t") {
      return amount * 1000
    }

    if (unit === "billion" || unit === "bn" || unit === "b") {
      return amount
    }

    if (unit === "million" || unit === "mn" || unit === "m") {
      return amount / 1000
    }

    if (unit === "thousand" || unit === "k") {
      return amount / 1_000_000
    }
  }

  const numeric = Number.parseFloat(compact)
  return Number.isFinite(numeric) && numeric > 0 ? numeric : null
}

function attachModelSignalSummaries<T extends ReturnType<typeof createModelFamilySummary>>(
  summary: T,
  detail: HFModelDetail
): T {
  return {
    ...summary,
    reproducibility_summary: detail.reproducibility_summary,
    provenance_summary: detail.provenance_summary,
    comparability_summary: detail.comparability_summary,
    variants: summary.variants.map((variant) => ({
      ...variant,
      reproducibility_summary: detail.reproducibility_summary,
      provenance_summary: detail.provenance_summary,
      comparability_summary: detail.comparability_summary,
    })),
  }
}

// ---------------------------------------------------------------------------
// HF model-cards.json → EvaluationCardData
// ---------------------------------------------------------------------------

export function hfModelCardToEvaluationCardData(entry: HFModelCardEntry): EvaluationCardData {
  const canonicalIdentity = getCanonicalModelIdentity({
    id: entry.model_family_id,
    name: entry.model_family_name,
  })
  const categories = mapHFCategories(entry.categories_covered) as CategoryType[]
  const averageScore = getModelCardAverageScore(entry)
  const topScores = getModelCardTopScores(entry)

  // Distribute total evaluations across categories proportionally
  const categoryStats: Record<string, number> = {}
  const perCat = categories.length > 0
    ? Math.max(1, Math.floor(entry.total_evaluations / categories.length))
    : 0
  let remaining = entry.total_evaluations
  for (let i = 0; i < categories.length; i++) {
    const count = i === categories.length - 1 ? remaining : Math.min(perCat, remaining)
    categoryStats[categories[i]] = count
    remaining -= count
  }

  return {
    id: canonicalIdentity.familyId,
    route_id: getModelFamilyRouteId(canonicalIdentity.familyId),
    model_name: canonicalIdentity.familyName,
    model_id: canonicalIdentity.familyId,
    canonical_model_name: canonicalIdentity.familyName,
    developer: normalizeDeveloperName(entry.developer),
    evaluations_count: entry.total_evaluations,
    benchmarks_count: entry.benchmark_family_count || entry.benchmark_count,
    variant_count: entry.variants.length,
    categories,
    category_stats: categoryStats as Record<CategoryType, number>,
    latest_timestamp: getModelCardLatestTimestamp(entry),
    // These fields aren't available in the summary — use values that
    // avoid misleading "missing" / "self-reported only" badges.
    evaluator_count: 0,
    evaluator_names: [],
    source_type_count: 1,
    source_types: ["documentation"],
    evidence_count: entry.total_evaluations,
    missing_generation_config_count: 0,
    third_party_eval_count: 0,
    independent_verification_ratio: 0,
    reproducibility_status: "partial",
    eval_libraries: [],
    latest_source_name: entry.benchmark_names?.length
      ? `${entry.benchmark_names.length} benchmark${entry.benchmark_names.length === 1 ? "" : "s"}`
      : undefined,
    params_billions: parseParamsBillions(entry.params_billions),
    reproducibility_summary: entry.reproducibility_summary,
    provenance_summary: entry.provenance_summary,
    comparability_summary: entry.comparability_summary,
    benchmark_names: (entry.benchmark_names ?? []).map((name) => getBenchmarkDisplayName(name)),
    score_summary: {
      count: entry.score_summary.count,
      min: entry.score_summary.min,
      max: entry.score_summary.max,
      average: averageScore,
    },
    top_scores: topScores,
    source_urls: [],
    detail_urls: [],
  }
}

// ---------------------------------------------------------------------------
// HF eval-list.json → BenchmarkEvalListItem
// ---------------------------------------------------------------------------

export function hfEvalEntryToListItem(entry: HFEvalListEntry): BenchmarkEvalListItem {
  // Use the pipeline's category directly, mapped to our CategoryType
  const category = mapHFCategories([entry.category])[0] ?? "General" as CategoryType

  // Build a metric_config from the primary metric
  const metrics = entry.metrics ?? []
  const primaryMetric = metrics.find((m) => m.metric_name === entry.primary_metric_name) ?? metrics[0]

  const benchmarkDisplayName = getBenchmarkDisplayName(entry.benchmark_parent_name || entry.benchmark || "")
  const rawDisplayName = entry.evaluation_name || entry.display_name || entry.benchmark_leaf_name || entry.eval_summary_id
  const normalizedDisplayName = rawDisplayName.trim().toLowerCase()
  const prefersBenchmarkName =
    Boolean(benchmarkDisplayName) &&
    (normalizedDisplayName.startsWith("accuracy on ") ||
      normalizedDisplayName.startsWith("score on ") ||
      normalizedDisplayName.includes("for scorer") ||
      normalizedDisplayName.includes("model_graded"))

  return {
    evaluation_name: prefersBenchmarkName ? benchmarkDisplayName : rawDisplayName,
    evaluation_id: entry.eval_summary_id,
    composite_benchmark_key: entry.benchmark ?? "",
    composite_benchmark_name: benchmarkDisplayName,
    category,
    metric_config: {
      evaluation_description: entry.primary_metric_name,
      lower_is_better: primaryMetric?.lower_is_better ?? false,
      score_type: "continuous",
      min_score: 0,
      max_score: 1,
    },
    models_count: entry.models_count,
    evaluator_names: [],
    source_types: [],
    latest_source_name: getBenchmarkDisplayName(entry.benchmark),
    third_party_ratio: 0,
    missing_generation_config_count: 0,
    best_model: entry.top_score != null ? { name: "", score: entry.top_score } : null,
    worst_model: null,
    avg_score: 0,
    avg_score_norm: 0,
    benchmark_card: entry.benchmark_card ?? undefined,
    // New fields from the pipeline
    tags: entry.tags,
    metrics_count: entry.metrics_count,
    metric_names: entry.metric_names,
    instance_data: entry.instance_data,
    family_id: entry.benchmark_family_key,
    parent_benchmark_id: entry.benchmark_parent_key,
    source_data: entry.source_data,
    top_score: entry.top_score,
    subtasks_count: entry.subtasks_count ?? 0,
    is_summary_score: entry.is_summary_score ?? false,
    summary_eval_ids: entry.summary_eval_ids ?? [],
    evalcards: entry.evalcards,
    reproducibility_summary: entry.reproducibility_summary,
    provenance_summary: entry.provenance_summary,
    comparability_summary: entry.comparability_summary,
  }
}

// ---------------------------------------------------------------------------
// HF eval detail → BenchmarkEvalSummary
// ---------------------------------------------------------------------------

function toSummaryMetricConfig(
  metric: HFEvalDetail["metrics"][number]
): BenchmarkEvalSummary["metric_config"] {
  const rawConfig = (metric.metric_config ?? {}) as Record<string, unknown>
  const description =
    (typeof rawConfig.evaluation_description === "string" && rawConfig.evaluation_description) ||
    (typeof rawConfig.metric_name === "string" && rawConfig.metric_name) ||
    metric.metric_name ||
    metric.display_name ||
    metric.evaluation_name ||
    ""
  const unit =
    typeof rawConfig.unit === "string"
      ? rawConfig.unit
      : typeof rawConfig.metric_unit === "string"
        ? rawConfig.metric_unit
        : undefined

  return {
    evaluation_description: description,
    lower_is_better: metric.lower_is_better ?? false,
    score_type:
      rawConfig.score_type === "binary" || rawConfig.score_type === "discrete"
        ? rawConfig.score_type
        : "continuous",
    min_score: typeof rawConfig.min_score === "number" ? rawConfig.min_score : 0,
    max_score: typeof rawConfig.max_score === "number" ? rawConfig.max_score : 1,
    unit,
  }
}

function toBenchmarkSummaryMetric(metric: HFEvalDetail["metrics"][number]) {
  const metricConfig = toSummaryMetricConfig(metric)
  const scores = (metric.model_results ?? []).map((result) => result.score).filter(Number.isFinite)
  const metricName = metric.metric_name || metric.evaluation_name || metric.display_name || "Metric"
  const displayName = metric.display_name || metric.metric_name || metric.evaluation_name || metricName

  return {
    metric_summary_id: metric.metric_summary_id,
    metric_name: metricName,
    display_name: displayName,
    canonical_display_name: metric.canonical_display_name,
    metric_key: metric.metric_key,
    lower_is_better: metric.lower_is_better ?? false,
    models_count: metric.model_results?.length ?? 0,
    top_score: scores.length > 0
      ? (metric.lower_is_better ? Math.min(...scores) : Math.max(...scores))
      : undefined,
    unit: metricConfig.unit,
  }
}

function extractDetailSubtasks(detail: HFEvalDetail) {
  return (Array.isArray(detail.subtasks) ? detail.subtasks : [])
    .flatMap((subtask) => {
      if (!subtask || typeof subtask !== "object") {
        return []
      }

      const subtaskRecord = subtask as Record<string, unknown>
      const metrics = Array.isArray(subtaskRecord.metrics)
        ? (subtaskRecord.metrics as HFEvalDetail["metrics"])
        : []

      return [{
        subtask_key:
          (typeof subtaskRecord.subtask_key === "string" && subtaskRecord.subtask_key) ||
          (typeof subtaskRecord.display_name === "string" && slugifyEvalId(subtaskRecord.display_name)) ||
          "subtask",
        subtask_name:
          (typeof subtaskRecord.subtask_name === "string" && subtaskRecord.subtask_name) ||
          (typeof subtaskRecord.display_name === "string" && subtaskRecord.display_name) ||
          "Subtask",
        display_name:
          (typeof subtaskRecord.display_name === "string" && subtaskRecord.display_name) ||
          (typeof subtaskRecord.subtask_name === "string" && subtaskRecord.subtask_name) ||
          "Subtask",
        canonical_display_name:
          typeof subtaskRecord.canonical_display_name === "string"
            ? subtaskRecord.canonical_display_name
            : undefined,
        metrics,
      }]
    })
}

function extractBenchmarkSubtasks(detail: HFEvalDetail): NonNullable<BenchmarkEvalSummary["subtasks"]> {
  return extractDetailSubtasks(detail).map((subtask) => ({
    subtask_key: subtask.subtask_key,
    subtask_name: subtask.subtask_name,
    display_name: subtask.display_name,
    canonical_display_name: subtask.canonical_display_name,
    metrics: subtask.metrics.map(toBenchmarkSummaryMetric),
  }))
}

function buildBenchmarkLeaderboardMatrix(detail: HFEvalDetail) {
  const benchmarkKey = detail.benchmark ?? ""
  const sourceData = detail.source_data ?? { dataset_name: benchmarkKey }

  const leaderboardMetrics: NonNullable<BenchmarkEvalSummary["leaderboard_metrics"]> = []
  const rowStates = new Map<
    string,
    NonNullable<BenchmarkEvalSummary["leaderboard_rows"]>[number] & { _timestampValue: number }
  >()

  const registerMetric = (
    metric: HFEvalDetail["metrics"][number],
    scope: "root" | "subtask",
    subtask?: {
      subtask_key: string
      subtask_name: string
    }
  ) => {
    const summaryMetric = toBenchmarkSummaryMetric(metric)
    const metricToken = summaryMetric.metric_summary_id || summaryMetric.metric_key || slugifyEvalId(summaryMetric.display_name)
    const columnKey = [scope, subtask?.subtask_key, metricToken].filter(Boolean).join(":")

    leaderboardMetrics.push({
      column_key: columnKey,
      metric_summary_id: summaryMetric.metric_summary_id,
      metric_name: summaryMetric.metric_name,
      display_name: summaryMetric.display_name,
      canonical_display_name: summaryMetric.canonical_display_name,
      lower_is_better: summaryMetric.lower_is_better,
      unit: summaryMetric.unit,
      scope,
      subtask_key: subtask?.subtask_key,
      subtask_name: subtask?.subtask_name,
    })

    for (const modelResult of metric.model_results ?? []) {
      const modelId = modelResult.model_id || modelResult.model_name
      if (!modelId) {
        continue
      }
      assertSourceMetadata(modelResult, `eval=${detail.eval_summary_id} metric=${metric.metric_summary_id}`)

      const nextTimestamp = normalizeEvalTimestamp(modelResult.retrieved_timestamp ?? "")
      const existing = rowStates.get(modelId)
      if (!existing) {
        rowStates.set(modelId, {
          model_info: {
            name: modelResult.model_name ?? "",
            id: modelId,
            developer: modelResult.developer ?? "",
          },
          model_route_id: modelResult.model_route_id,
          evaluation_timestamp: modelResult.retrieved_timestamp ?? "",
          source_metadata: modelResult.source_metadata,
          source_data: sourceData,
          values: { [columnKey]: modelResult.score ?? null },
          annotations_by_metric: { [columnKey]: modelResult.evalcards?.annotations ?? null },
          metrics_present: 0,
          _timestampValue: nextTimestamp,
        })
        continue
      }

      existing.values[columnKey] = modelResult.score ?? null
      existing.annotations_by_metric = {
        ...(existing.annotations_by_metric ?? {}),
        [columnKey]: modelResult.evalcards?.annotations ?? null,
      }
      if (!existing.model_route_id && modelResult.model_route_id) {
        existing.model_route_id = modelResult.model_route_id
      }
      if (nextTimestamp >= existing._timestampValue) {
        existing.evaluation_timestamp = modelResult.retrieved_timestamp ?? existing.evaluation_timestamp
        existing._timestampValue = nextTimestamp
      }
    }
  }

  for (const metric of detail.metrics ?? []) {
    registerMetric(metric, "root")
  }

  for (const subtask of extractDetailSubtasks(detail)) {
    for (const metric of subtask.metrics) {
      registerMetric(metric, "subtask", {
        subtask_key: subtask.subtask_key,
        subtask_name: subtask.display_name || subtask.subtask_name,
      })
    }
  }

  const leaderboardRows = Array.from(rowStates.values()).map(({ _timestampValue, ...row }) => ({
    ...row,
    metrics_present: leaderboardMetrics.reduce(
      (count, metric) => count + (typeof row.values[metric.column_key] === "number" ? 1 : 0),
      0
    ),
  }))

  return {
    leaderboard_metrics: leaderboardMetrics,
    leaderboard_rows: leaderboardRows,
  }
}

function toModelResultsForMetric(
  detail: HFEvalDetail,
  metric: HFEvalDetail["metrics"][number]
): ModelResultForBenchmark[] {
  const benchmarkKey = detail.benchmark ?? ""
  const metricConfig = toSummaryMetricConfig(metric)

  return (metric.model_results ?? []).map((mr) => {
    assertSourceMetadata(mr, `eval=${detail.eval_summary_id} metric=${metric.metric_summary_id}`)
    const evaluationTimestamp = mr.retrieved_timestamp ?? ""
    const modelInfo: ModelInfo = {
      name: mr.model_name ?? "",
      id: mr.model_id ?? "",
      developer: mr.developer ?? "",
    }

    const evaluationResult: EvaluationResult = {
      evaluation_name: metric.metric_name || metric.evaluation_name || metric.display_name || "",
      display_name: metric.display_name || metric.metric_name || metric.evaluation_name,
      canonical_display_name: metric.canonical_display_name,
      metric_summary_id: metric.metric_summary_id,
      metric_key: metric.metric_key,
      evaluation_timestamp: evaluationTimestamp,
      metric_config: metricConfig,
      score_details: { score: mr.score ?? 0 },
      detailed_evaluation_results_url: getCanonicalInstanceResultsUrl(
        mr.detailed_evaluation_results
      ),
      evalcards: mr.evalcards,
    }

    return {
      model_info: modelInfo,
      model_route_id: mr.model_route_id,
      score: mr.score ?? 0,
      score_details: { score: mr.score ?? 0 },
      evaluation_timestamp: evaluationTimestamp,
      source_metadata: mr.source_metadata,
      source_data: detail.source_data ?? { dataset_name: benchmarkKey },
      result: evaluationResult,
      source_record_url: mr.source_record_url,
    }
  })
}

export function hfEvalDetailToSummary(detail: HFEvalDetail): BenchmarkEvalSummary {
  const evalName = detail.benchmark_leaf_name || detail.eval_summary_id || "Unknown"
  const benchmarkKey = detail.benchmark ?? ""
  const allMetrics = detail.metrics ?? []
  const rootMetrics = allMetrics.map(toBenchmarkSummaryMetric)
  const subtasks = extractBenchmarkSubtasks(detail)
  const leaderboardMatrix = buildBenchmarkLeaderboardMatrix(detail)
  const primaryMetric =
    allMetrics[0] ??
    (Array.isArray(detail.subtasks) ? detail.subtasks : [])
      .flatMap((subtask) => {
        if (!subtask || typeof subtask !== "object") {
          return []
        }

        const metrics = Array.isArray((subtask as Record<string, unknown>).metrics)
          ? ((subtask as Record<string, unknown>).metrics as HFEvalDetail["metrics"])
          : []
        return metrics
      })[0]

  if (!primaryMetric) {
    return {
      evaluation_name: evalName,
      evaluation_id: detail.eval_summary_id,
      canonical_display_name: detail.canonical_display_name,
      composite_benchmark_key: benchmarkKey,
      composite_benchmark_name: getBenchmarkDisplayName(benchmarkKey),
      category: inferCategoryFromBenchmark(evalName),
      metric_config: { evaluation_description: "", lower_is_better: false, score_type: "continuous" },
      model_results: [],
      models_count: 0,
      evaluator_names: [],
      source_types: [],
      latest_source_name: getBenchmarkDisplayName(benchmarkKey),
      third_party_ratio: 0,
      missing_generation_config_count: 0,
      best_model: null,
      worst_model: null,
      avg_score: 0,
      avg_score_norm: 0,
      benchmark_card: detail.benchmark_card ?? undefined,
      metrics_count: leaderboardMatrix.leaderboard_metrics.length,
      metric_names: leaderboardMatrix.leaderboard_metrics.map((metric) =>
        metric.scope === "subtask" && metric.subtask_name
          ? `${metric.subtask_name} / ${metric.metric_name}`
          : metric.metric_name
      ),
      root_metrics: rootMetrics,
      subtasks,
      leaderboard_metrics: leaderboardMatrix.leaderboard_metrics,
      leaderboard_rows: leaderboardMatrix.leaderboard_rows,
      source_data: detail.source_data,
      evalcards: detail.evalcards,
      reproducibility_summary: detail.reproducibility_summary,
      provenance_summary: detail.provenance_summary,
      comparability_summary: detail.comparability_summary,
    }
  }

  const modelResults = toModelResultsForMetric(detail, primaryMetric)

  // Sort by score
  const metricConfig = toSummaryMetricConfig(primaryMetric)
  const lowerIsBetter = metricConfig.lower_is_better
  modelResults.sort((a, b) => (lowerIsBetter ? a.score - b.score : b.score - a.score))

  const scores = modelResults.map((r) => r.score).filter(Number.isFinite)
  const avgScore = scores.length > 0 ? scores.reduce((s, v) => s + v, 0) / scores.length : 0

  return {
    evaluation_name: evalName,
    evaluation_id: detail.eval_summary_id,
    canonical_display_name: detail.canonical_display_name,
    composite_benchmark_key: benchmarkKey,
    composite_benchmark_name: getBenchmarkDisplayName(benchmarkKey),
    category: inferCategoryFromBenchmark(evalName),
    metric_config: metricConfig,
    model_results: modelResults,
    models_count: modelResults.length,
    evaluator_names: [],
    source_types: [],
    latest_source_name: getBenchmarkDisplayName(benchmarkKey),
    third_party_ratio: 0,
    missing_generation_config_count: 0,
    best_model: modelResults.length > 0
      ? { name: modelResults[0].model_info.name, score: modelResults[0].score }
      : null,
    worst_model: modelResults.length > 0
      ? { name: modelResults[modelResults.length - 1].model_info.name, score: modelResults[modelResults.length - 1].score }
      : null,
    avg_score: avgScore,
    avg_score_norm: avgScore, // scores are already 0-1 from the pipeline
    benchmark_card: detail.benchmark_card ?? undefined,
    metric_names:
      leaderboardMatrix.leaderboard_metrics
        .map((metric) =>
          metric.scope === "subtask" && metric.subtask_name
            ? `${metric.subtask_name} / ${metric.metric_name}`
            : metric.metric_name
        )
        .filter((metricName): metricName is string => Boolean(metricName)),
    metrics_count: leaderboardMatrix.leaderboard_metrics.length,
    root_metrics: rootMetrics,
    subtasks,
    leaderboard_metrics: leaderboardMatrix.leaderboard_metrics,
    leaderboard_rows: leaderboardMatrix.leaderboard_rows,
    source_data: detail.source_data,
    evalcards: detail.evalcards,
    reproducibility_summary: detail.reproducibility_summary,
    provenance_summary: detail.provenance_summary,
    comparability_summary: detail.comparability_summary,
  }
}

// ---------------------------------------------------------------------------
// Aggregation (for aggregate eval summaries)
// ---------------------------------------------------------------------------

export async function attachBenchmarkCardToSummary(summary: BenchmarkEvalSummary): Promise<BenchmarkEvalSummary> {
  if (summary.benchmark_card) return summary

  const candidates = [
    summary.evaluation_name,
    summary.composite_benchmark_name,
    summary.composite_benchmark_key,
  ]

  for (const candidate of candidates) {
    const card = await getBenchmarkCard(candidate)
    if (card) return { ...summary, benchmark_card: card }
  }

  return summary
}

function aggregateBenchmarkSummaries(
  summaries: BenchmarkEvalSummary[],
  aggregationKey: string
): BenchmarkEvalSummary | null {
  if (summaries.length === 0) return null

  const first = summaries[0]
  const card = first.benchmark_card

  // Use each sub-eval's own name (not the suite name) so sub-cards show distinct titles
  const aggregateSources = Array.from(
    new Map(
      summaries.map((summary) => [
        summary.evaluation_id,
        {
          evaluation_id: summary.evaluation_id,
          composite_benchmark_key: summary.composite_benchmark_key,
          composite_benchmark_name: summary.evaluation_name,
          models_count: summary.models_count,
          avg_score_norm: summary.avg_score_norm,
        },
      ])
    ).values()
  ).sort((a, b) => a.composite_benchmark_name.localeCompare(b.composite_benchmark_name))

  // The display name for the aggregate should be the suite name, not a sub-eval name
  const suiteDisplayName = getBenchmarkDisplayName(aggregationKey)

  const modelBuckets = new Map<
    string,
    {
      model_info: ModelResultForBenchmark["model_info"]
      components: Array<{ summary: BenchmarkEvalSummary; modelResult: ModelResultForBenchmark }>
    }
  >()

  for (const summary of summaries) {
    for (const modelResult of summary.model_results) {
      const existing = modelBuckets.get(modelResult.model_info.id) ?? {
        model_info: modelResult.model_info,
        components: [],
      }
      existing.components.push({ summary, modelResult })
      modelBuckets.set(modelResult.model_info.id, existing)
    }
  }

  const aggregateMetricConfig = {
    ...first.metric_config,
    evaluation_description:
      aggregateSources.length > 1
        ? `Average normalized score across ${aggregateSources.map((s) => s.composite_benchmark_name).join(", ")}`
        : first.metric_config.evaluation_description,
    min_score: 0,
    max_score: 1,
    unit: "normalized average",
  } as const

  const aggregatedModelResults: ModelResultForBenchmark[] = Array.from(modelBuckets.values()).map(
    ({ model_info, components }) => {
      const normalizedScores = components.map(({ summary, modelResult }) =>
        normalizeSummaryScore(summary, modelResult.score)
      )
      const avgNormalizedScore =
        normalizedScores.reduce((sum, s) => sum + s, 0) / normalizedScores.length

      const latestComponent = [...components].sort(
        (a, b) =>
          normalizeEvalTimestamp(b.modelResult.evaluation_timestamp) -
          normalizeEvalTimestamp(a.modelResult.evaluation_timestamp)
      )[0]

      const aggregateComponents = components
        .map(({ summary, modelResult }) => ({
          evaluation_id: summary.evaluation_id,
          composite_benchmark_key: summary.composite_benchmark_key,
          composite_benchmark_name: summary.composite_benchmark_name,
          score: modelResult.score,
          normalized_score: normalizeSummaryScore(summary, modelResult.score),
          evaluation_timestamp: modelResult.evaluation_timestamp,
          source_name: modelResult.source_metadata.source_name,
          source_type: modelResult.source_metadata.source_type,
          source_organization_name: modelResult.source_metadata.source_organization_name,
          evaluator_relationship: modelResult.source_metadata.evaluator_relationship,
        }))
        .sort((a, b) => a.composite_benchmark_name.localeCompare(b.composite_benchmark_name))

      return {
        model_info,
        score: avgNormalizedScore,
        score_details: {
          score: avgNormalizedScore,
          sample_size:
            components.reduce((sum, { modelResult }) => sum + (modelResult.score_details.sample_size ?? 0), 0) ||
            undefined,
        },
        evaluation_timestamp: latestComponent.modelResult.evaluation_timestamp,
        source_metadata: latestComponent.modelResult.source_metadata,
        source_data: latestComponent.modelResult.source_data,
        result: {
          ...latestComponent.modelResult.result,
          evaluation_name: card?.benchmark_details?.name ?? first.evaluation_name,
          metric_config: aggregateMetricConfig,
          score_details: { score: avgNormalizedScore },
        },
        aggregate_components: aggregateComponents,
      }
    }
  )

  const lowerIsBetter = first.metric_config.lower_is_better
  aggregatedModelResults.sort((a, b) => (lowerIsBetter ? a.score - b.score : b.score - a.score))

  const avgScore =
    aggregatedModelResults.reduce((sum, r) => sum + r.score, 0) / aggregatedModelResults.length

  const evaluatorNames = Array.from(
    new Set(summaries.flatMap((s) => s.evaluator_names))
  ).sort()

  const sourceTypes = Array.from(
    new Set(summaries.flatMap((s) => s.source_types))
  ).sort()

  const totalUnderlying = summaries.reduce((sum, s) => sum + s.model_results.length, 0)
  const totalThirdParty = summaries.reduce(
    (sum, s) =>
      sum + s.model_results.filter((r) => r.source_metadata.evaluator_relationship === "third_party").length,
    0
  )

  return {
    // Use the suite display name for the aggregate, never a sub-metric name
    evaluation_name: suiteDisplayName,
    evaluation_id: getAggregateEvalId(aggregationKey),
    composite_benchmark_key:
      aggregateSources.length === 1 ? aggregateSources[0].composite_benchmark_key : aggregationKey,
    composite_benchmark_name: suiteDisplayName,
    category: first.category,
    metric_config: aggregateMetricConfig,
    model_results: aggregatedModelResults,
    models_count: aggregatedModelResults.length,
    evaluator_names: evaluatorNames,
    source_types: sourceTypes,
    latest_source_name:
      aggregateSources.length === 1 ? aggregateSources[0].composite_benchmark_name : "Multiple sources",
    third_party_ratio: totalUnderlying > 0 ? totalThirdParty / totalUnderlying : 0,
    missing_generation_config_count: summaries.reduce(
      (sum, s) => sum + s.missing_generation_config_count, 0
    ),
    best_model:
      aggregatedModelResults.length > 0
        ? { name: aggregatedModelResults[0].model_info.name, score: aggregatedModelResults[0].score }
        : null,
    worst_model:
      aggregatedModelResults.length > 0
        ? {
            name: aggregatedModelResults[aggregatedModelResults.length - 1].model_info.name,
            score: aggregatedModelResults[aggregatedModelResults.length - 1].score,
          }
        : null,
    avg_score: avgScore,
    avg_score_norm: avgScore,
    benchmark_card: card,
    is_aggregated: true,
    aggregate_sources: aggregateSources,
  }
}

const SYNTHETIC_MATRIX_EVAL_PREFIX = "matrix__"

function buildSingleMetricSuiteMatrixSummary(
  details: HFEvalDetail[],
  suiteKey: string
): BenchmarkEvalSummary | null {
  if (details.length < 2) {
    return null
  }

  const suiteDisplayName = getBenchmarkDisplayName(suiteKey)
  const validDetails = [...details]
    .filter((detail) => (detail.metrics?.length ?? 0) === 1 && extractDetailSubtasks(detail).length === 0)
    .sort((left, right) =>
      (left.benchmark_leaf_name || left.eval_summary_id).localeCompare(right.benchmark_leaf_name || right.eval_summary_id)
    )

  if (validDetails.length < 2) {
    return null
  }

  const leaderboardMetrics: NonNullable<BenchmarkEvalSummary["leaderboard_metrics"]> = []
  const rowStates = new Map<
    string,
    NonNullable<BenchmarkEvalSummary["leaderboard_rows"]>[number] & { _timestampValue: number }
  >()

  let metricConfig: BenchmarkEvalSummary["metric_config"] | null = null
  let benchmarkCard: BenchmarkCard | undefined
  const metricNames = new Set<string>()

  for (const detail of validDetails) {
    const metric = detail.metrics?.[0]
    if (!metric) {
      continue
    }

    if (!metricConfig) {
      metricConfig = toSummaryMetricConfig(metric)
    }

    if (!benchmarkCard && detail.benchmark_card) {
      benchmarkCard = detail.benchmark_card
    }

    const summaryMetric = toBenchmarkSummaryMetric(metric)
    metricNames.add(summaryMetric.metric_name)
    const subtaskKey = detail.benchmark_leaf_key || slugifyEvalId(detail.eval_summary_id)
    const subtaskName = detail.benchmark_leaf_name || detail.canonical_display_name || detail.eval_summary_id || subtaskKey
    const metricToken =
      summaryMetric.metric_summary_id ||
      summaryMetric.metric_key ||
      slugifyEvalId(summaryMetric.display_name)
    const columnKey = ["subtask", subtaskKey, metricToken].join(":")

    leaderboardMetrics.push({
      column_key: columnKey,
      metric_summary_id: summaryMetric.metric_summary_id,
      metric_name: summaryMetric.metric_name,
      display_name: summaryMetric.display_name,
      canonical_display_name: summaryMetric.canonical_display_name,
      lower_is_better: summaryMetric.lower_is_better,
      unit: summaryMetric.unit,
      scope: "subtask",
      subtask_key: subtaskKey,
      subtask_name: subtaskName,
    })

    const benchmarkKey = detail.benchmark ?? suiteKey
    const sourceData = detail.source_data ?? { dataset_name: benchmarkKey }

    for (const modelResult of metric.model_results ?? []) {
      const modelId = modelResult.model_id || modelResult.model_name
      if (!modelId) {
        continue
      }
      assertSourceMetadata(modelResult, `suite=${suiteKey} metric=${metric.metric_summary_id}`)

      const nextTimestamp = normalizeEvalTimestamp(modelResult.retrieved_timestamp ?? "")
      const existing = rowStates.get(modelId)

      if (!existing) {
        rowStates.set(modelId, {
          model_info: {
            name: modelResult.model_name ?? "",
            id: modelId,
            developer: modelResult.developer ?? "",
          },
          model_route_id: modelResult.model_route_id,
          evaluation_timestamp: modelResult.retrieved_timestamp ?? "",
          source_metadata: modelResult.source_metadata,
          source_data: sourceData,
          values: { [columnKey]: modelResult.score ?? null },
          annotations_by_metric: { [columnKey]: modelResult.evalcards?.annotations ?? null },
          metrics_present: 0,
          _timestampValue: nextTimestamp,
        })
        continue
      }

      existing.values[columnKey] = modelResult.score ?? null
      existing.annotations_by_metric = {
        ...(existing.annotations_by_metric ?? {}),
        [columnKey]: modelResult.evalcards?.annotations ?? null,
      }
      if (!existing.model_route_id && modelResult.model_route_id) {
        existing.model_route_id = modelResult.model_route_id
      }
      if (nextTimestamp >= existing._timestampValue) {
        existing.evaluation_timestamp = modelResult.retrieved_timestamp ?? existing.evaluation_timestamp
        existing.source_metadata = modelResult.source_metadata
        existing.source_data = sourceData
        existing._timestampValue = nextTimestamp
      }
    }
  }

  if (leaderboardMetrics.length < 2) {
    return null
  }

  const sharedMetricName = metricNames.size === 1 ? Array.from(metricNames)[0] : undefined
  const suiteMetricConfig = metricConfig
    ? {
        ...metricConfig,
        evaluation_description: sharedMetricName ?? metricConfig.evaluation_description,
      }
    : {
        evaluation_description: sharedMetricName ?? "",
        lower_is_better: false,
        score_type: "continuous" as const,
        min_score: 0,
        max_score: 1,
      }

  const leaderboardRows = Array.from(rowStates.values()).map(({ _timestampValue, ...row }) => ({
    ...row,
    metrics_present: leaderboardMetrics.reduce(
      (count, metric) => count + (typeof row.values[metric.column_key] === "number" ? 1 : 0),
      0
    ),
  }))

  return {
    evaluation_name: suiteDisplayName,
    evaluation_id: `${SYNTHETIC_MATRIX_EVAL_PREFIX}${suiteKey}`,
    canonical_display_name: suiteDisplayName,
    composite_benchmark_key: suiteKey,
    composite_benchmark_name: suiteDisplayName,
    category: inferCategoryFromBenchmark(suiteDisplayName),
    metric_config: suiteMetricConfig,
    model_results: [],
    models_count: leaderboardRows.length,
    evaluator_names: [],
    source_types: [],
    latest_source_name: suiteDisplayName,
    third_party_ratio: 0,
    missing_generation_config_count: 0,
    best_model: null,
    worst_model: null,
    avg_score: 0,
    avg_score_norm: 0,
    benchmark_card: benchmarkCard,
    metrics_count: leaderboardMetrics.length,
    metric_names: leaderboardMetrics.map((metric) => `${metric.subtask_name} / ${metric.metric_name}`),
    source_data: { dataset_name: suiteDisplayName },
    leaderboard_metrics: leaderboardMetrics,
    leaderboard_rows: leaderboardRows,
  }
}

// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------

export async function getDashboardData() {
  const [models, evals] = await Promise.all([getModelCards(), getEvalList()])
  return { models, evals }
}

export async function getBackendManifestData(): Promise<BackendManifest> {
  return fetchBackendManifest()
}

export async function getBackendManifestStatusData(): Promise<BackendManifestStatus> {
  return fetchBackendManifestStatus()
}

export async function getEvalHierarchyData(): Promise<EvalHierarchy> {
  return fetchEvalHierarchy()
}

export async function getModelCards(): Promise<EvaluationCardData[]> {
  const entries = await fetchModelCardsList()
  return entries.map(hfModelCardToEvaluationCardData).sort(
    (a, b) => new Date(b.latest_timestamp).getTime() - new Date(a.latest_timestamp).getTime()
  )
}

export async function getModelCardsLite(): Promise<EvaluationCardData[]> {
  const entries = await fetchModelCardsListLite()
  return entries.map(hfModelCardToEvaluationCardData).sort(
    (a, b) =>
      b.benchmarks_count - a.benchmarks_count ||
      b.evaluations_count - a.evaluations_count ||
      a.model_name.localeCompare(b.model_name)
  )
}

export async function getEvalListData(): Promise<{
  evals: BenchmarkEvalListItem[]
  totalModels: number
}> {
  const [evalData, modelCards] = await Promise.all([
    fetchHFEvalList(),
    fetchModelCardsList(),
  ])

  const evals = evalData.evals
    .filter((entry) => !(typeof entry.source_data?.hf_repo === "string" && entry.source_data.hf_repo.startsWith("example://")))
    .map(hfEvalEntryToListItem)

  // Attach benchmark cards where available
  const evalsWithCards = await Promise.all(
    evals.map(async (item) => {
      let updated = item

      // Attach benchmark card if not already provided by the pipeline
      if (!updated.benchmark_card) {
        const candidates = [item.evaluation_name, item.composite_benchmark_key, item.composite_benchmark_name].filter(Boolean)
        for (const name of candidates) {
          const card = await getBenchmarkCard(name)
          if (card) {
            updated = { ...updated, benchmark_card: card }
            break
          }
        }
      }

      return updated
    })
  )

  return {
    evals: evalsWithCards.sort((a, b) => (a.evaluation_name ?? "").localeCompare(b.evaluation_name ?? "")),
    totalModels: modelCards.length,
  }
}

export async function getEvalListLiteData(): Promise<{
  evals: BenchmarkEvalListItem[]
  totalModels: number
}> {
  const [evalData, modelCards] = await Promise.all([
    fetchHFEvalListLite(),
    fetchModelCardsListLite(),
  ])

  const evals = evalData.evals
    .filter((entry) => !(typeof entry.source_data?.hf_repo === "string" && entry.source_data.hf_repo.startsWith("example://")))
    .map(hfEvalEntryToListItem)

  return {
    evals: evals.sort((a, b) => (a.evaluation_name ?? "").localeCompare(b.evaluation_name ?? "")),
    totalModels: modelCards.length,
  }
}

export async function getEvalList() {
  const { evals } = await getEvalListData()
  return evals
}

export async function getDeveloperList() {
  const developerIndex = await fetchDevelopersList()

  // Deduplicate by route_id (handles case variations like "google" vs "Google")
  const deduped = new Map<string, { developer: string; model_count: number }>()
  for (const entry of developerIndex) {
    const routeId = getDeveloperRouteId(entry.developer)
    const existing = deduped.get(routeId)
    if (!existing || entry.model_count > existing.model_count) {
      // Keep the variant with the most models (likely the canonical name)
      deduped.set(routeId, {
        developer: existing && existing.model_count > entry.model_count
          ? existing.developer
          : entry.developer,
        model_count: (existing?.model_count ?? 0) + entry.model_count,
      })
    } else {
      // Accumulate model count
      deduped.set(routeId, {
        developer: existing.developer,
        model_count: existing.model_count + entry.model_count,
      })
    }
  }

  // Enrich with detail files for aggregate stats
  const details = await Promise.all(
    Array.from(deduped.values()).map(async (entry) => {
      let detail = null
      for (const slug of getDeveloperSlugCandidates(entry.developer)) {
        detail = await fetchHFDeveloperDetail(slug)
        if (detail?.developer && Array.isArray(detail.models)) {
          break
        }
      }

      const models = detail?.models ?? []
      const benchmarkCounts = getDeveloperBenchmarkStats(models)
      let evaluationCount = 0

      for (const model of models) {
        evaluationCount += model.total_evaluations
      }

      const popularEvals = Array.from(benchmarkCounts.entries())
        .sort((a, b) => b[1] - a[1])
        .slice(0, 3)
        .map(([benchmark, model_count]) => ({
          benchmark: getBenchmarkDisplayName(benchmark),
          model_count,
        }))

      return {
        developer: normalizeDeveloperName(detail?.developer ?? entry.developer),
        route_id: getDeveloperRouteId(entry.developer),
        // Use accumulated count from developers.json (handles case variants)
        // rather than detail file which may be incomplete due to slug collisions
        model_count: entry.model_count,
        benchmark_count: benchmarkCounts.size,
        evaluation_count: evaluationCount,
        popular_evals: popularEvals,
      }
    })
  )

  return details.sort((a, b) => a.developer.localeCompare(b.developer))
}

export function hfDeveloperDetailToSummary(detail: {
  developer: string
  models: HFModelCardEntry[]
}) {
  const modelCards = detail.models.map(hfModelCardToEvaluationCardData)
  const benchmarkCounts = getDeveloperBenchmarkStats(detail.models)
  const evaluationCount = detail.models.reduce(
    (sum, model) => sum + model.total_evaluations,
    0
  )
  const popularEvals = Array.from(benchmarkCounts.entries())
    .sort((a, b) => b[1] - a[1])
    .slice(0, 3)
    .map(([benchmark, model_count]) => ({
      benchmark: getBenchmarkDisplayName(benchmark),
      model_count,
    }))

  return {
    developer: normalizeDeveloperName(detail.developer),
    route_id: getDeveloperRouteId(detail.developer),
    model_count: detail.models.length,
    benchmark_count: benchmarkCounts.size,
    evaluation_count: evaluationCount,
    popular_evals: popularEvals,
    models: modelCards,
  }
}

export async function getDeveloperSummaryById(routeId: string) {
  // Try direct slug lookup
  for (const slug of getDeveloperSlugCandidates(routeId)) {
    const detail = await fetchHFDeveloperDetail(slug)
    if (detail?.developer && Array.isArray(detail.models)) {
      const modelCards = detail.models.map(hfModelCardToEvaluationCardData)

      // Calculate aggregate stats
      let evaluationCount = 0
      const benchmarkCounts = getDeveloperBenchmarkStats(detail.models)
      for (const m of detail.models) {
        evaluationCount += m.total_evaluations
      }

      const popularEvals = Array.from(benchmarkCounts.entries())
        .sort((a, b) => b[1] - a[1])
        .slice(0, 3)
        .map(([benchmark, model_count]) => ({
          benchmark: getBenchmarkDisplayName(benchmark),
          model_count,
        }))

      return {
        developer: normalizeDeveloperName(detail.developer),
        route_id: getDeveloperRouteId(detail.developer),
        model_count: detail.models.length,
        benchmark_count: benchmarkCounts.size,
        evaluation_count: evaluationCount,
        popular_evals: popularEvals,
        models: modelCards,
      }
    }
  }

  // Try looking up through the developer index
  const developerIndex = await fetchDevelopersList()
  const matchedDev = developerIndex.find(
    (e) => e.developer === routeId || getDeveloperRouteId(e.developer) === routeId
  )

  if (matchedDev) {
    for (const slug of getDeveloperSlugCandidates(matchedDev.developer)) {
      const detail = await fetchHFDeveloperDetail(slug)
      if (detail?.developer && Array.isArray(detail.models)) {
        const modelCards = detail.models.map(hfModelCardToEvaluationCardData)

        let evaluationCount = 0
        const benchmarkCounts = new Map<string, number>()
        for (const m of detail.models) {
          evaluationCount += m.total_evaluations
          for (const cat of m.categories_covered) {
            benchmarkCounts.set(cat, (benchmarkCounts.get(cat) ?? 0) + 1)
          }
        }

        const popularEvals = Array.from(benchmarkCounts.entries())
          .sort((a, b) => b[1] - a[1])
          .slice(0, 3)
          .map(([benchmark, model_count]) => ({
            benchmark: getBenchmarkDisplayName(benchmark),
            model_count,
          }))

        return {
          developer: detail.developer,
          route_id: getDeveloperRouteId(detail.developer),
          model_count: detail.models.length,
          benchmark_count: benchmarkCounts.size,
          evaluation_count: evaluationCount,
          popular_evals: popularEvals,
          models: modelCards,
        }
      }
    }
  }

  return null
}

export async function getModelSummaryById(modelId: string) {
  // Try fetching from HF model detail files
  for (const slug of getModelDetailSlugCandidates(modelId)) {
    const detail = await fetchHFModelDetail(slug)
    if (detail) {
      const evaluations = flattenModelEvaluations(detail)
      if (evaluations.length > 0) {
        return attachModelSignalSummaries(createModelFamilySummary(evaluations), detail)
      }
    }
  }

  // Try model-cards.json to find the right slug. Pipeline contract guarantees
  // model_route_id === model_family_id.replace(/\//g, "__") on every card
  // (verified in tests/pipeline-contract.test.ts and tests/upstream-drift.test.ts),
  // so a separate `getModelFamilyRouteId(family_id) === modelId` clause would
  // be redundant.
  const modelCards = await fetchModelCardsList()
  const matchedCard = modelCards.find(
    (card) =>
      card.model_family_id === modelId ||
      card.model_route_id === modelId
  )

  if (matchedCard) {
    // Try fetching by the model_route_id (which uses __ separator)
    const detail = await fetchHFModelDetail(matchedCard.model_route_id)
    if (detail) {
      const evaluations = flattenModelEvaluations(detail)
      if (evaluations.length > 0) {
        return attachModelSignalSummaries(createModelFamilySummary(evaluations), detail)
      }
    }

    // Try all raw model IDs from all variants
    for (const variant of matchedCard.variants) {
      for (const rawId of variant.raw_model_ids) {
        for (const slug of getModelDetailSlugCandidates(rawId)) {
          const variantDetail = await fetchHFModelDetail(slug)
          if (variantDetail) {
            const evaluations = flattenModelEvaluations(variantDetail)
            if (evaluations.length > 0) {
              return attachModelSignalSummaries(createModelFamilySummary(evaluations), variantDetail)
            }
          }
        }
      }
    }
  }

  return null
}

export async function getEvalSummaryById(evalId: string) {
  // Handle aggregate eval IDs (grouped by composite_benchmark_key)
  if (evalId.startsWith("aggregate__")) {
    const aggregateKey = evalId.replace(/^aggregate__/, "")

    // Find all evals in this benchmark suite by matching composite_benchmark_key
    const { evals } = await fetchHFEvalList()
    const matchingEvals = evals.filter((e) => {
      const normalizedBenchmark = e.benchmark.toLowerCase().replace(/[-.\s]+/g, "_").replace(/^_+|_+$/g, "")
      return normalizedBenchmark === aggregateKey || e.benchmark === aggregateKey
    })

    if (matchingEvals.length === 0) return null

    // Fetch full eval details for each sub-eval
    const detailSummaries = await Promise.all(
      matchingEvals.map(async (e) => {
        const detail = await fetchHFEvalDetail(e.eval_summary_id)
        if (!detail) return null
        return await attachBenchmarkCardToSummary(hfEvalDetailToSummary(detail))
      })
    )

    const validSummaries = detailSummaries.filter((s): s is BenchmarkEvalSummary => s !== null)
    return aggregateBenchmarkSummaries(validSummaries, aggregateKey)
  }

  if (evalId.startsWith(SYNTHETIC_MATRIX_EVAL_PREFIX)) {
    const suiteKey = evalId.replace(new RegExp(`^${SYNTHETIC_MATRIX_EVAL_PREFIX}`), "")
    const normalizedSuiteKey = normalizeBenchmarkKeyForLookup(suiteKey)
    const { evals } = await fetchHFEvalListLite()
    const matchingEvals = evals.filter((entry) => {
      if (entry.is_summary_score) {
        return false
      }

      const parentKey = normalizeBenchmarkKeyForLookup(
        entry.benchmark_parent_key || entry.benchmark_family_key || entry.benchmark
      )
      return parentKey === normalizedSuiteKey
    })

    if (matchingEvals.length < 2) {
      return null
    }

    const details = await Promise.all(
      matchingEvals.map(async (entry) => fetchHFEvalDetail(entry.eval_summary_id))
    )

    const validDetails = details.filter((detail): detail is HFEvalDetail => detail !== null)
    const syntheticSummary = buildSingleMetricSuiteMatrixSummary(validDetails, suiteKey)
    return syntheticSummary ? attachBenchmarkCardToSummary(syntheticSummary) : null
  }

  // Direct eval lookup
  const detail = await fetchHFEvalDetail(evalId)
  if (detail) {
    const summary = hfEvalDetailToSummary(detail)
    return attachBenchmarkCardToSummary(summary)
  }

  return null
}

// Keep this export for compatibility — but it now fetches from HF model-cards
// and returns evaluation card data (not raw BenchmarkEvaluation[])
export async function loadAllEvaluationsFromDataDirectory(): Promise<BenchmarkEvaluation[]> {
  // This function is kept for backward compatibility but should be avoided.
  // It returns an empty array since we no longer load all raw evaluations at once.
  console.warn("[model-data] loadAllEvaluationsFromDataDirectory() is deprecated with HF backend")
  return []
}