Spaces:

yananlong
/

general-eval-card

Sleeping

File size: 36,005 Bytes

/**
 * Processing utilities for benchmark-first evaluation data
 */

import type {
  BenchmarkCard,
  BenchmarkEvaluation,
  EvaluationCardData,
  CategoryType,
  ModelInfo,
  ModelVariantSummary,
  SourceMetadata,
  SourceData,
  ScoreDetails,
  MetricConfig,
  EvaluationResult,
} from './benchmark-schema'
import type { EvalcardsAnnotations, RowAnnotations, SignalSummaries } from './backend-artifacts'
import type { ModelEvaluationSummary } from './benchmark-schema'
import type { ModelSummaryCore } from './benchmark-schema'
import { inferCategoryFromBenchmark } from './benchmark-schema'

export type { BenchmarkCard }
import { getCanonicalModelIdentity, getModelFamilyRouteId } from './model-family'

export type { ModelEvaluationSummary }

const GENERIC_EVALUATION_NAMES = new Set([
  "score",
  "accuracy",
  "mean win rate",
  "exact match",
  "f1",
  "pass@1",
])

const BENCHMARK_PRIORITY_RULES: Array<{ pattern: RegExp; priority: number }> = [
  { pattern: /\b(swe-bench|terminal-bench|tau-bench|agent|browsecomp)\b/, priority: 10 },
  { pattern: /\b(gpqa|mmlu-pro|mmlu|bbh|ifeval|math|aime|gsm8k|minerva)\b/, priority: 9 },
  { pattern: /\b(humaneval|livecodebench|mbpp|codecontests|apps)\b/, priority: 8 },
  { pattern: /\b(mmmu|mmmu-pro|seed-bench|vision|vqa|multimodal)\b/, priority: 7 },
  { pattern: /\b(mt-bench|arena-hard|alpacaeval|reward-bench|truthfulqa)\b/, priority: 6 },
  { pattern: /\b(fairness|bias|safety|toxic|harmful|robust|privacy)\b/, priority: 5 },
]

function slugify(value: string): string {
  return value.toLowerCase().replace(/[^a-z0-9]+/g, "_").replace(/^_|_$/g, "")
}

function getBenchmarkName(
  evaluation: BenchmarkEvaluation,
  result?: EvaluationResult
): string {
  const resultSource = result?.source_data

  if (resultSource && !Array.isArray(resultSource) && resultSource.dataset_name) {
    return resultSource.dataset_name
  }

  if (evaluation.benchmark) {
    return evaluation.benchmark
  }

  if (!Array.isArray(evaluation.source_data) && evaluation.source_data.dataset_name) {
    return evaluation.source_data.dataset_name
  }

  return result?.evaluation_name ?? evaluation.evaluation_id
}

function getEvaluationDisplayName(
  evaluation: BenchmarkEvaluation,
  result: EvaluationResult
): string {
  const benchmarkName = getBenchmarkName(evaluation, result)
  const metricName = result.evaluation_name.trim()

  if (metricName === benchmarkName) {
    return metricName
  }

  if (GENERIC_EVALUATION_NAMES.has(metricName.toLowerCase())) {
    return `${benchmarkName} - ${metricName}`
  }

  return metricName
}

function getEvaluationSummaryId(
  evaluation: BenchmarkEvaluation,
  result: EvaluationResult
): string {
  const benchmarkKey = evaluation.benchmark || getBenchmarkName(evaluation, result)
  return slugify(`${benchmarkKey}__${result.evaluation_name}`)
}

function getBenchmarkPriority(value: string): number {
  const normalized = value.toLowerCase()

  for (const rule of BENCHMARK_PRIORITY_RULES) {
    if (rule.pattern.test(normalized)) {
      return rule.priority
    }
  }

  return 0
}

// ── Eval-centric (per-benchmark) types ────────────────────────────────────────

export interface ModelResultForBenchmark {
  model_info: ModelInfo
  model_route_id?: string
  score: number
  score_details: ScoreDetails
  evaluation_timestamp: string
  source_metadata: SourceMetadata
  source_data: BenchmarkEvaluation['source_data']
  result: EvaluationResult
  /** URL to the underlying record JSON in the upstream HF dataset, when known. */
  source_record_url?: string
  aggregate_components?: Array<{
    evaluation_id: string
    composite_benchmark_key: string
    composite_benchmark_name: string
    score: number
    normalized_score: number
    evaluation_timestamp: string
    source_name?: string
    source_type: SourceMetadata["source_type"]
    source_organization_name: string
    evaluator_relationship: SourceMetadata["evaluator_relationship"]
  }>
}

export interface BenchmarkEvalSummary extends SignalSummaries {
  evaluation_name: string
  /** URL-safe slug derived from evaluation_name */
  evaluation_id: string
  canonical_display_name?: string
  composite_benchmark_key: string
  composite_benchmark_name: string
  category: CategoryType
  metric_config: MetricConfig
  model_results: ModelResultForBenchmark[]
  models_count: number
  /** Unique evaluator organisation names */
  evaluator_names: string[]
  source_types: SourceMetadata["source_type"][]
  latest_source_name?: string
  third_party_ratio: number
  missing_generation_config_count: number
  best_model: { name: string; score: number } | null
  worst_model: { name: string; score: number } | null
  avg_score: number
  /** avg_score normalised to 0-1 using metric_config.min/max_score */
  avg_score_norm: number
  /** Rich benchmark card from the metadata/ folder, when available */
  benchmark_card?: BenchmarkCard
  is_aggregated?: boolean
  aggregate_sources?: Array<{
    evaluation_id: string
    composite_benchmark_key: string
    composite_benchmark_name: string
    models_count: number
    avg_score_norm: number
  }>
  /** Tags from the pipeline (domains, languages, tasks) */
  tags?: { domains: string[]; languages: string[]; tasks: string[] }
  /** Number of distinct metrics for this benchmark */
  metrics_count?: number
  /** Names of all metrics */
  metric_names?: string[]
  /** Instance-level data availability */
  instance_data?: { available: boolean; url_count: number; sample_urls: string[]; models_with_loaded_instances: number }
  /** Canonical benchmark id (the registry-resolved benchmark). Drives
   *  benchmark-card lookups regardless of slice/composite axis. */
  benchmark_id?: string
  /** Family display name. */
  benchmark_family_name?: string
  /** Composite (leaderboard) slug — e.g. "wasp", "helm-classic". */
  composite_slug?: string
  /** Composite display name — e.g. "WASP", "HELM Classic". */
  composite_display_name?: string
  /** Curated multi-benchmark family slug (e.g. "mmlu"), defaults to
   *  benchmark id for singletons. */
  family_id?: string
  /** Family display, post-cutover canonical name. */
  family_display_name?: string
  /** Parent benchmark id — populated when this row is a slice of a
   *  root benchmark; null for non-slice rows. */
  parent_benchmark_id?: string
  /** True when this row is a within-benchmark slice cut. */
  is_slice?: boolean
  /** Source dataset metadata from the pipeline */
  source_data?: SourceData
  /** Best raw score reported in the eval summary list */
  top_score?: number
  /** Count of nested subtasks reported for the benchmark */
  subtasks_count?: number
  /** Whether this row is a summary/rollup score for a composite */
  is_summary_score?: boolean
  /** Related summary-score sibling ids for this benchmark */
  summary_eval_ids?: string[]
  /** Canonical benchmark-level metrics from root metrics[] */
  root_metrics?: BenchmarkSummaryMetric[]
  /** Canonical benchmark subdivisions from subtasks[] */
  subtasks?: BenchmarkSummarySubtask[]
  /** Matrix columns for multi-metric benchmark leaderboards */
  leaderboard_metrics?: BenchmarkLeaderboardMetric[]
  /** Matrix rows for multi-metric benchmark leaderboards */
  leaderboard_rows?: BenchmarkLeaderboardRow[]
  evalcards?: { annotations?: EvalcardsAnnotations }
}

export interface BenchmarkSummaryMetric {
  metric_summary_id: string
  metric_name: string
  display_name: string
  canonical_display_name?: string
  metric_key?: string
  lower_is_better: boolean
  models_count: number
  top_score?: number
  unit?: string
}

export interface BenchmarkSummarySubtask {
  subtask_key: string
  subtask_name: string
  display_name: string
  canonical_display_name?: string
  metrics: BenchmarkSummaryMetric[]
}

export interface BenchmarkLeaderboardMetric {
  column_key: string
  metric_summary_id: string
  metric_name: string
  display_name: string
  canonical_display_name?: string
  lower_is_better: boolean
  unit?: string
  scope: "root" | "subtask"
  subtask_key?: string
  subtask_name?: string
}

export interface BenchmarkLeaderboardRow {
  model_info: ModelInfo
  model_route_id?: string
  evaluation_timestamp: string
  source_metadata: SourceMetadata
  source_data: BenchmarkEvaluation["source_data"]
  values: Record<string, number | null>
  annotations_by_metric?: Record<string, RowAnnotations | null | undefined>
  metrics_present: number
}

export type BenchmarkEvalListItem = Omit<BenchmarkEvalSummary, "model_results">

/**
 * Fill in derived fields the upstream pipeline sometimes leaves blank.
 *
 * Currently: `instance_data`. The pipeline that emits eval-summary parquets
 * occasionally ships rows where `instance_data` is null even though every
 * `model_results[].result.detailed_evaluation_results_url` is populated
 * (Wordle Arena is one example — 42 models, every one with a per-model
 * JSONL URL on `evaleval/card_backend`, but `instance_data` was null).
 *
 * Rather than patching this at one render site we derive it once here so
 * every consumer of the summary — eval detail page, modal previews,
 * cross-referenced model summaries, etc. — sees the same picture.
 */
export function normalizeEvalSummary<T extends BenchmarkEvalSummary>(summary: T): T {
  if (summary.instance_data?.available && summary.instance_data.url_count > 0) {
    return summary
  }

  const distinctUrls = new Set<string>()
  const modelsWithUrl = new Set<string>()
  for (const result of summary.model_results ?? []) {
    const url = result?.result?.detailed_evaluation_results_url
    if (typeof url === "string" && url.length > 0) {
      distinctUrls.add(url)
      const modelId = result.model_info?.id
      if (modelId) modelsWithUrl.add(modelId)
    }
  }

  if (distinctUrls.size === 0) {
    // Nothing to derive — preserve whatever the upstream said (typically
    // `available: false` or absent).
    return summary
  }

  // Take a small sample so callers can show example URLs without paying
  // for the full set, mirroring the upstream pipeline's contract.
  const sampleUrls = Array.from(distinctUrls).slice(0, 8)

  return {
    ...summary,
    instance_data: {
      available: true,
      url_count: distinctUrls.size,
      sample_urls: sampleUrls,
      models_with_loaded_instances: modelsWithUrl.size,
    },
  }
}

/**
 * Group multiple evaluations by model
 */
export function groupEvaluationsByModel(
  evaluations: BenchmarkEvaluation[]
): Record<string, BenchmarkEvaluation[]> {
  const grouped: Record<string, BenchmarkEvaluation[]> = {}
  
  for (const eval_ of evaluations) {
    const modelId = eval_.model_info.id
    if (!grouped[modelId]) {
      grouped[modelId] = []
    }
    grouped[modelId].push(eval_)
  }
  
  return grouped
}

export function groupEvaluationsByModelFamily(
  evaluations: BenchmarkEvaluation[]
): Record<string, BenchmarkEvaluation[]> {
  const grouped: Record<string, BenchmarkEvaluation[]> = {}

  for (const eval_ of evaluations) {
    const familyId = getCanonicalModelIdentity(eval_.model_info).familyId
    if (!grouped[familyId]) {
      grouped[familyId] = []
    }
    grouped[familyId].push(eval_)
  }

  return grouped
}

/**
 * Create a model evaluation summary from grouped evaluations
 */
export function createModelSummary(
  evaluations: BenchmarkEvaluation[]
): ModelSummaryCore {
  if (evaluations.length === 0) {
    throw new Error('No evaluations provided')
  }
  
  const modelInfo = evaluations[0].model_info
  const evaluationsByCategory: Record<string, BenchmarkEvaluation[]> = {}
  const categoriesSet = new Set<CategoryType>()
  
  // Group by category - track which categories each evaluation belongs to
  for (const eval_ of evaluations) {
    const evalCategories = new Set<CategoryType>()

    if (eval_.category) {
      evalCategories.add(eval_.category)
      categoriesSet.add(eval_.category)
    } else {
      for (const result of eval_.evaluation_results) {
        let category: CategoryType = inferCategoryFromBenchmark(result.evaluation_name)

        // Fallback to dataset name if source_data is an object
        if (category === 'General' && !Array.isArray(eval_.source_data)) {
          category = inferCategoryFromBenchmark(eval_.source_data.dataset_name)
        }

        evalCategories.add(category)
        categoriesSet.add(category)
      }
    }
    
    // Add evaluation to each unique category it belongs to (once per category)
    for (const category of evalCategories) {
      if (!evaluationsByCategory[category]) {
        evaluationsByCategory[category] = []
      }
      evaluationsByCategory[category].push(eval_)
    }
  }
  
  // Find latest timestamp
  const timestamps = evaluations.map(e => {
    const ts = e.retrieved_timestamp
    // Check if it's a number (unix timestamp in seconds)
    if (!isNaN(Number(ts)) && !ts.includes('-')) {
      return parseFloat(ts) * 1000
    }
    // Assume ISO string or date string
    return new Date(ts).getTime()
  })
  
  const latestTimestamp = new Date(Math.max(...timestamps)).toISOString()
  
  // Calculate total benchmark results
  const totalResults = evaluations.reduce((sum, eval_) => sum + eval_.evaluation_results.length, 0)

  return {
    model_info: modelInfo,
    evaluations_by_category: evaluationsByCategory as Record<CategoryType, BenchmarkEvaluation[]>,
    total_evaluations: totalResults,
    last_updated: latestTimestamp,
    categories_covered: Array.from(categoriesSet),
  }
}

function pickRepresentativeModelInfo(evaluations: BenchmarkEvaluation[]): ModelInfo {
  const sorted = [...evaluations].sort((a, b) => {
    const aTimestamp = new Date(a.retrieved_timestamp).getTime() || Number(a.retrieved_timestamp) * 1000 || 0
    const bTimestamp = new Date(b.retrieved_timestamp).getTime() || Number(b.retrieved_timestamp) * 1000 || 0
    if (bTimestamp !== aTimestamp) {
      return bTimestamp - aTimestamp
    }

    return b.evaluation_results.length - a.evaluation_results.length
  })

  return sorted[0].model_info
}

type AggregatedVariantDescriptor = {
  variantKey: string
  variantLabel: string
  variantDisplayName: string
  familyId: string
  familyName: string
  versionDate?: string
  versionQualifier?: string
  mergedSetupAlias: boolean
}

function getSetupAliasMode(modelInfo: ModelInfo) {
  const rawMode = modelInfo.additional_details?.mode
  if (typeof rawMode !== 'string') {
    return null
  }

  const normalizedMode = rawMode.trim().toLowerCase().replace(/[_-]+/g, ' ')
  if (!normalizedMode) {
    return null
  }

  if (
    normalizedMode === 'prompt' ||
    normalizedMode === 'fc' ||
    normalizedMode === 'function calling' ||
    normalizedMode.startsWith('thinking')
  ) {
    return rawMode.trim()
  }

  return null
}

function getAggregatedVariantDescriptor(modelInfo: ModelInfo): AggregatedVariantDescriptor {
  const identity = getCanonicalModelIdentity(modelInfo)
  const setupAliasMode = getSetupAliasMode(modelInfo)

  if (!setupAliasMode) {
    return {
      variantKey: identity.variantKey,
      variantLabel: identity.variantLabel,
      variantDisplayName: identity.variantDisplayName,
      familyId: identity.familyId,
      familyName: identity.familyName,
      versionDate: identity.versionDate,
      versionQualifier: identity.versionQualifier,
      mergedSetupAlias: false,
    }
  }

  if (identity.versionDate) {
    return {
      variantKey: identity.versionDate,
      variantLabel: identity.versionDate,
      variantDisplayName: `${identity.familyName} (${identity.versionDate})`,
      familyId: identity.familyId,
      familyName: identity.familyName,
      versionDate: identity.versionDate,
      versionQualifier: undefined,
      mergedSetupAlias: true,
    }
  }

  return {
    variantKey: 'base',
    variantLabel: 'Current',
    variantDisplayName: identity.familyName,
    familyId: identity.familyId,
    familyName: identity.familyName,
    versionDate: undefined,
    versionQualifier: undefined,
    mergedSetupAlias: true,
  }
}

function sortVariants(variants: ModelVariantSummary[]) {
  return [...variants].sort((a, b) => {
    const aDate = a.version_date ? new Date(a.version_date).getTime() : Number.NEGATIVE_INFINITY
    const bDate = b.version_date ? new Date(b.version_date).getTime() : Number.NEGATIVE_INFINITY

    if (aDate !== bDate) {
      return bDate - aDate
    }

    if (b.total_evaluations !== a.total_evaluations) {
      return b.total_evaluations - a.total_evaluations
    }

    return a.variant_label.localeCompare(b.variant_label)
  })
}

export function createModelFamilySummary(
  evaluations: BenchmarkEvaluation[]
): ModelEvaluationSummary {
  if (evaluations.length === 0) {
    throw new Error("No evaluations provided")
  }

  const familyIdentity = getCanonicalModelIdentity(evaluations[0].model_info)
  const variantGroups = new Map<string, {
    descriptor: AggregatedVariantDescriptor
    evaluations: BenchmarkEvaluation[]
  }>()

  for (const evaluation of evaluations) {
    const descriptor = getAggregatedVariantDescriptor(evaluation.model_info)
    const existing = variantGroups.get(descriptor.variantKey)

    if (existing) {
      existing.evaluations.push(evaluation)
      continue
    }

    variantGroups.set(descriptor.variantKey, {
      descriptor,
      evaluations: [evaluation],
    })
  }

  const variants = sortVariants(
    Array.from(variantGroups.values()).map(({ descriptor, evaluations: variantEvaluations }) => {
      const summary = createModelSummary(variantEvaluations)
      const modelInfo = descriptor.mergedSetupAlias
        ? {
            ...summary.model_info,
            id: descriptor.variantKey === 'base'
              ? descriptor.familyId
              : `${descriptor.familyId}::${descriptor.variantKey}`,
            name: descriptor.variantDisplayName,
            model_version: descriptor.variantKey === 'base' ? undefined : descriptor.variantLabel,
          }
        : summary.model_info

      return {
        ...summary,
        model_info: modelInfo,
        variant_id: `${descriptor.familyId}::${descriptor.variantKey}`,
        variant_key: descriptor.variantKey,
        variant_label: descriptor.variantLabel,
        variant_display_name: descriptor.variantDisplayName,
        raw_model_ids: Array.from(new Set(variantEvaluations.map((item) => item.model_info.id))).sort((a, b) =>
          a.localeCompare(b)
        ),
        family_id: descriptor.familyId,
        family_name: descriptor.familyName,
        version_date: descriptor.versionDate,
        version_qualifier: descriptor.versionQualifier,
      }
    })
  )

  const familySummary = createModelSummary(evaluations)
  const representativeVariant = variants[0] ?? familySummary

  return {
    ...familySummary,
    model_info: {
      ...representativeVariant.model_info,
      id: familyIdentity.familyId,
      name: familyIdentity.familyName,
      model_version: undefined,
    },
    model_family_id: familyIdentity.familyId,
    model_route_id: getModelFamilyRouteId(familyIdentity.familyId),
    model_family_name: familyIdentity.familyName,
    raw_model_ids: Array.from(new Set(evaluations.map((item) => item.model_info.id))).sort((a, b) =>
      a.localeCompare(b)
    ),
    variants,
  }
}

/**
 * Convert model summary to card display format
 */
export function createEvaluationCard(
  summary: ModelEvaluationSummary
): EvaluationCardData {
  // Get all unique benchmarks
  const benchmarksSet = new Set<string>()
  const allScores: Array<{
    benchmark: string
    benchmarkKey: string
    score: number
    metric: string
    unit?: string
  }> = []
  const sourceUrls = new Set<string>()
  const detailUrls = new Set<string>()
  const evaluatorNames = new Set<string>()
  const sourceTypes = new Set<SourceMetadata["source_type"]>()
  const evalLibraries = new Map<string, { name: string; version?: string; fork?: string }>()
  let missingGenerationConfigCount = 0
  let thirdPartyEvalCount = 0
  let latestSourceName: string | undefined
  let latestTimestamp = Number.NEGATIVE_INFINITY
  
  // Collect all evaluations
  for (const evals of Object.values(summary.evaluations_by_category)) {
    for (const eval_ of evals) {
      if (eval_.source_metadata.source_organization_name) {
        evaluatorNames.add(eval_.source_metadata.source_organization_name)
      }

      sourceTypes.add(eval_.source_metadata.source_type)

      if (eval_.source_metadata.evaluator_relationship === "third_party") {
        thirdPartyEvalCount += 1
      }

      const numericTimestamp = Number(eval_.retrieved_timestamp)
      const timestamp =
        !Number.isNaN(numericTimestamp) && !eval_.retrieved_timestamp.includes("-")
          ? numericTimestamp * 1000
          : new Date(eval_.retrieved_timestamp).getTime()
      if (Number.isFinite(timestamp) && timestamp >= latestTimestamp) {
        latestTimestamp = timestamp
        latestSourceName = eval_.source_metadata.source_name
      }

      if (eval_.eval_library?.name) {
        const libraryKey = `${eval_.eval_library.name}@${eval_.eval_library.version ?? ""}`
        evalLibraries.set(libraryKey, {
          name: eval_.eval_library.name,
          version: eval_.eval_library.version,
          fork:
            typeof eval_.eval_library.additional_details?.fork === "string"
              ? eval_.eval_library.additional_details.fork
              : undefined,
        })
      }

      // Handle source_data as either string[] or SourceData object
      if (Array.isArray(eval_.source_data)) {
        // source_data is string[] (URLs), extract benchmark names from evaluation_results
        for (const result of eval_.evaluation_results) {
          benchmarksSet.add(getBenchmarkName(eval_, result))
        }
      } else {
        // Even if source_data is an object, we should try to extract individual benchmarks
        // from evaluation_results if available, as dataset_name might be a suite name.
        if (eval_.evaluation_results && eval_.evaluation_results.length > 0) {
           for (const result of eval_.evaluation_results) {
             benchmarksSet.add(getBenchmarkName(eval_, result))
           }
        } else {
           benchmarksSet.add(eval_.source_data.dataset_name)
        }
      }
      
      if (eval_.source_metadata.source_url) {
        sourceUrls.add(eval_.source_metadata.source_url)
      }
      
      // Add source_data URLs if it's a string array
      if (Array.isArray(eval_.source_data)) {
        eval_.source_data.forEach(url => sourceUrls.add(url))
      }
      
      for (const result of eval_.evaluation_results) {
        if (!result.generation_config) {
          missingGenerationConfigCount += 1
        }

        if (result.detailed_evaluation_results_url) {
          detailUrls.add(result.detailed_evaluation_results_url)
        }
        
        allScores.push({
          benchmark: getEvaluationDisplayName(eval_, result),
          benchmarkKey: getBenchmarkName(eval_, result),
          score: result.score_details.score,
          metric: result.metric_config.evaluation_description || result.evaluation_name,
          unit: result.metric_config.unit
        })
      }
    }
  }
  
  // Deduplicate by benchmark name, keeping highest score for each
  const scoresByBenchmark = new Map<
    string,
    { benchmark: string; benchmarkKey: string; score: number; metric: string; unit?: string }
  >()
  for (const scoreData of allScores) {
    const existing = scoresByBenchmark.get(scoreData.benchmark)
    if (!existing || scoreData.score > existing.score) {
      scoresByBenchmark.set(scoreData.benchmark, scoreData)
    }
  }
  
  // Calculate category stats (count of unique benchmarks per category)
  const categoryStats: Record<CategoryType, number> = {} as any
  
  for (const category of summary.categories_covered) {
    const evals = summary.evaluations_by_category[category] || []
    const categoryBenchmarks = new Set<string>()
    
    for (const eval_ of evals) {
      for (const result of eval_.evaluation_results) {
        categoryBenchmarks.add(getBenchmarkName(eval_, result))
      }
    }
    categoryStats[category] = categoryBenchmarks.size
  }

  // Get top 5 unique benchmarks by score
  const topScores = Array.from(scoresByBenchmark.values())
    .sort((a, b) => {
      const priorityDiff = getBenchmarkPriority(b.benchmarkKey) - getBenchmarkPriority(a.benchmarkKey)
      if (priorityDiff !== 0) {
        return priorityDiff
      }

      if (b.score !== a.score) {
        return b.score - a.score
      }

      return a.benchmark.localeCompare(b.benchmark)
    })
    .slice(0, 5)
    .map(({ benchmark, score, metric, unit }) => ({
      benchmark,
      score,
      metric,
      unit,
    }))

  const paramsBillionsRaw = summary.model_info.additional_details?.params_billions
  const paramsBillions =
    typeof paramsBillionsRaw === "number"
      ? paramsBillionsRaw
      : typeof paramsBillionsRaw === "string"
        ? Number.parseFloat(paramsBillionsRaw)
        : null
  const reproducibilityStatus =
    missingGenerationConfigCount === 0
      ? "complete"
      : missingGenerationConfigCount === summary.total_evaluations
        ? "missing"
        : "partial"

  return {
    id: summary.model_family_id,
    route_id: summary.model_route_id,
    model_name: summary.model_family_name,
    model_id: summary.model_info.id,
    canonical_model_name: summary.model_family_name,
    developer: summary.model_info.developer ?? "",
    evaluations_count: summary.total_evaluations,
    benchmarks_count: benchmarksSet.size,
    variant_count: summary.variants.length,
    categories: summary.categories_covered,
    category_stats: categoryStats,
    latest_timestamp: summary.last_updated,
    evaluator_count: evaluatorNames.size,
    evaluator_names: Array.from(evaluatorNames).sort((a, b) => a.localeCompare(b)),
    source_type_count: sourceTypes.size,
    source_types: Array.from(sourceTypes).sort((a, b) => a.localeCompare(b)),
    evidence_count: sourceUrls.size + detailUrls.size,
    missing_generation_config_count: missingGenerationConfigCount,
    third_party_eval_count: thirdPartyEvalCount,
    independent_verification_ratio:
      summary.total_evaluations > 0 ? thirdPartyEvalCount / summary.total_evaluations : 0,
    reproducibility_status: reproducibilityStatus,
    eval_libraries: Array.from(evalLibraries.values()).sort((a, b) => a.name.localeCompare(b.name)),
    latest_source_name: latestSourceName,
    params_billions: Number.isFinite(paramsBillions ?? NaN) ? paramsBillions : null,
    reproducibility_summary: summary.reproducibility_summary,
    provenance_summary: summary.provenance_summary,
    comparability_summary: summary.comparability_summary,
    top_scores: topScores,
    source_urls: Array.from(sourceUrls),
    detail_urls: Array.from(detailUrls),
    architecture: summary.model_info.architecture,
    params: summary.model_info.parameter_count,
    inference_engine: summary.model_info.inference_engine,
    inference_platform: summary.model_info.inference_platform,
    input_modalities: summary.model_info.modalities?.input,
    output_modalities: summary.model_info.modalities?.output,
    release_date: summary.model_info.release_date,
    model_url: summary.model_info.model_url,
  }
}

/**
 * Get category stats for a model
 */
export function getCategoryStats(
  summary: ModelSummaryCore
): {
  categories: { category: CategoryType; count: number; avg_score: number; total_results: number }[]
} {
  const categories: { category: CategoryType; count: number; avg_score: number; total_results: number }[] = []
  
  for (const category of summary.categories_covered) {
    const evals = summary.evaluations_by_category[category] || []
    const allScores: number[] = []

    // Collect all scores from all results in this category
    for (const eval_ of evals) {
      for (const result of eval_.evaluation_results) {
        allScores.push(result.score_details.score)
      }
    }
    
    const avgScore = allScores.length > 0
      ? allScores.reduce((a, b) => a + b, 0) / allScores.length
      : 0
    
    const stat = {
      category,
      count: evals.length, // Number of evaluation files
      total_results: allScores.length, // Number of actual benchmark results
      avg_score: avgScore,
    }
    
    categories.push(stat)
  }
  
  // Sort categories by name or some other metric if needed
  categories.sort((a, b) => a.category.localeCompare(b.category))
  
  return { categories }
}

/**
 * Load and process evaluations from file paths
 */
export async function loadEvaluations(
  filePaths: string[]
): Promise<BenchmarkEvaluation[]> {
  const evaluations: BenchmarkEvaluation[] = []
  
  for (const path of filePaths) {
    try {
      const response = await fetch(path)
      if (!response.ok) continue
      
      const data = await response.json()
      
      // Validate it matches our schema
      if (data.schema_version && data.evaluation_id && data.model_info) {
        evaluations.push(data as BenchmarkEvaluation)
      }
    } catch (error) {
      console.warn(`Failed to load evaluation from ${path}:`, error)
    }
  }
  
  return evaluations
}

/**
 * Process all evaluations into card data
 */
export async function processEvaluationsToCards(
  filePaths: string[]
): Promise<EvaluationCardData[]> {
  const evaluations = await loadEvaluations(filePaths)
  const grouped = groupEvaluationsByModelFamily(evaluations)
  
  const cards: EvaluationCardData[] = []
  
  for (const modelId in grouped) {
    const modelEvals = grouped[modelId]
    const summary = createModelFamilySummary(modelEvals)
    const card = createEvaluationCard(summary)
    cards.push(card)
  }
  
  return cards
}

/**
 * Format score with proper precision
 */
export function formatScore(
  score: number,
  scoreType: 'continuous' | 'discrete' | 'binary',
  maxScore?: number
): string {
  if (scoreType === 'binary') {
    return score > 0.5 ? 'Pass' : 'Fail'
  }
  
  if (maxScore && maxScore === 1.0) {
    // It's a percentage/ratio
    return `${(score * 100).toFixed(1)}%`
  }
  
  if (maxScore && maxScore === 100) {
    return `${score.toFixed(1)}`
  }
  
  // Default formatting
  return score.toFixed(3)
}

/**
 * Get benchmark display name
 */
export function getBenchmarkDisplayName(name: string | undefined | null): string {
  if (!name) return 'Unknown Benchmark'
  
  // Map common benchmarks to friendly names
  const mapping: Record<string, string> = {
    'MMLU': 'Massive Multitask Language Understanding',
    'MMLU-Pro': 'MMLU Professional',
    'GSM8K': 'Grade School Math 8K',
    'HumanEval': 'Human Eval (Code)',
    'MBPP': 'Mostly Basic Python Problems',
    'HellaSwag': 'HellaSwag (Commonsense)',
    'ARC': 'AI2 Reasoning Challenge',
    'TruthfulQA': 'TruthfulQA',
    'BBH': 'Big-Bench Hard',
    'MATH': 'MATH Dataset',
  }
  
  for (const [key, value] of Object.entries(mapping)) {
    if (name.toUpperCase().includes(key.toUpperCase())) {
      return value
    }
  }
  
  return name
}

// ── Eval-centric grouping ─────────────────────────────────────────────────────

/**
 * Group individual benchmark results across all model files, keyed by
 * evaluation_name.  Each entry describes one benchmark and which models ran it.
 */
export function groupEvaluationsByBenchmark(
  evaluations: BenchmarkEvaluation[]
): Record<string, BenchmarkEvalSummary> {
  const summaries: Record<string, BenchmarkEvalSummary> = {}

  for (const eval_ of evaluations) {
    for (const result of eval_.evaluation_results) {
      const displayName = getEvaluationDisplayName(eval_, result)
      const evalId = getEvaluationSummaryId(eval_, result)
      const compositeBenchmarkKey = eval_.benchmark || getBenchmarkName(eval_, result)
      const compositeBenchmarkName = getBenchmarkDisplayName(compositeBenchmarkKey)

      if (!summaries[evalId]) {
        const category = inferCategoryFromBenchmark(displayName)

        summaries[evalId] = {
          evaluation_name: displayName,
          evaluation_id: evalId,
          composite_benchmark_key: compositeBenchmarkKey,
          composite_benchmark_name: compositeBenchmarkName,
          category,
          metric_config: result.metric_config,
          model_results: [],
          models_count: 0,
          evaluator_names: [],
          source_types: [],
          latest_source_name: undefined,
          third_party_ratio: 0,
          missing_generation_config_count: 0,
          best_model: null,
          worst_model: null,
          avg_score: 0,
          avg_score_norm: 0,
        }
      }

      summaries[evalId].model_results.push({
        model_info: eval_.model_info,
        score: result.score_details.score,
        score_details: result.score_details,
        evaluation_timestamp: result.evaluation_timestamp,
        source_metadata: eval_.source_metadata,
        source_data: result.source_data ?? eval_.source_data,
        result,
      })

      const orgName = eval_.source_metadata.source_organization_name
      if (!summaries[evalId].evaluator_names.includes(orgName)) {
        summaries[evalId].evaluator_names.push(orgName)
      }
    }
  }

  // Finalise each summary
  for (const summary of Object.values(summaries)) {
    summary.models_count = summary.model_results.length
    const scores = summary.model_results.map(m => m.score)
    summary.avg_score = scores.reduce((a, b) => a + b, 0) / scores.length
    summary.source_types = Array.from(
      new Set(summary.model_results.map((result) => result.source_metadata.source_type))
    ).sort((a, b) => a.localeCompare(b))
    summary.third_party_ratio =
      summary.model_results.filter((result) => result.source_metadata.evaluator_relationship === "third_party").length /
      summary.model_results.length
    summary.missing_generation_config_count = summary.model_results.filter(
      (result) => !result.result.generation_config
    ).length

    let latestTimestamp = Number.NEGATIVE_INFINITY
    for (const result of summary.model_results) {
      const numericTimestamp = Number(result.evaluation_timestamp)
      const timestamp =
        !Number.isNaN(numericTimestamp) && !result.evaluation_timestamp.includes("-")
          ? numericTimestamp * 1000
          : new Date(result.evaluation_timestamp).getTime()
      if (Number.isFinite(timestamp) && timestamp >= latestTimestamp) {
        latestTimestamp = timestamp
        summary.latest_source_name = result.source_metadata.source_name
      }
    }

    const maxScore = summary.metric_config.max_score ?? 1
    const minScore = summary.metric_config.min_score ?? 0
    const range = maxScore - minScore
    summary.avg_score_norm = range > 0 ? (summary.avg_score - minScore) / range : 0

    const lowerIsBetter = summary.metric_config.lower_is_better
    const sorted = [...summary.model_results].sort((a, b) =>
      lowerIsBetter ? a.score - b.score : b.score - a.score
    )

    if (sorted.length > 0) {
      summary.best_model = { name: sorted[0].model_info.name, score: sorted[0].score }
      summary.worst_model = {
        name: sorted[sorted.length - 1].model_info.name,
        score: sorted[sorted.length - 1].score,
      }
    }
  }

  return summaries
}

/**
 * Load files and return a flat array of BenchmarkEvalSummary objects,
 * one per unique evaluation name across all models.
 */
export async function processEvaluationsToBenchmarkSummaries(
  filePaths: string[]
): Promise<BenchmarkEvalSummary[]> {
  const evaluations = await loadEvaluations(filePaths)
  const grouped = groupEvaluationsByBenchmark(evaluations)
  return Object.values(grouped)
}

export function toBenchmarkEvalListItem(
  summary: BenchmarkEvalSummary
): BenchmarkEvalListItem {
  const { model_results: _modelResults, ...listItem } = summary
  return listItem
}