general-eval-card / lib /eval-processing.ts
j-chim's picture
Refactor to align on benchmark hierarchy
2ed4959
/**
* Processing utilities for benchmark-first evaluation data
*/
import type {
BenchmarkCard,
BenchmarkEvaluation,
EvaluationCardData,
CategoryType,
ModelInfo,
ModelVariantSummary,
SourceMetadata,
SourceData,
ScoreDetails,
MetricConfig,
EvaluationResult,
} from './benchmark-schema'
import type { EvalcardsAnnotations, RowAnnotations, SignalSummaries } from './backend-artifacts'
import type { ModelEvaluationSummary } from './benchmark-schema'
import type { ModelSummaryCore } from './benchmark-schema'
import { inferCategoryFromBenchmark } from './benchmark-schema'
export type { BenchmarkCard }
import { getCanonicalModelIdentity, getModelFamilyRouteId } from './model-family'
export type { ModelEvaluationSummary }
const GENERIC_EVALUATION_NAMES = new Set([
"score",
"accuracy",
"mean win rate",
"exact match",
"f1",
"pass@1",
])
const BENCHMARK_PRIORITY_RULES: Array<{ pattern: RegExp; priority: number }> = [
{ pattern: /\b(swe-bench|terminal-bench|tau-bench|agent|browsecomp)\b/, priority: 10 },
{ pattern: /\b(gpqa|mmlu-pro|mmlu|bbh|ifeval|math|aime|gsm8k|minerva)\b/, priority: 9 },
{ pattern: /\b(humaneval|livecodebench|mbpp|codecontests|apps)\b/, priority: 8 },
{ pattern: /\b(mmmu|mmmu-pro|seed-bench|vision|vqa|multimodal)\b/, priority: 7 },
{ pattern: /\b(mt-bench|arena-hard|alpacaeval|reward-bench|truthfulqa)\b/, priority: 6 },
{ pattern: /\b(fairness|bias|safety|toxic|harmful|robust|privacy)\b/, priority: 5 },
]
function slugify(value: string): string {
return value.toLowerCase().replace(/[^a-z0-9]+/g, "_").replace(/^_|_$/g, "")
}
function getBenchmarkName(
evaluation: BenchmarkEvaluation,
result?: EvaluationResult
): string {
const resultSource = result?.source_data
if (resultSource && !Array.isArray(resultSource) && resultSource.dataset_name) {
return resultSource.dataset_name
}
if (evaluation.benchmark) {
return evaluation.benchmark
}
if (!Array.isArray(evaluation.source_data) && evaluation.source_data.dataset_name) {
return evaluation.source_data.dataset_name
}
return result?.evaluation_name ?? evaluation.evaluation_id
}
function getEvaluationDisplayName(
evaluation: BenchmarkEvaluation,
result: EvaluationResult
): string {
const benchmarkName = getBenchmarkName(evaluation, result)
const metricName = result.evaluation_name.trim()
if (metricName === benchmarkName) {
return metricName
}
if (GENERIC_EVALUATION_NAMES.has(metricName.toLowerCase())) {
return `${benchmarkName} - ${metricName}`
}
return metricName
}
function getEvaluationSummaryId(
evaluation: BenchmarkEvaluation,
result: EvaluationResult
): string {
const benchmarkKey = evaluation.benchmark || getBenchmarkName(evaluation, result)
return slugify(`${benchmarkKey}__${result.evaluation_name}`)
}
function getBenchmarkPriority(value: string): number {
const normalized = value.toLowerCase()
for (const rule of BENCHMARK_PRIORITY_RULES) {
if (rule.pattern.test(normalized)) {
return rule.priority
}
}
return 0
}
// ── Eval-centric (per-benchmark) types ────────────────────────────────────────
export interface ModelResultForBenchmark {
model_info: ModelInfo
model_route_id?: string
score: number
score_details: ScoreDetails
evaluation_timestamp: string
source_metadata: SourceMetadata
source_data: BenchmarkEvaluation['source_data']
result: EvaluationResult
/** URL to the underlying record JSON in the upstream HF dataset, when known. */
source_record_url?: string
aggregate_components?: Array<{
evaluation_id: string
composite_benchmark_key: string
composite_benchmark_name: string
score: number
normalized_score: number
evaluation_timestamp: string
source_name?: string
source_type: SourceMetadata["source_type"]
source_organization_name: string
evaluator_relationship: SourceMetadata["evaluator_relationship"]
}>
}
export interface BenchmarkEvalSummary extends SignalSummaries {
evaluation_name: string
/** URL-safe slug derived from evaluation_name */
evaluation_id: string
canonical_display_name?: string
composite_benchmark_key: string
composite_benchmark_name: string
category: CategoryType
metric_config: MetricConfig
model_results: ModelResultForBenchmark[]
models_count: number
/** Unique evaluator organisation names */
evaluator_names: string[]
source_types: SourceMetadata["source_type"][]
latest_source_name?: string
third_party_ratio: number
missing_generation_config_count: number
best_model: { name: string; score: number } | null
worst_model: { name: string; score: number } | null
avg_score: number
/** avg_score normalised to 0-1 using metric_config.min/max_score */
avg_score_norm: number
/** Rich benchmark card from the metadata/ folder, when available */
benchmark_card?: BenchmarkCard
is_aggregated?: boolean
aggregate_sources?: Array<{
evaluation_id: string
composite_benchmark_key: string
composite_benchmark_name: string
models_count: number
avg_score_norm: number
}>
/** Tags from the pipeline (domains, languages, tasks) */
tags?: { domains: string[]; languages: string[]; tasks: string[] }
/** Number of distinct metrics for this benchmark */
metrics_count?: number
/** Names of all metrics */
metric_names?: string[]
/** Instance-level data availability */
instance_data?: { available: boolean; url_count: number; sample_urls: string[]; models_with_loaded_instances: number }
/** Canonical benchmark id (the registry-resolved benchmark). Drives
* benchmark-card lookups regardless of slice/composite axis. */
benchmark_id?: string
/** Family display name. */
benchmark_family_name?: string
/** Composite (leaderboard) slug β€” e.g. "wasp", "helm-classic". */
composite_slug?: string
/** Composite display name β€” e.g. "WASP", "HELM Classic". */
composite_display_name?: string
/** Curated multi-benchmark family slug (e.g. "mmlu"), defaults to
* benchmark id for singletons. */
family_id?: string
/** Family display, post-cutover canonical name. */
family_display_name?: string
/** Parent benchmark id β€” populated when this row is a slice of a
* root benchmark; null for non-slice rows. */
parent_benchmark_id?: string
/** True when this row is a within-benchmark slice cut. */
is_slice?: boolean
/** Source dataset metadata from the pipeline */
source_data?: SourceData
/** Best raw score reported in the eval summary list */
top_score?: number
/** Count of nested subtasks reported for the benchmark */
subtasks_count?: number
/** Whether this row is a summary/rollup score for a composite */
is_summary_score?: boolean
/** Related summary-score sibling ids for this benchmark */
summary_eval_ids?: string[]
/** Canonical benchmark-level metrics from root metrics[] */
root_metrics?: BenchmarkSummaryMetric[]
/** Canonical benchmark subdivisions from subtasks[] */
subtasks?: BenchmarkSummarySubtask[]
/** Matrix columns for multi-metric benchmark leaderboards */
leaderboard_metrics?: BenchmarkLeaderboardMetric[]
/** Matrix rows for multi-metric benchmark leaderboards */
leaderboard_rows?: BenchmarkLeaderboardRow[]
evalcards?: { annotations?: EvalcardsAnnotations }
}
export interface BenchmarkSummaryMetric {
metric_summary_id: string
metric_name: string
display_name: string
canonical_display_name?: string
metric_key?: string
lower_is_better: boolean
models_count: number
top_score?: number
unit?: string
}
export interface BenchmarkSummarySubtask {
subtask_key: string
subtask_name: string
display_name: string
canonical_display_name?: string
metrics: BenchmarkSummaryMetric[]
}
export interface BenchmarkLeaderboardMetric {
column_key: string
metric_summary_id: string
metric_name: string
display_name: string
canonical_display_name?: string
lower_is_better: boolean
unit?: string
scope: "root" | "subtask"
subtask_key?: string
subtask_name?: string
}
export interface BenchmarkLeaderboardRow {
model_info: ModelInfo
model_route_id?: string
evaluation_timestamp: string
source_metadata: SourceMetadata
source_data: BenchmarkEvaluation["source_data"]
values: Record<string, number | null>
annotations_by_metric?: Record<string, RowAnnotations | null | undefined>
metrics_present: number
}
export type BenchmarkEvalListItem = Omit<BenchmarkEvalSummary, "model_results">
/**
* Fill in derived fields the upstream pipeline sometimes leaves blank.
*
* Currently: `instance_data`. The pipeline that emits eval-summary parquets
* occasionally ships rows where `instance_data` is null even though every
* `model_results[].result.detailed_evaluation_results_url` is populated
* (Wordle Arena is one example β€” 42 models, every one with a per-model
* JSONL URL on `evaleval/card_backend`, but `instance_data` was null).
*
* Rather than patching this at one render site we derive it once here so
* every consumer of the summary β€” eval detail page, modal previews,
* cross-referenced model summaries, etc. β€” sees the same picture.
*/
export function normalizeEvalSummary<T extends BenchmarkEvalSummary>(summary: T): T {
if (summary.instance_data?.available && summary.instance_data.url_count > 0) {
return summary
}
const distinctUrls = new Set<string>()
const modelsWithUrl = new Set<string>()
for (const result of summary.model_results ?? []) {
const url = result?.result?.detailed_evaluation_results_url
if (typeof url === "string" && url.length > 0) {
distinctUrls.add(url)
const modelId = result.model_info?.id
if (modelId) modelsWithUrl.add(modelId)
}
}
if (distinctUrls.size === 0) {
// Nothing to derive β€” preserve whatever the upstream said (typically
// `available: false` or absent).
return summary
}
// Take a small sample so callers can show example URLs without paying
// for the full set, mirroring the upstream pipeline's contract.
const sampleUrls = Array.from(distinctUrls).slice(0, 8)
return {
...summary,
instance_data: {
available: true,
url_count: distinctUrls.size,
sample_urls: sampleUrls,
models_with_loaded_instances: modelsWithUrl.size,
},
}
}
/**
* Group multiple evaluations by model
*/
export function groupEvaluationsByModel(
evaluations: BenchmarkEvaluation[]
): Record<string, BenchmarkEvaluation[]> {
const grouped: Record<string, BenchmarkEvaluation[]> = {}
for (const eval_ of evaluations) {
const modelId = eval_.model_info.id
if (!grouped[modelId]) {
grouped[modelId] = []
}
grouped[modelId].push(eval_)
}
return grouped
}
export function groupEvaluationsByModelFamily(
evaluations: BenchmarkEvaluation[]
): Record<string, BenchmarkEvaluation[]> {
const grouped: Record<string, BenchmarkEvaluation[]> = {}
for (const eval_ of evaluations) {
const familyId = getCanonicalModelIdentity(eval_.model_info).familyId
if (!grouped[familyId]) {
grouped[familyId] = []
}
grouped[familyId].push(eval_)
}
return grouped
}
/**
* Create a model evaluation summary from grouped evaluations
*/
export function createModelSummary(
evaluations: BenchmarkEvaluation[]
): ModelSummaryCore {
if (evaluations.length === 0) {
throw new Error('No evaluations provided')
}
const modelInfo = evaluations[0].model_info
const evaluationsByCategory: Record<string, BenchmarkEvaluation[]> = {}
const categoriesSet = new Set<CategoryType>()
// Group by category - track which categories each evaluation belongs to
for (const eval_ of evaluations) {
const evalCategories = new Set<CategoryType>()
if (eval_.category) {
evalCategories.add(eval_.category)
categoriesSet.add(eval_.category)
} else {
for (const result of eval_.evaluation_results) {
let category: CategoryType = inferCategoryFromBenchmark(result.evaluation_name)
// Fallback to dataset name if source_data is an object
if (category === 'General' && !Array.isArray(eval_.source_data)) {
category = inferCategoryFromBenchmark(eval_.source_data.dataset_name)
}
evalCategories.add(category)
categoriesSet.add(category)
}
}
// Add evaluation to each unique category it belongs to (once per category)
for (const category of evalCategories) {
if (!evaluationsByCategory[category]) {
evaluationsByCategory[category] = []
}
evaluationsByCategory[category].push(eval_)
}
}
// Find latest timestamp
const timestamps = evaluations.map(e => {
const ts = e.retrieved_timestamp
// Check if it's a number (unix timestamp in seconds)
if (!isNaN(Number(ts)) && !ts.includes('-')) {
return parseFloat(ts) * 1000
}
// Assume ISO string or date string
return new Date(ts).getTime()
})
const latestTimestamp = new Date(Math.max(...timestamps)).toISOString()
// Calculate total benchmark results
const totalResults = evaluations.reduce((sum, eval_) => sum + eval_.evaluation_results.length, 0)
return {
model_info: modelInfo,
evaluations_by_category: evaluationsByCategory as Record<CategoryType, BenchmarkEvaluation[]>,
total_evaluations: totalResults,
last_updated: latestTimestamp,
categories_covered: Array.from(categoriesSet),
}
}
function pickRepresentativeModelInfo(evaluations: BenchmarkEvaluation[]): ModelInfo {
const sorted = [...evaluations].sort((a, b) => {
const aTimestamp = new Date(a.retrieved_timestamp).getTime() || Number(a.retrieved_timestamp) * 1000 || 0
const bTimestamp = new Date(b.retrieved_timestamp).getTime() || Number(b.retrieved_timestamp) * 1000 || 0
if (bTimestamp !== aTimestamp) {
return bTimestamp - aTimestamp
}
return b.evaluation_results.length - a.evaluation_results.length
})
return sorted[0].model_info
}
type AggregatedVariantDescriptor = {
variantKey: string
variantLabel: string
variantDisplayName: string
familyId: string
familyName: string
versionDate?: string
versionQualifier?: string
mergedSetupAlias: boolean
}
function getSetupAliasMode(modelInfo: ModelInfo) {
const rawMode = modelInfo.additional_details?.mode
if (typeof rawMode !== 'string') {
return null
}
const normalizedMode = rawMode.trim().toLowerCase().replace(/[_-]+/g, ' ')
if (!normalizedMode) {
return null
}
if (
normalizedMode === 'prompt' ||
normalizedMode === 'fc' ||
normalizedMode === 'function calling' ||
normalizedMode.startsWith('thinking')
) {
return rawMode.trim()
}
return null
}
function getAggregatedVariantDescriptor(modelInfo: ModelInfo): AggregatedVariantDescriptor {
const identity = getCanonicalModelIdentity(modelInfo)
const setupAliasMode = getSetupAliasMode(modelInfo)
if (!setupAliasMode) {
return {
variantKey: identity.variantKey,
variantLabel: identity.variantLabel,
variantDisplayName: identity.variantDisplayName,
familyId: identity.familyId,
familyName: identity.familyName,
versionDate: identity.versionDate,
versionQualifier: identity.versionQualifier,
mergedSetupAlias: false,
}
}
if (identity.versionDate) {
return {
variantKey: identity.versionDate,
variantLabel: identity.versionDate,
variantDisplayName: `${identity.familyName} (${identity.versionDate})`,
familyId: identity.familyId,
familyName: identity.familyName,
versionDate: identity.versionDate,
versionQualifier: undefined,
mergedSetupAlias: true,
}
}
return {
variantKey: 'base',
variantLabel: 'Current',
variantDisplayName: identity.familyName,
familyId: identity.familyId,
familyName: identity.familyName,
versionDate: undefined,
versionQualifier: undefined,
mergedSetupAlias: true,
}
}
function sortVariants(variants: ModelVariantSummary[]) {
return [...variants].sort((a, b) => {
const aDate = a.version_date ? new Date(a.version_date).getTime() : Number.NEGATIVE_INFINITY
const bDate = b.version_date ? new Date(b.version_date).getTime() : Number.NEGATIVE_INFINITY
if (aDate !== bDate) {
return bDate - aDate
}
if (b.total_evaluations !== a.total_evaluations) {
return b.total_evaluations - a.total_evaluations
}
return a.variant_label.localeCompare(b.variant_label)
})
}
export function createModelFamilySummary(
evaluations: BenchmarkEvaluation[]
): ModelEvaluationSummary {
if (evaluations.length === 0) {
throw new Error("No evaluations provided")
}
const familyIdentity = getCanonicalModelIdentity(evaluations[0].model_info)
const variantGroups = new Map<string, {
descriptor: AggregatedVariantDescriptor
evaluations: BenchmarkEvaluation[]
}>()
for (const evaluation of evaluations) {
const descriptor = getAggregatedVariantDescriptor(evaluation.model_info)
const existing = variantGroups.get(descriptor.variantKey)
if (existing) {
existing.evaluations.push(evaluation)
continue
}
variantGroups.set(descriptor.variantKey, {
descriptor,
evaluations: [evaluation],
})
}
const variants = sortVariants(
Array.from(variantGroups.values()).map(({ descriptor, evaluations: variantEvaluations }) => {
const summary = createModelSummary(variantEvaluations)
const modelInfo = descriptor.mergedSetupAlias
? {
...summary.model_info,
id: descriptor.variantKey === 'base'
? descriptor.familyId
: `${descriptor.familyId}::${descriptor.variantKey}`,
name: descriptor.variantDisplayName,
model_version: descriptor.variantKey === 'base' ? undefined : descriptor.variantLabel,
}
: summary.model_info
return {
...summary,
model_info: modelInfo,
variant_id: `${descriptor.familyId}::${descriptor.variantKey}`,
variant_key: descriptor.variantKey,
variant_label: descriptor.variantLabel,
variant_display_name: descriptor.variantDisplayName,
raw_model_ids: Array.from(new Set(variantEvaluations.map((item) => item.model_info.id))).sort((a, b) =>
a.localeCompare(b)
),
family_id: descriptor.familyId,
family_name: descriptor.familyName,
version_date: descriptor.versionDate,
version_qualifier: descriptor.versionQualifier,
}
})
)
const familySummary = createModelSummary(evaluations)
const representativeVariant = variants[0] ?? familySummary
return {
...familySummary,
model_info: {
...representativeVariant.model_info,
id: familyIdentity.familyId,
name: familyIdentity.familyName,
model_version: undefined,
},
model_family_id: familyIdentity.familyId,
model_route_id: getModelFamilyRouteId(familyIdentity.familyId),
model_family_name: familyIdentity.familyName,
raw_model_ids: Array.from(new Set(evaluations.map((item) => item.model_info.id))).sort((a, b) =>
a.localeCompare(b)
),
variants,
}
}
/**
* Convert model summary to card display format
*/
export function createEvaluationCard(
summary: ModelEvaluationSummary
): EvaluationCardData {
// Get all unique benchmarks
const benchmarksSet = new Set<string>()
const allScores: Array<{
benchmark: string
benchmarkKey: string
score: number
metric: string
unit?: string
}> = []
const sourceUrls = new Set<string>()
const detailUrls = new Set<string>()
const evaluatorNames = new Set<string>()
const sourceTypes = new Set<SourceMetadata["source_type"]>()
const evalLibraries = new Map<string, { name: string; version?: string; fork?: string }>()
let missingGenerationConfigCount = 0
let thirdPartyEvalCount = 0
let latestSourceName: string | undefined
let latestTimestamp = Number.NEGATIVE_INFINITY
// Collect all evaluations
for (const evals of Object.values(summary.evaluations_by_category)) {
for (const eval_ of evals) {
if (eval_.source_metadata.source_organization_name) {
evaluatorNames.add(eval_.source_metadata.source_organization_name)
}
sourceTypes.add(eval_.source_metadata.source_type)
if (eval_.source_metadata.evaluator_relationship === "third_party") {
thirdPartyEvalCount += 1
}
const numericTimestamp = Number(eval_.retrieved_timestamp)
const timestamp =
!Number.isNaN(numericTimestamp) && !eval_.retrieved_timestamp.includes("-")
? numericTimestamp * 1000
: new Date(eval_.retrieved_timestamp).getTime()
if (Number.isFinite(timestamp) && timestamp >= latestTimestamp) {
latestTimestamp = timestamp
latestSourceName = eval_.source_metadata.source_name
}
if (eval_.eval_library?.name) {
const libraryKey = `${eval_.eval_library.name}@${eval_.eval_library.version ?? ""}`
evalLibraries.set(libraryKey, {
name: eval_.eval_library.name,
version: eval_.eval_library.version,
fork:
typeof eval_.eval_library.additional_details?.fork === "string"
? eval_.eval_library.additional_details.fork
: undefined,
})
}
// Handle source_data as either string[] or SourceData object
if (Array.isArray(eval_.source_data)) {
// source_data is string[] (URLs), extract benchmark names from evaluation_results
for (const result of eval_.evaluation_results) {
benchmarksSet.add(getBenchmarkName(eval_, result))
}
} else {
// Even if source_data is an object, we should try to extract individual benchmarks
// from evaluation_results if available, as dataset_name might be a suite name.
if (eval_.evaluation_results && eval_.evaluation_results.length > 0) {
for (const result of eval_.evaluation_results) {
benchmarksSet.add(getBenchmarkName(eval_, result))
}
} else {
benchmarksSet.add(eval_.source_data.dataset_name)
}
}
if (eval_.source_metadata.source_url) {
sourceUrls.add(eval_.source_metadata.source_url)
}
// Add source_data URLs if it's a string array
if (Array.isArray(eval_.source_data)) {
eval_.source_data.forEach(url => sourceUrls.add(url))
}
for (const result of eval_.evaluation_results) {
if (!result.generation_config) {
missingGenerationConfigCount += 1
}
if (result.detailed_evaluation_results_url) {
detailUrls.add(result.detailed_evaluation_results_url)
}
allScores.push({
benchmark: getEvaluationDisplayName(eval_, result),
benchmarkKey: getBenchmarkName(eval_, result),
score: result.score_details.score,
metric: result.metric_config.evaluation_description || result.evaluation_name,
unit: result.metric_config.unit
})
}
}
}
// Deduplicate by benchmark name, keeping highest score for each
const scoresByBenchmark = new Map<
string,
{ benchmark: string; benchmarkKey: string; score: number; metric: string; unit?: string }
>()
for (const scoreData of allScores) {
const existing = scoresByBenchmark.get(scoreData.benchmark)
if (!existing || scoreData.score > existing.score) {
scoresByBenchmark.set(scoreData.benchmark, scoreData)
}
}
// Calculate category stats (count of unique benchmarks per category)
const categoryStats: Record<CategoryType, number> = {} as any
for (const category of summary.categories_covered) {
const evals = summary.evaluations_by_category[category] || []
const categoryBenchmarks = new Set<string>()
for (const eval_ of evals) {
for (const result of eval_.evaluation_results) {
categoryBenchmarks.add(getBenchmarkName(eval_, result))
}
}
categoryStats[category] = categoryBenchmarks.size
}
// Get top 5 unique benchmarks by score
const topScores = Array.from(scoresByBenchmark.values())
.sort((a, b) => {
const priorityDiff = getBenchmarkPriority(b.benchmarkKey) - getBenchmarkPriority(a.benchmarkKey)
if (priorityDiff !== 0) {
return priorityDiff
}
if (b.score !== a.score) {
return b.score - a.score
}
return a.benchmark.localeCompare(b.benchmark)
})
.slice(0, 5)
.map(({ benchmark, score, metric, unit }) => ({
benchmark,
score,
metric,
unit,
}))
const paramsBillionsRaw = summary.model_info.additional_details?.params_billions
const paramsBillions =
typeof paramsBillionsRaw === "number"
? paramsBillionsRaw
: typeof paramsBillionsRaw === "string"
? Number.parseFloat(paramsBillionsRaw)
: null
const reproducibilityStatus =
missingGenerationConfigCount === 0
? "complete"
: missingGenerationConfigCount === summary.total_evaluations
? "missing"
: "partial"
return {
id: summary.model_family_id,
route_id: summary.model_route_id,
model_name: summary.model_family_name,
model_id: summary.model_info.id,
canonical_model_name: summary.model_family_name,
developer: summary.model_info.developer ?? "",
evaluations_count: summary.total_evaluations,
benchmarks_count: benchmarksSet.size,
variant_count: summary.variants.length,
categories: summary.categories_covered,
category_stats: categoryStats,
latest_timestamp: summary.last_updated,
evaluator_count: evaluatorNames.size,
evaluator_names: Array.from(evaluatorNames).sort((a, b) => a.localeCompare(b)),
source_type_count: sourceTypes.size,
source_types: Array.from(sourceTypes).sort((a, b) => a.localeCompare(b)),
evidence_count: sourceUrls.size + detailUrls.size,
missing_generation_config_count: missingGenerationConfigCount,
third_party_eval_count: thirdPartyEvalCount,
independent_verification_ratio:
summary.total_evaluations > 0 ? thirdPartyEvalCount / summary.total_evaluations : 0,
reproducibility_status: reproducibilityStatus,
eval_libraries: Array.from(evalLibraries.values()).sort((a, b) => a.name.localeCompare(b.name)),
latest_source_name: latestSourceName,
params_billions: Number.isFinite(paramsBillions ?? NaN) ? paramsBillions : null,
reproducibility_summary: summary.reproducibility_summary,
provenance_summary: summary.provenance_summary,
comparability_summary: summary.comparability_summary,
top_scores: topScores,
source_urls: Array.from(sourceUrls),
detail_urls: Array.from(detailUrls),
architecture: summary.model_info.architecture,
params: summary.model_info.parameter_count,
inference_engine: summary.model_info.inference_engine,
inference_platform: summary.model_info.inference_platform,
input_modalities: summary.model_info.modalities?.input,
output_modalities: summary.model_info.modalities?.output,
release_date: summary.model_info.release_date,
model_url: summary.model_info.model_url,
}
}
/**
* Get category stats for a model
*/
export function getCategoryStats(
summary: ModelSummaryCore
): {
categories: { category: CategoryType; count: number; avg_score: number; total_results: number }[]
} {
const categories: { category: CategoryType; count: number; avg_score: number; total_results: number }[] = []
for (const category of summary.categories_covered) {
const evals = summary.evaluations_by_category[category] || []
const allScores: number[] = []
// Collect all scores from all results in this category
for (const eval_ of evals) {
for (const result of eval_.evaluation_results) {
allScores.push(result.score_details.score)
}
}
const avgScore = allScores.length > 0
? allScores.reduce((a, b) => a + b, 0) / allScores.length
: 0
const stat = {
category,
count: evals.length, // Number of evaluation files
total_results: allScores.length, // Number of actual benchmark results
avg_score: avgScore,
}
categories.push(stat)
}
// Sort categories by name or some other metric if needed
categories.sort((a, b) => a.category.localeCompare(b.category))
return { categories }
}
/**
* Load and process evaluations from file paths
*/
export async function loadEvaluations(
filePaths: string[]
): Promise<BenchmarkEvaluation[]> {
const evaluations: BenchmarkEvaluation[] = []
for (const path of filePaths) {
try {
const response = await fetch(path)
if (!response.ok) continue
const data = await response.json()
// Validate it matches our schema
if (data.schema_version && data.evaluation_id && data.model_info) {
evaluations.push(data as BenchmarkEvaluation)
}
} catch (error) {
console.warn(`Failed to load evaluation from ${path}:`, error)
}
}
return evaluations
}
/**
* Process all evaluations into card data
*/
export async function processEvaluationsToCards(
filePaths: string[]
): Promise<EvaluationCardData[]> {
const evaluations = await loadEvaluations(filePaths)
const grouped = groupEvaluationsByModelFamily(evaluations)
const cards: EvaluationCardData[] = []
for (const modelId in grouped) {
const modelEvals = grouped[modelId]
const summary = createModelFamilySummary(modelEvals)
const card = createEvaluationCard(summary)
cards.push(card)
}
return cards
}
/**
* Format score with proper precision
*/
export function formatScore(
score: number,
scoreType: 'continuous' | 'discrete' | 'binary',
maxScore?: number
): string {
if (scoreType === 'binary') {
return score > 0.5 ? 'Pass' : 'Fail'
}
if (maxScore && maxScore === 1.0) {
// It's a percentage/ratio
return `${(score * 100).toFixed(1)}%`
}
if (maxScore && maxScore === 100) {
return `${score.toFixed(1)}`
}
// Default formatting
return score.toFixed(3)
}
/**
* Get benchmark display name
*/
export function getBenchmarkDisplayName(name: string | undefined | null): string {
if (!name) return 'Unknown Benchmark'
// Map common benchmarks to friendly names
const mapping: Record<string, string> = {
'MMLU': 'Massive Multitask Language Understanding',
'MMLU-Pro': 'MMLU Professional',
'GSM8K': 'Grade School Math 8K',
'HumanEval': 'Human Eval (Code)',
'MBPP': 'Mostly Basic Python Problems',
'HellaSwag': 'HellaSwag (Commonsense)',
'ARC': 'AI2 Reasoning Challenge',
'TruthfulQA': 'TruthfulQA',
'BBH': 'Big-Bench Hard',
'MATH': 'MATH Dataset',
}
for (const [key, value] of Object.entries(mapping)) {
if (name.toUpperCase().includes(key.toUpperCase())) {
return value
}
}
return name
}
// ── Eval-centric grouping ─────────────────────────────────────────────────────
/**
* Group individual benchmark results across all model files, keyed by
* evaluation_name. Each entry describes one benchmark and which models ran it.
*/
export function groupEvaluationsByBenchmark(
evaluations: BenchmarkEvaluation[]
): Record<string, BenchmarkEvalSummary> {
const summaries: Record<string, BenchmarkEvalSummary> = {}
for (const eval_ of evaluations) {
for (const result of eval_.evaluation_results) {
const displayName = getEvaluationDisplayName(eval_, result)
const evalId = getEvaluationSummaryId(eval_, result)
const compositeBenchmarkKey = eval_.benchmark || getBenchmarkName(eval_, result)
const compositeBenchmarkName = getBenchmarkDisplayName(compositeBenchmarkKey)
if (!summaries[evalId]) {
const category = inferCategoryFromBenchmark(displayName)
summaries[evalId] = {
evaluation_name: displayName,
evaluation_id: evalId,
composite_benchmark_key: compositeBenchmarkKey,
composite_benchmark_name: compositeBenchmarkName,
category,
metric_config: result.metric_config,
model_results: [],
models_count: 0,
evaluator_names: [],
source_types: [],
latest_source_name: undefined,
third_party_ratio: 0,
missing_generation_config_count: 0,
best_model: null,
worst_model: null,
avg_score: 0,
avg_score_norm: 0,
}
}
summaries[evalId].model_results.push({
model_info: eval_.model_info,
score: result.score_details.score,
score_details: result.score_details,
evaluation_timestamp: result.evaluation_timestamp,
source_metadata: eval_.source_metadata,
source_data: result.source_data ?? eval_.source_data,
result,
})
const orgName = eval_.source_metadata.source_organization_name
if (!summaries[evalId].evaluator_names.includes(orgName)) {
summaries[evalId].evaluator_names.push(orgName)
}
}
}
// Finalise each summary
for (const summary of Object.values(summaries)) {
summary.models_count = summary.model_results.length
const scores = summary.model_results.map(m => m.score)
summary.avg_score = scores.reduce((a, b) => a + b, 0) / scores.length
summary.source_types = Array.from(
new Set(summary.model_results.map((result) => result.source_metadata.source_type))
).sort((a, b) => a.localeCompare(b))
summary.third_party_ratio =
summary.model_results.filter((result) => result.source_metadata.evaluator_relationship === "third_party").length /
summary.model_results.length
summary.missing_generation_config_count = summary.model_results.filter(
(result) => !result.result.generation_config
).length
let latestTimestamp = Number.NEGATIVE_INFINITY
for (const result of summary.model_results) {
const numericTimestamp = Number(result.evaluation_timestamp)
const timestamp =
!Number.isNaN(numericTimestamp) && !result.evaluation_timestamp.includes("-")
? numericTimestamp * 1000
: new Date(result.evaluation_timestamp).getTime()
if (Number.isFinite(timestamp) && timestamp >= latestTimestamp) {
latestTimestamp = timestamp
summary.latest_source_name = result.source_metadata.source_name
}
}
const maxScore = summary.metric_config.max_score ?? 1
const minScore = summary.metric_config.min_score ?? 0
const range = maxScore - minScore
summary.avg_score_norm = range > 0 ? (summary.avg_score - minScore) / range : 0
const lowerIsBetter = summary.metric_config.lower_is_better
const sorted = [...summary.model_results].sort((a, b) =>
lowerIsBetter ? a.score - b.score : b.score - a.score
)
if (sorted.length > 0) {
summary.best_model = { name: sorted[0].model_info.name, score: sorted[0].score }
summary.worst_model = {
name: sorted[sorted.length - 1].model_info.name,
score: sorted[sorted.length - 1].score,
}
}
}
return summaries
}
/**
* Load files and return a flat array of BenchmarkEvalSummary objects,
* one per unique evaluation name across all models.
*/
export async function processEvaluationsToBenchmarkSummaries(
filePaths: string[]
): Promise<BenchmarkEvalSummary[]> {
const evaluations = await loadEvaluations(filePaths)
const grouped = groupEvaluationsByBenchmark(evaluations)
return Object.values(grouped)
}
export function toBenchmarkEvalListItem(
summary: BenchmarkEvalSummary
): BenchmarkEvalListItem {
const { model_results: _modelResults, ...listItem } = summary
return listItem
}