general-eval-card / lib /eval-processing.ts
Avijit Ghosh
fix data
ddfc163
/**
* Processing utilities for benchmark-first evaluation data
*/
import type {
BenchmarkEvaluation,
EvaluationCardData,
CategoryType,
} from './benchmark-schema'
import type { ModelEvaluationSummary } from './benchmark-schema'
import { inferCategoryFromBenchmark, EVALUATION_CATEGORIES } from './benchmark-schema'
export type { ModelEvaluationSummary }
/**
* Group multiple evaluations by model
*/
export function groupEvaluationsByModel(
evaluations: BenchmarkEvaluation[]
): Record<string, BenchmarkEvaluation[]> {
const grouped: Record<string, BenchmarkEvaluation[]> = {}
for (const eval_ of evaluations) {
const modelId = eval_.model_info.id
if (!grouped[modelId]) {
grouped[modelId] = []
}
grouped[modelId].push(eval_)
}
return grouped
}
/**
* Create a model evaluation summary from grouped evaluations
*/
export function createModelSummary(
evaluations: BenchmarkEvaluation[]
): ModelEvaluationSummary {
if (evaluations.length === 0) {
throw new Error('No evaluations provided')
}
const modelInfo = evaluations[0].model_info
const evaluationsByCategory: Record<string, BenchmarkEvaluation[]> = {}
const categoriesSet = new Set<CategoryType>()
// Group by category - track which categories each evaluation belongs to
for (const eval_ of evaluations) {
const evalCategories = new Set<CategoryType>()
for (const result of eval_.evaluation_results) {
// Try to get category from factsheet first
let category: CategoryType | undefined;
if (result.factsheet?.functional_props) {
// The factsheet might contain multiple categories separated by semicolon
// We'll pick the first one that matches our known categories
const props = result.factsheet.functional_props.split(';').map(p => p.trim());
for (const prop of props) {
if (EVALUATION_CATEGORIES.includes(prop as CategoryType)) {
category = prop as CategoryType;
break;
}
}
}
// Infer category from evaluation name if not found in factsheet
if (!category) {
category = inferCategoryFromBenchmark(result.evaluation_name)
}
// Fallback to dataset name if source_data is an object
if (!category && !Array.isArray(eval_.source_data)) {
category = inferCategoryFromBenchmark(eval_.source_data.dataset_name)
}
if (category) {
evalCategories.add(category)
categoriesSet.add(category)
}
}
// Add evaluation to each unique category it belongs to (once per category)
for (const category of evalCategories) {
if (!evaluationsByCategory[category]) {
evaluationsByCategory[category] = []
}
evaluationsByCategory[category].push(eval_)
}
}
// Find latest timestamp
const timestamps = evaluations.map(e => {
const ts = e.retrieved_timestamp
// Check if it's a number (unix timestamp in seconds)
if (!isNaN(Number(ts)) && !ts.includes('-')) {
return parseFloat(ts) * 1000
}
// Assume ISO string or date string
return new Date(ts).getTime()
})
const latestTimestamp = new Date(Math.max(...timestamps)).toISOString()
// Calculate total benchmark results
const totalResults = evaluations.reduce((sum, eval_) => sum + eval_.evaluation_results.length, 0)
return {
model_info: modelInfo,
evaluations_by_category: evaluationsByCategory as Record<CategoryType, BenchmarkEvaluation[]>,
total_evaluations: totalResults,
last_updated: latestTimestamp,
categories_covered: Array.from(categoriesSet),
}
}
/**
* Convert model summary to card display format
*/
export function createEvaluationCard(
summary: ModelEvaluationSummary
): EvaluationCardData {
// Get all unique benchmarks
const benchmarksSet = new Set<string>()
const allScores: Array<{
benchmark: string
score: number
metric: string
unit?: string
}> = []
const sourceUrls = new Set<string>()
const detailUrls = new Set<string>()
// Collect all evaluations
for (const evals of Object.values(summary.evaluations_by_category)) {
for (const eval_ of evals) {
// Handle source_data as either string[] or SourceData object
if (Array.isArray(eval_.source_data)) {
// source_data is string[] (URLs), extract benchmark names from evaluation_results
for (const result of eval_.evaluation_results) {
benchmarksSet.add(result.evaluation_name)
}
} else {
// Even if source_data is an object, we should try to extract individual benchmarks
// from evaluation_results if available, as dataset_name might be a suite name.
if (eval_.evaluation_results && eval_.evaluation_results.length > 0) {
for (const result of eval_.evaluation_results) {
benchmarksSet.add(result.evaluation_name)
}
} else {
benchmarksSet.add(eval_.source_data.dataset_name)
}
}
if (eval_.source_metadata.source_url) {
sourceUrls.add(eval_.source_metadata.source_url)
}
// Add source_data URLs if it's a string array
if (Array.isArray(eval_.source_data)) {
eval_.source_data.forEach(url => sourceUrls.add(url))
}
for (const result of eval_.evaluation_results) {
if (result.detailed_evaluation_results_url) {
detailUrls.add(result.detailed_evaluation_results_url)
}
allScores.push({
benchmark: result.evaluation_name,
score: result.score_details.score,
metric: result.metric_config.evaluation_description || result.evaluation_name,
unit: result.metric_config.unit
})
}
}
}
// Deduplicate by benchmark name, keeping highest score for each
const scoresByBenchmark = new Map<string, { benchmark: string; score: number; metric: string; unit?: string }>()
for (const scoreData of allScores) {
const existing = scoresByBenchmark.get(scoreData.benchmark)
if (!existing || scoreData.score > existing.score) {
scoresByBenchmark.set(scoreData.benchmark, scoreData)
}
}
// Calculate category stats (count of unique benchmarks per category)
const categoryStats: Record<CategoryType, number> = {} as any
for (const category of summary.categories_covered) {
const evals = summary.evaluations_by_category[category] || []
const categoryBenchmarks = new Set<string>()
for (const eval_ of evals) {
if (Array.isArray(eval_.source_data)) {
for (const result of eval_.evaluation_results) {
// Only count if this result actually belongs to this category
const resultCategory = inferCategoryFromBenchmark(result.evaluation_name)
if (resultCategory === category) {
categoryBenchmarks.add(result.evaluation_name)
}
}
} else {
// For single-benchmark files, check if the file's main benchmark belongs to category
// But wait, inferCategoryFromBenchmark might have been used to categorize the whole file
// Let's just count the benchmarks in this file that match the category
for (const result of eval_.evaluation_results) {
// Determine category using the same logic as createModelSummary
let resultCategory: CategoryType | undefined;
if (result.factsheet?.functional_props) {
const props = result.factsheet.functional_props.split(';').map(p => p.trim());
for (const prop of props) {
if (EVALUATION_CATEGORIES.includes(prop as CategoryType)) {
resultCategory = prop as CategoryType;
break;
}
}
}
if (!resultCategory) {
resultCategory = inferCategoryFromBenchmark(result.evaluation_name)
}
if (resultCategory === category) {
categoryBenchmarks.add(result.evaluation_name)
}
}
}
}
categoryStats[category] = categoryBenchmarks.size
}
// Get top 5 unique benchmarks by score
const topScores = Array.from(scoresByBenchmark.values())
.sort((a, b) => b.score - a.score)
.slice(0, 5)
return {
id: summary.model_info.id,
model_name: summary.model_info.name,
model_id: summary.model_info.id,
developer: summary.model_info.developer,
evaluations_count: summary.total_evaluations,
benchmarks_count: benchmarksSet.size,
categories: summary.categories_covered,
category_stats: categoryStats,
latest_timestamp: summary.last_updated,
top_scores: topScores,
source_urls: Array.from(sourceUrls),
detail_urls: Array.from(detailUrls),
architecture: summary.model_info.architecture,
params: summary.model_info.parameter_count,
inference_engine: summary.model_info.inference_engine,
inference_platform: summary.model_info.inference_platform,
input_modalities: summary.model_info.modalities?.input,
output_modalities: summary.model_info.modalities?.output,
release_date: summary.model_info.release_date,
model_url: summary.model_info.model_url,
}
}
/**
* Get category stats for a model
*/
export function getCategoryStats(
summary: ModelEvaluationSummary
): {
categories: { category: CategoryType; count: number; avg_score: number; total_results: number }[]
} {
const categories: { category: CategoryType; count: number; avg_score: number; total_results: number }[] = []
for (const category of summary.categories_covered) {
const evals = summary.evaluations_by_category[category] || []
const allScores: number[] = []
// Collect all scores from all results in this category
for (const eval_ of evals) {
for (const result of eval_.evaluation_results) {
// Verify this result actually belongs to this category
let resultCategory: CategoryType | undefined;
if (result.factsheet?.functional_props) {
const props = result.factsheet.functional_props.split(';').map(p => p.trim());
for (const prop of props) {
if (EVALUATION_CATEGORIES.includes(prop as CategoryType)) {
resultCategory = prop as CategoryType;
break;
}
}
}
if (!resultCategory) {
resultCategory = inferCategoryFromBenchmark(result.evaluation_name)
}
// Only include scores for results that actually belong to this category
if (resultCategory === category) {
allScores.push(result.score_details.score)
}
}
}
const avgScore = allScores.length > 0
? allScores.reduce((a, b) => a + b, 0) / allScores.length
: 0
const stat = {
category,
count: evals.length, // Number of evaluation files
total_results: allScores.length, // Number of actual benchmark results
avg_score: avgScore,
}
categories.push(stat)
}
// Sort categories by name or some other metric if needed
categories.sort((a, b) => a.category.localeCompare(b.category))
return { categories }
}
/**
* Load and process evaluations from file paths
*/
export async function loadEvaluations(
filePaths: string[]
): Promise<BenchmarkEvaluation[]> {
const evaluations: BenchmarkEvaluation[] = []
for (const path of filePaths) {
try {
const response = await fetch(path)
if (!response.ok) continue
const data = await response.json()
// Validate it matches our schema
if (data.schema_version && data.evaluation_id && data.model_info) {
evaluations.push(data as BenchmarkEvaluation)
}
} catch (error) {
console.warn(`Failed to load evaluation from ${path}:`, error)
}
}
return evaluations
}
/**
* Process all evaluations into card data
*/
export async function processEvaluationsToCards(
filePaths: string[]
): Promise<EvaluationCardData[]> {
const evaluations = await loadEvaluations(filePaths)
const grouped = groupEvaluationsByModel(evaluations)
const cards: EvaluationCardData[] = []
for (const modelId in grouped) {
const modelEvals = grouped[modelId]
const summary = createModelSummary(modelEvals)
const card = createEvaluationCard(summary)
cards.push(card)
}
return cards
}
/**
* Format score with proper precision
*/
export function formatScore(
score: number,
scoreType: 'continuous' | 'discrete' | 'binary',
maxScore?: number
): string {
if (scoreType === 'binary') {
return score > 0.5 ? 'Pass' : 'Fail'
}
if (maxScore && maxScore === 1.0) {
// It's a percentage/ratio
return `${(score * 100).toFixed(1)}%`
}
if (maxScore && maxScore === 100) {
return `${score.toFixed(1)}`
}
// Default formatting
return score.toFixed(3)
}
/**
* Get benchmark display name
*/
export function getBenchmarkDisplayName(name: string | undefined | null): string {
if (!name) return 'Unknown Benchmark'
// Map common benchmarks to friendly names
const mapping: Record<string, string> = {
'MMLU': 'Massive Multitask Language Understanding',
'MMLU-Pro': 'MMLU Professional',
'GSM8K': 'Grade School Math 8K',
'HumanEval': 'Human Eval (Code)',
'MBPP': 'Mostly Basic Python Problems',
'HellaSwag': 'HellaSwag (Commonsense)',
'ARC': 'AI2 Reasoning Challenge',
'TruthfulQA': 'TruthfulQA',
'BBH': 'Big-Bench Hard',
'MATH': 'MATH Dataset',
}
for (const [key, value] of Object.entries(mapping)) {
if (name.toUpperCase().includes(key.toUpperCase())) {
return value
}
}
return name
}