/** * Processing utilities for benchmark-first evaluation data */ import type { BenchmarkEvaluation, EvaluationCardData, CategoryType, } from './benchmark-schema' import type { ModelEvaluationSummary } from './benchmark-schema' import { inferCategoryFromBenchmark, EVALUATION_CATEGORIES } from './benchmark-schema' export type { ModelEvaluationSummary } /** * Group multiple evaluations by model */ export function groupEvaluationsByModel( evaluations: BenchmarkEvaluation[] ): Record { const grouped: Record = {} for (const eval_ of evaluations) { const modelId = eval_.model_info.id if (!grouped[modelId]) { grouped[modelId] = [] } grouped[modelId].push(eval_) } return grouped } /** * Create a model evaluation summary from grouped evaluations */ export function createModelSummary( evaluations: BenchmarkEvaluation[] ): ModelEvaluationSummary { if (evaluations.length === 0) { throw new Error('No evaluations provided') } const modelInfo = evaluations[0].model_info const evaluationsByCategory: Record = {} const categoriesSet = new Set() // Group by category - track which categories each evaluation belongs to for (const eval_ of evaluations) { const evalCategories = new Set() for (const result of eval_.evaluation_results) { // Try to get category from factsheet first let category: CategoryType | undefined; if (result.factsheet?.functional_props) { // The factsheet might contain multiple categories separated by semicolon // We'll pick the first one that matches our known categories const props = result.factsheet.functional_props.split(';').map(p => p.trim()); for (const prop of props) { if (EVALUATION_CATEGORIES.includes(prop as CategoryType)) { category = prop as CategoryType; break; } } } // Infer category from evaluation name if not found in factsheet if (!category) { category = inferCategoryFromBenchmark(result.evaluation_name) } // Fallback to dataset name if source_data is an object if (!category && !Array.isArray(eval_.source_data)) { category = inferCategoryFromBenchmark(eval_.source_data.dataset_name) } if (category) { evalCategories.add(category) categoriesSet.add(category) } } // Add evaluation to each unique category it belongs to (once per category) for (const category of evalCategories) { if (!evaluationsByCategory[category]) { evaluationsByCategory[category] = [] } evaluationsByCategory[category].push(eval_) } } // Find latest timestamp const timestamps = evaluations.map(e => { const ts = e.retrieved_timestamp // Check if it's a number (unix timestamp in seconds) if (!isNaN(Number(ts)) && !ts.includes('-')) { return parseFloat(ts) * 1000 } // Assume ISO string or date string return new Date(ts).getTime() }) const latestTimestamp = new Date(Math.max(...timestamps)).toISOString() // Calculate total benchmark results const totalResults = evaluations.reduce((sum, eval_) => sum + eval_.evaluation_results.length, 0) return { model_info: modelInfo, evaluations_by_category: evaluationsByCategory as Record, total_evaluations: totalResults, last_updated: latestTimestamp, categories_covered: Array.from(categoriesSet), } } /** * Convert model summary to card display format */ export function createEvaluationCard( summary: ModelEvaluationSummary ): EvaluationCardData { // Get all unique benchmarks const benchmarksSet = new Set() const allScores: Array<{ benchmark: string score: number metric: string unit?: string }> = [] const sourceUrls = new Set() const detailUrls = new Set() // Collect all evaluations for (const evals of Object.values(summary.evaluations_by_category)) { for (const eval_ of evals) { // Handle source_data as either string[] or SourceData object if (Array.isArray(eval_.source_data)) { // source_data is string[] (URLs), extract benchmark names from evaluation_results for (const result of eval_.evaluation_results) { benchmarksSet.add(result.evaluation_name) } } else { // Even if source_data is an object, we should try to extract individual benchmarks // from evaluation_results if available, as dataset_name might be a suite name. if (eval_.evaluation_results && eval_.evaluation_results.length > 0) { for (const result of eval_.evaluation_results) { benchmarksSet.add(result.evaluation_name) } } else { benchmarksSet.add(eval_.source_data.dataset_name) } } if (eval_.source_metadata.source_url) { sourceUrls.add(eval_.source_metadata.source_url) } // Add source_data URLs if it's a string array if (Array.isArray(eval_.source_data)) { eval_.source_data.forEach(url => sourceUrls.add(url)) } for (const result of eval_.evaluation_results) { if (result.detailed_evaluation_results_url) { detailUrls.add(result.detailed_evaluation_results_url) } allScores.push({ benchmark: result.evaluation_name, score: result.score_details.score, metric: result.metric_config.evaluation_description || result.evaluation_name, unit: result.metric_config.unit }) } } } // Deduplicate by benchmark name, keeping highest score for each const scoresByBenchmark = new Map() for (const scoreData of allScores) { const existing = scoresByBenchmark.get(scoreData.benchmark) if (!existing || scoreData.score > existing.score) { scoresByBenchmark.set(scoreData.benchmark, scoreData) } } // Calculate category stats (count of unique benchmarks per category) const categoryStats: Record = {} as any for (const category of summary.categories_covered) { const evals = summary.evaluations_by_category[category] || [] const categoryBenchmarks = new Set() for (const eval_ of evals) { if (Array.isArray(eval_.source_data)) { for (const result of eval_.evaluation_results) { // Only count if this result actually belongs to this category const resultCategory = inferCategoryFromBenchmark(result.evaluation_name) if (resultCategory === category) { categoryBenchmarks.add(result.evaluation_name) } } } else { // For single-benchmark files, check if the file's main benchmark belongs to category // But wait, inferCategoryFromBenchmark might have been used to categorize the whole file // Let's just count the benchmarks in this file that match the category for (const result of eval_.evaluation_results) { // Determine category using the same logic as createModelSummary let resultCategory: CategoryType | undefined; if (result.factsheet?.functional_props) { const props = result.factsheet.functional_props.split(';').map(p => p.trim()); for (const prop of props) { if (EVALUATION_CATEGORIES.includes(prop as CategoryType)) { resultCategory = prop as CategoryType; break; } } } if (!resultCategory) { resultCategory = inferCategoryFromBenchmark(result.evaluation_name) } if (resultCategory === category) { categoryBenchmarks.add(result.evaluation_name) } } } } categoryStats[category] = categoryBenchmarks.size } // Get top 5 unique benchmarks by score const topScores = Array.from(scoresByBenchmark.values()) .sort((a, b) => b.score - a.score) .slice(0, 5) return { id: summary.model_info.id, model_name: summary.model_info.name, model_id: summary.model_info.id, developer: summary.model_info.developer, evaluations_count: summary.total_evaluations, benchmarks_count: benchmarksSet.size, categories: summary.categories_covered, category_stats: categoryStats, latest_timestamp: summary.last_updated, top_scores: topScores, source_urls: Array.from(sourceUrls), detail_urls: Array.from(detailUrls), architecture: summary.model_info.architecture, params: summary.model_info.parameter_count, inference_engine: summary.model_info.inference_engine, inference_platform: summary.model_info.inference_platform, input_modalities: summary.model_info.modalities?.input, output_modalities: summary.model_info.modalities?.output, release_date: summary.model_info.release_date, model_url: summary.model_info.model_url, } } /** * Get category stats for a model */ export function getCategoryStats( summary: ModelEvaluationSummary ): { categories: { category: CategoryType; count: number; avg_score: number; total_results: number }[] } { const categories: { category: CategoryType; count: number; avg_score: number; total_results: number }[] = [] for (const category of summary.categories_covered) { const evals = summary.evaluations_by_category[category] || [] const allScores: number[] = [] // Collect all scores from all results in this category for (const eval_ of evals) { for (const result of eval_.evaluation_results) { // Verify this result actually belongs to this category let resultCategory: CategoryType | undefined; if (result.factsheet?.functional_props) { const props = result.factsheet.functional_props.split(';').map(p => p.trim()); for (const prop of props) { if (EVALUATION_CATEGORIES.includes(prop as CategoryType)) { resultCategory = prop as CategoryType; break; } } } if (!resultCategory) { resultCategory = inferCategoryFromBenchmark(result.evaluation_name) } // Only include scores for results that actually belong to this category if (resultCategory === category) { allScores.push(result.score_details.score) } } } const avgScore = allScores.length > 0 ? allScores.reduce((a, b) => a + b, 0) / allScores.length : 0 const stat = { category, count: evals.length, // Number of evaluation files total_results: allScores.length, // Number of actual benchmark results avg_score: avgScore, } categories.push(stat) } // Sort categories by name or some other metric if needed categories.sort((a, b) => a.category.localeCompare(b.category)) return { categories } } /** * Load and process evaluations from file paths */ export async function loadEvaluations( filePaths: string[] ): Promise { const evaluations: BenchmarkEvaluation[] = [] for (const path of filePaths) { try { const response = await fetch(path) if (!response.ok) continue const data = await response.json() // Validate it matches our schema if (data.schema_version && data.evaluation_id && data.model_info) { evaluations.push(data as BenchmarkEvaluation) } } catch (error) { console.warn(`Failed to load evaluation from ${path}:`, error) } } return evaluations } /** * Process all evaluations into card data */ export async function processEvaluationsToCards( filePaths: string[] ): Promise { const evaluations = await loadEvaluations(filePaths) const grouped = groupEvaluationsByModel(evaluations) const cards: EvaluationCardData[] = [] for (const modelId in grouped) { const modelEvals = grouped[modelId] const summary = createModelSummary(modelEvals) const card = createEvaluationCard(summary) cards.push(card) } return cards } /** * Format score with proper precision */ export function formatScore( score: number, scoreType: 'continuous' | 'discrete' | 'binary', maxScore?: number ): string { if (scoreType === 'binary') { return score > 0.5 ? 'Pass' : 'Fail' } if (maxScore && maxScore === 1.0) { // It's a percentage/ratio return `${(score * 100).toFixed(1)}%` } if (maxScore && maxScore === 100) { return `${score.toFixed(1)}` } // Default formatting return score.toFixed(3) } /** * Get benchmark display name */ export function getBenchmarkDisplayName(name: string | undefined | null): string { if (!name) return 'Unknown Benchmark' // Map common benchmarks to friendly names const mapping: Record = { 'MMLU': 'Massive Multitask Language Understanding', 'MMLU-Pro': 'MMLU Professional', 'GSM8K': 'Grade School Math 8K', 'HumanEval': 'Human Eval (Code)', 'MBPP': 'Mostly Basic Python Problems', 'HellaSwag': 'HellaSwag (Commonsense)', 'ARC': 'AI2 Reasoning Challenge', 'TruthfulQA': 'TruthfulQA', 'BBH': 'Big-Bench Hard', 'MATH': 'MATH Dataset', } for (const [key, value] of Object.entries(mapping)) { if (name.toUpperCase().includes(key.toUpperCase())) { return value } } return name }