Spaces:

evaleval
/

general-eval-card

Running

general-eval-card / lib /eval-processing.ts

Avijit Ghosh

fix data

ddfc163 about 2 months ago

13.8 kB

	/**
	* Processing utilities for benchmark-first evaluation data
	*/

	import type {
	BenchmarkEvaluation,
	EvaluationCardData,
	CategoryType,
	} from './benchmark-schema'
	import type { ModelEvaluationSummary } from './benchmark-schema'
	import { inferCategoryFromBenchmark, EVALUATION_CATEGORIES } from './benchmark-schema'

	export type { ModelEvaluationSummary }

	/**
	* Group multiple evaluations by model
	*/
	export function groupEvaluationsByModel(
	evaluations: BenchmarkEvaluation[]
	): Record<string, BenchmarkEvaluation[]> {
	const grouped: Record<string, BenchmarkEvaluation[]> = {}

	for (const eval_ of evaluations) {
	const modelId = eval_.model_info.id
	if (!grouped[modelId]) {
	grouped[modelId] = []
	}
	grouped[modelId].push(eval_)
	}

	return grouped
	}

	/**
	* Create a model evaluation summary from grouped evaluations
	*/
	export function createModelSummary(
	evaluations: BenchmarkEvaluation[]
	): ModelEvaluationSummary {
	if (evaluations.length === 0) {
	throw new Error('No evaluations provided')
	}

	const modelInfo = evaluations[0].model_info
	const evaluationsByCategory: Record<string, BenchmarkEvaluation[]> = {}
	const categoriesSet = new Set<CategoryType>()

	// Group by category - track which categories each evaluation belongs to
	for (const eval_ of evaluations) {
	const evalCategories = new Set<CategoryType>()

	for (const result of eval_.evaluation_results) {
	// Try to get category from factsheet first
	let category: CategoryType \| undefined;

	if (result.factsheet?.functional_props) {
	// The factsheet might contain multiple categories separated by semicolon
	// We'll pick the first one that matches our known categories
	const props = result.factsheet.functional_props.split(';').map(p => p.trim());
	for (const prop of props) {
	if (EVALUATION_CATEGORIES.includes(prop as CategoryType)) {
	category = prop as CategoryType;
	break;
	}
	}
	}

	// Infer category from evaluation name if not found in factsheet
	if (!category) {
	category = inferCategoryFromBenchmark(result.evaluation_name)
	}

	// Fallback to dataset name if source_data is an object
	if (!category && !Array.isArray(eval_.source_data)) {
	category = inferCategoryFromBenchmark(eval_.source_data.dataset_name)
	}

	if (category) {
	evalCategories.add(category)
	categoriesSet.add(category)
	}
	}

	// Add evaluation to each unique category it belongs to (once per category)
	for (const category of evalCategories) {
	if (!evaluationsByCategory[category]) {
	evaluationsByCategory[category] = []
	}
	evaluationsByCategory[category].push(eval_)
	}
	}

	// Find latest timestamp
	const timestamps = evaluations.map(e => {
	const ts = e.retrieved_timestamp
	// Check if it's a number (unix timestamp in seconds)
	if (!isNaN(Number(ts)) && !ts.includes('-')) {
	return parseFloat(ts) * 1000
	}
	// Assume ISO string or date string
	return new Date(ts).getTime()
	})

	const latestTimestamp = new Date(Math.max(...timestamps)).toISOString()

	// Calculate total benchmark results
	const totalResults = evaluations.reduce((sum, eval_) => sum + eval_.evaluation_results.length, 0)

	return {
	model_info: modelInfo,
	evaluations_by_category: evaluationsByCategory as Record<CategoryType, BenchmarkEvaluation[]>,
	total_evaluations: totalResults,
	last_updated: latestTimestamp,
	categories_covered: Array.from(categoriesSet),
	}
	}

	/**
	* Convert model summary to card display format
	*/
	export function createEvaluationCard(
	summary: ModelEvaluationSummary
	): EvaluationCardData {
	// Get all unique benchmarks
	const benchmarksSet = new Set<string>()
	const allScores: Array<{
	benchmark: string
	score: number
	metric: string
	unit?: string
	}> = []
	const sourceUrls = new Set<string>()
	const detailUrls = new Set<string>()

	// Collect all evaluations
	for (const evals of Object.values(summary.evaluations_by_category)) {
	for (const eval_ of evals) {
	// Handle source_data as either string[] or SourceData object
	if (Array.isArray(eval_.source_data)) {
	// source_data is string[] (URLs), extract benchmark names from evaluation_results
	for (const result of eval_.evaluation_results) {
	benchmarksSet.add(result.evaluation_name)
	}
	} else {
	// Even if source_data is an object, we should try to extract individual benchmarks
	// from evaluation_results if available, as dataset_name might be a suite name.
	if (eval_.evaluation_results && eval_.evaluation_results.length > 0) {
	for (const result of eval_.evaluation_results) {
	benchmarksSet.add(result.evaluation_name)
	}
	} else {
	benchmarksSet.add(eval_.source_data.dataset_name)
	}
	}

	if (eval_.source_metadata.source_url) {
	sourceUrls.add(eval_.source_metadata.source_url)
	}

	// Add source_data URLs if it's a string array
	if (Array.isArray(eval_.source_data)) {
	eval_.source_data.forEach(url => sourceUrls.add(url))
	}

	for (const result of eval_.evaluation_results) {
	if (result.detailed_evaluation_results_url) {
	detailUrls.add(result.detailed_evaluation_results_url)
	}

	allScores.push({
	benchmark: result.evaluation_name,
	score: result.score_details.score,
	metric: result.metric_config.evaluation_description \|\| result.evaluation_name,
	unit: result.metric_config.unit
	})
	}
	}
	}

	// Deduplicate by benchmark name, keeping highest score for each
	const scoresByBenchmark = new Map<string, { benchmark: string; score: number; metric: string; unit?: string }>()
	for (const scoreData of allScores) {
	const existing = scoresByBenchmark.get(scoreData.benchmark)
	if (!existing \|\| scoreData.score > existing.score) {
	scoresByBenchmark.set(scoreData.benchmark, scoreData)
	}
	}

	// Calculate category stats (count of unique benchmarks per category)
	const categoryStats: Record<CategoryType, number> = {} as any

	for (const category of summary.categories_covered) {
	const evals = summary.evaluations_by_category[category] \|\| []
	const categoryBenchmarks = new Set<string>()

	for (const eval_ of evals) {
	if (Array.isArray(eval_.source_data)) {
	for (const result of eval_.evaluation_results) {
	// Only count if this result actually belongs to this category
	const resultCategory = inferCategoryFromBenchmark(result.evaluation_name)
	if (resultCategory === category) {
	categoryBenchmarks.add(result.evaluation_name)
	}
	}
	} else {
	// For single-benchmark files, check if the file's main benchmark belongs to category
	// But wait, inferCategoryFromBenchmark might have been used to categorize the whole file
	// Let's just count the benchmarks in this file that match the category
	for (const result of eval_.evaluation_results) {
	// Determine category using the same logic as createModelSummary
	let resultCategory: CategoryType \| undefined;

	if (result.factsheet?.functional_props) {
	const props = result.factsheet.functional_props.split(';').map(p => p.trim());
	for (const prop of props) {
	if (EVALUATION_CATEGORIES.includes(prop as CategoryType)) {
	resultCategory = prop as CategoryType;
	break;
	}
	}
	}

	if (!resultCategory) {
	resultCategory = inferCategoryFromBenchmark(result.evaluation_name)
	}

	if (resultCategory === category) {
	categoryBenchmarks.add(result.evaluation_name)
	}
	}
	}
	}
	categoryStats[category] = categoryBenchmarks.size
	}

	// Get top 5 unique benchmarks by score
	const topScores = Array.from(scoresByBenchmark.values())
	.sort((a, b) => b.score - a.score)
	.slice(0, 5)

	return {
	id: summary.model_info.id,
	model_name: summary.model_info.name,
	model_id: summary.model_info.id,
	developer: summary.model_info.developer,
	evaluations_count: summary.total_evaluations,
	benchmarks_count: benchmarksSet.size,
	categories: summary.categories_covered,
	category_stats: categoryStats,
	latest_timestamp: summary.last_updated,
	top_scores: topScores,
	source_urls: Array.from(sourceUrls),
	detail_urls: Array.from(detailUrls),
	architecture: summary.model_info.architecture,
	params: summary.model_info.parameter_count,
	inference_engine: summary.model_info.inference_engine,
	inference_platform: summary.model_info.inference_platform,
	input_modalities: summary.model_info.modalities?.input,
	output_modalities: summary.model_info.modalities?.output,
	release_date: summary.model_info.release_date,
	model_url: summary.model_info.model_url,
	}
	}

	/**
	* Get category stats for a model
	*/
	export function getCategoryStats(
	summary: ModelEvaluationSummary
	): {
	categories: { category: CategoryType; count: number; avg_score: number; total_results: number }[]
	} {
	const categories: { category: CategoryType; count: number; avg_score: number; total_results: number }[] = []

	for (const category of summary.categories_covered) {
	const evals = summary.evaluations_by_category[category] \|\| []
	const allScores: number[] = []

	// Collect all scores from all results in this category
	for (const eval_ of evals) {
	for (const result of eval_.evaluation_results) {
	// Verify this result actually belongs to this category
	let resultCategory: CategoryType \| undefined;

	if (result.factsheet?.functional_props) {
	const props = result.factsheet.functional_props.split(';').map(p => p.trim());
	for (const prop of props) {
	if (EVALUATION_CATEGORIES.includes(prop as CategoryType)) {
	resultCategory = prop as CategoryType;
	break;
	}
	}
	}

	if (!resultCategory) {
	resultCategory = inferCategoryFromBenchmark(result.evaluation_name)
	}

	// Only include scores for results that actually belong to this category
	if (resultCategory === category) {
	allScores.push(result.score_details.score)
	}
	}
	}

	const avgScore = allScores.length > 0
	? allScores.reduce((a, b) => a + b, 0) / allScores.length
	: 0

	const stat = {
	category,
	count: evals.length, // Number of evaluation files
	total_results: allScores.length, // Number of actual benchmark results
	avg_score: avgScore,
	}

	categories.push(stat)
	}

	// Sort categories by name or some other metric if needed
	categories.sort((a, b) => a.category.localeCompare(b.category))

	return { categories }
	}

	/**
	* Load and process evaluations from file paths
	*/
	export async function loadEvaluations(
	filePaths: string[]
	): Promise<BenchmarkEvaluation[]> {
	const evaluations: BenchmarkEvaluation[] = []

	for (const path of filePaths) {
	try {
	const response = await fetch(path)
	if (!response.ok) continue

	const data = await response.json()

	// Validate it matches our schema
	if (data.schema_version && data.evaluation_id && data.model_info) {
	evaluations.push(data as BenchmarkEvaluation)
	}
	} catch (error) {
	console.warn(`Failed to load evaluation from ${path}:`, error)
	}
	}

	return evaluations
	}

	/**
	* Process all evaluations into card data
	*/
	export async function processEvaluationsToCards(
	filePaths: string[]
	): Promise<EvaluationCardData[]> {
	const evaluations = await loadEvaluations(filePaths)
	const grouped = groupEvaluationsByModel(evaluations)

	const cards: EvaluationCardData[] = []

	for (const modelId in grouped) {
	const modelEvals = grouped[modelId]
	const summary = createModelSummary(modelEvals)
	const card = createEvaluationCard(summary)
	cards.push(card)
	}

	return cards
	}

	/**
	* Format score with proper precision
	*/
	export function formatScore(
	score: number,
	scoreType: 'continuous' \| 'discrete' \| 'binary',
	maxScore?: number
	): string {
	if (scoreType === 'binary') {
	return score > 0.5 ? 'Pass' : 'Fail'
	}

	if (maxScore && maxScore === 1.0) {
	// It's a percentage/ratio
	return `${(score * 100).toFixed(1)}%`
	}

	if (maxScore && maxScore === 100) {
	return `${score.toFixed(1)}`
	}

	// Default formatting
	return score.toFixed(3)
	}

	/**
	* Get benchmark display name
	*/
	export function getBenchmarkDisplayName(name: string \| undefined \| null): string {
	if (!name) return 'Unknown Benchmark'

	// Map common benchmarks to friendly names
	const mapping: Record<string, string> = {
	'MMLU': 'Massive Multitask Language Understanding',
	'MMLU-Pro': 'MMLU Professional',
	'GSM8K': 'Grade School Math 8K',
	'HumanEval': 'Human Eval (Code)',
	'MBPP': 'Mostly Basic Python Problems',
	'HellaSwag': 'HellaSwag (Commonsense)',
	'ARC': 'AI2 Reasoning Challenge',
	'TruthfulQA': 'TruthfulQA',
	'BBH': 'Big-Bench Hard',
	'MATH': 'MATH Dataset',
	}

	for (const [key, value] of Object.entries(mapping)) {
	if (name.toUpperCase().includes(key.toUpperCase())) {
	return value
	}
	}

	return name
	}