Spaces:

evaleval
/

general-eval-card

Running

File size: 13,774 Bytes

/**
 * Processing utilities for benchmark-first evaluation data
 */

import type {
  BenchmarkEvaluation,
  EvaluationCardData,
  CategoryType,
} from './benchmark-schema'
import type { ModelEvaluationSummary } from './benchmark-schema'
import { inferCategoryFromBenchmark, EVALUATION_CATEGORIES } from './benchmark-schema'

export type { ModelEvaluationSummary }

/**
 * Group multiple evaluations by model
 */
export function groupEvaluationsByModel(
  evaluations: BenchmarkEvaluation[]
): Record<string, BenchmarkEvaluation[]> {
  const grouped: Record<string, BenchmarkEvaluation[]> = {}
  
  for (const eval_ of evaluations) {
    const modelId = eval_.model_info.id
    if (!grouped[modelId]) {
      grouped[modelId] = []
    }
    grouped[modelId].push(eval_)
  }
  
  return grouped
}

/**
 * Create a model evaluation summary from grouped evaluations
 */
export function createModelSummary(
  evaluations: BenchmarkEvaluation[]
): ModelEvaluationSummary {
  if (evaluations.length === 0) {
    throw new Error('No evaluations provided')
  }
  
  const modelInfo = evaluations[0].model_info
  const evaluationsByCategory: Record<string, BenchmarkEvaluation[]> = {}
  const categoriesSet = new Set<CategoryType>()
  
  // Group by category - track which categories each evaluation belongs to
  for (const eval_ of evaluations) {
    const evalCategories = new Set<CategoryType>()
    
    for (const result of eval_.evaluation_results) {
      // Try to get category from factsheet first
      let category: CategoryType | undefined;
      
      if (result.factsheet?.functional_props) {
        // The factsheet might contain multiple categories separated by semicolon
        // We'll pick the first one that matches our known categories
        const props = result.factsheet.functional_props.split(';').map(p => p.trim());
        for (const prop of props) {
          if (EVALUATION_CATEGORIES.includes(prop as CategoryType)) {
            category = prop as CategoryType;
            break;
          }
        }
      }

      // Infer category from evaluation name if not found in factsheet
      if (!category) {
        category = inferCategoryFromBenchmark(result.evaluation_name)
      }
      
      // Fallback to dataset name if source_data is an object
      if (!category && !Array.isArray(eval_.source_data)) {
        category = inferCategoryFromBenchmark(eval_.source_data.dataset_name)
      }
      
      if (category) {
        evalCategories.add(category)
        categoriesSet.add(category)
      }
    }
    
    // Add evaluation to each unique category it belongs to (once per category)
    for (const category of evalCategories) {
      if (!evaluationsByCategory[category]) {
        evaluationsByCategory[category] = []
      }
      evaluationsByCategory[category].push(eval_)
    }
  }
  
  // Find latest timestamp
  const timestamps = evaluations.map(e => {
    const ts = e.retrieved_timestamp
    // Check if it's a number (unix timestamp in seconds)
    if (!isNaN(Number(ts)) && !ts.includes('-')) {
      return parseFloat(ts) * 1000
    }
    // Assume ISO string or date string
    return new Date(ts).getTime()
  })
  
  const latestTimestamp = new Date(Math.max(...timestamps)).toISOString()
  
  // Calculate total benchmark results
  const totalResults = evaluations.reduce((sum, eval_) => sum + eval_.evaluation_results.length, 0)

  return {
    model_info: modelInfo,
    evaluations_by_category: evaluationsByCategory as Record<CategoryType, BenchmarkEvaluation[]>,
    total_evaluations: totalResults,
    last_updated: latestTimestamp,
    categories_covered: Array.from(categoriesSet),
  }
}

/**
 * Convert model summary to card display format
 */
export function createEvaluationCard(
  summary: ModelEvaluationSummary
): EvaluationCardData {
  // Get all unique benchmarks
  const benchmarksSet = new Set<string>()
  const allScores: Array<{
    benchmark: string
    score: number
    metric: string
    unit?: string
  }> = []
  const sourceUrls = new Set<string>()
  const detailUrls = new Set<string>()
  
  // Collect all evaluations
  for (const evals of Object.values(summary.evaluations_by_category)) {
    for (const eval_ of evals) {
      // Handle source_data as either string[] or SourceData object
      if (Array.isArray(eval_.source_data)) {
        // source_data is string[] (URLs), extract benchmark names from evaluation_results
        for (const result of eval_.evaluation_results) {
          benchmarksSet.add(result.evaluation_name)
        }
      } else {
        // Even if source_data is an object, we should try to extract individual benchmarks
        // from evaluation_results if available, as dataset_name might be a suite name.
        if (eval_.evaluation_results && eval_.evaluation_results.length > 0) {
           for (const result of eval_.evaluation_results) {
             benchmarksSet.add(result.evaluation_name)
           }
        } else {
           benchmarksSet.add(eval_.source_data.dataset_name)
        }
      }
      
      if (eval_.source_metadata.source_url) {
        sourceUrls.add(eval_.source_metadata.source_url)
      }
      
      // Add source_data URLs if it's a string array
      if (Array.isArray(eval_.source_data)) {
        eval_.source_data.forEach(url => sourceUrls.add(url))
      }
      
      for (const result of eval_.evaluation_results) {
        if (result.detailed_evaluation_results_url) {
          detailUrls.add(result.detailed_evaluation_results_url)
        }
        
        allScores.push({
          benchmark: result.evaluation_name,
          score: result.score_details.score,
          metric: result.metric_config.evaluation_description || result.evaluation_name,
          unit: result.metric_config.unit
        })
      }
    }
  }
  
  // Deduplicate by benchmark name, keeping highest score for each
  const scoresByBenchmark = new Map<string, { benchmark: string; score: number; metric: string; unit?: string }>()
  for (const scoreData of allScores) {
    const existing = scoresByBenchmark.get(scoreData.benchmark)
    if (!existing || scoreData.score > existing.score) {
      scoresByBenchmark.set(scoreData.benchmark, scoreData)
    }
  }
  
  // Calculate category stats (count of unique benchmarks per category)
  const categoryStats: Record<CategoryType, number> = {} as any
  
  for (const category of summary.categories_covered) {
    const evals = summary.evaluations_by_category[category] || []
    const categoryBenchmarks = new Set<string>()
    
    for (const eval_ of evals) {
      if (Array.isArray(eval_.source_data)) {
        for (const result of eval_.evaluation_results) {
          // Only count if this result actually belongs to this category
          const resultCategory = inferCategoryFromBenchmark(result.evaluation_name)
          if (resultCategory === category) {
            categoryBenchmarks.add(result.evaluation_name)
          }
        }
      } else {
        // For single-benchmark files, check if the file's main benchmark belongs to category
        // But wait, inferCategoryFromBenchmark might have been used to categorize the whole file
        // Let's just count the benchmarks in this file that match the category
        for (const result of eval_.evaluation_results) {
           // Determine category using the same logic as createModelSummary
           let resultCategory: CategoryType | undefined;
           
           if (result.factsheet?.functional_props) {
             const props = result.factsheet.functional_props.split(';').map(p => p.trim());
             for (const prop of props) {
               if (EVALUATION_CATEGORIES.includes(prop as CategoryType)) {
                 resultCategory = prop as CategoryType;
                 break;
               }
             }
           }
           
           if (!resultCategory) {
             resultCategory = inferCategoryFromBenchmark(result.evaluation_name)
           }

           if (resultCategory === category) {
             categoryBenchmarks.add(result.evaluation_name)
           }
        }
      }
    }
    categoryStats[category] = categoryBenchmarks.size
  }

  // Get top 5 unique benchmarks by score
  const topScores = Array.from(scoresByBenchmark.values())
    .sort((a, b) => b.score - a.score)
    .slice(0, 5)

  return {
    id: summary.model_info.id,
    model_name: summary.model_info.name,
    model_id: summary.model_info.id,
    developer: summary.model_info.developer,
    evaluations_count: summary.total_evaluations,
    benchmarks_count: benchmarksSet.size,
    categories: summary.categories_covered,
    category_stats: categoryStats,
    latest_timestamp: summary.last_updated,
    top_scores: topScores,
    source_urls: Array.from(sourceUrls),
    detail_urls: Array.from(detailUrls),
    architecture: summary.model_info.architecture,
    params: summary.model_info.parameter_count,
    inference_engine: summary.model_info.inference_engine,
    inference_platform: summary.model_info.inference_platform,
    input_modalities: summary.model_info.modalities?.input,
    output_modalities: summary.model_info.modalities?.output,
    release_date: summary.model_info.release_date,
    model_url: summary.model_info.model_url,
  }
}

/**
 * Get category stats for a model
 */
export function getCategoryStats(
  summary: ModelEvaluationSummary
): {
  categories: { category: CategoryType; count: number; avg_score: number; total_results: number }[]
} {
  const categories: { category: CategoryType; count: number; avg_score: number; total_results: number }[] = []
  
  for (const category of summary.categories_covered) {
    const evals = summary.evaluations_by_category[category] || []
    const allScores: number[] = []
    
    // Collect all scores from all results in this category
    for (const eval_ of evals) {
      for (const result of eval_.evaluation_results) {
        // Verify this result actually belongs to this category
        let resultCategory: CategoryType | undefined;
        
        if (result.factsheet?.functional_props) {
          const props = result.factsheet.functional_props.split(';').map(p => p.trim());
          for (const prop of props) {
            if (EVALUATION_CATEGORIES.includes(prop as CategoryType)) {
              resultCategory = prop as CategoryType;
              break;
            }
          }
        }
        
        if (!resultCategory) {
          resultCategory = inferCategoryFromBenchmark(result.evaluation_name)
        }
        
        // Only include scores for results that actually belong to this category
        if (resultCategory === category) {
          allScores.push(result.score_details.score)
        }
      }
    }
    
    const avgScore = allScores.length > 0
      ? allScores.reduce((a, b) => a + b, 0) / allScores.length
      : 0
    
    const stat = {
      category,
      count: evals.length, // Number of evaluation files
      total_results: allScores.length, // Number of actual benchmark results
      avg_score: avgScore,
    }
    
    categories.push(stat)
  }
  
  // Sort categories by name or some other metric if needed
  categories.sort((a, b) => a.category.localeCompare(b.category))
  
  return { categories }
}

/**
 * Load and process evaluations from file paths
 */
export async function loadEvaluations(
  filePaths: string[]
): Promise<BenchmarkEvaluation[]> {
  const evaluations: BenchmarkEvaluation[] = []
  
  for (const path of filePaths) {
    try {
      const response = await fetch(path)
      if (!response.ok) continue
      
      const data = await response.json()
      
      // Validate it matches our schema
      if (data.schema_version && data.evaluation_id && data.model_info) {
        evaluations.push(data as BenchmarkEvaluation)
      }
    } catch (error) {
      console.warn(`Failed to load evaluation from ${path}:`, error)
    }
  }
  
  return evaluations
}

/**
 * Process all evaluations into card data
 */
export async function processEvaluationsToCards(
  filePaths: string[]
): Promise<EvaluationCardData[]> {
  const evaluations = await loadEvaluations(filePaths)
  const grouped = groupEvaluationsByModel(evaluations)
  
  const cards: EvaluationCardData[] = []
  
  for (const modelId in grouped) {
    const modelEvals = grouped[modelId]
    const summary = createModelSummary(modelEvals)
    const card = createEvaluationCard(summary)
    cards.push(card)
  }
  
  return cards
}

/**
 * Format score with proper precision
 */
export function formatScore(
  score: number,
  scoreType: 'continuous' | 'discrete' | 'binary',
  maxScore?: number
): string {
  if (scoreType === 'binary') {
    return score > 0.5 ? 'Pass' : 'Fail'
  }
  
  if (maxScore && maxScore === 1.0) {
    // It's a percentage/ratio
    return `${(score * 100).toFixed(1)}%`
  }
  
  if (maxScore && maxScore === 100) {
    return `${score.toFixed(1)}`
  }
  
  // Default formatting
  return score.toFixed(3)
}

/**
 * Get benchmark display name
 */
export function getBenchmarkDisplayName(name: string | undefined | null): string {
  if (!name) return 'Unknown Benchmark'
  
  // Map common benchmarks to friendly names
  const mapping: Record<string, string> = {
    'MMLU': 'Massive Multitask Language Understanding',
    'MMLU-Pro': 'MMLU Professional',
    'GSM8K': 'Grade School Math 8K',
    'HumanEval': 'Human Eval (Code)',
    'MBPP': 'Mostly Basic Python Problems',
    'HellaSwag': 'HellaSwag (Commonsense)',
    'ARC': 'AI2 Reasoning Challenge',
    'TruthfulQA': 'TruthfulQA',
    'BBH': 'Big-Bench Hard',
    'MATH': 'MATH Dataset',
  }
  
  for (const [key, value] of Object.entries(mapping)) {
    if (name.toUpperCase().includes(key.toUpperCase())) {
      return value
    }
  }
  
  return name
}