File size: 13,774 Bytes
6978d97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2554366
 
 
 
 
 
 
 
 
 
 
6978d97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2554366
 
 
 
 
 
 
 
 
6978d97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2554366
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6978d97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddfc163
6978d97
ddfc163
6978d97
 
 
 
 
ddfc163
6978d97
 
ddfc163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6978d97
 
 
 
 
 
 
 
 
ddfc163
 
6978d97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
/**
 * Processing utilities for benchmark-first evaluation data
 */

import type {
  BenchmarkEvaluation,
  EvaluationCardData,
  CategoryType,
} from './benchmark-schema'
import type { ModelEvaluationSummary } from './benchmark-schema'
import { inferCategoryFromBenchmark, EVALUATION_CATEGORIES } from './benchmark-schema'

export type { ModelEvaluationSummary }

/**
 * Group multiple evaluations by model
 */
export function groupEvaluationsByModel(
  evaluations: BenchmarkEvaluation[]
): Record<string, BenchmarkEvaluation[]> {
  const grouped: Record<string, BenchmarkEvaluation[]> = {}
  
  for (const eval_ of evaluations) {
    const modelId = eval_.model_info.id
    if (!grouped[modelId]) {
      grouped[modelId] = []
    }
    grouped[modelId].push(eval_)
  }
  
  return grouped
}

/**
 * Create a model evaluation summary from grouped evaluations
 */
export function createModelSummary(
  evaluations: BenchmarkEvaluation[]
): ModelEvaluationSummary {
  if (evaluations.length === 0) {
    throw new Error('No evaluations provided')
  }
  
  const modelInfo = evaluations[0].model_info
  const evaluationsByCategory: Record<string, BenchmarkEvaluation[]> = {}
  const categoriesSet = new Set<CategoryType>()
  
  // Group by category - track which categories each evaluation belongs to
  for (const eval_ of evaluations) {
    const evalCategories = new Set<CategoryType>()
    
    for (const result of eval_.evaluation_results) {
      // Try to get category from factsheet first
      let category: CategoryType | undefined;
      
      if (result.factsheet?.functional_props) {
        // The factsheet might contain multiple categories separated by semicolon
        // We'll pick the first one that matches our known categories
        const props = result.factsheet.functional_props.split(';').map(p => p.trim());
        for (const prop of props) {
          if (EVALUATION_CATEGORIES.includes(prop as CategoryType)) {
            category = prop as CategoryType;
            break;
          }
        }
      }

      // Infer category from evaluation name if not found in factsheet
      if (!category) {
        category = inferCategoryFromBenchmark(result.evaluation_name)
      }
      
      // Fallback to dataset name if source_data is an object
      if (!category && !Array.isArray(eval_.source_data)) {
        category = inferCategoryFromBenchmark(eval_.source_data.dataset_name)
      }
      
      if (category) {
        evalCategories.add(category)
        categoriesSet.add(category)
      }
    }
    
    // Add evaluation to each unique category it belongs to (once per category)
    for (const category of evalCategories) {
      if (!evaluationsByCategory[category]) {
        evaluationsByCategory[category] = []
      }
      evaluationsByCategory[category].push(eval_)
    }
  }
  
  // Find latest timestamp
  const timestamps = evaluations.map(e => {
    const ts = e.retrieved_timestamp
    // Check if it's a number (unix timestamp in seconds)
    if (!isNaN(Number(ts)) && !ts.includes('-')) {
      return parseFloat(ts) * 1000
    }
    // Assume ISO string or date string
    return new Date(ts).getTime()
  })
  
  const latestTimestamp = new Date(Math.max(...timestamps)).toISOString()
  
  // Calculate total benchmark results
  const totalResults = evaluations.reduce((sum, eval_) => sum + eval_.evaluation_results.length, 0)

  return {
    model_info: modelInfo,
    evaluations_by_category: evaluationsByCategory as Record<CategoryType, BenchmarkEvaluation[]>,
    total_evaluations: totalResults,
    last_updated: latestTimestamp,
    categories_covered: Array.from(categoriesSet),
  }
}

/**
 * Convert model summary to card display format
 */
export function createEvaluationCard(
  summary: ModelEvaluationSummary
): EvaluationCardData {
  // Get all unique benchmarks
  const benchmarksSet = new Set<string>()
  const allScores: Array<{
    benchmark: string
    score: number
    metric: string
    unit?: string
  }> = []
  const sourceUrls = new Set<string>()
  const detailUrls = new Set<string>()
  
  // Collect all evaluations
  for (const evals of Object.values(summary.evaluations_by_category)) {
    for (const eval_ of evals) {
      // Handle source_data as either string[] or SourceData object
      if (Array.isArray(eval_.source_data)) {
        // source_data is string[] (URLs), extract benchmark names from evaluation_results
        for (const result of eval_.evaluation_results) {
          benchmarksSet.add(result.evaluation_name)
        }
      } else {
        // Even if source_data is an object, we should try to extract individual benchmarks
        // from evaluation_results if available, as dataset_name might be a suite name.
        if (eval_.evaluation_results && eval_.evaluation_results.length > 0) {
           for (const result of eval_.evaluation_results) {
             benchmarksSet.add(result.evaluation_name)
           }
        } else {
           benchmarksSet.add(eval_.source_data.dataset_name)
        }
      }
      
      if (eval_.source_metadata.source_url) {
        sourceUrls.add(eval_.source_metadata.source_url)
      }
      
      // Add source_data URLs if it's a string array
      if (Array.isArray(eval_.source_data)) {
        eval_.source_data.forEach(url => sourceUrls.add(url))
      }
      
      for (const result of eval_.evaluation_results) {
        if (result.detailed_evaluation_results_url) {
          detailUrls.add(result.detailed_evaluation_results_url)
        }
        
        allScores.push({
          benchmark: result.evaluation_name,
          score: result.score_details.score,
          metric: result.metric_config.evaluation_description || result.evaluation_name,
          unit: result.metric_config.unit
        })
      }
    }
  }
  
  // Deduplicate by benchmark name, keeping highest score for each
  const scoresByBenchmark = new Map<string, { benchmark: string; score: number; metric: string; unit?: string }>()
  for (const scoreData of allScores) {
    const existing = scoresByBenchmark.get(scoreData.benchmark)
    if (!existing || scoreData.score > existing.score) {
      scoresByBenchmark.set(scoreData.benchmark, scoreData)
    }
  }
  
  // Calculate category stats (count of unique benchmarks per category)
  const categoryStats: Record<CategoryType, number> = {} as any
  
  for (const category of summary.categories_covered) {
    const evals = summary.evaluations_by_category[category] || []
    const categoryBenchmarks = new Set<string>()
    
    for (const eval_ of evals) {
      if (Array.isArray(eval_.source_data)) {
        for (const result of eval_.evaluation_results) {
          // Only count if this result actually belongs to this category
          const resultCategory = inferCategoryFromBenchmark(result.evaluation_name)
          if (resultCategory === category) {
            categoryBenchmarks.add(result.evaluation_name)
          }
        }
      } else {
        // For single-benchmark files, check if the file's main benchmark belongs to category
        // But wait, inferCategoryFromBenchmark might have been used to categorize the whole file
        // Let's just count the benchmarks in this file that match the category
        for (const result of eval_.evaluation_results) {
           // Determine category using the same logic as createModelSummary
           let resultCategory: CategoryType | undefined;
           
           if (result.factsheet?.functional_props) {
             const props = result.factsheet.functional_props.split(';').map(p => p.trim());
             for (const prop of props) {
               if (EVALUATION_CATEGORIES.includes(prop as CategoryType)) {
                 resultCategory = prop as CategoryType;
                 break;
               }
             }
           }
           
           if (!resultCategory) {
             resultCategory = inferCategoryFromBenchmark(result.evaluation_name)
           }

           if (resultCategory === category) {
             categoryBenchmarks.add(result.evaluation_name)
           }
        }
      }
    }
    categoryStats[category] = categoryBenchmarks.size
  }

  // Get top 5 unique benchmarks by score
  const topScores = Array.from(scoresByBenchmark.values())
    .sort((a, b) => b.score - a.score)
    .slice(0, 5)

  return {
    id: summary.model_info.id,
    model_name: summary.model_info.name,
    model_id: summary.model_info.id,
    developer: summary.model_info.developer,
    evaluations_count: summary.total_evaluations,
    benchmarks_count: benchmarksSet.size,
    categories: summary.categories_covered,
    category_stats: categoryStats,
    latest_timestamp: summary.last_updated,
    top_scores: topScores,
    source_urls: Array.from(sourceUrls),
    detail_urls: Array.from(detailUrls),
    architecture: summary.model_info.architecture,
    params: summary.model_info.parameter_count,
    inference_engine: summary.model_info.inference_engine,
    inference_platform: summary.model_info.inference_platform,
    input_modalities: summary.model_info.modalities?.input,
    output_modalities: summary.model_info.modalities?.output,
    release_date: summary.model_info.release_date,
    model_url: summary.model_info.model_url,
  }
}

/**
 * Get category stats for a model
 */
export function getCategoryStats(
  summary: ModelEvaluationSummary
): {
  categories: { category: CategoryType; count: number; avg_score: number; total_results: number }[]
} {
  const categories: { category: CategoryType; count: number; avg_score: number; total_results: number }[] = []
  
  for (const category of summary.categories_covered) {
    const evals = summary.evaluations_by_category[category] || []
    const allScores: number[] = []
    
    // Collect all scores from all results in this category
    for (const eval_ of evals) {
      for (const result of eval_.evaluation_results) {
        // Verify this result actually belongs to this category
        let resultCategory: CategoryType | undefined;
        
        if (result.factsheet?.functional_props) {
          const props = result.factsheet.functional_props.split(';').map(p => p.trim());
          for (const prop of props) {
            if (EVALUATION_CATEGORIES.includes(prop as CategoryType)) {
              resultCategory = prop as CategoryType;
              break;
            }
          }
        }
        
        if (!resultCategory) {
          resultCategory = inferCategoryFromBenchmark(result.evaluation_name)
        }
        
        // Only include scores for results that actually belong to this category
        if (resultCategory === category) {
          allScores.push(result.score_details.score)
        }
      }
    }
    
    const avgScore = allScores.length > 0
      ? allScores.reduce((a, b) => a + b, 0) / allScores.length
      : 0
    
    const stat = {
      category,
      count: evals.length, // Number of evaluation files
      total_results: allScores.length, // Number of actual benchmark results
      avg_score: avgScore,
    }
    
    categories.push(stat)
  }
  
  // Sort categories by name or some other metric if needed
  categories.sort((a, b) => a.category.localeCompare(b.category))
  
  return { categories }
}

/**
 * Load and process evaluations from file paths
 */
export async function loadEvaluations(
  filePaths: string[]
): Promise<BenchmarkEvaluation[]> {
  const evaluations: BenchmarkEvaluation[] = []
  
  for (const path of filePaths) {
    try {
      const response = await fetch(path)
      if (!response.ok) continue
      
      const data = await response.json()
      
      // Validate it matches our schema
      if (data.schema_version && data.evaluation_id && data.model_info) {
        evaluations.push(data as BenchmarkEvaluation)
      }
    } catch (error) {
      console.warn(`Failed to load evaluation from ${path}:`, error)
    }
  }
  
  return evaluations
}

/**
 * Process all evaluations into card data
 */
export async function processEvaluationsToCards(
  filePaths: string[]
): Promise<EvaluationCardData[]> {
  const evaluations = await loadEvaluations(filePaths)
  const grouped = groupEvaluationsByModel(evaluations)
  
  const cards: EvaluationCardData[] = []
  
  for (const modelId in grouped) {
    const modelEvals = grouped[modelId]
    const summary = createModelSummary(modelEvals)
    const card = createEvaluationCard(summary)
    cards.push(card)
  }
  
  return cards
}

/**
 * Format score with proper precision
 */
export function formatScore(
  score: number,
  scoreType: 'continuous' | 'discrete' | 'binary',
  maxScore?: number
): string {
  if (scoreType === 'binary') {
    return score > 0.5 ? 'Pass' : 'Fail'
  }
  
  if (maxScore && maxScore === 1.0) {
    // It's a percentage/ratio
    return `${(score * 100).toFixed(1)}%`
  }
  
  if (maxScore && maxScore === 100) {
    return `${score.toFixed(1)}`
  }
  
  // Default formatting
  return score.toFixed(3)
}

/**
 * Get benchmark display name
 */
export function getBenchmarkDisplayName(name: string | undefined | null): string {
  if (!name) return 'Unknown Benchmark'
  
  // Map common benchmarks to friendly names
  const mapping: Record<string, string> = {
    'MMLU': 'Massive Multitask Language Understanding',
    'MMLU-Pro': 'MMLU Professional',
    'GSM8K': 'Grade School Math 8K',
    'HumanEval': 'Human Eval (Code)',
    'MBPP': 'Mostly Basic Python Problems',
    'HellaSwag': 'HellaSwag (Commonsense)',
    'ARC': 'AI2 Reasoning Challenge',
    'TruthfulQA': 'TruthfulQA',
    'BBH': 'Big-Bench Hard',
    'MATH': 'MATH Dataset',
  }
  
  for (const [key, value] of Object.entries(mapping)) {
    if (name.toUpperCase().includes(key.toUpperCase())) {
      return value
    }
  }
  
  return name
}