File size: 9,624 Bytes
6978d97
 
 
 
 
 
 
c1f2130
6978d97
 
3a12290
415ac43
 
c1f2130
 
 
 
 
 
 
415ac43
 
 
c1f2130
 
ae1dc39
6978d97
 
3a12290
6978d97
ae1dc39
6978d97
 
 
 
3a12290
 
 
 
 
 
6978d97
 
3a12290
6978d97
 
3a12290
 
6978d97
 
3a12290
6978d97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a12290
6978d97
 
 
 
 
 
 
 
 
 
415ac43
 
 
 
6978d97
3a12290
6978d97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae1dc39
6978d97
 
 
 
 
 
 
ae1dc39
6978d97
 
 
 
 
 
 
 
 
 
 
 
 
 
c1f2130
6978d97
 
c1f2130
 
 
6978d97
c1f2130
6978d97
 
 
 
04b4cff
 
 
c1f2130
04b4cff
c1f2130
 
 
 
 
 
04b4cff
 
c1f2130
04b4cff
 
 
 
 
 
6978d97
c1f2130
 
6978d97
 
 
04b4cff
c1f2130
 
 
6978d97
 
c1f2130
 
 
 
6978d97
c1f2130
 
 
 
6978d97
c1f2130
 
 
6978d97
c1f2130
 
6978d97
 
 
 
 
3a12290
6978d97
 
 
 
 
 
 
3a12290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6978d97
 
 
 
 
3a12290
6978d97
 
3a12290
6978d97
 
 
3a12290
6978d97
 
 
3a12290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c1f2130
 
 
 
 
 
 
6978d97
 
 
 
c1f2130
6978d97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04b4cff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
/**
 * Benchmark-first evaluation schema types
 * Based on the evalevalai.com schema structure
 */

export interface BenchmarkEvaluation {
  schema_version: string
  eval_summary_id?: string
  evaluation_id: string
  retrieved_timestamp: string
  benchmark?: string
  display_name?: string
  canonical_display_name?: string
  category?: CategoryType
  benchmark_family_key?: string
  benchmark_family_name?: string
  benchmark_parent_key?: string
  benchmark_parent_name?: string
  benchmark_leaf_key?: string
  benchmark_leaf_name?: string
  benchmark_component_key?: string | null
  benchmark_component_name?: string | null
  is_summary_score?: boolean
  slice_key?: string
  slice_name?: string

  source_data: string[] | SourceData
  source_metadata: SourceMetadata
  eval_library?: EvalLibrary
  model_info: ModelInfo
  generation_config?: GenerationConfig
  evaluation_results: EvaluationResult[]
  detailed_evaluation_results_per_samples?: SampleResult[]
}

export interface EvalLibrary {
  name: string
  version?: string
  additional_details?: Record<string, any>
}

export interface SourceData {
  dataset_name: string
  source_type?: string
  hf_repo?: string
  hf_split?: string
  samples_number?: number
  url?: string[]
  dataset_url?: string
  dataset_version?: string
  [key: string]: any
}

export interface SourceMetadata {
  source_name?: string
  source_type: 'evaluation_run' | 'documentation' | 'paper' | 'leaderboard'
  source_organization_name: string
  source_organization_url?: string
  evaluator_relationship: 'first_party' | 'third_party' | 'collaborative' | 'other'
  source_url?: string
  publication_date?: string
}

export interface ModelInfo {
  name: string
  id: string
  developer?: string
  inference_platform?: string
  inference_engine?: string
  model_version?: string
  architecture?: string
  parameter_count?: string
  release_date?: string
  model_url?: string
  additional_details?: {
    precision?: string
    architecture?: string
    params_billions?: number | string
    [key: string]: any
  }
  modalities?: {
    input: string[]
    output: string[]
  }
}

export interface EvaluationResult {
  evaluation_name: string
  display_name?: string
  canonical_display_name?: string
  metric_summary_id?: string
  metric_key?: string
  evaluation_timestamp: string
  source_data?: string[] | SourceData
  metric_config: MetricConfig
  score_details: ScoreDetails
  detailed_evaluation_results_url?: string
  generation_config?: GenerationConfig
}

export interface MetricConfig {
  evaluation_description: string
  lower_is_better: boolean
  score_type: 'continuous' | 'discrete' | 'binary'
  min_score?: number
  max_score?: number
  unit?: string
}

export interface ScoreDetails {
  score: number
  details?: Record<string, any>
  confidence_interval?: {
    lower: number
    upper: number
    confidence_level: number
  }
  sample_size?: number
  standard_error?: number
}

export interface GenerationConfig {
  generation_args?: {
    temperature?: number
    top_p?: number
    top_k?: number
    max_tokens?: number
    reasoning?: boolean
    [key: string]: any
  }
  additional_details?: string | Record<string, any>
  prompt_template?: string
}

export interface SampleResult {
  sample_id: string
  input: string
  ground_truth?: string
  response: string
  choices?: string[]
  is_correct?: boolean
  metadata?: Record<string, any>
}

/**
 * Evaluation categories β€” aligned with the pipeline's category labels.
 */
export const EVALUATION_CATEGORIES = [
  'General',
  'Reasoning',
  'Agentic',
  'Safety',
  'Knowledge',
] as const

export type CategoryType = typeof EVALUATION_CATEGORIES[number]

/**
 * Returns Tailwind badge classes for a given category
 */
export function getCategoryColor(category: CategoryType | string): string {
  switch (category) {
    case 'General':
      return 'bg-sky-100 text-sky-800 border-sky-200 dark:bg-sky-950/40 dark:text-sky-200'
    case 'Reasoning':
      return 'bg-violet-100 text-violet-800 border-violet-200 dark:bg-violet-950/40 dark:text-violet-200'
    case 'Agentic':
      return 'bg-amber-100 text-amber-800 border-amber-200 dark:bg-amber-950/40 dark:text-amber-200'
    case 'Safety':
      return 'bg-rose-100 text-rose-800 border-rose-200 dark:bg-rose-950/40 dark:text-rose-200'
    case 'Knowledge':
      return 'bg-emerald-100 text-emerald-800 border-emerald-200 dark:bg-emerald-950/40 dark:text-emerald-200'
    default:
      return 'bg-muted text-muted-foreground border-border'
  }
}

/**
 * Helper to determine category from benchmark name.
 * The pipeline now provides categories directly, so this is only used as a fallback.
 */
export function inferCategoryFromBenchmark(benchmarkName: string): CategoryType {
  const name = benchmarkName.toLowerCase()

  if (name.includes('safety') || name.includes('harmful') || name.includes('toxic') || name.includes('truthful') ||
      name.includes('unsafe') || name.includes('civilcomments') || name.includes('civil_comments') ||
      name.includes('jailbreak') || name.includes('red-team') || name.includes('adversarial')) {
    return 'Safety'
  }
  if (name.includes('agent') || name.includes('swe-bench') || name.includes('swe_bench') ||
      name.includes('terminal-bench') || name.includes('tau-bench') || name.includes('tau_bench') ||
      name.includes('appworld') || name.includes('browsecomp')) {
    return 'Agentic'
  }
  if (name.includes('reasoning') || name.includes('bbh') || name.includes('math') || name.includes('gsm') ||
      name.includes('gpqa') || name.includes('musr') || name.includes('code') || name.includes('humaneval') ||
      name.includes('livecodebench')) {
    return 'Reasoning'
  }
  if (name.includes('mmlu') || name.includes('knowledge') || name.includes('trivia') || name.includes('medqa') ||
      name.includes('legalbench') || name.includes('theory_of_mind')) {
    return 'Knowledge'
  }

  return 'General'
}

/**
 * Aggregate evaluations by model
 */
export interface ModelSummaryCore {
  model_info: ModelInfo
  evaluations_by_category: Record<CategoryType, BenchmarkEvaluation[]>
  total_evaluations: number
  last_updated: string
  categories_covered: CategoryType[]
}

export interface ModelVariantSummary extends ModelSummaryCore {
  variant_id: string
  variant_key: string
  variant_label: string
  variant_display_name: string
  raw_model_ids: string[]
  family_id: string
  family_name: string
  version_date?: string
  version_qualifier?: string
}

export interface ModelEvaluationSummary extends ModelSummaryCore {
  model_family_id: string
  model_route_id: string
  model_family_name: string
  raw_model_ids: string[]
  variants: ModelVariantSummary[]
}

/**
 * Display-friendly format for the UI
 */
export interface EvaluationCardData {
  id: string
  route_id: string
  model_name: string
  model_id: string
  canonical_model_name: string
  developer: string
  evaluations_count: number
  benchmarks_count: number
  variant_count: number
  categories: CategoryType[]
  category_stats: Record<CategoryType, number>
  latest_timestamp: string
  evaluator_count: number
  evaluator_names: string[]
  source_type_count: number
  source_types: Array<SourceMetadata["source_type"]>
  evidence_count: number
  missing_generation_config_count: number
  third_party_eval_count: number
  independent_verification_ratio: number
  reproducibility_status: "complete" | "partial" | "missing"
  eval_libraries: Array<{
    name: string
    version?: string
    fork?: string
  }>
  latest_source_name?: string
  params_billions?: number | null
  benchmark_names?: string[]
  score_summary?: {
    count: number
    min: number
    max: number
    average: number | null
  }
  
  // Quick stats
  top_scores: Array<{
    benchmark: string
    benchmarkKey?: string
    score: number
    metric: string
  }>
  
  // Links
  source_urls: string[]
  detail_urls: string[]

  // Model Metadata (from auxiliary sources or model_metadata.json)
  model_url?: string
  release_date?: string
  input_modalities?: string[]
  output_modalities?: string[]
  architecture?: string
  params?: string
  inference_engine?: string
  inference_platform?: string
}

// ── Benchmark Card types (from metadata/benchmark_card_*.json) ────────────────

export interface BenchmarkCardDetails {
  name: string
  overview: string
  data_type: string
  domains: string[]
  languages: string[]
  similar_benchmarks: string[] | string
  resources: string[]
}

export interface BenchmarkCardPurpose {
  goal: string
  audience: string[] | string
  tasks: string[]
  limitations: string
  out_of_scope_uses: string[] | string
}

export interface BenchmarkCardData {
  source: string
  size: string
  format: string
  annotation: string
}

export interface BenchmarkCardMethodology {
  methods: string[]
  metrics: string[]
  calculation: string
  interpretation: string
  baseline_results: string
  validation: string
}

export interface BenchmarkCardEthical {
  privacy_and_anonymity: string
  data_licensing: string
  consent_procedures: string
  compliance_with_regulations: string
}

export interface BenchmarkCardRisk {
  category: string
  description: string[]
  url: string
}

export interface BenchmarkCard {
  benchmark_details: BenchmarkCardDetails
  purpose_and_intended_users: BenchmarkCardPurpose
  data: BenchmarkCardData
  methodology: BenchmarkCardMethodology
  ethical_and_legal_considerations: BenchmarkCardEthical
  possible_risks: BenchmarkCardRisk[]
  flagged_fields: Record<string, string>
  missing_fields: string[]
  card_info: {
    created_at: string
    llm: string
  }
}