Spaces:
Running
Running
File size: 9,624 Bytes
6978d97 c1f2130 6978d97 3a12290 415ac43 c1f2130 415ac43 c1f2130 ae1dc39 6978d97 3a12290 6978d97 ae1dc39 6978d97 3a12290 6978d97 3a12290 6978d97 3a12290 6978d97 3a12290 6978d97 3a12290 6978d97 415ac43 6978d97 3a12290 6978d97 ae1dc39 6978d97 ae1dc39 6978d97 c1f2130 6978d97 c1f2130 6978d97 c1f2130 6978d97 04b4cff c1f2130 04b4cff c1f2130 04b4cff c1f2130 04b4cff 6978d97 c1f2130 6978d97 04b4cff c1f2130 6978d97 c1f2130 6978d97 c1f2130 6978d97 c1f2130 6978d97 c1f2130 6978d97 3a12290 6978d97 3a12290 6978d97 3a12290 6978d97 3a12290 6978d97 3a12290 6978d97 3a12290 c1f2130 6978d97 c1f2130 6978d97 04b4cff | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 | /**
* Benchmark-first evaluation schema types
* Based on the evalevalai.com schema structure
*/
export interface BenchmarkEvaluation {
schema_version: string
eval_summary_id?: string
evaluation_id: string
retrieved_timestamp: string
benchmark?: string
display_name?: string
canonical_display_name?: string
category?: CategoryType
benchmark_family_key?: string
benchmark_family_name?: string
benchmark_parent_key?: string
benchmark_parent_name?: string
benchmark_leaf_key?: string
benchmark_leaf_name?: string
benchmark_component_key?: string | null
benchmark_component_name?: string | null
is_summary_score?: boolean
slice_key?: string
slice_name?: string
source_data: string[] | SourceData
source_metadata: SourceMetadata
eval_library?: EvalLibrary
model_info: ModelInfo
generation_config?: GenerationConfig
evaluation_results: EvaluationResult[]
detailed_evaluation_results_per_samples?: SampleResult[]
}
export interface EvalLibrary {
name: string
version?: string
additional_details?: Record<string, any>
}
export interface SourceData {
dataset_name: string
source_type?: string
hf_repo?: string
hf_split?: string
samples_number?: number
url?: string[]
dataset_url?: string
dataset_version?: string
[key: string]: any
}
export interface SourceMetadata {
source_name?: string
source_type: 'evaluation_run' | 'documentation' | 'paper' | 'leaderboard'
source_organization_name: string
source_organization_url?: string
evaluator_relationship: 'first_party' | 'third_party' | 'collaborative' | 'other'
source_url?: string
publication_date?: string
}
export interface ModelInfo {
name: string
id: string
developer?: string
inference_platform?: string
inference_engine?: string
model_version?: string
architecture?: string
parameter_count?: string
release_date?: string
model_url?: string
additional_details?: {
precision?: string
architecture?: string
params_billions?: number | string
[key: string]: any
}
modalities?: {
input: string[]
output: string[]
}
}
export interface EvaluationResult {
evaluation_name: string
display_name?: string
canonical_display_name?: string
metric_summary_id?: string
metric_key?: string
evaluation_timestamp: string
source_data?: string[] | SourceData
metric_config: MetricConfig
score_details: ScoreDetails
detailed_evaluation_results_url?: string
generation_config?: GenerationConfig
}
export interface MetricConfig {
evaluation_description: string
lower_is_better: boolean
score_type: 'continuous' | 'discrete' | 'binary'
min_score?: number
max_score?: number
unit?: string
}
export interface ScoreDetails {
score: number
details?: Record<string, any>
confidence_interval?: {
lower: number
upper: number
confidence_level: number
}
sample_size?: number
standard_error?: number
}
export interface GenerationConfig {
generation_args?: {
temperature?: number
top_p?: number
top_k?: number
max_tokens?: number
reasoning?: boolean
[key: string]: any
}
additional_details?: string | Record<string, any>
prompt_template?: string
}
export interface SampleResult {
sample_id: string
input: string
ground_truth?: string
response: string
choices?: string[]
is_correct?: boolean
metadata?: Record<string, any>
}
/**
* Evaluation categories β aligned with the pipeline's category labels.
*/
export const EVALUATION_CATEGORIES = [
'General',
'Reasoning',
'Agentic',
'Safety',
'Knowledge',
] as const
export type CategoryType = typeof EVALUATION_CATEGORIES[number]
/**
* Returns Tailwind badge classes for a given category
*/
export function getCategoryColor(category: CategoryType | string): string {
switch (category) {
case 'General':
return 'bg-sky-100 text-sky-800 border-sky-200 dark:bg-sky-950/40 dark:text-sky-200'
case 'Reasoning':
return 'bg-violet-100 text-violet-800 border-violet-200 dark:bg-violet-950/40 dark:text-violet-200'
case 'Agentic':
return 'bg-amber-100 text-amber-800 border-amber-200 dark:bg-amber-950/40 dark:text-amber-200'
case 'Safety':
return 'bg-rose-100 text-rose-800 border-rose-200 dark:bg-rose-950/40 dark:text-rose-200'
case 'Knowledge':
return 'bg-emerald-100 text-emerald-800 border-emerald-200 dark:bg-emerald-950/40 dark:text-emerald-200'
default:
return 'bg-muted text-muted-foreground border-border'
}
}
/**
* Helper to determine category from benchmark name.
* The pipeline now provides categories directly, so this is only used as a fallback.
*/
export function inferCategoryFromBenchmark(benchmarkName: string): CategoryType {
const name = benchmarkName.toLowerCase()
if (name.includes('safety') || name.includes('harmful') || name.includes('toxic') || name.includes('truthful') ||
name.includes('unsafe') || name.includes('civilcomments') || name.includes('civil_comments') ||
name.includes('jailbreak') || name.includes('red-team') || name.includes('adversarial')) {
return 'Safety'
}
if (name.includes('agent') || name.includes('swe-bench') || name.includes('swe_bench') ||
name.includes('terminal-bench') || name.includes('tau-bench') || name.includes('tau_bench') ||
name.includes('appworld') || name.includes('browsecomp')) {
return 'Agentic'
}
if (name.includes('reasoning') || name.includes('bbh') || name.includes('math') || name.includes('gsm') ||
name.includes('gpqa') || name.includes('musr') || name.includes('code') || name.includes('humaneval') ||
name.includes('livecodebench')) {
return 'Reasoning'
}
if (name.includes('mmlu') || name.includes('knowledge') || name.includes('trivia') || name.includes('medqa') ||
name.includes('legalbench') || name.includes('theory_of_mind')) {
return 'Knowledge'
}
return 'General'
}
/**
* Aggregate evaluations by model
*/
export interface ModelSummaryCore {
model_info: ModelInfo
evaluations_by_category: Record<CategoryType, BenchmarkEvaluation[]>
total_evaluations: number
last_updated: string
categories_covered: CategoryType[]
}
export interface ModelVariantSummary extends ModelSummaryCore {
variant_id: string
variant_key: string
variant_label: string
variant_display_name: string
raw_model_ids: string[]
family_id: string
family_name: string
version_date?: string
version_qualifier?: string
}
export interface ModelEvaluationSummary extends ModelSummaryCore {
model_family_id: string
model_route_id: string
model_family_name: string
raw_model_ids: string[]
variants: ModelVariantSummary[]
}
/**
* Display-friendly format for the UI
*/
export interface EvaluationCardData {
id: string
route_id: string
model_name: string
model_id: string
canonical_model_name: string
developer: string
evaluations_count: number
benchmarks_count: number
variant_count: number
categories: CategoryType[]
category_stats: Record<CategoryType, number>
latest_timestamp: string
evaluator_count: number
evaluator_names: string[]
source_type_count: number
source_types: Array<SourceMetadata["source_type"]>
evidence_count: number
missing_generation_config_count: number
third_party_eval_count: number
independent_verification_ratio: number
reproducibility_status: "complete" | "partial" | "missing"
eval_libraries: Array<{
name: string
version?: string
fork?: string
}>
latest_source_name?: string
params_billions?: number | null
benchmark_names?: string[]
score_summary?: {
count: number
min: number
max: number
average: number | null
}
// Quick stats
top_scores: Array<{
benchmark: string
benchmarkKey?: string
score: number
metric: string
}>
// Links
source_urls: string[]
detail_urls: string[]
// Model Metadata (from auxiliary sources or model_metadata.json)
model_url?: string
release_date?: string
input_modalities?: string[]
output_modalities?: string[]
architecture?: string
params?: string
inference_engine?: string
inference_platform?: string
}
// ββ Benchmark Card types (from metadata/benchmark_card_*.json) ββββββββββββββββ
export interface BenchmarkCardDetails {
name: string
overview: string
data_type: string
domains: string[]
languages: string[]
similar_benchmarks: string[] | string
resources: string[]
}
export interface BenchmarkCardPurpose {
goal: string
audience: string[] | string
tasks: string[]
limitations: string
out_of_scope_uses: string[] | string
}
export interface BenchmarkCardData {
source: string
size: string
format: string
annotation: string
}
export interface BenchmarkCardMethodology {
methods: string[]
metrics: string[]
calculation: string
interpretation: string
baseline_results: string
validation: string
}
export interface BenchmarkCardEthical {
privacy_and_anonymity: string
data_licensing: string
consent_procedures: string
compliance_with_regulations: string
}
export interface BenchmarkCardRisk {
category: string
description: string[]
url: string
}
export interface BenchmarkCard {
benchmark_details: BenchmarkCardDetails
purpose_and_intended_users: BenchmarkCardPurpose
data: BenchmarkCardData
methodology: BenchmarkCardMethodology
ethical_and_legal_considerations: BenchmarkCardEthical
possible_risks: BenchmarkCardRisk[]
flagged_fields: Record<string, string>
missing_fields: string[]
card_info: {
created_at: string
llm: string
}
}
|