Spaces:
Running
Running
| /** | |
| * Benchmark-first evaluation schema types | |
| * Based on the evalevalai.com schema structure | |
| */ | |
| export interface BenchmarkEvaluation { | |
| schema_version: string | |
| evaluation_id: string | |
| retrieved_timestamp: string | |
| source_data: string[] | SourceData | |
| source_metadata: SourceMetadata | |
| model_info: ModelInfo | |
| evaluation_results: EvaluationResult[] | |
| detailed_evaluation_results_per_samples?: SampleResult[] | |
| } | |
| export interface SourceData { | |
| dataset_name: string | |
| hf_repo?: string | |
| hf_split?: string | |
| samples_number: number | |
| dataset_url?: string | |
| dataset_version?: string | |
| } | |
| export interface SourceMetadata { | |
| source_name?: string | |
| source_type: 'evaluation_run' | 'documentation' | 'paper' | 'leaderboard' | |
| source_organization_name: string | |
| source_organization_url?: string | |
| evaluator_relationship: 'first_party' | 'third_party' | 'collaborative' | 'other' | |
| source_url?: string | |
| publication_date?: string | |
| } | |
| export interface ModelInfo { | |
| name: string | |
| id: string | |
| developer?: string | |
| inference_platform?: string | |
| inference_engine?: string | |
| model_version?: string | |
| architecture?: string | |
| parameter_count?: string | |
| release_date?: string | |
| model_url?: string | |
| additional_details?: { | |
| precision?: string | |
| architecture?: string | |
| params_billions?: number | |
| [key: string]: any | |
| } | |
| modalities?: { | |
| input: string[] | |
| output: string[] | |
| } | |
| } | |
| export interface EvaluationResult { | |
| evaluation_name: string | |
| evaluation_timestamp: string | |
| metric_config: MetricConfig | |
| score_details: ScoreDetails | |
| detailed_evaluation_results_url?: string | |
| generation_config?: GenerationConfig | |
| factsheet?: { | |
| purpose?: string | |
| principles_tested?: string | |
| functional_props?: string | |
| input_modality?: string | |
| output_modality?: string | |
| input_source?: string | |
| output_source?: string | |
| size?: string | |
| splits?: string | |
| design?: string | |
| judge?: string | |
| protocol?: string | |
| model_access?: string | |
| has_heldout?: boolean | |
| heldout_details?: string | |
| alignment_validation?: string | |
| is_valid?: boolean | |
| baseline_models?: string | |
| robustness_measures?: string | |
| known_limitations?: string | |
| benchmarks_list?: string | |
| } | |
| } | |
| export interface MetricConfig { | |
| evaluation_description: string | |
| lower_is_better: boolean | |
| score_type: 'continuous' | 'discrete' | 'binary' | |
| min_score?: number | |
| max_score?: number | |
| unit?: string | |
| } | |
| export interface ScoreDetails { | |
| score: number | |
| details?: Record<string, any> | |
| confidence_interval?: { | |
| lower: number | |
| upper: number | |
| confidence_level: number | |
| } | |
| sample_size?: number | |
| standard_error?: number | |
| } | |
| export interface GenerationConfig { | |
| generation_args: { | |
| temperature?: number | |
| top_p?: number | |
| top_k?: number | |
| max_tokens?: number | |
| reasoning?: boolean | |
| [key: string]: any | |
| } | |
| additional_details?: string | |
| prompt_template?: string | |
| } | |
| export interface SampleResult { | |
| sample_id: string | |
| input: string | |
| ground_truth?: string | |
| response: string | |
| choices?: string[] | |
| is_correct?: boolean | |
| metadata?: Record<string, any> | |
| } | |
| /** | |
| * Evaluation categories for classification | |
| */ | |
| export const EVALUATION_CATEGORIES = [ | |
| 'Core Performance', | |
| 'Core Quality Dimensions', | |
| 'Robustness', | |
| 'Calibration', | |
| 'Adversarial', | |
| 'Memorization', | |
| 'Fairness', | |
| 'Safety', | |
| 'Leakage/Contamination', | |
| 'Privacy', | |
| 'Interpretability', | |
| 'Efficiency', | |
| 'Retrainability', | |
| 'Meta-Learning', | |
| ] as const | |
| export type CategoryType = typeof EVALUATION_CATEGORIES[number] | |
| /** | |
| * Helper to determine category from benchmark name | |
| */ | |
| export function inferCategoryFromBenchmark(benchmarkName: string): CategoryType { | |
| const name = benchmarkName.toLowerCase() | |
| // Category mappings | |
| if (name.includes('advglue') || name.includes('jailbreak') || name.includes('attack') || name.includes('adversarial') || name.includes('red-team')) { | |
| return 'Adversarial' | |
| } | |
| if (name.includes('fairness') || name.includes('bias') || name.includes('stereo') || name.includes('bbq') || name.includes('celeb') || name.includes('winobias')) { | |
| return 'Fairness' | |
| } | |
| if (name.includes('safety') || name.includes('harmful') || name.includes('toxic') || name.includes('truthful') || name.includes('unsafe')) { | |
| return 'Safety' | |
| } | |
| if (name.includes('leakage') || name.includes('contamination')) { | |
| return 'Leakage/Contamination' | |
| } | |
| if (name.includes('privacy') || name.includes('pii') || name.includes('gdpr') || name.includes('private')) { | |
| return 'Privacy' | |
| } | |
| if (name.includes('robust')) { | |
| return 'Robustness' | |
| } | |
| if (name.includes('calibration') || name.includes('confidence')) { | |
| return 'Calibration' | |
| } | |
| if (name.includes('memoriz') || name.includes('copyright')) { | |
| return 'Memorization' | |
| } | |
| if (name.includes('interpret') || name.includes('explain')) { | |
| return 'Interpretability' | |
| } | |
| if (name.includes('efficien') || name.includes('latency') || name.includes('throughput') || name.includes('speed')) { | |
| return 'Efficiency' | |
| } | |
| if (name.includes('retrain') || name.includes('forgetting')) { | |
| return 'Retrainability' | |
| } | |
| if (name.includes('meta') || name.includes('few-shot') || name.includes('learning')) { | |
| return 'Meta-Learning' | |
| } | |
| if (name.includes('mt-bench') || name.includes('quality') || name.includes('human') || name.includes('fact') || name.includes('hallucination')) { | |
| return 'Core Quality Dimensions' | |
| } | |
| // Default to Core Performance for standard benchmarks | |
| if (name.includes('mmlu') || name.includes('arc') || name.includes('hellaswag') || name.includes('winogrande') || name.includes('gpqa') || | |
| name.includes('gsm') || name.includes('math') || name.includes('minerva') || name.includes('mgsm') || | |
| name.includes('humaneval') || name.includes('mbpp') || name.includes('code') || name.includes('apps') || | |
| name.includes('vision') || name.includes('vqa') || name.includes('image') || name.includes('coco') || | |
| name.includes('multimodal') || name.includes('mmmu') || name.includes('seed-bench') || | |
| name.includes('bbh') || name.includes('reasoning') || name.includes('musr') || | |
| name.includes('xsum') || name.includes('summariz') || name.includes('dialog') || name.includes('translation') || name.includes('ifeval') || | |
| name.includes('creative') || name.includes('social') || name.includes('agent')) { | |
| return 'Core Performance' | |
| } | |
| return 'Core Performance' | |
| } | |
| /** | |
| * Aggregate evaluations by model | |
| */ | |
| export interface ModelEvaluationSummary { | |
| model_info: ModelInfo | |
| evaluations_by_category: Record<CategoryType, BenchmarkEvaluation[]> | |
| total_evaluations: number | |
| last_updated: string | |
| categories_covered: CategoryType[] | |
| } | |
| /** | |
| * Display-friendly format for the UI | |
| */ | |
| export interface EvaluationCardData { | |
| id: string | |
| model_name: string | |
| model_id: string | |
| developer: string | |
| evaluations_count: number | |
| benchmarks_count: number | |
| categories: CategoryType[] | |
| category_stats: Record<CategoryType, number> | |
| latest_timestamp: string | |
| // Quick stats | |
| top_scores: Array<{ | |
| benchmark: string | |
| score: number | |
| metric: string | |
| }> | |
| // Links | |
| source_urls: string[] | |
| detail_urls: string[] | |
| // Model Metadata (from auxiliary sources or model_metadata.json) | |
| model_url?: string | |
| release_date?: string | |
| input_modalities?: string[] | |
| output_modalities?: string[] | |
| architecture?: string | |
| params?: string | |
| inference_engine?: string | |
| inference_platform?: string | |
| } | |