Spaces:
Running
Running
Avijit Ghosh
Refactor: Update benchmarks with realistic data, fix UI stats, and improve About page
2554366 | const fs = require('fs'); | |
| const path = require('path'); | |
| const BENCHMARKS_DIR = path.join(__dirname, '..', 'public', 'benchmarks'); | |
| const MISSING_CATEGORIES = [ | |
| { | |
| category: 'Calibration', | |
| benchmarks: [ | |
| { | |
| name: 'Confidence Calibration Suite', | |
| description: 'Measures how well the model\'s predicted probabilities align with empirical frequencies.', | |
| metric: 'Expected Calibration Error (ECE)' | |
| } | |
| ] | |
| }, | |
| { | |
| category: 'Adversarial', | |
| benchmarks: [ | |
| { | |
| name: 'AdvGLUE', | |
| description: 'Adversarial GLUE benchmark for robustness against adversarial attacks.', | |
| metric: 'Robust Accuracy' | |
| }, | |
| { | |
| name: 'JailbreakBench', | |
| description: 'Evaluates resistance to jailbreak attempts.', | |
| metric: 'Attack Success Rate' | |
| } | |
| ] | |
| }, | |
| { | |
| category: 'Memorization', | |
| benchmarks: [ | |
| { | |
| name: 'The Pile Extraction', | |
| description: 'Measures the rate of verbatim memorization from training data.', | |
| metric: 'Extraction Rate' | |
| }, | |
| { | |
| name: 'Copyright Probe', | |
| description: 'Tests for reproduction of copyrighted material.', | |
| metric: 'Reproduction Score' | |
| } | |
| ] | |
| }, | |
| { | |
| category: 'Fairness', | |
| benchmarks: [ | |
| { | |
| name: 'BBQ (Bias Benchmark for QA)', | |
| description: 'Assesses bias in Question Answering models across multiple social dimensions.', | |
| metric: 'Bias Score' | |
| }, | |
| { | |
| name: 'CrowS-Pairs', | |
| description: 'Crowdsourced Stereotype Pairs benchmark.', | |
| metric: 'Stereotype Score' | |
| } | |
| ] | |
| }, | |
| { | |
| category: 'Leakage/Contamination', | |
| benchmarks: [ | |
| { | |
| name: 'Contamination Detector', | |
| description: 'Detects if test set data was present in the training set.', | |
| metric: 'Contamination Score' | |
| } | |
| ] | |
| }, | |
| { | |
| category: 'Privacy', | |
| benchmarks: [ | |
| { | |
| name: 'PII Detection', | |
| description: 'Evaluates the model\'s tendency to leak Personally Identifiable Information.', | |
| metric: 'Leakage Rate' | |
| } | |
| ] | |
| }, | |
| { | |
| category: 'Interpretability', | |
| benchmarks: [ | |
| { | |
| name: 'Explainability Harness', | |
| description: 'Measures the quality and faithfulness of model explanations.', | |
| metric: 'Faithfulness Score' | |
| } | |
| ] | |
| }, | |
| { | |
| category: 'Efficiency', | |
| benchmarks: [ | |
| { | |
| name: 'LLMPerf', | |
| description: 'Benchmarks inference latency and throughput.', | |
| metric: 'Tokens/sec' | |
| } | |
| ] | |
| }, | |
| { | |
| category: 'Retrainability', | |
| benchmarks: [ | |
| { | |
| name: 'CL-Benchmark', | |
| description: 'Continual Learning benchmark to measure forgetting.', | |
| metric: 'Forgetting Rate' | |
| } | |
| ] | |
| }, | |
| { | |
| category: 'Meta-Learning', | |
| benchmarks: [ | |
| { | |
| name: 'Meta-Dataset', | |
| description: 'Evaluates few-shot learning capabilities across diverse domains.', | |
| metric: 'Few-shot Accuracy' | |
| } | |
| ] | |
| } | |
| ]; | |
| function generateScore() { | |
| return 0.4 + Math.random() * 0.55; | |
| } | |
| function main() { | |
| const files = fs.readdirSync(BENCHMARKS_DIR); | |
| for (const file of files) { | |
| if (!file.endsWith('.json')) continue; | |
| const filePath = path.join(BENCHMARKS_DIR, file); | |
| const content = JSON.parse(fs.readFileSync(filePath, 'utf8')); | |
| console.log(`Adding fake benchmarks to ${file}...`); | |
| const existingNames = new Set(content.evaluation_results.map(r => r.evaluation_name)); | |
| for (const cat of MISSING_CATEGORIES) { | |
| for (const bench of cat.benchmarks) { | |
| if (existingNames.has(bench.name)) continue; | |
| const score = generateScore(); | |
| content.evaluation_results.push({ | |
| evaluation_name: bench.name, | |
| metric_config: { | |
| evaluation_description: bench.description, | |
| lower_is_better: false, | |
| score_type: "continuous", | |
| min_score: 0, | |
| max_score: 1, | |
| unit: bench.metric.toLowerCase().includes('rate') ? 'rate' : 'score' | |
| }, | |
| score_details: { | |
| score: score, | |
| details: { | |
| subtask_a: score, | |
| subtask_b: score | |
| } | |
| }, | |
| factsheet: { | |
| purpose: "Research; Development", | |
| principles_tested: cat.category, | |
| functional_props: cat.category, // This is key for the UI to pick it up | |
| input_modality: "Text", | |
| output_modality: "Text", | |
| input_source: "Synthetic", | |
| output_source: "Automatic", | |
| size: "Medium", | |
| splits: "Test", | |
| design: "Fixed", | |
| judge: "Automatic", | |
| protocol: "Standard", | |
| model_access: "Outputs", | |
| has_heldout: false, | |
| alignment_validation: "None", | |
| baseline_models: "None", | |
| robustness_measures: "None", | |
| known_limitations: "Synthetic data", | |
| benchmarks_list: bench.name | |
| } | |
| }); | |
| } | |
| } | |
| fs.writeFileSync(filePath, JSON.stringify(content, null, 2)); | |
| } | |
| console.log("Done adding fake benchmarks."); | |
| } | |
| main(); | |