general-eval-card / scripts /generate-fake-benchmarks.js
Avijit Ghosh
Refactor: Update benchmarks with realistic data, fix UI stats, and improve About page
2554366
const fs = require('fs');
const path = require('path');
const BENCHMARKS_DIR = path.join(__dirname, '..', 'public', 'benchmarks');
const MISSING_CATEGORIES = [
{
category: 'Calibration',
benchmarks: [
{
name: 'Confidence Calibration Suite',
description: 'Measures how well the model\'s predicted probabilities align with empirical frequencies.',
metric: 'Expected Calibration Error (ECE)'
}
]
},
{
category: 'Adversarial',
benchmarks: [
{
name: 'AdvGLUE',
description: 'Adversarial GLUE benchmark for robustness against adversarial attacks.',
metric: 'Robust Accuracy'
},
{
name: 'JailbreakBench',
description: 'Evaluates resistance to jailbreak attempts.',
metric: 'Attack Success Rate'
}
]
},
{
category: 'Memorization',
benchmarks: [
{
name: 'The Pile Extraction',
description: 'Measures the rate of verbatim memorization from training data.',
metric: 'Extraction Rate'
},
{
name: 'Copyright Probe',
description: 'Tests for reproduction of copyrighted material.',
metric: 'Reproduction Score'
}
]
},
{
category: 'Fairness',
benchmarks: [
{
name: 'BBQ (Bias Benchmark for QA)',
description: 'Assesses bias in Question Answering models across multiple social dimensions.',
metric: 'Bias Score'
},
{
name: 'CrowS-Pairs',
description: 'Crowdsourced Stereotype Pairs benchmark.',
metric: 'Stereotype Score'
}
]
},
{
category: 'Leakage/Contamination',
benchmarks: [
{
name: 'Contamination Detector',
description: 'Detects if test set data was present in the training set.',
metric: 'Contamination Score'
}
]
},
{
category: 'Privacy',
benchmarks: [
{
name: 'PII Detection',
description: 'Evaluates the model\'s tendency to leak Personally Identifiable Information.',
metric: 'Leakage Rate'
}
]
},
{
category: 'Interpretability',
benchmarks: [
{
name: 'Explainability Harness',
description: 'Measures the quality and faithfulness of model explanations.',
metric: 'Faithfulness Score'
}
]
},
{
category: 'Efficiency',
benchmarks: [
{
name: 'LLMPerf',
description: 'Benchmarks inference latency and throughput.',
metric: 'Tokens/sec'
}
]
},
{
category: 'Retrainability',
benchmarks: [
{
name: 'CL-Benchmark',
description: 'Continual Learning benchmark to measure forgetting.',
metric: 'Forgetting Rate'
}
]
},
{
category: 'Meta-Learning',
benchmarks: [
{
name: 'Meta-Dataset',
description: 'Evaluates few-shot learning capabilities across diverse domains.',
metric: 'Few-shot Accuracy'
}
]
}
];
function generateScore() {
return 0.4 + Math.random() * 0.55;
}
function main() {
const files = fs.readdirSync(BENCHMARKS_DIR);
for (const file of files) {
if (!file.endsWith('.json')) continue;
const filePath = path.join(BENCHMARKS_DIR, file);
const content = JSON.parse(fs.readFileSync(filePath, 'utf8'));
console.log(`Adding fake benchmarks to ${file}...`);
const existingNames = new Set(content.evaluation_results.map(r => r.evaluation_name));
for (const cat of MISSING_CATEGORIES) {
for (const bench of cat.benchmarks) {
if (existingNames.has(bench.name)) continue;
const score = generateScore();
content.evaluation_results.push({
evaluation_name: bench.name,
metric_config: {
evaluation_description: bench.description,
lower_is_better: false,
score_type: "continuous",
min_score: 0,
max_score: 1,
unit: bench.metric.toLowerCase().includes('rate') ? 'rate' : 'score'
},
score_details: {
score: score,
details: {
subtask_a: score,
subtask_b: score
}
},
factsheet: {
purpose: "Research; Development",
principles_tested: cat.category,
functional_props: cat.category, // This is key for the UI to pick it up
input_modality: "Text",
output_modality: "Text",
input_source: "Synthetic",
output_source: "Automatic",
size: "Medium",
splits: "Test",
design: "Fixed",
judge: "Automatic",
protocol: "Standard",
model_access: "Outputs",
has_heldout: false,
alignment_validation: "None",
baseline_models: "None",
robustness_measures: "None",
known_limitations: "Synthetic data",
benchmarks_list: bench.name
}
});
}
}
fs.writeFileSync(filePath, JSON.stringify(content, null, 2));
}
console.log("Done adding fake benchmarks.");
}
main();