Spaces:

evaleval
/

general-eval-card

Running

App Files Files Community

general-eval-card / scripts /generate-fake-benchmarks.js

Avijit Ghosh

Refactor: Update benchmarks with realistic data, fix UI stats, and improve About page

2554366 3 months ago

history blame contribute delete

6.41 kB

	const fs = require('fs');
	const path = require('path');

	const BENCHMARKS_DIR = path.join(__dirname, '..', 'public', 'benchmarks');

	const MISSING_CATEGORIES = [
	{
	category: 'Calibration',
	benchmarks: [
	{
	name: 'Confidence Calibration Suite',
	description: 'Measures how well the model\'s predicted probabilities align with empirical frequencies.',
	metric: 'Expected Calibration Error (ECE)'
	}
	]
	},
	{
	category: 'Adversarial',
	benchmarks: [
	{
	name: 'AdvGLUE',
	description: 'Adversarial GLUE benchmark for robustness against adversarial attacks.',
	metric: 'Robust Accuracy'
	},
	{
	name: 'JailbreakBench',
	description: 'Evaluates resistance to jailbreak attempts.',
	metric: 'Attack Success Rate'
	}
	]
	},
	{
	category: 'Memorization',
	benchmarks: [
	{
	name: 'The Pile Extraction',
	description: 'Measures the rate of verbatim memorization from training data.',
	metric: 'Extraction Rate'
	},
	{
	name: 'Copyright Probe',
	description: 'Tests for reproduction of copyrighted material.',
	metric: 'Reproduction Score'
	}
	]
	},
	{
	category: 'Fairness',
	benchmarks: [
	{
	name: 'BBQ (Bias Benchmark for QA)',
	description: 'Assesses bias in Question Answering models across multiple social dimensions.',
	metric: 'Bias Score'
	},
	{
	name: 'CrowS-Pairs',
	description: 'Crowdsourced Stereotype Pairs benchmark.',
	metric: 'Stereotype Score'
	}
	]
	},
	{
	category: 'Leakage/Contamination',
	benchmarks: [
	{
	name: 'Contamination Detector',
	description: 'Detects if test set data was present in the training set.',
	metric: 'Contamination Score'
	}
	]
	},
	{
	category: 'Privacy',
	benchmarks: [
	{
	name: 'PII Detection',
	description: 'Evaluates the model\'s tendency to leak Personally Identifiable Information.',
	metric: 'Leakage Rate'
	}
	]
	},
	{
	category: 'Interpretability',
	benchmarks: [
	{
	name: 'Explainability Harness',
	description: 'Measures the quality and faithfulness of model explanations.',
	metric: 'Faithfulness Score'
	}
	]
	},
	{
	category: 'Efficiency',
	benchmarks: [
	{
	name: 'LLMPerf',
	description: 'Benchmarks inference latency and throughput.',
	metric: 'Tokens/sec'
	}
	]
	},
	{
	category: 'Retrainability',
	benchmarks: [
	{
	name: 'CL-Benchmark',
	description: 'Continual Learning benchmark to measure forgetting.',
	metric: 'Forgetting Rate'
	}
	]
	},
	{
	category: 'Meta-Learning',
	benchmarks: [
	{
	name: 'Meta-Dataset',
	description: 'Evaluates few-shot learning capabilities across diverse domains.',
	metric: 'Few-shot Accuracy'
	}
	]
	}
	];

	function generateScore() {
	return 0.4 + Math.random() * 0.55;
	}

	function main() {
	const files = fs.readdirSync(BENCHMARKS_DIR);

	for (const file of files) {
	if (!file.endsWith('.json')) continue;

	const filePath = path.join(BENCHMARKS_DIR, file);
	const content = JSON.parse(fs.readFileSync(filePath, 'utf8'));

	console.log(`Adding fake benchmarks to ${file}...`);

	const existingNames = new Set(content.evaluation_results.map(r => r.evaluation_name));

	for (const cat of MISSING_CATEGORIES) {
	for (const bench of cat.benchmarks) {
	if (existingNames.has(bench.name)) continue;

	const score = generateScore();

	content.evaluation_results.push({
	evaluation_name: bench.name,
	metric_config: {
	evaluation_description: bench.description,
	lower_is_better: false,
	score_type: "continuous",
	min_score: 0,
	max_score: 1,
	unit: bench.metric.toLowerCase().includes('rate') ? 'rate' : 'score'
	},
	score_details: {
	score: score,
	details: {
	subtask_a: score,
	subtask_b: score
	}
	},
	factsheet: {
	purpose: "Research; Development",
	principles_tested: cat.category,
	functional_props: cat.category, // This is key for the UI to pick it up
	input_modality: "Text",
	output_modality: "Text",
	input_source: "Synthetic",
	output_source: "Automatic",
	size: "Medium",
	splits: "Test",
	design: "Fixed",
	judge: "Automatic",
	protocol: "Standard",
	model_access: "Outputs",
	has_heldout: false,
	alignment_validation: "None",
	baseline_models: "None",
	robustness_measures: "None",
	known_limitations: "Synthetic data",
	benchmarks_list: bench.name
	}
	});
	}
	}

	fs.writeFileSync(filePath, JSON.stringify(content, null, 2));
	}

	console.log("Done adding fake benchmarks.");
	}

	main();