general-eval-card / scripts /refine-fake-data.js
Avijit Ghosh
Refactor: Update benchmarks with realistic data, fix UI stats, and improve About page
2554366
const fs = require('fs');
const path = require('path');
const BENCHMARKS_DIR = path.join(__dirname, '..', 'public', 'benchmarks');
const MODEL_TIERS = {
'openai-gpt-4o.json': { tier: 1, name: 'GPT-4o' },
'anthropic-claude-3-5-sonnet.json': { tier: 1, name: 'Claude 3.5 Sonnet' },
'meta-llama-3-70b.json': { tier: 2, name: 'Llama 3 70B' },
'mistral-mistral-large.json': { tier: 2, name: 'Mistral Large' },
'alibaba-qwen-2-72b.json': { tier: 2, name: 'Qwen 2 72B' },
'google-gemma-2-27b.json': { tier: 3, name: 'Gemma 2 27B' }
};
// Base scores for tiers (0.0 - 1.0)
const TIER_BASE_SCORES = {
1: 0.88,
2: 0.78,
3: 0.65
};
// Probability of having a specific "niche" benchmark (fake ones)
const TIER_AVAILABILITY = {
1: 0.95, // High tier has almost everything
2: 0.70,
3: 0.40
};
const FAKE_CATEGORIES = [
'Calibration', 'Adversarial', 'Memorization', 'Fairness',
'Leakage/Contamination', 'Privacy', 'Interpretability',
'Efficiency', 'Retrainability', 'Meta-Learning'
];
function getRandomDate(start, end) {
return new Date(start.getTime() + Math.random() * (end.getTime() - start.getTime()));
}
function generateTieredScore(tier) {
const base = TIER_BASE_SCORES[tier];
// Variation: +/- 0.10, clamped to 0.1-0.99
let score = base + (Math.random() * 0.2 - 0.1);
return Math.max(0.1, Math.min(0.99, score));
}
function main() {
const files = fs.readdirSync(BENCHMARKS_DIR);
const startDate = new Date('2024-08-01');
const endDate = new Date('2024-12-15');
for (const file of files) {
if (!file.endsWith('.json')) continue;
const filePath = path.join(BENCHMARKS_DIR, file);
const content = JSON.parse(fs.readFileSync(filePath, 'utf8'));
const modelConfig = MODEL_TIERS[file] || { tier: 2 }; // Default to tier 2
console.log(`Refining ${file} (Tier ${modelConfig.tier})...`);
// Update top-level timestamp
const fileDate = getRandomDate(startDate, endDate);
content.retrieved_timestamp = fileDate.toISOString();
// Filter and update results
content.evaluation_results = content.evaluation_results.filter(result => {
const category = result.factsheet?.functional_props || '';
const isFakeCategory = FAKE_CATEGORIES.some(c => category.includes(c));
// Decide availability for fake categories
if (isFakeCategory) {
if (Math.random() > TIER_AVAILABILITY[modelConfig.tier]) {
return false; // Remove this benchmark
}
}
return true;
}).map(result => {
const category = result.factsheet?.functional_props || '';
const isFakeCategory = FAKE_CATEGORIES.some(c => category.includes(c));
// Update timestamp
// Some variation in date per benchmark
const evalDate = getRandomDate(new Date(fileDate.getTime() - 30*24*60*60*1000), fileDate);
result.evaluation_timestamp = evalDate.toISOString();
// Update score if it's a fake category
if (isFakeCategory) {
const newScore = generateTieredScore(modelConfig.tier);
result.score_details.score = newScore;
if (result.score_details.details) {
Object.keys(result.score_details.details).forEach(k => {
result.score_details.details[k] = newScore;
});
}
}
return result;
});
fs.writeFileSync(filePath, JSON.stringify(content, null, 2));
}
console.log("Done refining data.");
}
main();