general-eval-card / scripts /migrate-to-benchmarks.mjs
Avijit Ghosh
new ux
6978d97
#!/usr/bin/env node
/**
* Migration script to convert old checkbox-based evaluations to new benchmark-first format
*
* Usage: node scripts/migrate-to-benchmarks.mjs
*/
import fs from 'fs/promises'
import path from 'path'
// Mapping of old categories to new categories
const CATEGORY_MAPPING = {
'language-communication': 'language-communication',
'problem-solving': 'reasoning',
'creativity-innovation': 'creativity',
'learning-memory': 'knowledge',
'perception-vision': 'vision',
'social-intelligence': 'social-intelligence',
'harmful-content': 'toxicity',
'bias-fairness': 'bias-fairness',
'information-integrity': 'truthfulness',
'security-robustness': 'robustness',
}
// Map old benchmark names to standardized ones
const BENCHMARK_MAPPING = {
'MMLU, HellaSwag, ARC-Challenge, WinoGrande': 'MMLU',
'TruthfulQA': 'TruthfulQA',
'BBH': 'BBH',
}
async function migrateEvaluation(oldData) {
const evaluations = []
// Process each category
for (const [categoryId, categoryData] of Object.entries(oldData.categoryEvaluations || {})) {
const benchmarkSources = categoryData.benchmarkSources || {}
// Process each benchmark question
for (const [questionId, sources] of Object.entries(benchmarkSources)) {
for (const source of sources) {
if (!source.benchmarkName) continue
const evaluation = {
schema_version: '0.1',
evaluation_id: `${oldData.id}_${categoryId}_${questionId}_${source.id}`,
retrieved_timestamp: String(Date.now() / 1000),
source_data: {
dataset_name: source.benchmarkName || 'Unknown',
samples_number: 0, // Unknown from old format
dataset_version: source.version || 'unknown',
},
source_metadata: {
source_name: oldData.evaluator || 'Unknown',
source_type: source.sourceType === 'external' ? 'evaluation_run' : 'model_card',
source_organization_name: oldData.provider || 'Unknown',
source_organization_url: oldData.url || '',
evaluator_relationship: source.sourceType === 'external' ? 'third_party' : 'first_party',
source_url: source.url || '',
},
model_info: {
name: oldData.systemName,
id: oldData.id,
developer: oldData.provider,
model_version: oldData.version || oldData.modelTag,
modalities: {
input: oldData.inputModalities || ['text'],
output: oldData.outputModalities || ['text'],
},
},
evaluation_results: [
{
evaluation_name: `${source.benchmarkName} - ${source.metrics || 'accuracy'}`,
evaluation_timestamp: String(Date.now() / 1000),
metric_config: {
evaluation_description: source.metrics || 'Accuracy',
lower_is_better: false,
score_type: 'continuous',
min_score: 0.0,
max_score: 1.0,
},
score_details: {
score: parseScore(source.score),
details: {},
},
detailed_evaluation_results_url: source.url,
generation_config: {
generation_args: {},
additional_details: source.taskVariants || '',
},
},
],
}
evaluations.push(evaluation)
}
}
}
return evaluations
}
function parseScore(scoreString) {
if (!scoreString) return 0
// Extract first number from string like "87.4% MMLU"
const match = scoreString.match(/(\d+\.?\d*)/)
if (!match) return 0
const value = parseFloat(match[1])
// If it looks like a percentage, convert to 0-1
if (value > 1 && value <= 100) {
return value / 100
}
return value
}
async function main() {
const oldEvalsDir = './public/evaluations'
const newBenchmarksDir = './public/benchmarks'
try {
// Create benchmarks directory if it doesn't exist
await fs.mkdir(newBenchmarksDir, { recursive: true })
// Read all old evaluation files
const files = await fs.readdir(oldEvalsDir)
const jsonFiles = files.filter(f => f.endsWith('.json'))
console.log(`Found ${jsonFiles.length} evaluation files to migrate`)
for (const file of jsonFiles) {
try {
const filePath = path.join(oldEvalsDir, file)
const content = await fs.readFile(filePath, 'utf-8')
const oldData = JSON.parse(content)
console.log(`\nMigrating ${file}...`)
console.log(` System: ${oldData.systemName}`)
const evaluations = await migrateEvaluation(oldData)
console.log(` Generated ${evaluations.length} benchmark evaluations`)
// Write each evaluation as a separate file, or combine them
// For now, we'll write the first one as a sample
if (evaluations.length > 0) {
const outputFile = path.join(newBenchmarksDir, file)
await fs.writeFile(
outputFile,
JSON.stringify(evaluations[0], null, 2)
)
console.log(` Wrote to ${outputFile}`)
}
} catch (error) {
console.error(` Error migrating ${file}:`, error.message)
}
}
console.log('\nMigration complete!')
console.log('\nNote: This is a basic migration. You should:')
console.log('1. Review generated files for accuracy')
console.log('2. Add missing metadata (sample counts, confidence intervals)')
console.log('3. Verify score conversions')
console.log('4. Add detailed sample results if available')
} catch (error) {
console.error('Migration failed:', error)
process.exit(1)
}
}
main()