leaderboard / preprocess_evals.py
tareknaser's picture
feat: include tag in experiment ID
c830a77 unverified
#!/usr/bin/env python3
import json
from pathlib import Path
import sys
sys.path.insert(0, str(Path(__file__).parent))
from src.leaderboard.eval_processor import process_eval_file
def preprocess_eval_file(eval_path: Path, output_dir: Path, experiment_name: str):
from inspect_ai.log import read_eval_log
log = read_eval_log(eval_path)
tag = log.eval.metadata.get('tag', 'unknown') if log.eval.metadata else 'unknown'
model_id = log.eval.model
df = process_eval_file(eval_path, experiment_name)
experiment_data = {
"name": experiment_name,
"model_id": model_id,
"tag": tag,
"eval_filename": eval_path.name,
"samples": df.to_dict(orient='records')
}
output_file = output_dir / f"{experiment_name.replace(' ', '_').lower()}_{tag}.json"
with open(output_file, 'w') as f:
json.dump(experiment_data, f, indent=2)
return {
"name": experiment_name,
"data_file": output_file.name,
"tag": tag,
"model_id": model_id
}
def main():
repo_root = Path(__file__).parent
output_dir = repo_root / "data" / "experiments"
config_path = repo_root / "data" / "leaderboard_config.json"
output_dir.mkdir(parents=True, exist_ok=True)
if not config_path.exists():
print(f"Error: {config_path} does not exist")
sys.exit(1)
with open(config_path) as f:
config = json.load(f)
if not config.get("experiments"):
print("Error: No experiments in config")
sys.exit(1)
experiments = []
for exp_config in config["experiments"]:
eval_path = repo_root / exp_config["eval_path"]
experiment_name = exp_config["name"]
if not eval_path.exists():
continue
exp_info = preprocess_eval_file(eval_path, output_dir, experiment_name)
experiments.append(exp_info)
processed_config_path = repo_root / "data" / "processed_config.json"
new_config = {"experiments": experiments}
with open(processed_config_path, 'w') as f:
json.dump(new_config, f, indent=2)
print(f"Preprocessed {len(experiments)} experiments")
if __name__ == "__main__":
main()