DataSense_E2B / eval_hackathon_benchmarks.json
sanjaymalladi's picture
DataSense E2B hackathon demo - Gradio agent, story, eval assets
52674b8 verified
Raw
History Blame Contribute Delete
1.6 kB
{
"description": "Hackathon eval — first complete parallel run (Modal T4). Straggler reruns overwrote volume; these are the authoritative numbers.",
"benchmarks": ["databench_test_15", "dsbench_analysis_10", "mentor_hard"],
"n_problems_per_model": 30,
"demo_adapter": "sanjaymalladi/DataSense-Modal-E2B-SFT",
"model_summaries": [
{
"suite": "Base",
"per_benchmark": {
"databench_test_15": 0.6,
"dsbench_analysis_10": 0.0,
"mentor_hard": 0.2
},
"macro_avg_accuracy": 0.26666666666666666,
"micro_avg_accuracy": 0.3333333333333333,
"total_correct": 10,
"total_problems": 30
},
{
"suite": "SFT v1",
"per_benchmark": {
"databench_test_15": 0.8666666666666667,
"dsbench_analysis_10": 0.0,
"mentor_hard": 0.6
},
"macro_avg_accuracy": 0.48888888888888893,
"micro_avg_accuracy": 0.5333333333333333,
"total_correct": 16,
"total_problems": 30,
"exec_ok_mentor_hard": 1.0
},
{
"suite": "EVTE-STaR Micro-1",
"per_benchmark": {
"databench_test_15": 0.8,
"dsbench_analysis_10": 0.0,
"mentor_hard": 1.0
},
"macro_avg_accuracy": 0.6,
"micro_avg_accuracy": 0.5666666666666667,
"total_correct": 17,
"total_problems": 30,
"exec_ok_mentor_hard": 1.0,
"dsbench_value_adjusted": {
"note": "Q15 exact dollar match maps to letter A; official scorer shows 0%",
"adjusted_correct": 1,
"adjusted_macro_avg_accuracy": 0.6333333333333333
}
}
]
}