{ "description": "Hackathon eval — first complete parallel run (Modal T4). Straggler reruns overwrote volume; these are the authoritative numbers.", "benchmarks": ["databench_test_15", "dsbench_analysis_10", "mentor_hard"], "n_problems_per_model": 30, "demo_adapter": "sanjaymalladi/DataSense-Modal-E2B-SFT", "model_summaries": [ { "suite": "Base", "per_benchmark": { "databench_test_15": 0.6, "dsbench_analysis_10": 0.0, "mentor_hard": 0.2 }, "macro_avg_accuracy": 0.26666666666666666, "micro_avg_accuracy": 0.3333333333333333, "total_correct": 10, "total_problems": 30 }, { "suite": "SFT v1", "per_benchmark": { "databench_test_15": 0.8666666666666667, "dsbench_analysis_10": 0.0, "mentor_hard": 0.6 }, "macro_avg_accuracy": 0.48888888888888893, "micro_avg_accuracy": 0.5333333333333333, "total_correct": 16, "total_problems": 30, "exec_ok_mentor_hard": 1.0 }, { "suite": "EVTE-STaR Micro-1", "per_benchmark": { "databench_test_15": 0.8, "dsbench_analysis_10": 0.0, "mentor_hard": 1.0 }, "macro_avg_accuracy": 0.6, "micro_avg_accuracy": 0.5666666666666667, "total_correct": 17, "total_problems": 30, "exec_ok_mentor_hard": 1.0, "dsbench_value_adjusted": { "note": "Q15 exact dollar match maps to letter A; official scorer shows 0%", "adjusted_correct": 1, "adjusted_macro_avg_accuracy": 0.6333333333333333 } } ] }