Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import json | |
| from dataclasses import asdict | |
| from pathlib import Path | |
| from services.solve.solver import MathSolver | |
| from services.verify.verifier import verify_solution | |
| def run_benchmark( | |
| testset_path: Path, | |
| index_path: Path, | |
| canonical_path: Path, | |
| output_path: Path, | |
| ) -> dict: | |
| tests = [json.loads(line) for line in testset_path.read_text().splitlines()] | |
| canonical_rows = [json.loads(line) for line in canonical_path.read_text().splitlines()] | |
| canonical = {row["question_id"]: row for row in canonical_rows} | |
| solver = MathSolver(str(index_path)) | |
| results = [] | |
| for t in tests: | |
| solved = solver.solve(t["question_text"]) | |
| report = verify_solution(t, canonical[t["question_id"]], solved.answer) | |
| results.append( | |
| { | |
| "question_id": t["question_id"], | |
| "verification": asdict(report), | |
| } | |
| ) | |
| final_hits = sum(1 for r in results if r["verification"]["final_answer_correct"]) | |
| format_hits = sum(1 for r in results if r["verification"]["format_compliant"]) | |
| final_correct = final_hits / len(results) | |
| mp_coverage = sum(r["verification"]["mp_coverage"] for r in results) / len(results) | |
| format_score = format_hits / len(results) | |
| summary = { | |
| "final_answer_correctness_rate": final_correct, | |
| "mp_coverage_score": mp_coverage, | |
| "format_compliance_score": format_score, | |
| "n": len(results), | |
| } | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| payload = {"summary": summary, "results": results} | |
| output_path.write_text(json.dumps(payload, indent=2)) | |
| return summary | |