67rp / eval /benchmark.py
Jovanseetk
Prepare Hugging Face Spaces deploy
554d9f2
from __future__ import annotations
import json
from dataclasses import asdict
from pathlib import Path
from services.solve.solver import MathSolver
from services.verify.verifier import verify_solution
def run_benchmark(
testset_path: Path,
index_path: Path,
canonical_path: Path,
output_path: Path,
) -> dict:
tests = [json.loads(line) for line in testset_path.read_text().splitlines()]
canonical_rows = [json.loads(line) for line in canonical_path.read_text().splitlines()]
canonical = {row["question_id"]: row for row in canonical_rows}
solver = MathSolver(str(index_path))
results = []
for t in tests:
solved = solver.solve(t["question_text"])
report = verify_solution(t, canonical[t["question_id"]], solved.answer)
results.append(
{
"question_id": t["question_id"],
"verification": asdict(report),
}
)
final_hits = sum(1 for r in results if r["verification"]["final_answer_correct"])
format_hits = sum(1 for r in results if r["verification"]["format_compliant"])
final_correct = final_hits / len(results)
mp_coverage = sum(r["verification"]["mp_coverage"] for r in results) / len(results)
format_score = format_hits / len(results)
summary = {
"final_answer_correctness_rate": final_correct,
"mp_coverage_score": mp_coverage,
"format_compliance_score": format_score,
"n": len(results),
}
output_path.parent.mkdir(parents=True, exist_ok=True)
payload = {"summary": summary, "results": results}
output_path.write_text(json.dumps(payload, indent=2))
return summary