"""Batch evaluation script for the Math Mentor pipeline.""" from __future__ import annotations import json import os import sys import time from datetime import datetime from pathlib import Path # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent)) from ui.callbacks import run_pipeline, new_thread_id def load_test_problems() -> list[dict]: path = Path(__file__).parent / "test_problems.json" with open(path, "r") as f: return json.loads(f.read()) def evaluate_single(problem: dict) -> dict: """Run a single problem through the pipeline and collect results.""" thread_id = new_thread_id() question = problem["question"] final_state = {} try: for update in run_pipeline( input_text=question, input_image=None, input_audio=None, input_mode="Text", thread_id=thread_id, chat_history=[], ): node = update["node"] output = update["output"] if node != "error": for k, v in output.items(): final_state[k] = v else: return { **problem, "actual_answer": "", "actual_topic": "", "confidence": 0, "error": output.get("error", "Unknown"), "correct": False, } except Exception as e: return { **problem, "actual_answer": "", "actual_topic": "", "confidence": 0, "error": str(e), "correct": False, } solution = final_state.get("solution", "") topic = final_state.get("problem_topic", "") confidence = final_state.get("final_confidence", 0) verification = final_state.get("verification_result", {}) return { **problem, "actual_answer": solution, "actual_topic": topic, "confidence": confidence, "verified_correct": verification.get("is_correct", False), "verification_confidence": verification.get("confidence", 0), "error": None, } def run_evaluation(): problems = load_test_problems() print(f"Running evaluation on {len(problems)} problems...\n") results = [] topic_stats: dict[str, dict] = {} total_start = time.time() for i, problem in enumerate(problems): print(f"[{i+1}/{len(problems)}] {problem['question'][:60]}...") t0 = time.time() result = evaluate_single(problem) result["time_seconds"] = round(time.time() - t0, 1) results.append(result) topic = problem["topic"] if topic not in topic_stats: topic_stats[topic] = {"total": 0, "verified": 0, "errors": 0} topic_stats[topic]["total"] += 1 if result.get("error"): topic_stats[topic]["errors"] += 1 elif result.get("verified_correct"): topic_stats[topic]["verified"] += 1 status = "✓" if result.get("verified_correct") else ("✗ ERROR" if result.get("error") else "✗") print(f" → {status} | confidence: {result.get('confidence', 0):.2f} | {result['time_seconds']}s") # Summary print("\n" + "=" * 60) print("EVALUATION SUMMARY") print("=" * 60) total = len(results) verified = sum(1 for r in results if r.get("verified_correct")) errors = sum(1 for r in results if r.get("error")) avg_confidence = sum(r.get("confidence", 0) for r in results) / total if total else 0 print(f"\nOverall: {verified}/{total} verified correct ({verified/total*100:.1f}%)") print(f"Errors: {errors}/{total}") print(f"Average confidence: {avg_confidence:.2f}") print("\nPer-topic breakdown:") for topic, stats in sorted(topic_stats.items()): acc = stats["verified"] / stats["total"] * 100 if stats["total"] else 0 print(f" {topic}: {stats['verified']}/{stats['total']} ({acc:.0f}%) | errors: {stats['errors']}") # Save results output_dir = Path(__file__).parent / "results" output_dir.mkdir(exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_path = output_dir / f"eval_{timestamp}.json" report = { "timestamp": datetime.now().isoformat(), "total_problems": total, "verified_correct": verified, "accuracy": verified / total if total else 0, "errors": errors, "avg_confidence": avg_confidence, "per_topic": topic_stats, "results": results, } with open(output_path, "w") as f: json.dump(report, f, indent=2, default=str) total_time = round(time.time() - total_start, 1) report["total_time_seconds"] = total_time print(f"Total time: {total_time}s ({total_time/total:.1f}s avg per problem)") print(f"\nFull results saved to: {output_path}") # Also generate a markdown report md_path = output_dir / f"eval_{timestamp}.md" md_lines = [ "# Evaluation Report", f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M')}", f"**Total problems:** {total}", f"**Verified correct:** {verified}/{total} ({verified/total*100:.1f}%)", f"**Errors:** {errors}/{total}", f"**Avg confidence:** {avg_confidence:.2f}", f"**Total time:** {total_time}s", "", "## Per-topic Breakdown", "| Topic | Correct | Total | Accuracy | Errors |", "|-------|---------|-------|----------|--------|", ] for topic, stats in sorted(topic_stats.items()): acc = stats["verified"] / stats["total"] * 100 if stats["total"] else 0 md_lines.append(f"| {topic} | {stats['verified']} | {stats['total']} | {acc:.0f}% | {stats['errors']} |") md_lines.extend(["", "## Individual Results", "| # | Question | Correct | Confidence | Time |", "|---|----------|---------|------------|------|"]) for r in results: q = r["question"][:50] ok = "Yes" if r.get("verified_correct") else ("ERR" if r.get("error") else "No") md_lines.append(f"| {r['id']} | {q} | {ok} | {r.get('confidence', 0):.2f} | {r.get('time_seconds', 0)}s |") with open(md_path, "w") as f: f.write("\n".join(md_lines)) print(f"Markdown report: {md_path}") if __name__ == "__main__": run_evaluation()