Spaces:
Sleeping
Sleeping
| """Batch evaluation script for the Math Mentor pipeline.""" | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import sys | |
| import time | |
| from datetime import datetime | |
| from pathlib import Path | |
| # Add project root to path | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from ui.callbacks import run_pipeline, new_thread_id | |
| def load_test_problems() -> list[dict]: | |
| path = Path(__file__).parent / "test_problems.json" | |
| with open(path, "r") as f: | |
| return json.loads(f.read()) | |
| def evaluate_single(problem: dict) -> dict: | |
| """Run a single problem through the pipeline and collect results.""" | |
| thread_id = new_thread_id() | |
| question = problem["question"] | |
| final_state = {} | |
| try: | |
| for update in run_pipeline( | |
| input_text=question, | |
| input_image=None, | |
| input_audio=None, | |
| input_mode="Text", | |
| thread_id=thread_id, | |
| chat_history=[], | |
| ): | |
| node = update["node"] | |
| output = update["output"] | |
| if node != "error": | |
| for k, v in output.items(): | |
| final_state[k] = v | |
| else: | |
| return { | |
| **problem, | |
| "actual_answer": "", | |
| "actual_topic": "", | |
| "confidence": 0, | |
| "error": output.get("error", "Unknown"), | |
| "correct": False, | |
| } | |
| except Exception as e: | |
| return { | |
| **problem, | |
| "actual_answer": "", | |
| "actual_topic": "", | |
| "confidence": 0, | |
| "error": str(e), | |
| "correct": False, | |
| } | |
| solution = final_state.get("solution", "") | |
| topic = final_state.get("problem_topic", "") | |
| confidence = final_state.get("final_confidence", 0) | |
| verification = final_state.get("verification_result", {}) | |
| return { | |
| **problem, | |
| "actual_answer": solution, | |
| "actual_topic": topic, | |
| "confidence": confidence, | |
| "verified_correct": verification.get("is_correct", False), | |
| "verification_confidence": verification.get("confidence", 0), | |
| "error": None, | |
| } | |
| def run_evaluation(): | |
| problems = load_test_problems() | |
| print(f"Running evaluation on {len(problems)} problems...\n") | |
| results = [] | |
| topic_stats: dict[str, dict] = {} | |
| total_start = time.time() | |
| for i, problem in enumerate(problems): | |
| print(f"[{i+1}/{len(problems)}] {problem['question'][:60]}...") | |
| t0 = time.time() | |
| result = evaluate_single(problem) | |
| result["time_seconds"] = round(time.time() - t0, 1) | |
| results.append(result) | |
| topic = problem["topic"] | |
| if topic not in topic_stats: | |
| topic_stats[topic] = {"total": 0, "verified": 0, "errors": 0} | |
| topic_stats[topic]["total"] += 1 | |
| if result.get("error"): | |
| topic_stats[topic]["errors"] += 1 | |
| elif result.get("verified_correct"): | |
| topic_stats[topic]["verified"] += 1 | |
| status = "✓" if result.get("verified_correct") else ("✗ ERROR" if result.get("error") else "✗") | |
| print(f" → {status} | confidence: {result.get('confidence', 0):.2f} | {result['time_seconds']}s") | |
| # Summary | |
| print("\n" + "=" * 60) | |
| print("EVALUATION SUMMARY") | |
| print("=" * 60) | |
| total = len(results) | |
| verified = sum(1 for r in results if r.get("verified_correct")) | |
| errors = sum(1 for r in results if r.get("error")) | |
| avg_confidence = sum(r.get("confidence", 0) for r in results) / total if total else 0 | |
| print(f"\nOverall: {verified}/{total} verified correct ({verified/total*100:.1f}%)") | |
| print(f"Errors: {errors}/{total}") | |
| print(f"Average confidence: {avg_confidence:.2f}") | |
| print("\nPer-topic breakdown:") | |
| for topic, stats in sorted(topic_stats.items()): | |
| acc = stats["verified"] / stats["total"] * 100 if stats["total"] else 0 | |
| print(f" {topic}: {stats['verified']}/{stats['total']} ({acc:.0f}%) | errors: {stats['errors']}") | |
| # Save results | |
| output_dir = Path(__file__).parent / "results" | |
| output_dir.mkdir(exist_ok=True) | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| output_path = output_dir / f"eval_{timestamp}.json" | |
| report = { | |
| "timestamp": datetime.now().isoformat(), | |
| "total_problems": total, | |
| "verified_correct": verified, | |
| "accuracy": verified / total if total else 0, | |
| "errors": errors, | |
| "avg_confidence": avg_confidence, | |
| "per_topic": topic_stats, | |
| "results": results, | |
| } | |
| with open(output_path, "w") as f: | |
| json.dump(report, f, indent=2, default=str) | |
| total_time = round(time.time() - total_start, 1) | |
| report["total_time_seconds"] = total_time | |
| print(f"Total time: {total_time}s ({total_time/total:.1f}s avg per problem)") | |
| print(f"\nFull results saved to: {output_path}") | |
| # Also generate a markdown report | |
| md_path = output_dir / f"eval_{timestamp}.md" | |
| md_lines = [ | |
| "# Evaluation Report", | |
| f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M')}", | |
| f"**Total problems:** {total}", | |
| f"**Verified correct:** {verified}/{total} ({verified/total*100:.1f}%)", | |
| f"**Errors:** {errors}/{total}", | |
| f"**Avg confidence:** {avg_confidence:.2f}", | |
| f"**Total time:** {total_time}s", | |
| "", | |
| "## Per-topic Breakdown", | |
| "| Topic | Correct | Total | Accuracy | Errors |", | |
| "|-------|---------|-------|----------|--------|", | |
| ] | |
| for topic, stats in sorted(topic_stats.items()): | |
| acc = stats["verified"] / stats["total"] * 100 if stats["total"] else 0 | |
| md_lines.append(f"| {topic} | {stats['verified']} | {stats['total']} | {acc:.0f}% | {stats['errors']} |") | |
| md_lines.extend(["", "## Individual Results", "| # | Question | Correct | Confidence | Time |", "|---|----------|---------|------------|------|"]) | |
| for r in results: | |
| q = r["question"][:50] | |
| ok = "Yes" if r.get("verified_correct") else ("ERR" if r.get("error") else "No") | |
| md_lines.append(f"| {r['id']} | {q} | {ok} | {r.get('confidence', 0):.2f} | {r.get('time_seconds', 0)}s |") | |
| with open(md_path, "w") as f: | |
| f.write("\n".join(md_lines)) | |
| print(f"Markdown report: {md_path}") | |
| if __name__ == "__main__": | |
| run_evaluation() | |