Amit-kr26's picture
Initial commit: Multimodal Math Mentor
3c25c17
"""Batch evaluation script for the Math Mentor pipeline."""
from __future__ import annotations
import json
import os
import sys
import time
from datetime import datetime
from pathlib import Path
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from ui.callbacks import run_pipeline, new_thread_id
def load_test_problems() -> list[dict]:
path = Path(__file__).parent / "test_problems.json"
with open(path, "r") as f:
return json.loads(f.read())
def evaluate_single(problem: dict) -> dict:
"""Run a single problem through the pipeline and collect results."""
thread_id = new_thread_id()
question = problem["question"]
final_state = {}
try:
for update in run_pipeline(
input_text=question,
input_image=None,
input_audio=None,
input_mode="Text",
thread_id=thread_id,
chat_history=[],
):
node = update["node"]
output = update["output"]
if node != "error":
for k, v in output.items():
final_state[k] = v
else:
return {
**problem,
"actual_answer": "",
"actual_topic": "",
"confidence": 0,
"error": output.get("error", "Unknown"),
"correct": False,
}
except Exception as e:
return {
**problem,
"actual_answer": "",
"actual_topic": "",
"confidence": 0,
"error": str(e),
"correct": False,
}
solution = final_state.get("solution", "")
topic = final_state.get("problem_topic", "")
confidence = final_state.get("final_confidence", 0)
verification = final_state.get("verification_result", {})
return {
**problem,
"actual_answer": solution,
"actual_topic": topic,
"confidence": confidence,
"verified_correct": verification.get("is_correct", False),
"verification_confidence": verification.get("confidence", 0),
"error": None,
}
def run_evaluation():
problems = load_test_problems()
print(f"Running evaluation on {len(problems)} problems...\n")
results = []
topic_stats: dict[str, dict] = {}
total_start = time.time()
for i, problem in enumerate(problems):
print(f"[{i+1}/{len(problems)}] {problem['question'][:60]}...")
t0 = time.time()
result = evaluate_single(problem)
result["time_seconds"] = round(time.time() - t0, 1)
results.append(result)
topic = problem["topic"]
if topic not in topic_stats:
topic_stats[topic] = {"total": 0, "verified": 0, "errors": 0}
topic_stats[topic]["total"] += 1
if result.get("error"):
topic_stats[topic]["errors"] += 1
elif result.get("verified_correct"):
topic_stats[topic]["verified"] += 1
status = "✓" if result.get("verified_correct") else ("✗ ERROR" if result.get("error") else "✗")
print(f" → {status} | confidence: {result.get('confidence', 0):.2f} | {result['time_seconds']}s")
# Summary
print("\n" + "=" * 60)
print("EVALUATION SUMMARY")
print("=" * 60)
total = len(results)
verified = sum(1 for r in results if r.get("verified_correct"))
errors = sum(1 for r in results if r.get("error"))
avg_confidence = sum(r.get("confidence", 0) for r in results) / total if total else 0
print(f"\nOverall: {verified}/{total} verified correct ({verified/total*100:.1f}%)")
print(f"Errors: {errors}/{total}")
print(f"Average confidence: {avg_confidence:.2f}")
print("\nPer-topic breakdown:")
for topic, stats in sorted(topic_stats.items()):
acc = stats["verified"] / stats["total"] * 100 if stats["total"] else 0
print(f" {topic}: {stats['verified']}/{stats['total']} ({acc:.0f}%) | errors: {stats['errors']}")
# Save results
output_dir = Path(__file__).parent / "results"
output_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = output_dir / f"eval_{timestamp}.json"
report = {
"timestamp": datetime.now().isoformat(),
"total_problems": total,
"verified_correct": verified,
"accuracy": verified / total if total else 0,
"errors": errors,
"avg_confidence": avg_confidence,
"per_topic": topic_stats,
"results": results,
}
with open(output_path, "w") as f:
json.dump(report, f, indent=2, default=str)
total_time = round(time.time() - total_start, 1)
report["total_time_seconds"] = total_time
print(f"Total time: {total_time}s ({total_time/total:.1f}s avg per problem)")
print(f"\nFull results saved to: {output_path}")
# Also generate a markdown report
md_path = output_dir / f"eval_{timestamp}.md"
md_lines = [
"# Evaluation Report",
f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
f"**Total problems:** {total}",
f"**Verified correct:** {verified}/{total} ({verified/total*100:.1f}%)",
f"**Errors:** {errors}/{total}",
f"**Avg confidence:** {avg_confidence:.2f}",
f"**Total time:** {total_time}s",
"",
"## Per-topic Breakdown",
"| Topic | Correct | Total | Accuracy | Errors |",
"|-------|---------|-------|----------|--------|",
]
for topic, stats in sorted(topic_stats.items()):
acc = stats["verified"] / stats["total"] * 100 if stats["total"] else 0
md_lines.append(f"| {topic} | {stats['verified']} | {stats['total']} | {acc:.0f}% | {stats['errors']} |")
md_lines.extend(["", "## Individual Results", "| # | Question | Correct | Confidence | Time |", "|---|----------|---------|------------|------|"])
for r in results:
q = r["question"][:50]
ok = "Yes" if r.get("verified_correct") else ("ERR" if r.get("error") else "No")
md_lines.append(f"| {r['id']} | {q} | {ok} | {r.get('confidence', 0):.2f} | {r.get('time_seconds', 0)}s |")
with open(md_path, "w") as f:
f.write("\n".join(md_lines))
print(f"Markdown report: {md_path}")
if __name__ == "__main__":
run_evaluation()