Spaces:

Amit-kr26
/

Multimodal_Math_Mentor

Sleeping

App Files Files Community

Multimodal_Math_Mentor / eval /run_eval.py

Amit-kr26

Initial commit: Multimodal Math Mentor

3c25c17 26 days ago

raw

history blame contribute delete

6.37 kB

	"""Batch evaluation script for the Math Mentor pipeline."""

	from __future__ import annotations

	import json
	import os
	import sys
	import time
	from datetime import datetime
	from pathlib import Path

	# Add project root to path
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from ui.callbacks import run_pipeline, new_thread_id


	def load_test_problems() -> list[dict]:
	path = Path(__file__).parent / "test_problems.json"
	with open(path, "r") as f:
	return json.loads(f.read())


	def evaluate_single(problem: dict) -> dict:
	"""Run a single problem through the pipeline and collect results."""
	thread_id = new_thread_id()
	question = problem["question"]

	final_state = {}
	try:
	for update in run_pipeline(
	input_text=question,
	input_image=None,
	input_audio=None,
	input_mode="Text",
	thread_id=thread_id,
	chat_history=[],
	):
	node = update["node"]
	output = update["output"]
	if node != "error":
	for k, v in output.items():
	final_state[k] = v
	else:
	return {
	**problem,
	"actual_answer": "",
	"actual_topic": "",
	"confidence": 0,
	"error": output.get("error", "Unknown"),
	"correct": False,
	}
	except Exception as e:
	return {
	**problem,
	"actual_answer": "",
	"actual_topic": "",
	"confidence": 0,
	"error": str(e),
	"correct": False,
	}

	solution = final_state.get("solution", "")
	topic = final_state.get("problem_topic", "")
	confidence = final_state.get("final_confidence", 0)
	verification = final_state.get("verification_result", {})

	return {
	**problem,
	"actual_answer": solution,
	"actual_topic": topic,
	"confidence": confidence,
	"verified_correct": verification.get("is_correct", False),
	"verification_confidence": verification.get("confidence", 0),
	"error": None,
	}


	def run_evaluation():
	problems = load_test_problems()
	print(f"Running evaluation on {len(problems)} problems...\n")

	results = []
	topic_stats: dict[str, dict] = {}

	total_start = time.time()

	for i, problem in enumerate(problems):
	print(f"[{i+1}/{len(problems)}] {problem['question'][:60]}...")
	t0 = time.time()
	result = evaluate_single(problem)
	result["time_seconds"] = round(time.time() - t0, 1)
	results.append(result)

	topic = problem["topic"]
	if topic not in topic_stats:
	topic_stats[topic] = {"total": 0, "verified": 0, "errors": 0}
	topic_stats[topic]["total"] += 1
	if result.get("error"):
	topic_stats[topic]["errors"] += 1
	elif result.get("verified_correct"):
	topic_stats[topic]["verified"] += 1

	status = "✓" if result.get("verified_correct") else ("✗ ERROR" if result.get("error") else "✗")
	print(f" → {status} \| confidence: {result.get('confidence', 0):.2f} \| {result['time_seconds']}s")

	# Summary
	print("\n" + "=" * 60)
	print("EVALUATION SUMMARY")
	print("=" * 60)

	total = len(results)
	verified = sum(1 for r in results if r.get("verified_correct"))
	errors = sum(1 for r in results if r.get("error"))
	avg_confidence = sum(r.get("confidence", 0) for r in results) / total if total else 0

	print(f"\nOverall: {verified}/{total} verified correct ({verified/total*100:.1f}%)")
	print(f"Errors: {errors}/{total}")
	print(f"Average confidence: {avg_confidence:.2f}")

	print("\nPer-topic breakdown:")
	for topic, stats in sorted(topic_stats.items()):
	acc = stats["verified"] / stats["total"] * 100 if stats["total"] else 0
	print(f" {topic}: {stats['verified']}/{stats['total']} ({acc:.0f}%) \| errors: {stats['errors']}")

	# Save results
	output_dir = Path(__file__).parent / "results"
	output_dir.mkdir(exist_ok=True)
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	output_path = output_dir / f"eval_{timestamp}.json"

	report = {
	"timestamp": datetime.now().isoformat(),
	"total_problems": total,
	"verified_correct": verified,
	"accuracy": verified / total if total else 0,
	"errors": errors,
	"avg_confidence": avg_confidence,
	"per_topic": topic_stats,
	"results": results,
	}

	with open(output_path, "w") as f:
	json.dump(report, f, indent=2, default=str)

	total_time = round(time.time() - total_start, 1)
	report["total_time_seconds"] = total_time
	print(f"Total time: {total_time}s ({total_time/total:.1f}s avg per problem)")
	print(f"\nFull results saved to: {output_path}")

	# Also generate a markdown report
	md_path = output_dir / f"eval_{timestamp}.md"
	md_lines = [
	"# Evaluation Report",
	f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}",
	f"Total problems: {total}",
	f"Verified correct: {verified}/{total} ({verified/total*100:.1f}%)",
	f"Errors: {errors}/{total}",
	f"Avg confidence: {avg_confidence:.2f}",
	f"Total time: {total_time}s",
	"",
	"## Per-topic Breakdown",
	"\| Topic \| Correct \| Total \| Accuracy \| Errors \|",
	"\|-------\|---------\|-------\|----------\|--------\|",
	]
	for topic, stats in sorted(topic_stats.items()):
	acc = stats["verified"] / stats["total"] * 100 if stats["total"] else 0
	md_lines.append(f"\| {topic} \| {stats['verified']} \| {stats['total']} \| {acc:.0f}% \| {stats['errors']} \|")

	md_lines.extend(["", "## Individual Results", "\| # \| Question \| Correct \| Confidence \| Time \|", "\|---\|----------\|---------\|------------\|------\|"])
	for r in results:
	q = r["question"][:50]
	ok = "Yes" if r.get("verified_correct") else ("ERR" if r.get("error") else "No")
	md_lines.append(f"\| {r['id']} \| {q} \| {ok} \| {r.get('confidence', 0):.2f} \| {r.get('time_seconds', 0)}s \|")

	with open(md_path, "w") as f:
	f.write("\n".join(md_lines))
	print(f"Markdown report: {md_path}")


	if __name__ == "__main__":
	run_evaluation()