Spaces:

Amit-kr26
/

Multimodal_Math_Mentor

Sleeping

File size: 6,368 Bytes

3c25c17

"""Batch evaluation script for the Math Mentor pipeline."""

from __future__ import annotations

import json
import os
import sys
import time
from datetime import datetime
from pathlib import Path

# Add project root to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from ui.callbacks import run_pipeline, new_thread_id


def load_test_problems() -> list[dict]:
    path = Path(__file__).parent / "test_problems.json"
    with open(path, "r") as f:
        return json.loads(f.read())


def evaluate_single(problem: dict) -> dict:
    """Run a single problem through the pipeline and collect results."""
    thread_id = new_thread_id()
    question = problem["question"]

    final_state = {}
    try:
        for update in run_pipeline(
            input_text=question,
            input_image=None,
            input_audio=None,
            input_mode="Text",
            thread_id=thread_id,
            chat_history=[],
        ):
            node = update["node"]
            output = update["output"]
            if node != "error":
                for k, v in output.items():
                    final_state[k] = v
            else:
                return {
                    **problem,
                    "actual_answer": "",
                    "actual_topic": "",
                    "confidence": 0,
                    "error": output.get("error", "Unknown"),
                    "correct": False,
                }
    except Exception as e:
        return {
            **problem,
            "actual_answer": "",
            "actual_topic": "",
            "confidence": 0,
            "error": str(e),
            "correct": False,
        }

    solution = final_state.get("solution", "")
    topic = final_state.get("problem_topic", "")
    confidence = final_state.get("final_confidence", 0)
    verification = final_state.get("verification_result", {})

    return {
        **problem,
        "actual_answer": solution,
        "actual_topic": topic,
        "confidence": confidence,
        "verified_correct": verification.get("is_correct", False),
        "verification_confidence": verification.get("confidence", 0),
        "error": None,
    }


def run_evaluation():
    problems = load_test_problems()
    print(f"Running evaluation on {len(problems)} problems...\n")

    results = []
    topic_stats: dict[str, dict] = {}

    total_start = time.time()

    for i, problem in enumerate(problems):
        print(f"[{i+1}/{len(problems)}] {problem['question'][:60]}...")
        t0 = time.time()
        result = evaluate_single(problem)
        result["time_seconds"] = round(time.time() - t0, 1)
        results.append(result)

        topic = problem["topic"]
        if topic not in topic_stats:
            topic_stats[topic] = {"total": 0, "verified": 0, "errors": 0}
        topic_stats[topic]["total"] += 1
        if result.get("error"):
            topic_stats[topic]["errors"] += 1
        elif result.get("verified_correct"):
            topic_stats[topic]["verified"] += 1

        status = "✓" if result.get("verified_correct") else ("✗ ERROR" if result.get("error") else "✗")
        print(f"  → {status} | confidence: {result.get('confidence', 0):.2f} | {result['time_seconds']}s")

    # Summary
    print("\n" + "=" * 60)
    print("EVALUATION SUMMARY")
    print("=" * 60)

    total = len(results)
    verified = sum(1 for r in results if r.get("verified_correct"))
    errors = sum(1 for r in results if r.get("error"))
    avg_confidence = sum(r.get("confidence", 0) for r in results) / total if total else 0

    print(f"\nOverall: {verified}/{total} verified correct ({verified/total*100:.1f}%)")
    print(f"Errors: {errors}/{total}")
    print(f"Average confidence: {avg_confidence:.2f}")

    print("\nPer-topic breakdown:")
    for topic, stats in sorted(topic_stats.items()):
        acc = stats["verified"] / stats["total"] * 100 if stats["total"] else 0
        print(f"  {topic}: {stats['verified']}/{stats['total']} ({acc:.0f}%) | errors: {stats['errors']}")

    # Save results
    output_dir = Path(__file__).parent / "results"
    output_dir.mkdir(exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = output_dir / f"eval_{timestamp}.json"

    report = {
        "timestamp": datetime.now().isoformat(),
        "total_problems": total,
        "verified_correct": verified,
        "accuracy": verified / total if total else 0,
        "errors": errors,
        "avg_confidence": avg_confidence,
        "per_topic": topic_stats,
        "results": results,
    }

    with open(output_path, "w") as f:
        json.dump(report, f, indent=2, default=str)

    total_time = round(time.time() - total_start, 1)
    report["total_time_seconds"] = total_time
    print(f"Total time: {total_time}s ({total_time/total:.1f}s avg per problem)")
    print(f"\nFull results saved to: {output_path}")

    # Also generate a markdown report
    md_path = output_dir / f"eval_{timestamp}.md"
    md_lines = [
        "# Evaluation Report",
        f"**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
        f"**Total problems:** {total}",
        f"**Verified correct:** {verified}/{total} ({verified/total*100:.1f}%)",
        f"**Errors:** {errors}/{total}",
        f"**Avg confidence:** {avg_confidence:.2f}",
        f"**Total time:** {total_time}s",
        "",
        "## Per-topic Breakdown",
        "| Topic | Correct | Total | Accuracy | Errors |",
        "|-------|---------|-------|----------|--------|",
    ]
    for topic, stats in sorted(topic_stats.items()):
        acc = stats["verified"] / stats["total"] * 100 if stats["total"] else 0
        md_lines.append(f"| {topic} | {stats['verified']} | {stats['total']} | {acc:.0f}% | {stats['errors']} |")

    md_lines.extend(["", "## Individual Results", "| # | Question | Correct | Confidence | Time |", "|---|----------|---------|------------|------|"])
    for r in results:
        q = r["question"][:50]
        ok = "Yes" if r.get("verified_correct") else ("ERR" if r.get("error") else "No")
        md_lines.append(f"| {r['id']} | {q} | {ok} | {r.get('confidence', 0):.2f} | {r.get('time_seconds', 0)}s |")

    with open(md_path, "w") as f:
        f.write("\n".join(md_lines))
    print(f"Markdown report: {md_path}")


if __name__ == "__main__":
    run_evaluation()