File size: 16,802 Bytes

ed1b365

"""

Benchmark Runner - loads test prompts, runs/loads responses, scores them,

and produces detailed evaluation reports.



Supports:

- Loading prompts from JSON files in evaluation/prompts/

- Pre-generated response files (JSON mapping prompt -> response)

- Scoring via ReasoningMetrics

- Per-category and overall reports

- Baseline vs trained model comparison

- CLI interface

"""

from __future__ import annotations

import argparse
import json
import os
import sys
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional

# Allow running from project root or from evaluation/
_THIS_DIR = Path(__file__).resolve().parent
_PROJECT_ROOT = _THIS_DIR.parent
if str(_PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(_PROJECT_ROOT))

from evaluation.reasoning_metrics import ReasoningMetrics


# ---------------------------------------------------------------------------
# Benchmark Runner
# ---------------------------------------------------------------------------

class BenchmarkRunner:
    """Load prompts, score responses, produce reports."""

    def __init__(

        self,

        prompts_dir: Optional[str] = None,

        metrics: Optional[ReasoningMetrics] = None,

    ):
        self.prompts_dir = Path(prompts_dir) if prompts_dir else _THIS_DIR / "prompts"
        self.metrics = metrics or ReasoningMetrics()
        self._prompts: Dict[str, List[str]] = {}
        self._counterexamples: List[Dict[str, str]] = []

    # -- loading -----------------------------------------------------------

    def load_prompts(self, filename: str = "reasoning_tests.json") -> Dict[str, List[str]]:
        """Load categorised prompts from a JSON file.



        Expected format: {"category": ["prompt1", "prompt2", ...], ...}

        """
        path = self.prompts_dir / filename
        if not path.exists():
            raise FileNotFoundError(f"Prompt file not found: {path}")
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        self._prompts = data
        return data

    def load_counterexamples(self, filename: str = "counterexample_tests.json") -> List[Dict[str, str]]:
        """Load counterexample test prompts."""
        path = self.prompts_dir / filename
        if not path.exists():
            raise FileNotFoundError(f"Counterexample file not found: {path}")
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        self._counterexamples = data
        return data

    def load_responses(self, filepath: str) -> Dict[str, str]:
        """Load pre-generated responses from a JSON file.



        Expected format: {"prompt_text": "response_text", ...}

        """
        with open(filepath, "r", encoding="utf-8") as f:
            return json.load(f)

    # -- scoring -----------------------------------------------------------

    def score_responses(

        self,

        responses: Dict[str, str],

    ) -> Dict[str, Any]:
        """Score all responses and organise results by category.



        Args:

            responses: mapping of prompt text -> response text



        Returns:

            Dict with per-prompt scores, per-category averages, and overall.

        """
        if not self._prompts:
            self.load_prompts()

        results: Dict[str, Any] = {
            "timestamp": datetime.utcnow().isoformat(),
            "total_prompts": 0,
            "scored_prompts": 0,
            "missing_responses": 0,
            "categories": {},
            "all_scores": [],
        }

        for category, prompts in self._prompts.items():
            cat_scores: List[Dict[str, Any]] = []
            for prompt in prompts:
                results["total_prompts"] += 1
                response = responses.get(prompt)
                if response is None:
                    results["missing_responses"] += 1
                    continue
                scores = self.metrics.score_reasoning(response)
                results["scored_prompts"] += 1
                entry = {"prompt": prompt, "scores": scores}
                cat_scores.append(entry)
                results["all_scores"].append(entry)

            # Category averages
            if cat_scores:
                avg = self._average_scores([e["scores"] for e in cat_scores])
            else:
                avg = {}
            results["categories"][category] = {
                "prompts_scored": len(cat_scores),
                "average_scores": avg,
                "details": cat_scores,
            }

        # Overall averages
        if results["all_scores"]:
            results["overall"] = self._average_scores(
                [e["scores"] for e in results["all_scores"]]
            )
        else:
            results["overall"] = {}

        return results

    def score_counterexamples(

        self,

        responses: Dict[str, str],

    ) -> Dict[str, Any]:
        """Score counterexample responses (should identify wrong reasoning)."""
        if not self._counterexamples:
            self.load_counterexamples()

        results = []
        refutations = 0
        total = 0

        refutation_markers = [
            "not true", "incorrect", "misconception", "actually",
            "contrary", "doesn't", "does not", "false", "myth",
            "wrong", "mistake", "no,", "in fact", "however",
            "this is a common", "oversimplification", "nuanced",
            "not necessarily", "depends on", "more complex",
        ]

        for item in self._counterexamples:
            prompt = item["prompt"]
            expected = item.get("expected", "refutation")
            response = responses.get(prompt, "")
            total += 1

            if not response:
                results.append({
                    "prompt": prompt,
                    "expected": expected,
                    "responded": False,
                    "contains_refutation": False,
                })
                continue

            resp_lower = response.lower()
            found_refutation = any(m in resp_lower for m in refutation_markers)
            if found_refutation and expected == "refutation":
                refutations += 1

            scores = self.metrics.score_reasoning(response)
            results.append({
                "prompt": prompt,
                "expected": expected,
                "responded": True,
                "contains_refutation": found_refutation,
                "scores": scores,
            })

        return {
            "total": total,
            "refutation_rate": round(refutations / max(total, 1), 4),
            "details": results,
        }

    # -- comparison --------------------------------------------------------

    def compare_models(

        self,

        baseline_responses: Dict[str, str],

        trained_responses: Dict[str, str],

    ) -> Dict[str, Any]:
        """Compare baseline vs trained model responses."""
        baseline_results = self.score_responses(baseline_responses)
        trained_results = self.score_responses(trained_responses)

        comparison: Dict[str, Any] = {
            "timestamp": datetime.utcnow().isoformat(),
            "baseline_overall": baseline_results.get("overall", {}),
            "trained_overall": trained_results.get("overall", {}),
            "category_comparison": {},
            "improvements": {},
            "regressions": {},
        }

        # Per-category delta
        for cat in baseline_results["categories"]:
            b_avg = baseline_results["categories"][cat]["average_scores"]
            t_avg = trained_results["categories"].get(cat, {}).get("average_scores", {})
            delta = {}
            for k in b_avg:
                if k in t_avg and isinstance(b_avg[k], (int, float)):
                    delta[k] = round(t_avg[k] - b_avg[k], 4)
            comparison["category_comparison"][cat] = {
                "baseline": b_avg,
                "trained": t_avg,
                "delta": delta,
            }

        # Overall delta
        b_ov = comparison["baseline_overall"]
        t_ov = comparison["trained_overall"]
        for k in b_ov:
            if k in t_ov and isinstance(b_ov[k], (int, float)):
                d = round(t_ov[k] - b_ov[k], 4)
                if d > 0.01:
                    comparison["improvements"][k] = d
                elif d < -0.01:
                    comparison["regressions"][k] = d

        return comparison

    # -- report ------------------------------------------------------------

    def format_report(self, results: Dict[str, Any]) -> str:
        """Format evaluation results as a readable text report."""
        lines: List[str] = []
        lines.append("=" * 70)
        lines.append("  CODETTE BENCHMARK EVALUATION REPORT")
        lines.append("=" * 70)
        lines.append(f"  Timestamp:  {results.get('timestamp', 'N/A')}")
        lines.append(f"  Prompts:    {results.get('scored_prompts', 0)} scored / "
                      f"{results.get('total_prompts', 0)} total")
        if results.get("missing_responses"):
            lines.append(f"  Missing:    {results['missing_responses']} responses not found")
        lines.append("")

        # Overall
        overall = results.get("overall", {})
        if overall:
            lines.append("-" * 70)
            lines.append("  OVERALL SCORES")
            lines.append("-" * 70)
            for k, v in sorted(overall.items()):
                if isinstance(v, float):
                    bar = self._bar(v)
                    lines.append(f"    {k:<22s} {v:.4f}  {bar}")
            lines.append("")

        # Per-category
        for cat, data in results.get("categories", {}).items():
            avg = data.get("average_scores", {})
            if not avg:
                continue
            lines.append("-" * 70)
            lines.append(f"  CATEGORY: {cat.upper()}")
            lines.append(f"  Prompts scored: {data.get('prompts_scored', 0)}")
            lines.append("-" * 70)
            for k, v in sorted(avg.items()):
                if isinstance(v, float):
                    bar = self._bar(v)
                    lines.append(f"    {k:<22s} {v:.4f}  {bar}")
            lines.append("")

        lines.append("=" * 70)
        return "\n".join(lines)

    def format_comparison_report(self, comparison: Dict[str, Any]) -> str:
        """Format a comparison report between baseline and trained model."""
        lines: List[str] = []
        lines.append("=" * 70)
        lines.append("  MODEL COMPARISON REPORT")
        lines.append("=" * 70)
        lines.append(f"  Timestamp: {comparison.get('timestamp', 'N/A')}")
        lines.append("")

        # Overall
        lines.append("-" * 70)
        lines.append("  OVERALL SCORES  (baseline -> trained [delta])")
        lines.append("-" * 70)
        b = comparison.get("baseline_overall", {})
        t = comparison.get("trained_overall", {})
        for k in sorted(set(list(b.keys()) + list(t.keys()))):
            bv = b.get(k, 0)
            tv = t.get(k, 0)
            if not isinstance(bv, (int, float)):
                continue
            d = tv - bv
            sign = "+" if d >= 0 else ""
            lines.append(f"    {k:<22s} {bv:.4f} -> {tv:.4f}  [{sign}{d:.4f}]")

        # Improvements / regressions
        imp = comparison.get("improvements", {})
        reg = comparison.get("regressions", {})
        if imp:
            lines.append("")
            lines.append("  IMPROVEMENTS:")
            for k, v in sorted(imp.items(), key=lambda x: -x[1]):
                lines.append(f"    + {k}: +{v:.4f}")
        if reg:
            lines.append("")
            lines.append("  REGRESSIONS:")
            for k, v in sorted(reg.items(), key=lambda x: x[1]):
                lines.append(f"    - {k}: {v:.4f}")

        # Per-category
        lines.append("")
        for cat, data in comparison.get("category_comparison", {}).items():
            delta = data.get("delta", {})
            if not delta:
                continue
            overall_d = delta.get("overall", 0)
            sign = "+" if overall_d >= 0 else ""
            lines.append(f"  {cat:<18s}  overall delta: {sign}{overall_d:.4f}")

        lines.append("")
        lines.append("=" * 70)
        return "\n".join(lines)

    # -- helpers -----------------------------------------------------------

    @staticmethod
    def _average_scores(score_list: List[Dict[str, float]]) -> Dict[str, float]:
        """Average numeric values across a list of score dicts."""
        if not score_list:
            return {}
        totals: Dict[str, float] = {}
        counts: Dict[str, int] = {}
        for s in score_list:
            for k, v in s.items():
                if isinstance(v, (int, float)):
                    totals[k] = totals.get(k, 0.0) + v
                    counts[k] = counts.get(k, 0) + 1
        return {k: round(totals[k] / counts[k], 4) for k in sorted(totals)}

    @staticmethod
    def _bar(value: float, width: int = 20) -> str:
        """ASCII progress bar."""
        filled = int(value * width)
        return "[" + "#" * filled + "." * (width - filled) + "]"

    # -- save / load results -----------------------------------------------

    def save_results(self, results: Dict[str, Any], filepath: str) -> None:
        """Save evaluation results to JSON."""
        # Convert non-serialisable types
        os.makedirs(os.path.dirname(filepath) or ".", exist_ok=True)
        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2, default=str)

    @staticmethod
    def load_results(filepath: str) -> Dict[str, Any]:
        """Load evaluation results from JSON."""
        with open(filepath, "r", encoding="utf-8") as f:
            return json.load(f)


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def main() -> None:
    parser = argparse.ArgumentParser(
        description="Codette Benchmark Runner - evaluate model reasoning quality"
    )
    parser.add_argument(
        "--responses", "-r",
        required=True,
        help="Path to JSON file with pre-generated responses (prompt -> response)",
    )
    parser.add_argument(
        "--prompts-dir", "-p",
        default=None,
        help="Directory containing prompt JSON files (default: evaluation/prompts/)",
    )
    parser.add_argument(
        "--baseline", "-b",
        default=None,
        help="Path to baseline responses JSON for comparison",
    )
    parser.add_argument(
        "--output", "-o",
        default=None,
        help="Save results to this JSON file",
    )
    parser.add_argument(
        "--counterexamples", "-c",
        action="store_true",
        help="Also run counterexample tests",
    )
    parser.add_argument(
        "--prompts-file",
        default="reasoning_tests.json",
        help="Prompt file name inside prompts dir (default: reasoning_tests.json)",
    )

    args = parser.parse_args()

    runner = BenchmarkRunner(prompts_dir=args.prompts_dir)
    runner.load_prompts(args.prompts_file)

    print(f"Loading responses from: {args.responses}")
    responses = runner.load_responses(args.responses)
    print(f"  Loaded {len(responses)} responses")

    # Score
    print("\nScoring responses...")
    results = runner.score_responses(responses)
    print(runner.format_report(results))

    # Counterexamples
    if args.counterexamples:
        print("\nRunning counterexample tests...")
        runner.load_counterexamples()
        ce_results = runner.score_counterexamples(responses)
        print(f"  Refutation detection rate: {ce_results['refutation_rate']:.2%}")
        results["counterexamples"] = ce_results

    # Comparison
    if args.baseline:
        print(f"\nLoading baseline from: {args.baseline}")
        baseline = runner.load_responses(args.baseline)
        comparison = runner.compare_models(baseline, responses)
        print(runner.format_comparison_report(comparison))
        results["comparison"] = comparison

    # Save
    if args.output:
        runner.save_results(results, args.output)
        print(f"\nResults saved to: {args.output}")


if __name__ == "__main__":
    main()