Codette-Reasoning / evaluation /benchmark_runner.py
Raiff1982's picture
Upload 120 files
ed1b365 verified
"""
Benchmark Runner - loads test prompts, runs/loads responses, scores them,
and produces detailed evaluation reports.
Supports:
- Loading prompts from JSON files in evaluation/prompts/
- Pre-generated response files (JSON mapping prompt -> response)
- Scoring via ReasoningMetrics
- Per-category and overall reports
- Baseline vs trained model comparison
- CLI interface
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional
# Allow running from project root or from evaluation/
_THIS_DIR = Path(__file__).resolve().parent
_PROJECT_ROOT = _THIS_DIR.parent
if str(_PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(_PROJECT_ROOT))
from evaluation.reasoning_metrics import ReasoningMetrics
# ---------------------------------------------------------------------------
# Benchmark Runner
# ---------------------------------------------------------------------------
class BenchmarkRunner:
"""Load prompts, score responses, produce reports."""
def __init__(
self,
prompts_dir: Optional[str] = None,
metrics: Optional[ReasoningMetrics] = None,
):
self.prompts_dir = Path(prompts_dir) if prompts_dir else _THIS_DIR / "prompts"
self.metrics = metrics or ReasoningMetrics()
self._prompts: Dict[str, List[str]] = {}
self._counterexamples: List[Dict[str, str]] = []
# -- loading -----------------------------------------------------------
def load_prompts(self, filename: str = "reasoning_tests.json") -> Dict[str, List[str]]:
"""Load categorised prompts from a JSON file.
Expected format: {"category": ["prompt1", "prompt2", ...], ...}
"""
path = self.prompts_dir / filename
if not path.exists():
raise FileNotFoundError(f"Prompt file not found: {path}")
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
self._prompts = data
return data
def load_counterexamples(self, filename: str = "counterexample_tests.json") -> List[Dict[str, str]]:
"""Load counterexample test prompts."""
path = self.prompts_dir / filename
if not path.exists():
raise FileNotFoundError(f"Counterexample file not found: {path}")
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
self._counterexamples = data
return data
def load_responses(self, filepath: str) -> Dict[str, str]:
"""Load pre-generated responses from a JSON file.
Expected format: {"prompt_text": "response_text", ...}
"""
with open(filepath, "r", encoding="utf-8") as f:
return json.load(f)
# -- scoring -----------------------------------------------------------
def score_responses(
self,
responses: Dict[str, str],
) -> Dict[str, Any]:
"""Score all responses and organise results by category.
Args:
responses: mapping of prompt text -> response text
Returns:
Dict with per-prompt scores, per-category averages, and overall.
"""
if not self._prompts:
self.load_prompts()
results: Dict[str, Any] = {
"timestamp": datetime.utcnow().isoformat(),
"total_prompts": 0,
"scored_prompts": 0,
"missing_responses": 0,
"categories": {},
"all_scores": [],
}
for category, prompts in self._prompts.items():
cat_scores: List[Dict[str, Any]] = []
for prompt in prompts:
results["total_prompts"] += 1
response = responses.get(prompt)
if response is None:
results["missing_responses"] += 1
continue
scores = self.metrics.score_reasoning(response)
results["scored_prompts"] += 1
entry = {"prompt": prompt, "scores": scores}
cat_scores.append(entry)
results["all_scores"].append(entry)
# Category averages
if cat_scores:
avg = self._average_scores([e["scores"] for e in cat_scores])
else:
avg = {}
results["categories"][category] = {
"prompts_scored": len(cat_scores),
"average_scores": avg,
"details": cat_scores,
}
# Overall averages
if results["all_scores"]:
results["overall"] = self._average_scores(
[e["scores"] for e in results["all_scores"]]
)
else:
results["overall"] = {}
return results
def score_counterexamples(
self,
responses: Dict[str, str],
) -> Dict[str, Any]:
"""Score counterexample responses (should identify wrong reasoning)."""
if not self._counterexamples:
self.load_counterexamples()
results = []
refutations = 0
total = 0
refutation_markers = [
"not true", "incorrect", "misconception", "actually",
"contrary", "doesn't", "does not", "false", "myth",
"wrong", "mistake", "no,", "in fact", "however",
"this is a common", "oversimplification", "nuanced",
"not necessarily", "depends on", "more complex",
]
for item in self._counterexamples:
prompt = item["prompt"]
expected = item.get("expected", "refutation")
response = responses.get(prompt, "")
total += 1
if not response:
results.append({
"prompt": prompt,
"expected": expected,
"responded": False,
"contains_refutation": False,
})
continue
resp_lower = response.lower()
found_refutation = any(m in resp_lower for m in refutation_markers)
if found_refutation and expected == "refutation":
refutations += 1
scores = self.metrics.score_reasoning(response)
results.append({
"prompt": prompt,
"expected": expected,
"responded": True,
"contains_refutation": found_refutation,
"scores": scores,
})
return {
"total": total,
"refutation_rate": round(refutations / max(total, 1), 4),
"details": results,
}
# -- comparison --------------------------------------------------------
def compare_models(
self,
baseline_responses: Dict[str, str],
trained_responses: Dict[str, str],
) -> Dict[str, Any]:
"""Compare baseline vs trained model responses."""
baseline_results = self.score_responses(baseline_responses)
trained_results = self.score_responses(trained_responses)
comparison: Dict[str, Any] = {
"timestamp": datetime.utcnow().isoformat(),
"baseline_overall": baseline_results.get("overall", {}),
"trained_overall": trained_results.get("overall", {}),
"category_comparison": {},
"improvements": {},
"regressions": {},
}
# Per-category delta
for cat in baseline_results["categories"]:
b_avg = baseline_results["categories"][cat]["average_scores"]
t_avg = trained_results["categories"].get(cat, {}).get("average_scores", {})
delta = {}
for k in b_avg:
if k in t_avg and isinstance(b_avg[k], (int, float)):
delta[k] = round(t_avg[k] - b_avg[k], 4)
comparison["category_comparison"][cat] = {
"baseline": b_avg,
"trained": t_avg,
"delta": delta,
}
# Overall delta
b_ov = comparison["baseline_overall"]
t_ov = comparison["trained_overall"]
for k in b_ov:
if k in t_ov and isinstance(b_ov[k], (int, float)):
d = round(t_ov[k] - b_ov[k], 4)
if d > 0.01:
comparison["improvements"][k] = d
elif d < -0.01:
comparison["regressions"][k] = d
return comparison
# -- report ------------------------------------------------------------
def format_report(self, results: Dict[str, Any]) -> str:
"""Format evaluation results as a readable text report."""
lines: List[str] = []
lines.append("=" * 70)
lines.append(" CODETTE BENCHMARK EVALUATION REPORT")
lines.append("=" * 70)
lines.append(f" Timestamp: {results.get('timestamp', 'N/A')}")
lines.append(f" Prompts: {results.get('scored_prompts', 0)} scored / "
f"{results.get('total_prompts', 0)} total")
if results.get("missing_responses"):
lines.append(f" Missing: {results['missing_responses']} responses not found")
lines.append("")
# Overall
overall = results.get("overall", {})
if overall:
lines.append("-" * 70)
lines.append(" OVERALL SCORES")
lines.append("-" * 70)
for k, v in sorted(overall.items()):
if isinstance(v, float):
bar = self._bar(v)
lines.append(f" {k:<22s} {v:.4f} {bar}")
lines.append("")
# Per-category
for cat, data in results.get("categories", {}).items():
avg = data.get("average_scores", {})
if not avg:
continue
lines.append("-" * 70)
lines.append(f" CATEGORY: {cat.upper()}")
lines.append(f" Prompts scored: {data.get('prompts_scored', 0)}")
lines.append("-" * 70)
for k, v in sorted(avg.items()):
if isinstance(v, float):
bar = self._bar(v)
lines.append(f" {k:<22s} {v:.4f} {bar}")
lines.append("")
lines.append("=" * 70)
return "\n".join(lines)
def format_comparison_report(self, comparison: Dict[str, Any]) -> str:
"""Format a comparison report between baseline and trained model."""
lines: List[str] = []
lines.append("=" * 70)
lines.append(" MODEL COMPARISON REPORT")
lines.append("=" * 70)
lines.append(f" Timestamp: {comparison.get('timestamp', 'N/A')}")
lines.append("")
# Overall
lines.append("-" * 70)
lines.append(" OVERALL SCORES (baseline -> trained [delta])")
lines.append("-" * 70)
b = comparison.get("baseline_overall", {})
t = comparison.get("trained_overall", {})
for k in sorted(set(list(b.keys()) + list(t.keys()))):
bv = b.get(k, 0)
tv = t.get(k, 0)
if not isinstance(bv, (int, float)):
continue
d = tv - bv
sign = "+" if d >= 0 else ""
lines.append(f" {k:<22s} {bv:.4f} -> {tv:.4f} [{sign}{d:.4f}]")
# Improvements / regressions
imp = comparison.get("improvements", {})
reg = comparison.get("regressions", {})
if imp:
lines.append("")
lines.append(" IMPROVEMENTS:")
for k, v in sorted(imp.items(), key=lambda x: -x[1]):
lines.append(f" + {k}: +{v:.4f}")
if reg:
lines.append("")
lines.append(" REGRESSIONS:")
for k, v in sorted(reg.items(), key=lambda x: x[1]):
lines.append(f" - {k}: {v:.4f}")
# Per-category
lines.append("")
for cat, data in comparison.get("category_comparison", {}).items():
delta = data.get("delta", {})
if not delta:
continue
overall_d = delta.get("overall", 0)
sign = "+" if overall_d >= 0 else ""
lines.append(f" {cat:<18s} overall delta: {sign}{overall_d:.4f}")
lines.append("")
lines.append("=" * 70)
return "\n".join(lines)
# -- helpers -----------------------------------------------------------
@staticmethod
def _average_scores(score_list: List[Dict[str, float]]) -> Dict[str, float]:
"""Average numeric values across a list of score dicts."""
if not score_list:
return {}
totals: Dict[str, float] = {}
counts: Dict[str, int] = {}
for s in score_list:
for k, v in s.items():
if isinstance(v, (int, float)):
totals[k] = totals.get(k, 0.0) + v
counts[k] = counts.get(k, 0) + 1
return {k: round(totals[k] / counts[k], 4) for k in sorted(totals)}
@staticmethod
def _bar(value: float, width: int = 20) -> str:
"""ASCII progress bar."""
filled = int(value * width)
return "[" + "#" * filled + "." * (width - filled) + "]"
# -- save / load results -----------------------------------------------
def save_results(self, results: Dict[str, Any], filepath: str) -> None:
"""Save evaluation results to JSON."""
# Convert non-serialisable types
os.makedirs(os.path.dirname(filepath) or ".", exist_ok=True)
with open(filepath, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, default=str)
@staticmethod
def load_results(filepath: str) -> Dict[str, Any]:
"""Load evaluation results from JSON."""
with open(filepath, "r", encoding="utf-8") as f:
return json.load(f)
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(
description="Codette Benchmark Runner - evaluate model reasoning quality"
)
parser.add_argument(
"--responses", "-r",
required=True,
help="Path to JSON file with pre-generated responses (prompt -> response)",
)
parser.add_argument(
"--prompts-dir", "-p",
default=None,
help="Directory containing prompt JSON files (default: evaluation/prompts/)",
)
parser.add_argument(
"--baseline", "-b",
default=None,
help="Path to baseline responses JSON for comparison",
)
parser.add_argument(
"--output", "-o",
default=None,
help="Save results to this JSON file",
)
parser.add_argument(
"--counterexamples", "-c",
action="store_true",
help="Also run counterexample tests",
)
parser.add_argument(
"--prompts-file",
default="reasoning_tests.json",
help="Prompt file name inside prompts dir (default: reasoning_tests.json)",
)
args = parser.parse_args()
runner = BenchmarkRunner(prompts_dir=args.prompts_dir)
runner.load_prompts(args.prompts_file)
print(f"Loading responses from: {args.responses}")
responses = runner.load_responses(args.responses)
print(f" Loaded {len(responses)} responses")
# Score
print("\nScoring responses...")
results = runner.score_responses(responses)
print(runner.format_report(results))
# Counterexamples
if args.counterexamples:
print("\nRunning counterexample tests...")
runner.load_counterexamples()
ce_results = runner.score_counterexamples(responses)
print(f" Refutation detection rate: {ce_results['refutation_rate']:.2%}")
results["counterexamples"] = ce_results
# Comparison
if args.baseline:
print(f"\nLoading baseline from: {args.baseline}")
baseline = runner.load_responses(args.baseline)
comparison = runner.compare_models(baseline, responses)
print(runner.format_comparison_report(comparison))
results["comparison"] = comparison
# Save
if args.output:
runner.save_results(results, args.output)
print(f"\nResults saved to: {args.output}")
if __name__ == "__main__":
main()