| """ |
| eval/calibration.py |
| -------------------- |
| Phase 11 β Calibration Curve Generator |
| |
| Reads eval/results/recon_linear.csv and produces: |
| docs/calibration_curve.png |
| |
| The calibration curve answers: "When the critic assigns a verdict, |
| does that verdict actually predict position accuracy?" |
| |
| A well-calibrated critic shows: |
| PASS > FORCED_PASS > INSUFFICIENT > STALE (in position accuracy) |
| |
| This proves the critic's verdict tiers are statistically meaningful, |
| not arbitrary heuristic labels. |
| |
| Run from repo root: |
| python eval/calibration.py |
| """ |
|
|
| import sys |
| import os |
| import csv |
| import json |
| from collections import defaultdict |
|
|
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) |
|
|
| |
| EVAL_DIR = os.path.dirname(os.path.abspath(__file__)) |
| DOCS_DIR = os.path.join(os.path.dirname(EVAL_DIR), "docs") |
| os.makedirs(DOCS_DIR, exist_ok=True) |
|
|
| RECON_LINEAR_CSV = os.path.join(EVAL_DIR, "results", "recon_linear.csv") |
| OUTPUT_PNG = os.path.join(DOCS_DIR, "calibration_curve.png") |
|
|
| |
| VERDICT_ORDER = ["PASS", "FORCED_PASS", "STALE", "INSUFFICIENT", "CONTRADICTED", "N/A"] |
|
|
| VERDICT_COLORS = { |
| "PASS": "#22c55e", |
| "FORCED_PASS": "#f59e0b", |
| "STALE": "#f97316", |
| "INSUFFICIENT": "#ef4444", |
| "CONTRADICTED": "#a855f7", |
| "N/A": "#6b7280", |
| } |
|
|
| VERDICT_LABELS = { |
| "PASS": "PASS", |
| "FORCED_PASS": "FORCED\nPASS", |
| "STALE": "STALE", |
| "INSUFFICIENT": "INSUF.", |
| "CONTRADICTED": "CONTRA.", |
| "N/A": "N/A", |
| } |
|
|
| |
| ACCURACY_SCORE = {"MATCH": 1.0, "PARTIAL": 0.5, "MISMATCH": 0.0} |
|
|
|
|
| def load_recon_linear() -> list[dict]: |
| if not os.path.exists(RECON_LINEAR_CSV): |
| raise FileNotFoundError( |
| f"recon_linear.csv not found at {RECON_LINEAR_CSV}\n" |
| "Run eval/run_eval.py first." |
| ) |
| with open(RECON_LINEAR_CSV, encoding="utf-8") as f: |
| return list(csv.DictReader(f)) |
|
|
|
|
| def compute_calibration(rows: list[dict]) -> dict: |
| """ |
| For each critic verdict bin, compute: |
| - mean position accuracy (MATCH=1, PARTIAL=0.5, MISMATCH=0) |
| - count of questions in that bin |
| - 95% confidence interval (Wilson score interval approximation) |
| """ |
| import math |
|
|
| bins = defaultdict(list) |
|
|
| for row in rows: |
| verdict = row.get("critic_verdict", "N/A") or "N/A" |
| accuracy = row.get("position_accuracy", "") |
|
|
| if accuracy not in ACCURACY_SCORE: |
| continue |
|
|
| bins[verdict].append(ACCURACY_SCORE[accuracy]) |
|
|
| result = {} |
| for verdict, scores in bins.items(): |
| n = len(scores) |
| mean = sum(scores) / n if n else 0.0 |
|
|
| |
| variance = sum((s - mean) ** 2 for s in scores) / n if n > 1 else 0.0 |
| se = math.sqrt(variance / n) if n > 0 else 0.0 |
| ci95 = 1.96 * se |
|
|
| result[verdict] = { |
| "mean": round(mean, 4), |
| "count": n, |
| "ci95": round(ci95, 4), |
| } |
|
|
| return result |
|
|
|
|
| def plot_calibration(calibration: dict, output_path: str) -> None: |
| import matplotlib |
| matplotlib.use("Agg") |
| import matplotlib.pyplot as plt |
| import matplotlib.patches as mpatches |
| import numpy as np |
|
|
| |
| present = [v for v in VERDICT_ORDER if v in calibration] |
|
|
| if not present: |
| print("β No verdict data found β nothing to plot.") |
| return |
|
|
| means = [calibration[v]["mean"] for v in present] |
| ci95s = [calibration[v]["ci95"] for v in present] |
| counts = [calibration[v]["count"] for v in present] |
| colors = [VERDICT_COLORS.get(v, "#6b7280") for v in present] |
| labels = [VERDICT_LABELS.get(v, v) for v in present] |
|
|
| x = np.arange(len(present)) |
|
|
| fig, ax = plt.subplots(figsize=(9, 5.5)) |
| fig.patch.set_facecolor("#0f172a") |
| ax.set_facecolor("#1e293b") |
|
|
| |
| bars = ax.bar( |
| x, means, |
| width=0.55, |
| color=colors, |
| alpha=0.88, |
| zorder=3, |
| edgecolor="#0f172a", |
| linewidth=1.2, |
| ) |
|
|
| |
| ax.errorbar( |
| x, means, |
| yerr=ci95s, |
| fmt="none", |
| ecolor="#e2e8f0", |
| elinewidth=1.5, |
| capsize=5, |
| capthick=1.5, |
| zorder=4, |
| ) |
|
|
| |
| for bar, mean, count, ci in zip(bars, means, counts, ci95s): |
| ax.text( |
| bar.get_x() + bar.get_width() / 2, |
| mean + ci + 0.025, |
| f"{mean:.0%}", |
| ha="center", va="bottom", |
| fontsize=10, fontweight="bold", |
| color="#f1f5f9", |
| ) |
| ax.text( |
| bar.get_x() + bar.get_width() / 2, |
| 0.01, |
| f"n={count}", |
| ha="center", va="bottom", |
| fontsize=8, |
| color="#94a3b8", |
| ) |
|
|
| |
| ax.axhline(0.5, color="#64748b", linestyle="--", linewidth=1.0, |
| alpha=0.6, zorder=2, label="PARTIAL baseline (0.50)") |
|
|
| |
| ax.yaxis.grid(True, color="#334155", linewidth=0.6, alpha=0.7, zorder=1) |
| ax.set_axisbelow(True) |
| ax.spines[["top", "right", "left", "bottom"]].set_visible(False) |
| ax.tick_params(colors="#94a3b8", length=0) |
|
|
| |
| ax.set_xticks(x) |
| ax.set_xticklabels(labels, fontsize=10, color="#e2e8f0", fontweight="500") |
| ax.set_ylim(0, 1.12) |
| ax.set_yticks([0, 0.25, 0.5, 0.75, 1.0]) |
| ax.set_yticklabels(["0%", "25%", "50%", "75%", "100%"], |
| fontsize=9, color="#94a3b8") |
|
|
| |
| ax.set_xlabel("Critic Verdict", fontsize=11, color="#94a3b8", |
| labelpad=10, fontweight="500") |
| ax.set_ylabel("Position Accuracy\n(MATCH=1.0 Β· PARTIAL=0.5 Β· MISMATCH=0)", |
| fontsize=9, color="#94a3b8", labelpad=10) |
|
|
| ax.set_title( |
| "RECON Critic Calibration Curve", |
| fontsize=14, fontweight="bold", color="#f1f5f9", pad=16, |
| ) |
| ax.text( |
| 0.5, 1.04, |
| "Position accuracy by critic verdict β recon_linear Β· 130 questions", |
| transform=ax.transAxes, |
| ha="center", fontsize=9, color="#64748b", |
| ) |
|
|
| |
| ax.legend( |
| loc="upper right", fontsize=8, |
| facecolor="#1e293b", edgecolor="#334155", |
| labelcolor="#94a3b8", |
| ) |
|
|
| plt.tight_layout(pad=1.5) |
| plt.savefig(output_path, dpi=160, bbox_inches="tight", |
| facecolor=fig.get_facecolor()) |
| plt.close() |
|
|
| print(f"β
Calibration curve saved β {output_path}") |
|
|
|
|
| def main(): |
| print("=" * 55) |
| print("RECON β Calibration Curve Generator") |
| print("=" * 55) |
|
|
| rows = load_recon_linear() |
| print(f"Loaded {len(rows)} rows from recon_linear.csv") |
|
|
| calibration = compute_calibration(rows) |
|
|
| print("\nCalibration data:") |
| print(f" {'Verdict':<16} {'Mean Acc':>9} {'Count':>7} {'95% CI':>8}") |
| print(f" {'-'*16} {'-'*9} {'-'*7} {'-'*8}") |
| for v in VERDICT_ORDER: |
| if v in calibration: |
| d = calibration[v] |
| print(f" {v:<16} {d['mean']:>8.1%} {d['count']:>7} Β±{d['ci95']:.3f}") |
|
|
| print() |
| plot_calibration(calibration, OUTPUT_PNG) |
|
|
| |
| json_path = OUTPUT_PNG.replace(".png", ".json") |
| with open(json_path, "w") as f: |
| json.dump(calibration, f, indent=2) |
| print(f"β
Calibration data saved β {json_path}") |
|
|
| print("\nInterpretation:") |
| if "PASS" in calibration and "STALE" in calibration: |
| pass_acc = calibration["PASS"]["mean"] |
| stale_acc = calibration["STALE"]["mean"] |
| if pass_acc > stale_acc: |
| print(f" β PASS ({pass_acc:.0%}) > STALE ({stale_acc:.0%})") |
| print(" Critic is calibrated β higher-confidence verdicts") |
| print(" correlate with higher position accuracy.") |
| else: |
| print(f" β PASS ({pass_acc:.0%}) β€ STALE ({stale_acc:.0%})") |
| print(" Calibration is weak β discuss in limitations.") |
|
|
|
|
| if __name__ == "__main__": |
| main() |