recon / eval /calibration.py
MukulRay's picture
Phase 13: HF Spaces deploy ready - verdict logging, clean requirements
6f237d6
"""
eval/calibration.py
--------------------
Phase 11 β€” Calibration Curve Generator
Reads eval/results/recon_linear.csv and produces:
docs/calibration_curve.png
The calibration curve answers: "When the critic assigns a verdict,
does that verdict actually predict position accuracy?"
A well-calibrated critic shows:
PASS > FORCED_PASS > INSUFFICIENT > STALE (in position accuracy)
This proves the critic's verdict tiers are statistically meaningful,
not arbitrary heuristic labels.
Run from repo root:
python eval/calibration.py
"""
import sys
import os
import csv
import json
from collections import defaultdict
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
# ── Output paths ──────────────────────────────────────────────────────────────
EVAL_DIR = os.path.dirname(os.path.abspath(__file__))
DOCS_DIR = os.path.join(os.path.dirname(EVAL_DIR), "docs")
os.makedirs(DOCS_DIR, exist_ok=True)
RECON_LINEAR_CSV = os.path.join(EVAL_DIR, "results", "recon_linear.csv")
OUTPUT_PNG = os.path.join(DOCS_DIR, "calibration_curve.png")
# ── Verdict display order and colours ────────────────────────────────────────
VERDICT_ORDER = ["PASS", "FORCED_PASS", "STALE", "INSUFFICIENT", "CONTRADICTED", "N/A"]
VERDICT_COLORS = {
"PASS": "#22c55e", # green
"FORCED_PASS": "#f59e0b", # amber
"STALE": "#f97316", # orange
"INSUFFICIENT": "#ef4444", # red
"CONTRADICTED": "#a855f7", # purple
"N/A": "#6b7280", # grey
}
VERDICT_LABELS = {
"PASS": "PASS",
"FORCED_PASS": "FORCED\nPASS",
"STALE": "STALE",
"INSUFFICIENT": "INSUF.",
"CONTRADICTED": "CONTRA.",
"N/A": "N/A",
}
# Accuracy score: MATCH=1.0, PARTIAL=0.5, MISMATCH=0.0, others=excluded
ACCURACY_SCORE = {"MATCH": 1.0, "PARTIAL": 0.5, "MISMATCH": 0.0}
def load_recon_linear() -> list[dict]:
if not os.path.exists(RECON_LINEAR_CSV):
raise FileNotFoundError(
f"recon_linear.csv not found at {RECON_LINEAR_CSV}\n"
"Run eval/run_eval.py first."
)
with open(RECON_LINEAR_CSV, encoding="utf-8") as f:
return list(csv.DictReader(f))
def compute_calibration(rows: list[dict]) -> dict:
"""
For each critic verdict bin, compute:
- mean position accuracy (MATCH=1, PARTIAL=0.5, MISMATCH=0)
- count of questions in that bin
- 95% confidence interval (Wilson score interval approximation)
"""
import math
bins = defaultdict(list)
for row in rows:
verdict = row.get("critic_verdict", "N/A") or "N/A"
accuracy = row.get("position_accuracy", "")
if accuracy not in ACCURACY_SCORE:
continue # skip SKIPPED / ERROR rows
bins[verdict].append(ACCURACY_SCORE[accuracy])
result = {}
for verdict, scores in bins.items():
n = len(scores)
mean = sum(scores) / n if n else 0.0
# Standard error for mean of bounded [0,1] scores
variance = sum((s - mean) ** 2 for s in scores) / n if n > 1 else 0.0
se = math.sqrt(variance / n) if n > 0 else 0.0
ci95 = 1.96 * se
result[verdict] = {
"mean": round(mean, 4),
"count": n,
"ci95": round(ci95, 4),
}
return result
def plot_calibration(calibration: dict, output_path: str) -> None:
import matplotlib
matplotlib.use("Agg") # non-interactive backend β€” works on Windows + Colab
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
# Filter to verdicts that actually appear in the data
present = [v for v in VERDICT_ORDER if v in calibration]
if not present:
print("⚠ No verdict data found β€” nothing to plot.")
return
means = [calibration[v]["mean"] for v in present]
ci95s = [calibration[v]["ci95"] for v in present]
counts = [calibration[v]["count"] for v in present]
colors = [VERDICT_COLORS.get(v, "#6b7280") for v in present]
labels = [VERDICT_LABELS.get(v, v) for v in present]
x = np.arange(len(present))
fig, ax = plt.subplots(figsize=(9, 5.5))
fig.patch.set_facecolor("#0f172a")
ax.set_facecolor("#1e293b")
# Bars
bars = ax.bar(
x, means,
width=0.55,
color=colors,
alpha=0.88,
zorder=3,
edgecolor="#0f172a",
linewidth=1.2,
)
# Error bars (95% CI)
ax.errorbar(
x, means,
yerr=ci95s,
fmt="none",
ecolor="#e2e8f0",
elinewidth=1.5,
capsize=5,
capthick=1.5,
zorder=4,
)
# Value labels on bars
for bar, mean, count, ci in zip(bars, means, counts, ci95s):
ax.text(
bar.get_x() + bar.get_width() / 2,
mean + ci + 0.025,
f"{mean:.0%}",
ha="center", va="bottom",
fontsize=10, fontweight="bold",
color="#f1f5f9",
)
ax.text(
bar.get_x() + bar.get_width() / 2,
0.01,
f"n={count}",
ha="center", va="bottom",
fontsize=8,
color="#94a3b8",
)
# Reference line at chance (0.5 = all PARTIAL)
ax.axhline(0.5, color="#64748b", linestyle="--", linewidth=1.0,
alpha=0.6, zorder=2, label="PARTIAL baseline (0.50)")
# Gridlines
ax.yaxis.grid(True, color="#334155", linewidth=0.6, alpha=0.7, zorder=1)
ax.set_axisbelow(True)
ax.spines[["top", "right", "left", "bottom"]].set_visible(False)
ax.tick_params(colors="#94a3b8", length=0)
# Axes
ax.set_xticks(x)
ax.set_xticklabels(labels, fontsize=10, color="#e2e8f0", fontweight="500")
ax.set_ylim(0, 1.12)
ax.set_yticks([0, 0.25, 0.5, 0.75, 1.0])
ax.set_yticklabels(["0%", "25%", "50%", "75%", "100%"],
fontsize=9, color="#94a3b8")
# Labels and title
ax.set_xlabel("Critic Verdict", fontsize=11, color="#94a3b8",
labelpad=10, fontweight="500")
ax.set_ylabel("Position Accuracy\n(MATCH=1.0 Β· PARTIAL=0.5 Β· MISMATCH=0)",
fontsize=9, color="#94a3b8", labelpad=10)
ax.set_title(
"RECON Critic Calibration Curve",
fontsize=14, fontweight="bold", color="#f1f5f9", pad=16,
)
ax.text(
0.5, 1.04,
"Position accuracy by critic verdict β€” recon_linear Β· 130 questions",
transform=ax.transAxes,
ha="center", fontsize=9, color="#64748b",
)
# Legend
ax.legend(
loc="upper right", fontsize=8,
facecolor="#1e293b", edgecolor="#334155",
labelcolor="#94a3b8",
)
plt.tight_layout(pad=1.5)
plt.savefig(output_path, dpi=160, bbox_inches="tight",
facecolor=fig.get_facecolor())
plt.close()
print(f"βœ… Calibration curve saved β†’ {output_path}")
def main():
print("=" * 55)
print("RECON β€” Calibration Curve Generator")
print("=" * 55)
rows = load_recon_linear()
print(f"Loaded {len(rows)} rows from recon_linear.csv")
calibration = compute_calibration(rows)
print("\nCalibration data:")
print(f" {'Verdict':<16} {'Mean Acc':>9} {'Count':>7} {'95% CI':>8}")
print(f" {'-'*16} {'-'*9} {'-'*7} {'-'*8}")
for v in VERDICT_ORDER:
if v in calibration:
d = calibration[v]
print(f" {v:<16} {d['mean']:>8.1%} {d['count']:>7} Β±{d['ci95']:.3f}")
print()
plot_calibration(calibration, OUTPUT_PNG)
# Also save calibration data as JSON for reference
json_path = OUTPUT_PNG.replace(".png", ".json")
with open(json_path, "w") as f:
json.dump(calibration, f, indent=2)
print(f"βœ… Calibration data saved β†’ {json_path}")
print("\nInterpretation:")
if "PASS" in calibration and "STALE" in calibration:
pass_acc = calibration["PASS"]["mean"]
stale_acc = calibration["STALE"]["mean"]
if pass_acc > stale_acc:
print(f" βœ“ PASS ({pass_acc:.0%}) > STALE ({stale_acc:.0%})")
print(" Critic is calibrated β€” higher-confidence verdicts")
print(" correlate with higher position accuracy.")
else:
print(f" ⚠ PASS ({pass_acc:.0%}) ≀ STALE ({stale_acc:.0%})")
print(" Calibration is weak β€” discuss in limitations.")
if __name__ == "__main__":
main()