Spaces:

MukulRay
/

recon

Sleeping

File size: 8,629 Bytes

6f237d6

"""
eval/calibration.py
--------------------
Phase 11 — Calibration Curve Generator

Reads eval/results/recon_linear.csv and produces:
    docs/calibration_curve.png

The calibration curve answers: "When the critic assigns a verdict,
does that verdict actually predict position accuracy?"

A well-calibrated critic shows:
    PASS > FORCED_PASS > INSUFFICIENT > STALE (in position accuracy)

This proves the critic's verdict tiers are statistically meaningful,
not arbitrary heuristic labels.

Run from repo root:
    python eval/calibration.py
"""

import sys
import os
import csv
import json
from collections import defaultdict

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

# ── Output paths ──────────────────────────────────────────────────────────────
EVAL_DIR  = os.path.dirname(os.path.abspath(__file__))
DOCS_DIR  = os.path.join(os.path.dirname(EVAL_DIR), "docs")
os.makedirs(DOCS_DIR, exist_ok=True)

RECON_LINEAR_CSV = os.path.join(EVAL_DIR, "results", "recon_linear.csv")
OUTPUT_PNG       = os.path.join(DOCS_DIR, "calibration_curve.png")

# ── Verdict display order and colours ────────────────────────────────────────
VERDICT_ORDER = ["PASS", "FORCED_PASS", "STALE", "INSUFFICIENT", "CONTRADICTED", "N/A"]

VERDICT_COLORS = {
    "PASS":         "#22c55e",   # green
    "FORCED_PASS":  "#f59e0b",   # amber
    "STALE":        "#f97316",   # orange
    "INSUFFICIENT": "#ef4444",   # red
    "CONTRADICTED": "#a855f7",   # purple
    "N/A":          "#6b7280",   # grey
}

VERDICT_LABELS = {
    "PASS":         "PASS",
    "FORCED_PASS":  "FORCED\nPASS",
    "STALE":        "STALE",
    "INSUFFICIENT": "INSUF.",
    "CONTRADICTED": "CONTRA.",
    "N/A":          "N/A",
}

# Accuracy score: MATCH=1.0, PARTIAL=0.5, MISMATCH=0.0, others=excluded
ACCURACY_SCORE = {"MATCH": 1.0, "PARTIAL": 0.5, "MISMATCH": 0.0}


def load_recon_linear() -> list[dict]:
    if not os.path.exists(RECON_LINEAR_CSV):
        raise FileNotFoundError(
            f"recon_linear.csv not found at {RECON_LINEAR_CSV}\n"
            "Run eval/run_eval.py first."
        )
    with open(RECON_LINEAR_CSV, encoding="utf-8") as f:
        return list(csv.DictReader(f))


def compute_calibration(rows: list[dict]) -> dict:
    """
    For each critic verdict bin, compute:
      - mean position accuracy (MATCH=1, PARTIAL=0.5, MISMATCH=0)
      - count of questions in that bin
      - 95% confidence interval (Wilson score interval approximation)
    """
    import math

    bins = defaultdict(list)

    for row in rows:
        verdict  = row.get("critic_verdict", "N/A") or "N/A"
        accuracy = row.get("position_accuracy", "")

        if accuracy not in ACCURACY_SCORE:
            continue  # skip SKIPPED / ERROR rows

        bins[verdict].append(ACCURACY_SCORE[accuracy])

    result = {}
    for verdict, scores in bins.items():
        n    = len(scores)
        mean = sum(scores) / n if n else 0.0

        # Standard error for mean of bounded [0,1] scores
        variance = sum((s - mean) ** 2 for s in scores) / n if n > 1 else 0.0
        se       = math.sqrt(variance / n) if n > 0 else 0.0
        ci95     = 1.96 * se

        result[verdict] = {
            "mean":  round(mean, 4),
            "count": n,
            "ci95":  round(ci95, 4),
        }

    return result


def plot_calibration(calibration: dict, output_path: str) -> None:
    import matplotlib
    matplotlib.use("Agg")   # non-interactive backend — works on Windows + Colab
    import matplotlib.pyplot as plt
    import matplotlib.patches as mpatches
    import numpy as np

    # Filter to verdicts that actually appear in the data
    present = [v for v in VERDICT_ORDER if v in calibration]

    if not present:
        print("⚠ No verdict data found — nothing to plot.")
        return

    means  = [calibration[v]["mean"]  for v in present]
    ci95s  = [calibration[v]["ci95"]  for v in present]
    counts = [calibration[v]["count"] for v in present]
    colors = [VERDICT_COLORS.get(v, "#6b7280") for v in present]
    labels = [VERDICT_LABELS.get(v, v) for v in present]

    x = np.arange(len(present))

    fig, ax = plt.subplots(figsize=(9, 5.5))
    fig.patch.set_facecolor("#0f172a")
    ax.set_facecolor("#1e293b")

    # Bars
    bars = ax.bar(
        x, means,
        width=0.55,
        color=colors,
        alpha=0.88,
        zorder=3,
        edgecolor="#0f172a",
        linewidth=1.2,
    )

    # Error bars (95% CI)
    ax.errorbar(
        x, means,
        yerr=ci95s,
        fmt="none",
        ecolor="#e2e8f0",
        elinewidth=1.5,
        capsize=5,
        capthick=1.5,
        zorder=4,
    )

    # Value labels on bars
    for bar, mean, count, ci in zip(bars, means, counts, ci95s):
        ax.text(
            bar.get_x() + bar.get_width() / 2,
            mean + ci + 0.025,
            f"{mean:.0%}",
            ha="center", va="bottom",
            fontsize=10, fontweight="bold",
            color="#f1f5f9",
        )
        ax.text(
            bar.get_x() + bar.get_width() / 2,
            0.01,
            f"n={count}",
            ha="center", va="bottom",
            fontsize=8,
            color="#94a3b8",
        )

    # Reference line at chance (0.5 = all PARTIAL)
    ax.axhline(0.5, color="#64748b", linestyle="--", linewidth=1.0,
               alpha=0.6, zorder=2, label="PARTIAL baseline (0.50)")

    # Gridlines
    ax.yaxis.grid(True, color="#334155", linewidth=0.6, alpha=0.7, zorder=1)
    ax.set_axisbelow(True)
    ax.spines[["top", "right", "left", "bottom"]].set_visible(False)
    ax.tick_params(colors="#94a3b8", length=0)

    # Axes
    ax.set_xticks(x)
    ax.set_xticklabels(labels, fontsize=10, color="#e2e8f0", fontweight="500")
    ax.set_ylim(0, 1.12)
    ax.set_yticks([0, 0.25, 0.5, 0.75, 1.0])
    ax.set_yticklabels(["0%", "25%", "50%", "75%", "100%"],
                       fontsize=9, color="#94a3b8")

    # Labels and title
    ax.set_xlabel("Critic Verdict", fontsize=11, color="#94a3b8",
                  labelpad=10, fontweight="500")
    ax.set_ylabel("Position Accuracy\n(MATCH=1.0 · PARTIAL=0.5 · MISMATCH=0)",
                  fontsize=9, color="#94a3b8", labelpad=10)

    ax.set_title(
        "RECON Critic Calibration Curve",
        fontsize=14, fontweight="bold", color="#f1f5f9", pad=16,
    )
    ax.text(
        0.5, 1.04,
        "Position accuracy by critic verdict — recon_linear · 130 questions",
        transform=ax.transAxes,
        ha="center", fontsize=9, color="#64748b",
    )

    # Legend
    ax.legend(
        loc="upper right", fontsize=8,
        facecolor="#1e293b", edgecolor="#334155",
        labelcolor="#94a3b8",
    )

    plt.tight_layout(pad=1.5)
    plt.savefig(output_path, dpi=160, bbox_inches="tight",
                facecolor=fig.get_facecolor())
    plt.close()

    print(f"✅ Calibration curve saved → {output_path}")


def main():
    print("=" * 55)
    print("RECON — Calibration Curve Generator")
    print("=" * 55)

    rows = load_recon_linear()
    print(f"Loaded {len(rows)} rows from recon_linear.csv")

    calibration = compute_calibration(rows)

    print("\nCalibration data:")
    print(f"  {'Verdict':<16} {'Mean Acc':>9} {'Count':>7} {'95% CI':>8}")
    print(f"  {'-'*16} {'-'*9} {'-'*7} {'-'*8}")
    for v in VERDICT_ORDER:
        if v in calibration:
            d = calibration[v]
            print(f"  {v:<16} {d['mean']:>8.1%} {d['count']:>7} ±{d['ci95']:.3f}")

    print()
    plot_calibration(calibration, OUTPUT_PNG)

    # Also save calibration data as JSON for reference
    json_path = OUTPUT_PNG.replace(".png", ".json")
    with open(json_path, "w") as f:
        json.dump(calibration, f, indent=2)
    print(f"✅ Calibration data saved → {json_path}")

    print("\nInterpretation:")
    if "PASS" in calibration and "STALE" in calibration:
        pass_acc  = calibration["PASS"]["mean"]
        stale_acc = calibration["STALE"]["mean"]
        if pass_acc > stale_acc:
            print(f"  ✓ PASS ({pass_acc:.0%}) > STALE ({stale_acc:.0%})")
            print("    Critic is calibrated — higher-confidence verdicts")
            print("    correlate with higher position accuracy.")
        else:
            print(f"  ⚠ PASS ({pass_acc:.0%}) ≤ STALE ({stale_acc:.0%})")
            print("    Calibration is weak — discuss in limitations.")


if __name__ == "__main__":
    main()