Spaces:

MukulRay
/

recon

Sleeping

App Files Files Community

recon / eval /calibration.py

MukulRay

Phase 13: HF Spaces deploy ready - verdict logging, clean requirements

6f237d6 about 2 months ago

raw

history blame contribute delete

8.63 kB

	"""
	eval/calibration.py
	--------------------
	Phase 11 — Calibration Curve Generator

	Reads eval/results/recon_linear.csv and produces:
	docs/calibration_curve.png

	The calibration curve answers: "When the critic assigns a verdict,
	does that verdict actually predict position accuracy?"

	A well-calibrated critic shows:
	PASS > FORCED_PASS > INSUFFICIENT > STALE (in position accuracy)

	This proves the critic's verdict tiers are statistically meaningful,
	not arbitrary heuristic labels.

	Run from repo root:
	python eval/calibration.py
	"""

	import sys
	import os
	import csv
	import json
	from collections import defaultdict

	sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

	# ── Output paths ──────────────────────────────────────────────────────────────
	EVAL_DIR = os.path.dirname(os.path.abspath(__file__))
	DOCS_DIR = os.path.join(os.path.dirname(EVAL_DIR), "docs")
	os.makedirs(DOCS_DIR, exist_ok=True)

	RECON_LINEAR_CSV = os.path.join(EVAL_DIR, "results", "recon_linear.csv")
	OUTPUT_PNG = os.path.join(DOCS_DIR, "calibration_curve.png")

	# ── Verdict display order and colours ────────────────────────────────────────
	VERDICT_ORDER = ["PASS", "FORCED_PASS", "STALE", "INSUFFICIENT", "CONTRADICTED", "N/A"]

	VERDICT_COLORS = {
	"PASS": "#22c55e", # green
	"FORCED_PASS": "#f59e0b", # amber
	"STALE": "#f97316", # orange
	"INSUFFICIENT": "#ef4444", # red
	"CONTRADICTED": "#a855f7", # purple
	"N/A": "#6b7280", # grey
	}

	VERDICT_LABELS = {
	"PASS": "PASS",
	"FORCED_PASS": "FORCED\nPASS",
	"STALE": "STALE",
	"INSUFFICIENT": "INSUF.",
	"CONTRADICTED": "CONTRA.",
	"N/A": "N/A",
	}

	# Accuracy score: MATCH=1.0, PARTIAL=0.5, MISMATCH=0.0, others=excluded
	ACCURACY_SCORE = {"MATCH": 1.0, "PARTIAL": 0.5, "MISMATCH": 0.0}


	def load_recon_linear() -> list[dict]:
	if not os.path.exists(RECON_LINEAR_CSV):
	raise FileNotFoundError(
	f"recon_linear.csv not found at {RECON_LINEAR_CSV}\n"
	"Run eval/run_eval.py first."
	)
	with open(RECON_LINEAR_CSV, encoding="utf-8") as f:
	return list(csv.DictReader(f))


	def compute_calibration(rows: list[dict]) -> dict:
	"""
	For each critic verdict bin, compute:
	- mean position accuracy (MATCH=1, PARTIAL=0.5, MISMATCH=0)
	- count of questions in that bin
	- 95% confidence interval (Wilson score interval approximation)
	"""
	import math

	bins = defaultdict(list)

	for row in rows:
	verdict = row.get("critic_verdict", "N/A") or "N/A"
	accuracy = row.get("position_accuracy", "")

	if accuracy not in ACCURACY_SCORE:
	continue # skip SKIPPED / ERROR rows

	bins[verdict].append(ACCURACY_SCORE[accuracy])

	result = {}
	for verdict, scores in bins.items():
	n = len(scores)
	mean = sum(scores) / n if n else 0.0

	# Standard error for mean of bounded [0,1] scores
	variance = sum((s - mean) ** 2 for s in scores) / n if n > 1 else 0.0
	se = math.sqrt(variance / n) if n > 0 else 0.0
	ci95 = 1.96 * se

	result[verdict] = {
	"mean": round(mean, 4),
	"count": n,
	"ci95": round(ci95, 4),
	}

	return result


	def plot_calibration(calibration: dict, output_path: str) -> None:
	import matplotlib
	matplotlib.use("Agg") # non-interactive backend — works on Windows + Colab
	import matplotlib.pyplot as plt
	import matplotlib.patches as mpatches
	import numpy as np

	# Filter to verdicts that actually appear in the data
	present = [v for v in VERDICT_ORDER if v in calibration]

	if not present:
	print("⚠ No verdict data found — nothing to plot.")
	return

	means = [calibration[v]["mean"] for v in present]
	ci95s = [calibration[v]["ci95"] for v in present]
	counts = [calibration[v]["count"] for v in present]
	colors = [VERDICT_COLORS.get(v, "#6b7280") for v in present]
	labels = [VERDICT_LABELS.get(v, v) for v in present]

	x = np.arange(len(present))

	fig, ax = plt.subplots(figsize=(9, 5.5))
	fig.patch.set_facecolor("#0f172a")
	ax.set_facecolor("#1e293b")

	# Bars
	bars = ax.bar(
	x, means,
	width=0.55,
	color=colors,
	alpha=0.88,
	zorder=3,
	edgecolor="#0f172a",
	linewidth=1.2,
	)

	# Error bars (95% CI)
	ax.errorbar(
	x, means,
	yerr=ci95s,
	fmt="none",
	ecolor="#e2e8f0",
	elinewidth=1.5,
	capsize=5,
	capthick=1.5,
	zorder=4,
	)

	# Value labels on bars
	for bar, mean, count, ci in zip(bars, means, counts, ci95s):
	ax.text(
	bar.get_x() + bar.get_width() / 2,
	mean + ci + 0.025,
	f"{mean:.0%}",
	ha="center", va="bottom",
	fontsize=10, fontweight="bold",
	color="#f1f5f9",
	)
	ax.text(
	bar.get_x() + bar.get_width() / 2,
	0.01,
	f"n={count}",
	ha="center", va="bottom",
	fontsize=8,
	color="#94a3b8",
	)

	# Reference line at chance (0.5 = all PARTIAL)
	ax.axhline(0.5, color="#64748b", linestyle="--", linewidth=1.0,
	alpha=0.6, zorder=2, label="PARTIAL baseline (0.50)")

	# Gridlines
	ax.yaxis.grid(True, color="#334155", linewidth=0.6, alpha=0.7, zorder=1)
	ax.set_axisbelow(True)
	ax.spines[["top", "right", "left", "bottom"]].set_visible(False)
	ax.tick_params(colors="#94a3b8", length=0)

	# Axes
	ax.set_xticks(x)
	ax.set_xticklabels(labels, fontsize=10, color="#e2e8f0", fontweight="500")
	ax.set_ylim(0, 1.12)
	ax.set_yticks([0, 0.25, 0.5, 0.75, 1.0])
	ax.set_yticklabels(["0%", "25%", "50%", "75%", "100%"],
	fontsize=9, color="#94a3b8")

	# Labels and title
	ax.set_xlabel("Critic Verdict", fontsize=11, color="#94a3b8",
	labelpad=10, fontweight="500")
	ax.set_ylabel("Position Accuracy\n(MATCH=1.0 · PARTIAL=0.5 · MISMATCH=0)",
	fontsize=9, color="#94a3b8", labelpad=10)

	ax.set_title(
	"RECON Critic Calibration Curve",
	fontsize=14, fontweight="bold", color="#f1f5f9", pad=16,
	)
	ax.text(
	0.5, 1.04,
	"Position accuracy by critic verdict — recon_linear · 130 questions",
	transform=ax.transAxes,
	ha="center", fontsize=9, color="#64748b",
	)

	# Legend
	ax.legend(
	loc="upper right", fontsize=8,
	facecolor="#1e293b", edgecolor="#334155",
	labelcolor="#94a3b8",
	)

	plt.tight_layout(pad=1.5)
	plt.savefig(output_path, dpi=160, bbox_inches="tight",
	facecolor=fig.get_facecolor())
	plt.close()

	print(f"✅ Calibration curve saved → {output_path}")


	def main():
	print("=" * 55)
	print("RECON — Calibration Curve Generator")
	print("=" * 55)

	rows = load_recon_linear()
	print(f"Loaded {len(rows)} rows from recon_linear.csv")

	calibration = compute_calibration(rows)

	print("\nCalibration data:")
	print(f" {'Verdict':<16} {'Mean Acc':>9} {'Count':>7} {'95% CI':>8}")
	print(f" {'-'16} {'-'9} {'-'7} {'-'8}")
	for v in VERDICT_ORDER:
	if v in calibration:
	d = calibration[v]
	print(f" {v:<16} {d['mean']:>8.1%} {d['count']:>7} ±{d['ci95']:.3f}")

	print()
	plot_calibration(calibration, OUTPUT_PNG)

	# Also save calibration data as JSON for reference
	json_path = OUTPUT_PNG.replace(".png", ".json")
	with open(json_path, "w") as f:
	json.dump(calibration, f, indent=2)
	print(f"✅ Calibration data saved → {json_path}")

	print("\nInterpretation:")
	if "PASS" in calibration and "STALE" in calibration:
	pass_acc = calibration["PASS"]["mean"]
	stale_acc = calibration["STALE"]["mean"]
	if pass_acc > stale_acc:
	print(f" ✓ PASS ({pass_acc:.0%}) > STALE ({stale_acc:.0%})")
	print(" Critic is calibrated — higher-confidence verdicts")
	print(" correlate with higher position accuracy.")
	else:
	print(f" ⚠ PASS ({pass_acc:.0%}) ≤ STALE ({stale_acc:.0%})")
	print(" Calibration is weak — discuss in limitations.")


	if __name__ == "__main__":
	main()