#!/usr/bin/env python3 """ calibration_analysis.py — HalluMaze Confidence Calibration Analysis Computes per-model calibration metrics from existing trial data: - ECE (Expected Calibration Error): binned CE across trials - Brier Score proxy: mean((ce_i)^2) per model - Mean CE, median CE, and coverage statistics Data source: ce field (per-trial Calibration Error) already computed by hallumaze.py. CE = mean |confidence/100 - correctness| per step within each trial. When confidence_log data is available (future runs), computes step-level ECE with 10 bins. For current data, uses trial-level CE values. Usage: python3 scripts/calibration_analysis.py # Output: experiment_results/calibration.json """ from __future__ import annotations import json, math from pathlib import Path from collections import defaultdict from statistics import mean, median, stdev BASE = Path(__file__).parent.parent / "experiment_results" # ── Data Sources (consistent with build_final_analysis.py) ────── SOURCES = { "checkpoint_rerun": { "file": BASE / "checkpoint_rerun.json", "model_key": "model", }, "or_phaseB_scout_gemini": { "file": BASE / "or_phaseB.json", "model_key": "or_model_id", "filter_models": ["meta-llama/llama-4-scout", "google/gemini-2.0-flash-lite-001"], }, "or_haiku": {"file": BASE / "or_haiku.json", "model_key": "or_model_id"}, "or_gptmini": {"file": BASE / "or_gptmini.json", "model_key": "or_model_id"}, "or_maverick": {"file": BASE / "or_maverick.json", "model_key": "or_model_id"}, "or_qwen": {"file": BASE / "or_qwen.json", "model_key": "or_model_id"}, } MODEL_DISPLAY = { "glm-4.7": "GLM-4.7", "MiniMax-M2.5": "MiniMax-M2.5", "meta-llama/llama-4-scout": "Llama-4-Scout", "meta-llama/llama-4-maverick": "Llama-4-Maverick", "google/gemini-2.0-flash-lite-001": "Gemini-2.0-Flash-Lite", "openai/gpt-4o-mini": "GPT-4o-mini", "anthropic/claude-3-haiku": "Claude-3-Haiku", "qwen/qwen-2.5-72b-instruct": "Qwen-2.5-72B", } def load_all_records() -> dict[str, list[dict]]: """Load all trial records grouped by display model name.""" by_model: dict[str, list[dict]] = defaultdict(list) for src_name, src in SOURCES.items(): fpath = src["file"] if not fpath.exists(): print(f" [SKIP] {fpath.name} not found") continue with open(fpath) as f: records = json.load(f) model_key = src.get("model_key", "model") filter_models = src.get("filter_models") for rec in records: raw_model = rec.get(model_key) or rec.get("model", "unknown") if filter_models and raw_model not in filter_models: continue if rec.get("error"): continue display = MODEL_DISPLAY.get(raw_model, raw_model) by_model[display].append(rec) return dict(by_model) def compute_ece_from_confidence_logs(trials: list[dict], n_bins: int = 10) -> dict | None: """Compute step-level ECE if confidence_log data is available.""" all_confs = [] all_outcomes = [] for rec in trials: conf_log = rec.get("confidence_log", []) if not conf_log: continue hrr = rec.get("hrr", 0.0) for entry in conf_log: if entry is None: continue conf = entry.get("conf") if conf is None: continue all_confs.append(conf / 100.0) all_outcomes.append(hrr) if len(all_confs) < 5: return None # 10-bin ECE bins = [[] for _ in range(n_bins)] outcome_bins = [[] for _ in range(n_bins)] for c, o in zip(all_confs, all_outcomes): idx = min(int(c * n_bins), n_bins - 1) bins[idx].append(c) outcome_bins[idx].append(o) ece = 0.0 n_total = len(all_confs) for b_confs, b_outs in zip(bins, outcome_bins): if not b_confs: continue avg_conf = mean(b_confs) avg_acc = mean(b_outs) ece += (len(b_confs) / n_total) * abs(avg_acc - avg_conf) brier = mean((c - o) ** 2 for c, o in zip(all_confs, all_outcomes)) return { "ece": round(ece, 4), "brier": round(brier, 4), "n_steps": len(all_confs), "mean_confidence": round(mean(all_confs), 4), } def compute_calibration_from_ce(trials: list[dict]) -> dict: """Compute model-level calibration statistics from pre-computed CE values.""" ce_values = [rec["ce"] for rec in trials if rec.get("ce") is not None] hrr_values = [rec.get("hrr", 0.0) for rec in trials] sr_values = [rec.get("sr", 0.0) for rec in trials] n_total = len(trials) n_with_ce = len(ce_values) result = { "n_total": n_total, "n_with_confidence": n_with_ce, "coverage": round(n_with_ce / n_total, 4) if n_total > 0 else 0.0, } if n_with_ce == 0: result.update({ "mean_ce": None, "median_ce": None, "std_ce": None, "ece_trial_level": None, "brier_proxy": None, "mean_hrr": round(mean(hrr_values), 4) if hrr_values else None, "mean_sr": round(mean(sr_values), 4) if sr_values else None, }) return result # Trial-level ECE: bin trials by their CE value, compute weighted average n_bins = 10 bins = [[] for _ in range(n_bins)] for ce in ce_values: idx = min(int(ce * n_bins), n_bins - 1) bins[idx].append(ce) ece = 0.0 for i, b in enumerate(bins): if not b: continue bin_center = (i + 0.5) / n_bins avg_ce = mean(b) ece += (len(b) / n_with_ce) * abs(avg_ce - bin_center) # Brier proxy: mean(ce^2) brier_proxy = mean(ce ** 2 for ce in ce_values) result.update({ "mean_ce": round(mean(ce_values), 4), "median_ce": round(median(ce_values), 4), "std_ce": round(stdev(ce_values), 4) if n_with_ce > 1 else 0.0, "ece_trial_level": round(ece, 4), "brier_proxy": round(brier_proxy, 4), "mean_hrr": round(mean(hrr_values), 4), "mean_sr": round(mean(sr_values), 4), }) return result def analyze_calibration(by_model: dict[str, list[dict]]) -> dict: """Run calibration analysis on all models.""" results = {} for model, trials in sorted(by_model.items()): # Try step-level ECE first (from confidence_log) step_level = compute_ece_from_confidence_logs(trials) # Always compute trial-level CE stats trial_level = compute_calibration_from_ce(trials) if step_level: trial_level["ece_step_level"] = step_level["ece"] trial_level["brier_step_level"] = step_level["brier"] trial_level["n_confidence_steps"] = step_level["n_steps"] trial_level["mean_confidence"] = step_level["mean_confidence"] results[model] = trial_level return results def print_summary(results: dict) -> None: """Print a readable summary table.""" header = f"{'Model':<25} {'n':>3} {'cov':>5} {'mean_CE':>8} {'med_CE':>8} {'ECE':>8} {'Brier':>8} {'HRR':>6} {'SR':>6}" print("\n" + "=" * len(header)) print("HalluMaze Confidence Calibration Analysis") print("=" * len(header)) print(header) print("-" * len(header)) for model, data in sorted(results.items(), key=lambda x: (x[1].get("mean_ce") or 999)): cov_pct = f"{data['coverage']*100:.0f}%" mean_ce = f"{data['mean_ce']:.4f}" if data['mean_ce'] is not None else "N/A" med_ce = f"{data['median_ce']:.4f}" if data['median_ce'] is not None else "N/A" ece = f"{data['ece_trial_level']:.4f}" if data['ece_trial_level'] is not None else "N/A" brier = f"{data['brier_proxy']:.4f}" if data['brier_proxy'] is not None else "N/A" hrr = f"{data['mean_hrr']:.3f}" if data.get('mean_hrr') is not None else "N/A" sr = f"{data['mean_sr']:.3f}" if data.get('mean_sr') is not None else "N/A" print(f"{model:<25} {data['n_total']:>3} {cov_pct:>5} {mean_ce:>8} {med_ce:>8} {ece:>8} {brier:>8} {hrr:>6} {sr:>6}") print("=" * len(header)) print("\nLegend:") print(" cov: % of trials with confidence data") print(" mean_CE: mean per-trial Calibration Error (lower = better calibrated)") print(" ECE: Expected Calibration Error (trial-level binned)") print(" Brier: Brier Score proxy = mean(CE^2)") print(" HRR: Hallucination Recovery Rate") print(" SR: Solve Rate") def main() -> None: print("Loading trial data...") by_model = load_all_records() total = sum(len(v) for v in by_model.values()) print(f"Loaded {total} valid trials across {len(by_model)} models") results = analyze_calibration(by_model) print_summary(results) outpath = BASE / "calibration.json" with open(outpath, "w") as f: json.dump(results, f, indent=2, ensure_ascii=False) print(f"\nSaved to {outpath}") if __name__ == "__main__": main()