Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| calibration_analysis.py β HalluMaze Confidence Calibration Analysis | |
| Computes per-model calibration metrics from existing trial data: | |
| - ECE (Expected Calibration Error): binned CE across trials | |
| - Brier Score proxy: mean((ce_i)^2) per model | |
| - Mean CE, median CE, and coverage statistics | |
| Data source: ce field (per-trial Calibration Error) already computed by hallumaze.py. | |
| CE = mean |confidence/100 - correctness| per step within each trial. | |
| When confidence_log data is available (future runs), computes step-level ECE with | |
| 10 bins. For current data, uses trial-level CE values. | |
| Usage: | |
| python3 scripts/calibration_analysis.py | |
| # Output: experiment_results/calibration.json | |
| """ | |
| from __future__ import annotations | |
| import json, math | |
| from pathlib import Path | |
| from collections import defaultdict | |
| from statistics import mean, median, stdev | |
| BASE = Path(__file__).parent.parent / "experiment_results" | |
| # ββ Data Sources (consistent with build_final_analysis.py) ββββββ | |
| SOURCES = { | |
| "checkpoint_rerun": { | |
| "file": BASE / "checkpoint_rerun.json", | |
| "model_key": "model", | |
| }, | |
| "or_phaseB_scout_gemini": { | |
| "file": BASE / "or_phaseB.json", | |
| "model_key": "or_model_id", | |
| "filter_models": ["meta-llama/llama-4-scout", "google/gemini-2.0-flash-lite-001"], | |
| }, | |
| "or_haiku": {"file": BASE / "or_haiku.json", "model_key": "or_model_id"}, | |
| "or_gptmini": {"file": BASE / "or_gptmini.json", "model_key": "or_model_id"}, | |
| "or_maverick": {"file": BASE / "or_maverick.json", "model_key": "or_model_id"}, | |
| "or_qwen": {"file": BASE / "or_qwen.json", "model_key": "or_model_id"}, | |
| } | |
| MODEL_DISPLAY = { | |
| "glm-4.7": "GLM-4.7", | |
| "MiniMax-M2.5": "MiniMax-M2.5", | |
| "meta-llama/llama-4-scout": "Llama-4-Scout", | |
| "meta-llama/llama-4-maverick": "Llama-4-Maverick", | |
| "google/gemini-2.0-flash-lite-001": "Gemini-2.0-Flash-Lite", | |
| "openai/gpt-4o-mini": "GPT-4o-mini", | |
| "anthropic/claude-3-haiku": "Claude-3-Haiku", | |
| "qwen/qwen-2.5-72b-instruct": "Qwen-2.5-72B", | |
| } | |
| def load_all_records() -> dict[str, list[dict]]: | |
| """Load all trial records grouped by display model name.""" | |
| by_model: dict[str, list[dict]] = defaultdict(list) | |
| for src_name, src in SOURCES.items(): | |
| fpath = src["file"] | |
| if not fpath.exists(): | |
| print(f" [SKIP] {fpath.name} not found") | |
| continue | |
| with open(fpath) as f: | |
| records = json.load(f) | |
| model_key = src.get("model_key", "model") | |
| filter_models = src.get("filter_models") | |
| for rec in records: | |
| raw_model = rec.get(model_key) or rec.get("model", "unknown") | |
| if filter_models and raw_model not in filter_models: | |
| continue | |
| if rec.get("error"): | |
| continue | |
| display = MODEL_DISPLAY.get(raw_model, raw_model) | |
| by_model[display].append(rec) | |
| return dict(by_model) | |
| def compute_ece_from_confidence_logs(trials: list[dict], n_bins: int = 10) -> dict | None: | |
| """Compute step-level ECE if confidence_log data is available.""" | |
| all_confs = [] | |
| all_outcomes = [] | |
| for rec in trials: | |
| conf_log = rec.get("confidence_log", []) | |
| if not conf_log: | |
| continue | |
| hrr = rec.get("hrr", 0.0) | |
| for entry in conf_log: | |
| if entry is None: | |
| continue | |
| conf = entry.get("conf") | |
| if conf is None: | |
| continue | |
| all_confs.append(conf / 100.0) | |
| all_outcomes.append(hrr) | |
| if len(all_confs) < 5: | |
| return None | |
| # 10-bin ECE | |
| bins = [[] for _ in range(n_bins)] | |
| outcome_bins = [[] for _ in range(n_bins)] | |
| for c, o in zip(all_confs, all_outcomes): | |
| idx = min(int(c * n_bins), n_bins - 1) | |
| bins[idx].append(c) | |
| outcome_bins[idx].append(o) | |
| ece = 0.0 | |
| n_total = len(all_confs) | |
| for b_confs, b_outs in zip(bins, outcome_bins): | |
| if not b_confs: | |
| continue | |
| avg_conf = mean(b_confs) | |
| avg_acc = mean(b_outs) | |
| ece += (len(b_confs) / n_total) * abs(avg_acc - avg_conf) | |
| brier = mean((c - o) ** 2 for c, o in zip(all_confs, all_outcomes)) | |
| return { | |
| "ece": round(ece, 4), | |
| "brier": round(brier, 4), | |
| "n_steps": len(all_confs), | |
| "mean_confidence": round(mean(all_confs), 4), | |
| } | |
| def compute_calibration_from_ce(trials: list[dict]) -> dict: | |
| """Compute model-level calibration statistics from pre-computed CE values.""" | |
| ce_values = [rec["ce"] for rec in trials if rec.get("ce") is not None] | |
| hrr_values = [rec.get("hrr", 0.0) for rec in trials] | |
| sr_values = [rec.get("sr", 0.0) for rec in trials] | |
| n_total = len(trials) | |
| n_with_ce = len(ce_values) | |
| result = { | |
| "n_total": n_total, | |
| "n_with_confidence": n_with_ce, | |
| "coverage": round(n_with_ce / n_total, 4) if n_total > 0 else 0.0, | |
| } | |
| if n_with_ce == 0: | |
| result.update({ | |
| "mean_ce": None, | |
| "median_ce": None, | |
| "std_ce": None, | |
| "ece_trial_level": None, | |
| "brier_proxy": None, | |
| "mean_hrr": round(mean(hrr_values), 4) if hrr_values else None, | |
| "mean_sr": round(mean(sr_values), 4) if sr_values else None, | |
| }) | |
| return result | |
| # Trial-level ECE: bin trials by their CE value, compute weighted average | |
| n_bins = 10 | |
| bins = [[] for _ in range(n_bins)] | |
| for ce in ce_values: | |
| idx = min(int(ce * n_bins), n_bins - 1) | |
| bins[idx].append(ce) | |
| ece = 0.0 | |
| for i, b in enumerate(bins): | |
| if not b: | |
| continue | |
| bin_center = (i + 0.5) / n_bins | |
| avg_ce = mean(b) | |
| ece += (len(b) / n_with_ce) * abs(avg_ce - bin_center) | |
| # Brier proxy: mean(ce^2) | |
| brier_proxy = mean(ce ** 2 for ce in ce_values) | |
| result.update({ | |
| "mean_ce": round(mean(ce_values), 4), | |
| "median_ce": round(median(ce_values), 4), | |
| "std_ce": round(stdev(ce_values), 4) if n_with_ce > 1 else 0.0, | |
| "ece_trial_level": round(ece, 4), | |
| "brier_proxy": round(brier_proxy, 4), | |
| "mean_hrr": round(mean(hrr_values), 4), | |
| "mean_sr": round(mean(sr_values), 4), | |
| }) | |
| return result | |
| def analyze_calibration(by_model: dict[str, list[dict]]) -> dict: | |
| """Run calibration analysis on all models.""" | |
| results = {} | |
| for model, trials in sorted(by_model.items()): | |
| # Try step-level ECE first (from confidence_log) | |
| step_level = compute_ece_from_confidence_logs(trials) | |
| # Always compute trial-level CE stats | |
| trial_level = compute_calibration_from_ce(trials) | |
| if step_level: | |
| trial_level["ece_step_level"] = step_level["ece"] | |
| trial_level["brier_step_level"] = step_level["brier"] | |
| trial_level["n_confidence_steps"] = step_level["n_steps"] | |
| trial_level["mean_confidence"] = step_level["mean_confidence"] | |
| results[model] = trial_level | |
| return results | |
| def print_summary(results: dict) -> None: | |
| """Print a readable summary table.""" | |
| header = f"{'Model':<25} {'n':>3} {'cov':>5} {'mean_CE':>8} {'med_CE':>8} {'ECE':>8} {'Brier':>8} {'HRR':>6} {'SR':>6}" | |
| print("\n" + "=" * len(header)) | |
| print("HalluMaze Confidence Calibration Analysis") | |
| print("=" * len(header)) | |
| print(header) | |
| print("-" * len(header)) | |
| for model, data in sorted(results.items(), key=lambda x: (x[1].get("mean_ce") or 999)): | |
| cov_pct = f"{data['coverage']*100:.0f}%" | |
| mean_ce = f"{data['mean_ce']:.4f}" if data['mean_ce'] is not None else "N/A" | |
| med_ce = f"{data['median_ce']:.4f}" if data['median_ce'] is not None else "N/A" | |
| ece = f"{data['ece_trial_level']:.4f}" if data['ece_trial_level'] is not None else "N/A" | |
| brier = f"{data['brier_proxy']:.4f}" if data['brier_proxy'] is not None else "N/A" | |
| hrr = f"{data['mean_hrr']:.3f}" if data.get('mean_hrr') is not None else "N/A" | |
| sr = f"{data['mean_sr']:.3f}" if data.get('mean_sr') is not None else "N/A" | |
| print(f"{model:<25} {data['n_total']:>3} {cov_pct:>5} {mean_ce:>8} {med_ce:>8} {ece:>8} {brier:>8} {hrr:>6} {sr:>6}") | |
| print("=" * len(header)) | |
| print("\nLegend:") | |
| print(" cov: % of trials with confidence data") | |
| print(" mean_CE: mean per-trial Calibration Error (lower = better calibrated)") | |
| print(" ECE: Expected Calibration Error (trial-level binned)") | |
| print(" Brier: Brier Score proxy = mean(CE^2)") | |
| print(" HRR: Hallucination Recovery Rate") | |
| print(" SR: Solve Rate") | |
| def main() -> None: | |
| print("Loading trial data...") | |
| by_model = load_all_records() | |
| total = sum(len(v) for v in by_model.values()) | |
| print(f"Loaded {total} valid trials across {len(by_model)} models") | |
| results = analyze_calibration(by_model) | |
| print_summary(results) | |
| outpath = BASE / "calibration.json" | |
| with open(outpath, "w") as f: | |
| json.dump(results, f, indent=2, ensure_ascii=False) | |
| print(f"\nSaved to {outpath}") | |
| if __name__ == "__main__": | |
| main() | |