Spaces:

Be2Jay
/

hallumaze

Running

File size: 9,111 Bytes

d77ae53

#!/usr/bin/env python3
"""
calibration_analysis.py — HalluMaze Confidence Calibration Analysis

Computes per-model calibration metrics from existing trial data:
  - ECE (Expected Calibration Error): binned CE across trials
  - Brier Score proxy: mean((ce_i)^2) per model
  - Mean CE, median CE, and coverage statistics

Data source: ce field (per-trial Calibration Error) already computed by hallumaze.py.
CE = mean |confidence/100 - correctness| per step within each trial.

When confidence_log data is available (future runs), computes step-level ECE with
10 bins. For current data, uses trial-level CE values.

Usage:
    python3 scripts/calibration_analysis.py
    # Output: experiment_results/calibration.json
"""
from __future__ import annotations
import json, math
from pathlib import Path
from collections import defaultdict
from statistics import mean, median, stdev

BASE = Path(__file__).parent.parent / "experiment_results"

# ── Data Sources (consistent with build_final_analysis.py) ──────
SOURCES = {
    "checkpoint_rerun": {
        "file": BASE / "checkpoint_rerun.json",
        "model_key": "model",
    },
    "or_phaseB_scout_gemini": {
        "file": BASE / "or_phaseB.json",
        "model_key": "or_model_id",
        "filter_models": ["meta-llama/llama-4-scout", "google/gemini-2.0-flash-lite-001"],
    },
    "or_haiku":    {"file": BASE / "or_haiku.json",    "model_key": "or_model_id"},
    "or_gptmini":  {"file": BASE / "or_gptmini.json",  "model_key": "or_model_id"},
    "or_maverick": {"file": BASE / "or_maverick.json",  "model_key": "or_model_id"},
    "or_qwen":     {"file": BASE / "or_qwen.json",     "model_key": "or_model_id"},
}

MODEL_DISPLAY = {
    "glm-4.7": "GLM-4.7",
    "MiniMax-M2.5": "MiniMax-M2.5",
    "meta-llama/llama-4-scout": "Llama-4-Scout",
    "meta-llama/llama-4-maverick": "Llama-4-Maverick",
    "google/gemini-2.0-flash-lite-001": "Gemini-2.0-Flash-Lite",
    "openai/gpt-4o-mini": "GPT-4o-mini",
    "anthropic/claude-3-haiku": "Claude-3-Haiku",
    "qwen/qwen-2.5-72b-instruct": "Qwen-2.5-72B",
}


def load_all_records() -> dict[str, list[dict]]:
    """Load all trial records grouped by display model name."""
    by_model: dict[str, list[dict]] = defaultdict(list)
    for src_name, src in SOURCES.items():
        fpath = src["file"]
        if not fpath.exists():
            print(f"  [SKIP] {fpath.name} not found")
            continue
        with open(fpath) as f:
            records = json.load(f)
        model_key = src.get("model_key", "model")
        filter_models = src.get("filter_models")
        for rec in records:
            raw_model = rec.get(model_key) or rec.get("model", "unknown")
            if filter_models and raw_model not in filter_models:
                continue
            if rec.get("error"):
                continue
            display = MODEL_DISPLAY.get(raw_model, raw_model)
            by_model[display].append(rec)
    return dict(by_model)


def compute_ece_from_confidence_logs(trials: list[dict], n_bins: int = 10) -> dict | None:
    """Compute step-level ECE if confidence_log data is available."""
    all_confs = []
    all_outcomes = []
    for rec in trials:
        conf_log = rec.get("confidence_log", [])
        if not conf_log:
            continue
        hrr = rec.get("hrr", 0.0)
        for entry in conf_log:
            if entry is None:
                continue
            conf = entry.get("conf")
            if conf is None:
                continue
            all_confs.append(conf / 100.0)
            all_outcomes.append(hrr)

    if len(all_confs) < 5:
        return None

    # 10-bin ECE
    bins = [[] for _ in range(n_bins)]
    outcome_bins = [[] for _ in range(n_bins)]
    for c, o in zip(all_confs, all_outcomes):
        idx = min(int(c * n_bins), n_bins - 1)
        bins[idx].append(c)
        outcome_bins[idx].append(o)

    ece = 0.0
    n_total = len(all_confs)
    for b_confs, b_outs in zip(bins, outcome_bins):
        if not b_confs:
            continue
        avg_conf = mean(b_confs)
        avg_acc = mean(b_outs)
        ece += (len(b_confs) / n_total) * abs(avg_acc - avg_conf)

    brier = mean((c - o) ** 2 for c, o in zip(all_confs, all_outcomes))
    return {
        "ece": round(ece, 4),
        "brier": round(brier, 4),
        "n_steps": len(all_confs),
        "mean_confidence": round(mean(all_confs), 4),
    }


def compute_calibration_from_ce(trials: list[dict]) -> dict:
    """Compute model-level calibration statistics from pre-computed CE values."""
    ce_values = [rec["ce"] for rec in trials if rec.get("ce") is not None]
    hrr_values = [rec.get("hrr", 0.0) for rec in trials]
    sr_values = [rec.get("sr", 0.0) for rec in trials]

    n_total = len(trials)
    n_with_ce = len(ce_values)

    result = {
        "n_total": n_total,
        "n_with_confidence": n_with_ce,
        "coverage": round(n_with_ce / n_total, 4) if n_total > 0 else 0.0,
    }

    if n_with_ce == 0:
        result.update({
            "mean_ce": None,
            "median_ce": None,
            "std_ce": None,
            "ece_trial_level": None,
            "brier_proxy": None,
            "mean_hrr": round(mean(hrr_values), 4) if hrr_values else None,
            "mean_sr": round(mean(sr_values), 4) if sr_values else None,
        })
        return result

    # Trial-level ECE: bin trials by their CE value, compute weighted average
    n_bins = 10
    bins = [[] for _ in range(n_bins)]
    for ce in ce_values:
        idx = min(int(ce * n_bins), n_bins - 1)
        bins[idx].append(ce)

    ece = 0.0
    for i, b in enumerate(bins):
        if not b:
            continue
        bin_center = (i + 0.5) / n_bins
        avg_ce = mean(b)
        ece += (len(b) / n_with_ce) * abs(avg_ce - bin_center)

    # Brier proxy: mean(ce^2)
    brier_proxy = mean(ce ** 2 for ce in ce_values)

    result.update({
        "mean_ce": round(mean(ce_values), 4),
        "median_ce": round(median(ce_values), 4),
        "std_ce": round(stdev(ce_values), 4) if n_with_ce > 1 else 0.0,
        "ece_trial_level": round(ece, 4),
        "brier_proxy": round(brier_proxy, 4),
        "mean_hrr": round(mean(hrr_values), 4),
        "mean_sr": round(mean(sr_values), 4),
    })
    return result


def analyze_calibration(by_model: dict[str, list[dict]]) -> dict:
    """Run calibration analysis on all models."""
    results = {}
    for model, trials in sorted(by_model.items()):
        # Try step-level ECE first (from confidence_log)
        step_level = compute_ece_from_confidence_logs(trials)
        # Always compute trial-level CE stats
        trial_level = compute_calibration_from_ce(trials)

        if step_level:
            trial_level["ece_step_level"] = step_level["ece"]
            trial_level["brier_step_level"] = step_level["brier"]
            trial_level["n_confidence_steps"] = step_level["n_steps"]
            trial_level["mean_confidence"] = step_level["mean_confidence"]

        results[model] = trial_level
    return results


def print_summary(results: dict) -> None:
    """Print a readable summary table."""
    header = f"{'Model':<25} {'n':>3} {'cov':>5} {'mean_CE':>8} {'med_CE':>8} {'ECE':>8} {'Brier':>8} {'HRR':>6} {'SR':>6}"
    print("\n" + "=" * len(header))
    print("HalluMaze Confidence Calibration Analysis")
    print("=" * len(header))
    print(header)
    print("-" * len(header))
    for model, data in sorted(results.items(), key=lambda x: (x[1].get("mean_ce") or 999)):
        cov_pct = f"{data['coverage']*100:.0f}%"
        mean_ce = f"{data['mean_ce']:.4f}" if data['mean_ce'] is not None else "N/A"
        med_ce = f"{data['median_ce']:.4f}" if data['median_ce'] is not None else "N/A"
        ece = f"{data['ece_trial_level']:.4f}" if data['ece_trial_level'] is not None else "N/A"
        brier = f"{data['brier_proxy']:.4f}" if data['brier_proxy'] is not None else "N/A"
        hrr = f"{data['mean_hrr']:.3f}" if data.get('mean_hrr') is not None else "N/A"
        sr = f"{data['mean_sr']:.3f}" if data.get('mean_sr') is not None else "N/A"
        print(f"{model:<25} {data['n_total']:>3} {cov_pct:>5} {mean_ce:>8} {med_ce:>8} {ece:>8} {brier:>8} {hrr:>6} {sr:>6}")
    print("=" * len(header))
    print("\nLegend:")
    print("  cov: % of trials with confidence data")
    print("  mean_CE: mean per-trial Calibration Error (lower = better calibrated)")
    print("  ECE: Expected Calibration Error (trial-level binned)")
    print("  Brier: Brier Score proxy = mean(CE^2)")
    print("  HRR: Hallucination Recovery Rate")
    print("  SR: Solve Rate")


def main() -> None:
    print("Loading trial data...")
    by_model = load_all_records()
    total = sum(len(v) for v in by_model.values())
    print(f"Loaded {total} valid trials across {len(by_model)} models")

    results = analyze_calibration(by_model)
    print_summary(results)

    outpath = BASE / "calibration.json"
    with open(outpath, "w") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\nSaved to {outpath}")


if __name__ == "__main__":
    main()