hallumaze / scripts /build_final_analysis.py
Be2Jay's picture
v1.4: update scripts/build_final_analysis.py
6af8c8c verified
#!/usr/bin/env python3
"""
build_final_analysis.py β€” HalluMaze μ΅œμ’… 뢄석 νŒŒμ΄ν”„λΌμΈ
λͺ¨λ“  μ‹€ν—˜ κ²°κ³Όλ₯Ό 병합 β†’ Bootstrap CI β†’ Wilcoxon/Bonferroni β†’ JSON 좜λ ₯
Usage:
python3 scripts/build_final_analysis.py
python3 scripts/build_final_analysis.py --partial # 아직 μ™„λ£Œ μ•ˆ 된 것도 포함
"""
from __future__ import annotations
import json, math, random, argparse
from pathlib import Path
from collections import defaultdict
BASE = Path(__file__).parent.parent / "experiment_results"
# ── 데이터 μ†ŒμŠ€ μ •μ˜ ──────────────────────────────────────────
SOURCES = {
# Local runs (MiniMax + GLM)
"checkpoint_rerun": {
"file": BASE / "checkpoint_rerun.json",
"model_key": "model",
"format": "list",
},
# OpenRouter phase B (Llama/Gemini/GPT/Haiku)
"or_phaseB_scout_gemini": {
"file": BASE / "or_phaseB.json",
"model_key": "or_model_id",
"format": "list",
"filter_models": ["meta-llama/llama-4-scout", "google/gemini-2.0-flash-lite-001"],
},
# Completed reruns
"or_haiku": {"file": BASE / "or_haiku.json", "model_key": "or_model_id", "format": "list"},
"or_gptmini": {"file": BASE / "or_gptmini.json", "model_key": "or_model_id", "format": "list"},
"or_maverick": {"file": BASE / "or_maverick.json", "model_key": "or_model_id", "format": "list"},
"or_qwen": {"file": BASE / "or_qwen.json", "model_key": "or_model_id", "format": "list"},
# Phase C (SOTA frontier models)
"or_phaseC": {"file": BASE / "or_phaseC.json", "model_key": "or_model_id", "format": "list"},
}
# ── λͺ¨λΈ μ •κ·œν™” 이름 ───────────────────────────────────────────
MODEL_DISPLAY = {
"glm-4.7": "GLM-4.7",
"MiniMax-M2.5": "MiniMax-M2.5",
"meta-llama/llama-4-scout": "Llama-4-Scout",
"meta-llama/llama-4-maverick": "Llama-4-Maverick",
"google/gemini-2.0-flash-lite-001": "Gemini-2.0-Flash-Lite",
"openai/gpt-4o-mini": "GPT-4o-mini",
"anthropic/claude-3-haiku": "Claude-3-Haiku",
"qwen/qwen-2.5-72b-instruct": "Qwen-2.5-72B",
"openai/gpt-4o": "GPT-4o",
"anthropic/claude-3.7-sonnet": "Claude-3.7-Sonnet",
}
BASELINES = {
"random_walk": {"mei": 0.9, "sr": 1.0, "hrr": 1.0, "brs": 1.0},
"astar": {"mei": 0.9, "sr": 1.0, "hrr": 1.0, "brs": 1.0},
"bfs": {"mei": 0.9, "sr": 1.0, "hrr": 1.0, "brs": 1.0},
}
def load_all_records(partial: bool = False) -> dict[str, list[dict]]:
"""λͺ¨λ“  μ†ŒμŠ€μ—μ„œ 유효 λ ˆμ½”λ“œ λ‘œλ“œ β†’ λͺ¨λΈλ³„ λ”•μ…”λ„ˆλ¦¬"""
by_model: dict[str, list[dict]] = defaultdict(list)
seen = set() # (model, size, seed) dedup
for src_name, cfg in SOURCES.items():
path = cfg["file"]
if not path.exists():
print(f" [skip] {path.name} not found")
continue
try:
data = json.loads(path.read_text())
except Exception as e:
print(f" [skip] {path.name}: {e}")
continue
if not isinstance(data, list):
data = data.get("raw_trials", data.get("results", []))
if not isinstance(data, list):
continue
filter_m = set(cfg.get("filter_models", []))
mk = cfg.get("model_key", "model")
for r in data:
if r.get("error"):
continue
if r.get("sr") is None and r.get("mei") is None:
continue
raw_model = r.get(mk, r.get("model", "?"))
if filter_m and raw_model not in filter_m:
continue
display = MODEL_DISPLAY.get(raw_model, raw_model)
size = r.get("size", 5)
seed = r.get("seed", 0)
key = (display, size, seed)
if key in seen:
continue
seen.add(key)
by_model[display].append(r)
return dict(by_model)
def bootstrap_ci(values: list[float], n_boot: int = 2000, ci: float = 0.95) -> tuple[float, float, float]:
"""Bootstrap confidence interval"""
if not values:
return 0.0, 0.0, 0.0
rng = random.Random(42)
n = len(values)
means = []
for _ in range(n_boot):
sample = [values[rng.randint(0, n-1)] for _ in range(n)]
means.append(sum(sample) / n)
means.sort()
lo = means[int(n_boot * (1 - ci) / 2)]
hi = means[int(n_boot * (1 - (1 - ci) / 2)) - 1]
mean = sum(values) / n
return mean, lo, hi
def _norm_cdf(x: float) -> float:
"""Standard normal CDF via error function"""
return 0.5 * (1 + math.erf(x / math.sqrt(2)))
def one_sample_wilcoxon(values: list[float], mu0: float = 0.9) -> float:
"""One-sample Wilcoxon signed-rank test against constant mu0.
Appropriate when comparing LLM MEI values against a deterministic baseline
with zero variance. Normal approximation (n>=10).
Returns two-sided p-value.
"""
diffs = [v - mu0 for v in values if v != mu0]
n = len(diffs)
if n == 0:
return 1.0
abs_diffs_sorted = sorted(range(n), key=lambda i: abs(diffs[i]))
# Average ranks for ties
ranks = [0.0] * n
i = 0
while i < n:
j = i
while j < n and abs(diffs[abs_diffs_sorted[j]]) == abs(diffs[abs_diffs_sorted[i]]):
j += 1
avg_rank = (i + j + 1) / 2
for k in range(i, j):
ranks[abs_diffs_sorted[k]] = avg_rank
i = j
W_plus = sum(ranks[i] for i in range(n) if diffs[i] > 0)
mu_W = n * (n + 1) / 4
sigma_W = math.sqrt(n * (n + 1) * (2 * n + 1) / 24)
if sigma_W == 0:
return 1.0
z = (W_plus - mu_W) / sigma_W
return 2 * (1 - _norm_cdf(abs(z)))
def glass_delta(constant_mu: float, values: list[float]) -> float:
"""Glass's delta: (mu_constant - mu_values) / sd_values.
Use when one group is a constant (zero-variance baseline).
Standard pooled Cohen's d inflates effect size by sqrt(2) in this case.
"""
if len(values) < 2:
return 0.0
my = sum(values) / len(values)
sy = math.sqrt(sum((v - my) ** 2 for v in values) / (len(values) - 1))
return abs(constant_mu - my) / sy if sy > 1e-9 else 0.0
def build_analysis(partial: bool = False):
print("=== HalluMaze Final Analysis Builder ===")
records = load_all_records(partial)
if not records:
print("ERROR: No valid records found.")
return
# ── Summary stats per model ──────────────────────────────
summary = {}
for model, recs in sorted(records.items()):
mei_vals = [r.get("mei", r.get("hallumaze_score", 0)) for r in recs]
sr_vals = [r.get("sr", 0) for r in recs]
hrr_vals = [r.get("hrr", 0) for r in recs]
brs_vals = [r.get("brs", 0) for r in recs]
hc_vals = [r.get("hallucination_count", 0) for r in recs]
mei_m, mei_lo, mei_hi = bootstrap_ci(mei_vals)
sr_m, sr_lo, sr_hi = bootstrap_ci(sr_vals)
hrr_m, hrr_lo, hrr_hi = bootstrap_ci(hrr_vals)
brs_m, brs_lo, brs_hi = bootstrap_ci(brs_vals)
summary[model] = {
"n": len(recs),
"mei": {"mean": round(mei_m,4), "ci_lo": round(mei_lo,4), "ci_hi": round(mei_hi,4)},
"sr": {"mean": round(sr_m,4), "ci_lo": round(sr_lo,4), "ci_hi": round(sr_hi,4)},
"hrr": {"mean": round(hrr_m,4), "ci_lo": round(hrr_lo,4), "ci_hi": round(hrr_hi,4)},
"brs": {"mean": round(brs_m,4), "ci_lo": round(brs_lo,4), "ci_hi": round(brs_hi,4)},
"hc_mean": round(sum(hc_vals)/len(hc_vals),2) if hc_vals else 0,
}
print(f" {model:30s} n={len(recs):3d} MEI={mei_m:.3f} [{mei_lo:.3f},{mei_hi:.3f}] SR={sr_m:.3f} HRR={hrr_m:.3f}")
# ── Baselines ────────────────────────────────────────────
rw_mei = [BASELINES["random_walk"]["mei"]] * 60
summary["random_walk"] = {
"n": 60, "is_baseline": True,
"mei": {"mean": 0.9, "ci_lo": 0.9, "ci_hi": 0.9},
"sr": {"mean": 1.0, "ci_lo": 1.0, "ci_hi": 1.0},
"hrr": {"mean": 1.0, "ci_lo": 1.0, "ci_hi": 1.0},
"brs": {"mean": 1.0, "ci_lo": 1.0, "ci_hi": 1.0},
"hc_mean": 0,
}
for b in ["astar", "bfs"]:
summary[b] = {**summary["random_walk"], "n": 60}
# ── Pairwise tests (vs random_walk) ──────────────────────
k = len([m for m in records])
alpha_bonf = 0.05 / k if k else 0.05
pairwise = {}
for model, recs in records.items():
mei_vals = [r.get("mei", r.get("hallumaze_score", 0)) for r in recs]
# One-sample Wilcoxon signed-rank test vs constant baseline mu0=0.9
p_raw = one_sample_wilcoxon(mei_vals, mu0=BASELINES["random_walk"]["mei"])
p_bonf = min(p_raw * k, 1.0)
# Glass's delta (appropriate when baseline has zero variance)
d = glass_delta(BASELINES["random_walk"]["mei"], mei_vals)
pairwise[model] = {
"n": len(recs),
"p_raw": round(p_raw, 6),
"p_bonferroni": round(p_bonf, 6),
"cohens_d": round(d, 3), # Glass's delta (one-sample, constant baseline)
"significant_bonf": p_bonf < 0.05,
}
# ── Sort by MEI descending ────────────────────────────────
llm_models = {m: v for m, v in summary.items() if not v.get("is_baseline") and m not in ("astar","bfs","random_walk")}
sorted_models = sorted(llm_models.keys(), key=lambda m: -summary[m]["mei"]["mean"])
output = {
"metadata": {
"k_bonferroni": k,
"alpha_bonferroni": round(alpha_bonf, 4),
"n_boot": 2000,
"ci_level": 0.95,
"total_valid_trials": sum(v["n"] for v in llm_models.values()),
"models_by_mei": sorted_models,
},
"summary": summary,
"pairwise_tests": pairwise,
}
out_path = BASE / "analysis_final2.json"
out_path.write_text(json.dumps(output, indent=2, ensure_ascii=False))
print(f"\nβœ“ Saved: {out_path}")
print(f" Models: {', '.join(sorted_models)}")
print(f" Total valid trials: {output['metadata']['total_valid_trials']}")
return output
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("--partial", action="store_true", help="Include incomplete runs")
args = ap.parse_args()
build_analysis(partial=args.partial)