hallumaze / scripts /failure_mode_analysis.py
Be2Jay's picture
Upload folder using huggingface_hub
d77ae53 verified
#!/usr/bin/env python3
"""
failure_mode_analysis.py β€” HalluMaze μ‹€νŒ¨ λͺ¨λ“œ λΆ„λ₯˜ν•™
각 trial을 4κ°€μ§€ μœ ν˜•μœΌλ‘œ μžλ™ λΆ„λ₯˜:
TYPE_S: Success β€” sr == 1.0
TYPE_A: Mirage_Undetected β€” hallucination_count > 0 AND backtrack_count == 0
TYPE_B: Mirage_Detected_Failed β€” hallucination_count > 0 AND backtrack_count > 0 AND hrr < 0.5
TYPE_C: Loop_Trapped β€” loop_count >= 2 AND sr == 0
Usage:
python scripts/failure_mode_analysis.py
# Output: experiment_results/failure_modes.json
"""
from __future__ import annotations
import json
from pathlib import Path
from collections import defaultdict
BASE = Path(__file__).parent.parent / "experiment_results"
# ── Data Sources (same as build_final_analysis.py) ──────────────
SOURCES = {
"checkpoint_rerun": {
"file": BASE / "checkpoint_rerun.json",
"model_key": "model",
},
"or_phaseB_scout_gemini": {
"file": BASE / "or_phaseB.json",
"model_key": "or_model_id",
"filter_models": ["meta-llama/llama-4-scout", "google/gemini-2.0-flash-lite-001"],
},
"or_haiku": {"file": BASE / "or_haiku.json", "model_key": "or_model_id"},
"or_gptmini": {"file": BASE / "or_gptmini.json", "model_key": "or_model_id"},
"or_maverick": {"file": BASE / "or_maverick.json", "model_key": "or_model_id"},
"or_qwen": {"file": BASE / "or_qwen.json", "model_key": "or_model_id"},
}
MODEL_DISPLAY = {
"glm-4.7": "GLM-4.7",
"MiniMax-M2.5": "MiniMax-M2.5",
"meta-llama/llama-4-scout": "Llama-4-Scout",
"meta-llama/llama-4-maverick": "Llama-4-Maverick",
"google/gemini-2.0-flash-lite-001": "Gemini-2.0-Flash-Lite",
"openai/gpt-4o-mini": "GPT-4o-mini",
"anthropic/claude-3-haiku": "Claude-3-Haiku",
"qwen/qwen-2.5-72b-instruct": "Qwen-2.5-72B",
}
def load_all_records() -> dict[str, list[dict]]:
"""Load all trial records grouped by display model name."""
by_model: dict[str, list[dict]] = defaultdict(list)
for src_name, src in SOURCES.items():
fpath = src["file"]
if not fpath.exists():
print(f" [SKIP] {fpath.name} not found")
continue
with open(fpath) as f:
records = json.load(f)
model_key = src.get("model_key", "model")
filter_models = src.get("filter_models")
for rec in records:
raw_model = rec.get(model_key) or rec.get("model", "unknown")
if filter_models and raw_model not in filter_models:
continue
# Skip error trials
if rec.get("error"):
continue
display = MODEL_DISPLAY.get(raw_model, raw_model)
by_model[display].append(rec)
return dict(by_model)
def classify_trial(rec: dict) -> str:
"""Classify a single trial into one of 4 failure modes."""
sr = rec.get("sr", 0)
hallucination_count = rec.get("hallucination_count", 0)
backtrack_count = rec.get("backtrack_count", 0)
loop_count = rec.get("loop_count", 0)
hrr = rec.get("hrr", 0.0)
# Priority order: Success first, then specific failure modes
if sr == 1.0:
return "TYPE_S"
if hallucination_count > 0 and backtrack_count == 0:
return "TYPE_A"
if hallucination_count > 0 and backtrack_count > 0 and hrr < 0.5:
return "TYPE_B"
if loop_count >= 2:
return "TYPE_C"
# Fallback: failure that doesn't match specific patterns
return "TYPE_OTHER"
def analyze_failure_modes(by_model: dict[str, list[dict]]) -> dict:
"""Run failure mode classification on all models."""
results = {}
for model, trials in sorted(by_model.items()):
counts = {"TYPE_S": 0, "TYPE_A": 0, "TYPE_B": 0, "TYPE_C": 0, "TYPE_OTHER": 0}
for rec in trials:
ftype = classify_trial(rec)
counts[ftype] += 1
n = len(trials)
pcts = {k: round(v / n * 100, 1) if n > 0 else 0.0 for k, v in counts.items()}
results[model] = {
"n": n,
"counts": counts,
"percentages": pcts,
"labels": {
"TYPE_S": "Success",
"TYPE_A": "Mirage_Undetected",
"TYPE_B": "Mirage_Detected_Failed",
"TYPE_C": "Loop_Trapped",
"TYPE_OTHER": "Other_Failure",
},
}
return results
def print_summary(results: dict) -> None:
"""Print a readable summary table."""
header = f"{'Model':<25} {'n':>3} {'Success':>8} {'Undetect':>8} {'Det+Fail':>8} {'Loop':>8} {'Other':>8}"
print("\n" + "=" * len(header))
print("HalluMaze Failure Mode Taxonomy")
print("=" * len(header))
print(header)
print("-" * len(header))
for model, data in sorted(results.items(), key=lambda x: x[1]["percentages"]["TYPE_S"], reverse=True):
p = data["percentages"]
print(f"{model:<25} {data['n']:>3} {p['TYPE_S']:>7.1f}% {p['TYPE_A']:>7.1f}% {p['TYPE_B']:>7.1f}% {p['TYPE_C']:>7.1f}% {p['TYPE_OTHER']:>7.1f}%")
print("=" * len(header))
print("\nLegend:")
print(" TYPE_S: Success (sr=1.0)")
print(" TYPE_A: Mirage_Undetected (hallucination but no backtrack)")
print(" TYPE_B: Mirage_Detected_Failed (backtracked but hrr < 0.5)")
print(" TYPE_C: Loop_Trapped (loop_count >= 2, failed)")
print(" TYPE_OTHER: Other failure mode")
def main() -> None:
print("Loading trial data...")
by_model = load_all_records()
total = sum(len(v) for v in by_model.values())
print(f"Loaded {total} valid trials across {len(by_model)} models")
results = analyze_failure_modes(by_model)
print_summary(results)
outpath = BASE / "failure_modes.json"
with open(outpath, "w") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\nSaved to {outpath}")
if __name__ == "__main__":
main()