Spaces:
Running
Running
File size: 5,912 Bytes
d77ae53 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | #!/usr/bin/env python3
"""
failure_mode_analysis.py β HalluMaze μ€ν¨ λͺ¨λ λΆλ₯ν
κ° trialμ 4κ°μ§ μ νμΌλ‘ μλ λΆλ₯:
TYPE_S: Success β sr == 1.0
TYPE_A: Mirage_Undetected β hallucination_count > 0 AND backtrack_count == 0
TYPE_B: Mirage_Detected_Failed β hallucination_count > 0 AND backtrack_count > 0 AND hrr < 0.5
TYPE_C: Loop_Trapped β loop_count >= 2 AND sr == 0
Usage:
python scripts/failure_mode_analysis.py
# Output: experiment_results/failure_modes.json
"""
from __future__ import annotations
import json
from pathlib import Path
from collections import defaultdict
BASE = Path(__file__).parent.parent / "experiment_results"
# ββ Data Sources (same as build_final_analysis.py) ββββββββββββββ
SOURCES = {
"checkpoint_rerun": {
"file": BASE / "checkpoint_rerun.json",
"model_key": "model",
},
"or_phaseB_scout_gemini": {
"file": BASE / "or_phaseB.json",
"model_key": "or_model_id",
"filter_models": ["meta-llama/llama-4-scout", "google/gemini-2.0-flash-lite-001"],
},
"or_haiku": {"file": BASE / "or_haiku.json", "model_key": "or_model_id"},
"or_gptmini": {"file": BASE / "or_gptmini.json", "model_key": "or_model_id"},
"or_maverick": {"file": BASE / "or_maverick.json", "model_key": "or_model_id"},
"or_qwen": {"file": BASE / "or_qwen.json", "model_key": "or_model_id"},
}
MODEL_DISPLAY = {
"glm-4.7": "GLM-4.7",
"MiniMax-M2.5": "MiniMax-M2.5",
"meta-llama/llama-4-scout": "Llama-4-Scout",
"meta-llama/llama-4-maverick": "Llama-4-Maverick",
"google/gemini-2.0-flash-lite-001": "Gemini-2.0-Flash-Lite",
"openai/gpt-4o-mini": "GPT-4o-mini",
"anthropic/claude-3-haiku": "Claude-3-Haiku",
"qwen/qwen-2.5-72b-instruct": "Qwen-2.5-72B",
}
def load_all_records() -> dict[str, list[dict]]:
"""Load all trial records grouped by display model name."""
by_model: dict[str, list[dict]] = defaultdict(list)
for src_name, src in SOURCES.items():
fpath = src["file"]
if not fpath.exists():
print(f" [SKIP] {fpath.name} not found")
continue
with open(fpath) as f:
records = json.load(f)
model_key = src.get("model_key", "model")
filter_models = src.get("filter_models")
for rec in records:
raw_model = rec.get(model_key) or rec.get("model", "unknown")
if filter_models and raw_model not in filter_models:
continue
# Skip error trials
if rec.get("error"):
continue
display = MODEL_DISPLAY.get(raw_model, raw_model)
by_model[display].append(rec)
return dict(by_model)
def classify_trial(rec: dict) -> str:
"""Classify a single trial into one of 4 failure modes."""
sr = rec.get("sr", 0)
hallucination_count = rec.get("hallucination_count", 0)
backtrack_count = rec.get("backtrack_count", 0)
loop_count = rec.get("loop_count", 0)
hrr = rec.get("hrr", 0.0)
# Priority order: Success first, then specific failure modes
if sr == 1.0:
return "TYPE_S"
if hallucination_count > 0 and backtrack_count == 0:
return "TYPE_A"
if hallucination_count > 0 and backtrack_count > 0 and hrr < 0.5:
return "TYPE_B"
if loop_count >= 2:
return "TYPE_C"
# Fallback: failure that doesn't match specific patterns
return "TYPE_OTHER"
def analyze_failure_modes(by_model: dict[str, list[dict]]) -> dict:
"""Run failure mode classification on all models."""
results = {}
for model, trials in sorted(by_model.items()):
counts = {"TYPE_S": 0, "TYPE_A": 0, "TYPE_B": 0, "TYPE_C": 0, "TYPE_OTHER": 0}
for rec in trials:
ftype = classify_trial(rec)
counts[ftype] += 1
n = len(trials)
pcts = {k: round(v / n * 100, 1) if n > 0 else 0.0 for k, v in counts.items()}
results[model] = {
"n": n,
"counts": counts,
"percentages": pcts,
"labels": {
"TYPE_S": "Success",
"TYPE_A": "Mirage_Undetected",
"TYPE_B": "Mirage_Detected_Failed",
"TYPE_C": "Loop_Trapped",
"TYPE_OTHER": "Other_Failure",
},
}
return results
def print_summary(results: dict) -> None:
"""Print a readable summary table."""
header = f"{'Model':<25} {'n':>3} {'Success':>8} {'Undetect':>8} {'Det+Fail':>8} {'Loop':>8} {'Other':>8}"
print("\n" + "=" * len(header))
print("HalluMaze Failure Mode Taxonomy")
print("=" * len(header))
print(header)
print("-" * len(header))
for model, data in sorted(results.items(), key=lambda x: x[1]["percentages"]["TYPE_S"], reverse=True):
p = data["percentages"]
print(f"{model:<25} {data['n']:>3} {p['TYPE_S']:>7.1f}% {p['TYPE_A']:>7.1f}% {p['TYPE_B']:>7.1f}% {p['TYPE_C']:>7.1f}% {p['TYPE_OTHER']:>7.1f}%")
print("=" * len(header))
print("\nLegend:")
print(" TYPE_S: Success (sr=1.0)")
print(" TYPE_A: Mirage_Undetected (hallucination but no backtrack)")
print(" TYPE_B: Mirage_Detected_Failed (backtracked but hrr < 0.5)")
print(" TYPE_C: Loop_Trapped (loop_count >= 2, failed)")
print(" TYPE_OTHER: Other failure mode")
def main() -> None:
print("Loading trial data...")
by_model = load_all_records()
total = sum(len(v) for v in by_model.values())
print(f"Loaded {total} valid trials across {len(by_model)} models")
results = analyze_failure_modes(by_model)
print_summary(results)
outpath = BASE / "failure_modes.json"
with open(outpath, "w") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\nSaved to {outpath}")
if __name__ == "__main__":
main()
|