Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| failure_mode_analysis.py β HalluMaze μ€ν¨ λͺ¨λ λΆλ₯ν | |
| κ° trialμ 4κ°μ§ μ νμΌλ‘ μλ λΆλ₯: | |
| TYPE_S: Success β sr == 1.0 | |
| TYPE_A: Mirage_Undetected β hallucination_count > 0 AND backtrack_count == 0 | |
| TYPE_B: Mirage_Detected_Failed β hallucination_count > 0 AND backtrack_count > 0 AND hrr < 0.5 | |
| TYPE_C: Loop_Trapped β loop_count >= 2 AND sr == 0 | |
| Usage: | |
| python scripts/failure_mode_analysis.py | |
| # Output: experiment_results/failure_modes.json | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from pathlib import Path | |
| from collections import defaultdict | |
| BASE = Path(__file__).parent.parent / "experiment_results" | |
| # ββ Data Sources (same as build_final_analysis.py) ββββββββββββββ | |
| SOURCES = { | |
| "checkpoint_rerun": { | |
| "file": BASE / "checkpoint_rerun.json", | |
| "model_key": "model", | |
| }, | |
| "or_phaseB_scout_gemini": { | |
| "file": BASE / "or_phaseB.json", | |
| "model_key": "or_model_id", | |
| "filter_models": ["meta-llama/llama-4-scout", "google/gemini-2.0-flash-lite-001"], | |
| }, | |
| "or_haiku": {"file": BASE / "or_haiku.json", "model_key": "or_model_id"}, | |
| "or_gptmini": {"file": BASE / "or_gptmini.json", "model_key": "or_model_id"}, | |
| "or_maverick": {"file": BASE / "or_maverick.json", "model_key": "or_model_id"}, | |
| "or_qwen": {"file": BASE / "or_qwen.json", "model_key": "or_model_id"}, | |
| } | |
| MODEL_DISPLAY = { | |
| "glm-4.7": "GLM-4.7", | |
| "MiniMax-M2.5": "MiniMax-M2.5", | |
| "meta-llama/llama-4-scout": "Llama-4-Scout", | |
| "meta-llama/llama-4-maverick": "Llama-4-Maverick", | |
| "google/gemini-2.0-flash-lite-001": "Gemini-2.0-Flash-Lite", | |
| "openai/gpt-4o-mini": "GPT-4o-mini", | |
| "anthropic/claude-3-haiku": "Claude-3-Haiku", | |
| "qwen/qwen-2.5-72b-instruct": "Qwen-2.5-72B", | |
| } | |
| def load_all_records() -> dict[str, list[dict]]: | |
| """Load all trial records grouped by display model name.""" | |
| by_model: dict[str, list[dict]] = defaultdict(list) | |
| for src_name, src in SOURCES.items(): | |
| fpath = src["file"] | |
| if not fpath.exists(): | |
| print(f" [SKIP] {fpath.name} not found") | |
| continue | |
| with open(fpath) as f: | |
| records = json.load(f) | |
| model_key = src.get("model_key", "model") | |
| filter_models = src.get("filter_models") | |
| for rec in records: | |
| raw_model = rec.get(model_key) or rec.get("model", "unknown") | |
| if filter_models and raw_model not in filter_models: | |
| continue | |
| # Skip error trials | |
| if rec.get("error"): | |
| continue | |
| display = MODEL_DISPLAY.get(raw_model, raw_model) | |
| by_model[display].append(rec) | |
| return dict(by_model) | |
| def classify_trial(rec: dict) -> str: | |
| """Classify a single trial into one of 4 failure modes.""" | |
| sr = rec.get("sr", 0) | |
| hallucination_count = rec.get("hallucination_count", 0) | |
| backtrack_count = rec.get("backtrack_count", 0) | |
| loop_count = rec.get("loop_count", 0) | |
| hrr = rec.get("hrr", 0.0) | |
| # Priority order: Success first, then specific failure modes | |
| if sr == 1.0: | |
| return "TYPE_S" | |
| if hallucination_count > 0 and backtrack_count == 0: | |
| return "TYPE_A" | |
| if hallucination_count > 0 and backtrack_count > 0 and hrr < 0.5: | |
| return "TYPE_B" | |
| if loop_count >= 2: | |
| return "TYPE_C" | |
| # Fallback: failure that doesn't match specific patterns | |
| return "TYPE_OTHER" | |
| def analyze_failure_modes(by_model: dict[str, list[dict]]) -> dict: | |
| """Run failure mode classification on all models.""" | |
| results = {} | |
| for model, trials in sorted(by_model.items()): | |
| counts = {"TYPE_S": 0, "TYPE_A": 0, "TYPE_B": 0, "TYPE_C": 0, "TYPE_OTHER": 0} | |
| for rec in trials: | |
| ftype = classify_trial(rec) | |
| counts[ftype] += 1 | |
| n = len(trials) | |
| pcts = {k: round(v / n * 100, 1) if n > 0 else 0.0 for k, v in counts.items()} | |
| results[model] = { | |
| "n": n, | |
| "counts": counts, | |
| "percentages": pcts, | |
| "labels": { | |
| "TYPE_S": "Success", | |
| "TYPE_A": "Mirage_Undetected", | |
| "TYPE_B": "Mirage_Detected_Failed", | |
| "TYPE_C": "Loop_Trapped", | |
| "TYPE_OTHER": "Other_Failure", | |
| }, | |
| } | |
| return results | |
| def print_summary(results: dict) -> None: | |
| """Print a readable summary table.""" | |
| header = f"{'Model':<25} {'n':>3} {'Success':>8} {'Undetect':>8} {'Det+Fail':>8} {'Loop':>8} {'Other':>8}" | |
| print("\n" + "=" * len(header)) | |
| print("HalluMaze Failure Mode Taxonomy") | |
| print("=" * len(header)) | |
| print(header) | |
| print("-" * len(header)) | |
| for model, data in sorted(results.items(), key=lambda x: x[1]["percentages"]["TYPE_S"], reverse=True): | |
| p = data["percentages"] | |
| print(f"{model:<25} {data['n']:>3} {p['TYPE_S']:>7.1f}% {p['TYPE_A']:>7.1f}% {p['TYPE_B']:>7.1f}% {p['TYPE_C']:>7.1f}% {p['TYPE_OTHER']:>7.1f}%") | |
| print("=" * len(header)) | |
| print("\nLegend:") | |
| print(" TYPE_S: Success (sr=1.0)") | |
| print(" TYPE_A: Mirage_Undetected (hallucination but no backtrack)") | |
| print(" TYPE_B: Mirage_Detected_Failed (backtracked but hrr < 0.5)") | |
| print(" TYPE_C: Loop_Trapped (loop_count >= 2, failed)") | |
| print(" TYPE_OTHER: Other failure mode") | |
| def main() -> None: | |
| print("Loading trial data...") | |
| by_model = load_all_records() | |
| total = sum(len(v) for v in by_model.values()) | |
| print(f"Loaded {total} valid trials across {len(by_model)} models") | |
| results = analyze_failure_modes(by_model) | |
| print_summary(results) | |
| outpath = BASE / "failure_modes.json" | |
| with open(outpath, "w") as f: | |
| json.dump(results, f, indent=2, ensure_ascii=False) | |
| print(f"\nSaved to {outpath}") | |
| if __name__ == "__main__": | |
| main() | |