Spaces:

Be2Jay
/

hallumaze

Running

File size: 5,912 Bytes

d77ae53

#!/usr/bin/env python3
"""
failure_mode_analysis.py — HalluMaze 실패 모드 분류학

각 trial을 4가지 유형으로 자동 분류:
  TYPE_S: Success           — sr == 1.0
  TYPE_A: Mirage_Undetected — hallucination_count > 0 AND backtrack_count == 0
  TYPE_B: Mirage_Detected_Failed — hallucination_count > 0 AND backtrack_count > 0 AND hrr < 0.5
  TYPE_C: Loop_Trapped      — loop_count >= 2 AND sr == 0

Usage:
    python scripts/failure_mode_analysis.py
    # Output: experiment_results/failure_modes.json
"""
from __future__ import annotations
import json
from pathlib import Path
from collections import defaultdict

BASE = Path(__file__).parent.parent / "experiment_results"

# ── Data Sources (same as build_final_analysis.py) ──────────────
SOURCES = {
    "checkpoint_rerun": {
        "file": BASE / "checkpoint_rerun.json",
        "model_key": "model",
    },
    "or_phaseB_scout_gemini": {
        "file": BASE / "or_phaseB.json",
        "model_key": "or_model_id",
        "filter_models": ["meta-llama/llama-4-scout", "google/gemini-2.0-flash-lite-001"],
    },
    "or_haiku":    {"file": BASE / "or_haiku.json",    "model_key": "or_model_id"},
    "or_gptmini":  {"file": BASE / "or_gptmini.json",  "model_key": "or_model_id"},
    "or_maverick": {"file": BASE / "or_maverick.json",  "model_key": "or_model_id"},
    "or_qwen":     {"file": BASE / "or_qwen.json",     "model_key": "or_model_id"},
}

MODEL_DISPLAY = {
    "glm-4.7": "GLM-4.7",
    "MiniMax-M2.5": "MiniMax-M2.5",
    "meta-llama/llama-4-scout": "Llama-4-Scout",
    "meta-llama/llama-4-maverick": "Llama-4-Maverick",
    "google/gemini-2.0-flash-lite-001": "Gemini-2.0-Flash-Lite",
    "openai/gpt-4o-mini": "GPT-4o-mini",
    "anthropic/claude-3-haiku": "Claude-3-Haiku",
    "qwen/qwen-2.5-72b-instruct": "Qwen-2.5-72B",
}


def load_all_records() -> dict[str, list[dict]]:
    """Load all trial records grouped by display model name."""
    by_model: dict[str, list[dict]] = defaultdict(list)
    for src_name, src in SOURCES.items():
        fpath = src["file"]
        if not fpath.exists():
            print(f"  [SKIP] {fpath.name} not found")
            continue
        with open(fpath) as f:
            records = json.load(f)
        model_key = src.get("model_key", "model")
        filter_models = src.get("filter_models")
        for rec in records:
            raw_model = rec.get(model_key) or rec.get("model", "unknown")
            if filter_models and raw_model not in filter_models:
                continue
            # Skip error trials
            if rec.get("error"):
                continue
            display = MODEL_DISPLAY.get(raw_model, raw_model)
            by_model[display].append(rec)
    return dict(by_model)


def classify_trial(rec: dict) -> str:
    """Classify a single trial into one of 4 failure modes."""
    sr = rec.get("sr", 0)
    hallucination_count = rec.get("hallucination_count", 0)
    backtrack_count = rec.get("backtrack_count", 0)
    loop_count = rec.get("loop_count", 0)
    hrr = rec.get("hrr", 0.0)

    # Priority order: Success first, then specific failure modes
    if sr == 1.0:
        return "TYPE_S"
    if hallucination_count > 0 and backtrack_count == 0:
        return "TYPE_A"
    if hallucination_count > 0 and backtrack_count > 0 and hrr < 0.5:
        return "TYPE_B"
    if loop_count >= 2:
        return "TYPE_C"
    # Fallback: failure that doesn't match specific patterns
    return "TYPE_OTHER"


def analyze_failure_modes(by_model: dict[str, list[dict]]) -> dict:
    """Run failure mode classification on all models."""
    results = {}
    for model, trials in sorted(by_model.items()):
        counts = {"TYPE_S": 0, "TYPE_A": 0, "TYPE_B": 0, "TYPE_C": 0, "TYPE_OTHER": 0}
        for rec in trials:
            ftype = classify_trial(rec)
            counts[ftype] += 1
        n = len(trials)
        pcts = {k: round(v / n * 100, 1) if n > 0 else 0.0 for k, v in counts.items()}
        results[model] = {
            "n": n,
            "counts": counts,
            "percentages": pcts,
            "labels": {
                "TYPE_S": "Success",
                "TYPE_A": "Mirage_Undetected",
                "TYPE_B": "Mirage_Detected_Failed",
                "TYPE_C": "Loop_Trapped",
                "TYPE_OTHER": "Other_Failure",
            },
        }
    return results


def print_summary(results: dict) -> None:
    """Print a readable summary table."""
    header = f"{'Model':<25} {'n':>3}  {'Success':>8}  {'Undetect':>8}  {'Det+Fail':>8}  {'Loop':>8}  {'Other':>8}"
    print("\n" + "=" * len(header))
    print("HalluMaze Failure Mode Taxonomy")
    print("=" * len(header))
    print(header)
    print("-" * len(header))
    for model, data in sorted(results.items(), key=lambda x: x[1]["percentages"]["TYPE_S"], reverse=True):
        p = data["percentages"]
        print(f"{model:<25} {data['n']:>3}  {p['TYPE_S']:>7.1f}%  {p['TYPE_A']:>7.1f}%  {p['TYPE_B']:>7.1f}%  {p['TYPE_C']:>7.1f}%  {p['TYPE_OTHER']:>7.1f}%")
    print("=" * len(header))
    print("\nLegend:")
    print("  TYPE_S: Success (sr=1.0)")
    print("  TYPE_A: Mirage_Undetected (hallucination but no backtrack)")
    print("  TYPE_B: Mirage_Detected_Failed (backtracked but hrr < 0.5)")
    print("  TYPE_C: Loop_Trapped (loop_count >= 2, failed)")
    print("  TYPE_OTHER: Other failure mode")


def main() -> None:
    print("Loading trial data...")
    by_model = load_all_records()
    total = sum(len(v) for v in by_model.values())
    print(f"Loaded {total} valid trials across {len(by_model)} models")

    results = analyze_failure_modes(by_model)
    print_summary(results)

    outpath = BASE / "failure_modes.json"
    with open(outpath, "w") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\nSaved to {outpath}")


if __name__ == "__main__":
    main()