File size: 5,511 Bytes
3487f22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""
Extract failed and underperforming scenarios from eval_metrics.json.

A scenario is considered failed if any scorer falls below --threshold (default 1.0).
Prints a structured report and writes it to a .txt file (default: failures.txt).

Usage:
    python agent-evaluation/extract_failures.py
    python agent-evaluation/extract_failures.py --threshold 0.75
    python agent-evaluation/extract_failures.py --metrics eval_metrics.json --output failures.txt
"""

import argparse
import json
from pathlib import Path

DEFAULT_METRICS_PATH = Path(__file__).resolve().parent / "eval_metrics.json"
DEFAULT_OUTPUT_PATH = Path(__file__).resolve().parent / "failures.txt"
DEFAULT_THRESHOLD = 1.0


def load_metrics(path: str) -> dict:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Metrics file not found: {path}")
    with open(p) as f:
        return json.load(f)


def _fmt_types(label: str, types: list[str], width: int = 12) -> str:
    return f"           {label:<{width}}: {', '.join(types) if types else '(none)'}"


def extract_failures(metrics: dict, threshold: float) -> dict:
    failures_by_scorer: dict[str, list[dict]] = {}
    failed_cases: list[dict] = []

    for i, case in enumerate(metrics.get("per_case", [])):
        if case.get("error"):
            failed_cases.append({
                "case_index": i,
                "category": case.get("category", "unknown"),
                "input_preview": case.get("input_preview", ""),
                "failed_scorers": {"ERROR": case["error"]},
            })
            continue

        failed_scorers = {
            scorer: score
            for scorer, score in case.get("scores", {}).items()
            if score < threshold
        }

        if failed_scorers:
            entry = {
                "case_index": i,
                "category": case.get("category", "unknown"),
                "input_preview": case.get("input_preview", ""),
                "failed_scorers": failed_scorers,
                "all_scores": case.get("scores", {}),
                "scorer_metadata": case.get("scorer_metadata", {}),
            }
            failed_cases.append(entry)
            for scorer in failed_scorers:
                failures_by_scorer.setdefault(scorer, []).append(entry)

    return {
        "threshold": threshold,
        "total_cases": metrics.get("total_cases", 0),
        "failed_case_count": len(failed_cases),
        "failures_by_scorer": {
            scorer: len(cases) for scorer, cases in failures_by_scorer.items()
        },
        "failed_cases": failed_cases,
    }


def print_report(result: dict, file=None) -> None:
    def out(text=""):
        print(text, file=file)

    threshold = result["threshold"]
    total = result["total_cases"]
    failed = result["failed_case_count"]

    out()
    out("=" * 72)
    out(f"FAILURE REPORT  (threshold < {threshold})")
    out("=" * 72)
    out(f"Total cases  : {total}")
    out(f"Failed cases : {failed}  ({failed / total:.1%})" if total else "Failed cases : 0")

    if not result["failures_by_scorer"]:
        out("\nNo failures found at this threshold.")
        out("=" * 72)
        return

    out()
    out("-" * 72)
    out("FAILURES PER SCORER")
    out("-" * 72)
    for scorer, count in sorted(result["failures_by_scorer"].items(), key=lambda x: -x[1]):
        out(f"  {scorer:<40} {count} case(s)")

    out()
    out("-" * 72)
    out("FAILED CASES DETAIL")
    out("-" * 72)
    for case in result["failed_cases"]:
        out(f"\n  [{case['case_index']}] category : {case['category']}")
        out(f"       preview  : {case['input_preview'][:80]}...")
        out("       failures :")
        for scorer, score in case["failed_scorers"].items():
            if isinstance(score, float):
                out(f"         {scorer:<38} score = {score:.4f}")
            else:
                out(f"         {scorer:<38} {score}")
            if scorer == "ExpectedClauseType":
                meta = case.get("scorer_metadata", {}).get("ExpectedClauseType")
                if meta:
                    missing = sorted(set(meta["expected_types"]) - set(meta["matched_types"]))
                    out(_fmt_types("expected", meta["expected_types"]))
                    out(_fmt_types("found   ", meta["found_types"]))
                    out(_fmt_types("missing ", missing))

    out()
    out("=" * 72)


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Extract failed scenarios from eval_metrics.json",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )
    parser.add_argument(
        "--metrics", type=str, default=str(DEFAULT_METRICS_PATH),
        help=f"Path to eval_metrics.json (default: {DEFAULT_METRICS_PATH})",
    )
    parser.add_argument(
        "--threshold", type=float, default=DEFAULT_THRESHOLD,
        help=f"Score threshold — cases below this are reported as failures (default: {DEFAULT_THRESHOLD})",
    )
    parser.add_argument(
        "--output", type=str, default=str(DEFAULT_OUTPUT_PATH),
        help=f"Path to write the failures report as plain text (default: {DEFAULT_OUTPUT_PATH})",
    )
    args = parser.parse_args()

    metrics = load_metrics(args.metrics)
    result = extract_failures(metrics, args.threshold)

    print_report(result)

    with open(args.output, "w") as f:
        print_report(result, file=f)
    print(f"Failures written to {args.output}\n")


if __name__ == "__main__":
    main()