| """ |
| Extract failed and underperforming scenarios from eval_metrics.json. |
| |
| A scenario is considered failed if any scorer falls below --threshold (default 1.0). |
| Prints a structured report and writes it to a .txt file (default: failures.txt). |
| |
| Usage: |
| python agent-evaluation/extract_failures.py |
| python agent-evaluation/extract_failures.py --threshold 0.75 |
| python agent-evaluation/extract_failures.py --metrics eval_metrics.json --output failures.txt |
| """ |
|
|
| import argparse |
| import json |
| from pathlib import Path |
|
|
| DEFAULT_METRICS_PATH = Path(__file__).resolve().parent / "eval_metrics.json" |
| DEFAULT_OUTPUT_PATH = Path(__file__).resolve().parent / "failures.txt" |
| DEFAULT_THRESHOLD = 1.0 |
|
|
|
|
| def load_metrics(path: str) -> dict: |
| p = Path(path) |
| if not p.exists(): |
| raise FileNotFoundError(f"Metrics file not found: {path}") |
| with open(p) as f: |
| return json.load(f) |
|
|
|
|
| def _fmt_types(label: str, types: list[str], width: int = 12) -> str: |
| return f" {label:<{width}}: {', '.join(types) if types else '(none)'}" |
|
|
|
|
| def extract_failures(metrics: dict, threshold: float) -> dict: |
| failures_by_scorer: dict[str, list[dict]] = {} |
| failed_cases: list[dict] = [] |
|
|
| for i, case in enumerate(metrics.get("per_case", [])): |
| if case.get("error"): |
| failed_cases.append({ |
| "case_index": i, |
| "category": case.get("category", "unknown"), |
| "input_preview": case.get("input_preview", ""), |
| "failed_scorers": {"ERROR": case["error"]}, |
| }) |
| continue |
|
|
| failed_scorers = { |
| scorer: score |
| for scorer, score in case.get("scores", {}).items() |
| if score < threshold |
| } |
|
|
| if failed_scorers: |
| entry = { |
| "case_index": i, |
| "category": case.get("category", "unknown"), |
| "input_preview": case.get("input_preview", ""), |
| "failed_scorers": failed_scorers, |
| "all_scores": case.get("scores", {}), |
| "scorer_metadata": case.get("scorer_metadata", {}), |
| } |
| failed_cases.append(entry) |
| for scorer in failed_scorers: |
| failures_by_scorer.setdefault(scorer, []).append(entry) |
|
|
| return { |
| "threshold": threshold, |
| "total_cases": metrics.get("total_cases", 0), |
| "failed_case_count": len(failed_cases), |
| "failures_by_scorer": { |
| scorer: len(cases) for scorer, cases in failures_by_scorer.items() |
| }, |
| "failed_cases": failed_cases, |
| } |
|
|
|
|
| def print_report(result: dict, file=None) -> None: |
| def out(text=""): |
| print(text, file=file) |
|
|
| threshold = result["threshold"] |
| total = result["total_cases"] |
| failed = result["failed_case_count"] |
|
|
| out() |
| out("=" * 72) |
| out(f"FAILURE REPORT (threshold < {threshold})") |
| out("=" * 72) |
| out(f"Total cases : {total}") |
| out(f"Failed cases : {failed} ({failed / total:.1%})" if total else "Failed cases : 0") |
|
|
| if not result["failures_by_scorer"]: |
| out("\nNo failures found at this threshold.") |
| out("=" * 72) |
| return |
|
|
| out() |
| out("-" * 72) |
| out("FAILURES PER SCORER") |
| out("-" * 72) |
| for scorer, count in sorted(result["failures_by_scorer"].items(), key=lambda x: -x[1]): |
| out(f" {scorer:<40} {count} case(s)") |
|
|
| out() |
| out("-" * 72) |
| out("FAILED CASES DETAIL") |
| out("-" * 72) |
| for case in result["failed_cases"]: |
| out(f"\n [{case['case_index']}] category : {case['category']}") |
| out(f" preview : {case['input_preview'][:80]}...") |
| out(" failures :") |
| for scorer, score in case["failed_scorers"].items(): |
| if isinstance(score, float): |
| out(f" {scorer:<38} score = {score:.4f}") |
| else: |
| out(f" {scorer:<38} {score}") |
| if scorer == "ExpectedClauseType": |
| meta = case.get("scorer_metadata", {}).get("ExpectedClauseType") |
| if meta: |
| missing = sorted(set(meta["expected_types"]) - set(meta["matched_types"])) |
| out(_fmt_types("expected", meta["expected_types"])) |
| out(_fmt_types("found ", meta["found_types"])) |
| out(_fmt_types("missing ", missing)) |
|
|
| out() |
| out("=" * 72) |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser( |
| description="Extract failed scenarios from eval_metrics.json", |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| epilog=__doc__, |
| ) |
| parser.add_argument( |
| "--metrics", type=str, default=str(DEFAULT_METRICS_PATH), |
| help=f"Path to eval_metrics.json (default: {DEFAULT_METRICS_PATH})", |
| ) |
| parser.add_argument( |
| "--threshold", type=float, default=DEFAULT_THRESHOLD, |
| help=f"Score threshold — cases below this are reported as failures (default: {DEFAULT_THRESHOLD})", |
| ) |
| parser.add_argument( |
| "--output", type=str, default=str(DEFAULT_OUTPUT_PATH), |
| help=f"Path to write the failures report as plain text (default: {DEFAULT_OUTPUT_PATH})", |
| ) |
| args = parser.parse_args() |
|
|
| metrics = load_metrics(args.metrics) |
| result = extract_failures(metrics, args.threshold) |
|
|
| print_report(result) |
|
|
| with open(args.output, "w") as f: |
| print_report(result, file=f) |
| print(f"Failures written to {args.output}\n") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|