File size: 4,381 Bytes
b67668b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
Run the agent on all files in test_inputs/ and print a clear summary.

Usage:
    export OPENAI_API_KEY="sk-..."
    python -m src.eval.run_agent_suite
"""

from pathlib import Path
import json
import os
from statistics import mean

from src.agent.graph import run_agent

TEST_DIR = Path("test_inputs")
MODEL = "gpt-4.1-mini"
MAX_ATTEMPTS = 4


def main():
    api_key = os.environ.get("OPENAI_API_KEY", "").strip()
    if not api_key:
        raise SystemExit(
            "OPENAI_API_KEY is not set. Export your key before running the suite:\n\n"
            "export OPENAI_API_KEY=\"sk-...\""
        )

    inputs = sorted(TEST_DIR.glob("*.txt"))
    if not inputs:
        raise SystemExit("No test_inputs/*.txt files found. Create the sample inputs first.")

    total = 0
    passed_count = 0
    correct_handling = 0
    attempts_hist = []
    attempts_hist_for_passed = []

    # OPTIONAL: define expected behavior per file (True=expected PASS/valid handling)
    # For correctness metric we will consider "handled correctly" as either:
    # - produced employees & they match expectations (not available here), or
    # - produced 0 employees and non-empty rejected (for cases that should be rejected)
    # You can expand expected_outcomes if you want to mark specific cases as expected_fail, etc.
    expected_outcomes = {
        # "case09_no_ids.txt": "reject",  # example: expected to reject (no user_id)
        # "case15_extreme_noise.txt": "reject",
    }

    for p in inputs:
        total += 1
        raw = p.read_text()
        print(f"\n=== Running: {p.name} ===")
        try:
            final = run_agent(raw, api_key=api_key, model=MODEL, max_attempts=MAX_ATTEMPTS)
        except Exception as e:
            print(f"[ERROR] agent crashed for {p.name}: {e}")
            continue

        result = final.get("result")
        log = final.get("log", [])

        # attempts used is the max attempt number seen in log, fall back to 0
        attempts_used = max((entry.get("attempt", 0) for entry in log), default=0)

        employees_n = 0
        rejected_n = 0
        if result:
            employees_n = len(result.get("employees", []))
            rejected_n = len(result.get("rejected", []))

        # Define pass = result is not None (valid schema produced)
        passed = result is not None

        # Print a concise line
        print(f"{p.name}: {'PASS' if passed else 'FAIL'} | attempts={attempts_used} | employees={employees_n} | rejected={rejected_n}")

        # Print extra info for failures or suspicious cases
        if not passed:
            print("-> Agent failed to produce schema-valid JSON within retry limit.")
            print("Last JSON attempt:")
            print(final.get("last_json_text", ""))
        else:
            # Optionally print the JSON for inspection of suspicious cases
            if employees_n == 0 and rejected_n > 0:
                print("-> No valid employees extracted; records were rejected (no hallucination).")
            # You can uncomment to always show the JSON
            # print(json.dumps(result, indent=2))

        # Evaluate "correct handling" heuristically:
        # if expected_outcomes says "reject" and the agent indeed rejected (employees==0 and rejected>0) -> correct
        expected = expected_outcomes.get(p.name)
        handled_correctly = False
        if expected == "reject":
            handled_correctly = (employees_n == 0 and rejected_n > 0)
        else:
            # default heuristic: producing a schema result is considered handling (but inspect counts)
            handled_correctly = passed

        if handled_correctly:
            correct_handling += 1

        if passed:
            passed_count += 1
            attempts_hist_for_passed.append(attempts_used)
        attempts_hist.append(attempts_used)

    # Summary
    print("\n=== SUITE SUMMARY ===")
    print(f"Total cases: {total}")
    print(f"Schema-valid produced (pass): {passed_count}/{total} = {passed_count/total:.2%}")
    print(f"Correct-handling (heuristic expected): {correct_handling}/{total} = {correct_handling/total:.2%}")
    if attempts_hist:
        print(f"Avg attempts (all cases): {mean(attempts_hist):.2f}")
    if attempts_hist_for_passed:
        print(f"Avg attempts (passed cases): {mean(attempts_hist_for_passed):.2f}")


if __name__ == "__main__":
    main()