| """ |
| Run the agent on all files in test_inputs/ and print a clear summary. |
| |
| Usage: |
| export OPENAI_API_KEY="sk-..." |
| python -m src.eval.run_agent_suite |
| """ |
|
|
| from pathlib import Path |
| import json |
| import os |
| from statistics import mean |
|
|
| from src.agent.graph import run_agent |
|
|
| TEST_DIR = Path("test_inputs") |
| MODEL = "gpt-4.1-mini" |
| MAX_ATTEMPTS = 4 |
|
|
|
|
| def main(): |
| api_key = os.environ.get("OPENAI_API_KEY", "").strip() |
| if not api_key: |
| raise SystemExit( |
| "OPENAI_API_KEY is not set. Export your key before running the suite:\n\n" |
| "export OPENAI_API_KEY=\"sk-...\"" |
| ) |
|
|
| inputs = sorted(TEST_DIR.glob("*.txt")) |
| if not inputs: |
| raise SystemExit("No test_inputs/*.txt files found. Create the sample inputs first.") |
|
|
| total = 0 |
| passed_count = 0 |
| correct_handling = 0 |
| attempts_hist = [] |
| attempts_hist_for_passed = [] |
|
|
| |
| |
| |
| |
| |
| expected_outcomes = { |
| |
| |
| } |
|
|
| for p in inputs: |
| total += 1 |
| raw = p.read_text() |
| print(f"\n=== Running: {p.name} ===") |
| try: |
| final = run_agent(raw, api_key=api_key, model=MODEL, max_attempts=MAX_ATTEMPTS) |
| except Exception as e: |
| print(f"[ERROR] agent crashed for {p.name}: {e}") |
| continue |
|
|
| result = final.get("result") |
| log = final.get("log", []) |
|
|
| |
| attempts_used = max((entry.get("attempt", 0) for entry in log), default=0) |
|
|
| employees_n = 0 |
| rejected_n = 0 |
| if result: |
| employees_n = len(result.get("employees", [])) |
| rejected_n = len(result.get("rejected", [])) |
|
|
| |
| passed = result is not None |
|
|
| |
| print(f"{p.name}: {'PASS' if passed else 'FAIL'} | attempts={attempts_used} | employees={employees_n} | rejected={rejected_n}") |
|
|
| |
| if not passed: |
| print("-> Agent failed to produce schema-valid JSON within retry limit.") |
| print("Last JSON attempt:") |
| print(final.get("last_json_text", "")) |
| else: |
| |
| if employees_n == 0 and rejected_n > 0: |
| print("-> No valid employees extracted; records were rejected (no hallucination).") |
| |
| |
|
|
| |
| |
| expected = expected_outcomes.get(p.name) |
| handled_correctly = False |
| if expected == "reject": |
| handled_correctly = (employees_n == 0 and rejected_n > 0) |
| else: |
| |
| handled_correctly = passed |
|
|
| if handled_correctly: |
| correct_handling += 1 |
|
|
| if passed: |
| passed_count += 1 |
| attempts_hist_for_passed.append(attempts_used) |
| attempts_hist.append(attempts_used) |
|
|
| |
| print("\n=== SUITE SUMMARY ===") |
| print(f"Total cases: {total}") |
| print(f"Schema-valid produced (pass): {passed_count}/{total} = {passed_count/total:.2%}") |
| print(f"Correct-handling (heuristic expected): {correct_handling}/{total} = {correct_handling/total:.2%}") |
| if attempts_hist: |
| print(f"Avg attempts (all cases): {mean(attempts_hist):.2f}") |
| if attempts_hist_for_passed: |
| print(f"Avg attempts (passed cases): {mean(attempts_hist_for_passed):.2f}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|