Siddhesh Patil
Initial commit - Self-Correcting Data Validation Agent
b67668b
"""
Run the agent on all files in test_inputs/ and print a clear summary.
Usage:
export OPENAI_API_KEY="sk-..."
python -m src.eval.run_agent_suite
"""
from pathlib import Path
import json
import os
from statistics import mean
from src.agent.graph import run_agent
TEST_DIR = Path("test_inputs")
MODEL = "gpt-4.1-mini"
MAX_ATTEMPTS = 4
def main():
api_key = os.environ.get("OPENAI_API_KEY", "").strip()
if not api_key:
raise SystemExit(
"OPENAI_API_KEY is not set. Export your key before running the suite:\n\n"
"export OPENAI_API_KEY=\"sk-...\""
)
inputs = sorted(TEST_DIR.glob("*.txt"))
if not inputs:
raise SystemExit("No test_inputs/*.txt files found. Create the sample inputs first.")
total = 0
passed_count = 0
correct_handling = 0
attempts_hist = []
attempts_hist_for_passed = []
# OPTIONAL: define expected behavior per file (True=expected PASS/valid handling)
# For correctness metric we will consider "handled correctly" as either:
# - produced employees & they match expectations (not available here), or
# - produced 0 employees and non-empty rejected (for cases that should be rejected)
# You can expand expected_outcomes if you want to mark specific cases as expected_fail, etc.
expected_outcomes = {
# "case09_no_ids.txt": "reject", # example: expected to reject (no user_id)
# "case15_extreme_noise.txt": "reject",
}
for p in inputs:
total += 1
raw = p.read_text()
print(f"\n=== Running: {p.name} ===")
try:
final = run_agent(raw, api_key=api_key, model=MODEL, max_attempts=MAX_ATTEMPTS)
except Exception as e:
print(f"[ERROR] agent crashed for {p.name}: {e}")
continue
result = final.get("result")
log = final.get("log", [])
# attempts used is the max attempt number seen in log, fall back to 0
attempts_used = max((entry.get("attempt", 0) for entry in log), default=0)
employees_n = 0
rejected_n = 0
if result:
employees_n = len(result.get("employees", []))
rejected_n = len(result.get("rejected", []))
# Define pass = result is not None (valid schema produced)
passed = result is not None
# Print a concise line
print(f"{p.name}: {'PASS' if passed else 'FAIL'} | attempts={attempts_used} | employees={employees_n} | rejected={rejected_n}")
# Print extra info for failures or suspicious cases
if not passed:
print("-> Agent failed to produce schema-valid JSON within retry limit.")
print("Last JSON attempt:")
print(final.get("last_json_text", ""))
else:
# Optionally print the JSON for inspection of suspicious cases
if employees_n == 0 and rejected_n > 0:
print("-> No valid employees extracted; records were rejected (no hallucination).")
# You can uncomment to always show the JSON
# print(json.dumps(result, indent=2))
# Evaluate "correct handling" heuristically:
# if expected_outcomes says "reject" and the agent indeed rejected (employees==0 and rejected>0) -> correct
expected = expected_outcomes.get(p.name)
handled_correctly = False
if expected == "reject":
handled_correctly = (employees_n == 0 and rejected_n > 0)
else:
# default heuristic: producing a schema result is considered handling (but inspect counts)
handled_correctly = passed
if handled_correctly:
correct_handling += 1
if passed:
passed_count += 1
attempts_hist_for_passed.append(attempts_used)
attempts_hist.append(attempts_used)
# Summary
print("\n=== SUITE SUMMARY ===")
print(f"Total cases: {total}")
print(f"Schema-valid produced (pass): {passed_count}/{total} = {passed_count/total:.2%}")
print(f"Correct-handling (heuristic expected): {correct_handling}/{total} = {correct_handling/total:.2%}")
if attempts_hist:
print(f"Avg attempts (all cases): {mean(attempts_hist):.2f}")
if attempts_hist_for_passed:
print(f"Avg attempts (passed cases): {mean(attempts_hist_for_passed):.2f}")
if __name__ == "__main__":
main()