BDR-AI's picture
Initial commit for claims-gpt-eval-suite
8b2fc89 verified
#!/usr/bin/env python3
"""Evaluation suite runner"""
import json
import requests
from pathlib import Path
def run_eval_suite(api_url="http://localhost:8000"):
"""Run evaluation against golden set"""
golden_path = Path("golden_sets/gcc_claims_small.jsonl")
results = {"passed": 0, "failed": 0, "p95_latency": 0}
latencies = []
with open(golden_path) as f:
for line in f:
claim = json.loads(line)
response = requests.post(
f"{api_url}/claims/triage",
json=claim,
timeout=3.0
)
latencies.append(response.elapsed.total_seconds())
if response.status_code == 200:
results["passed"] += 1
else:
results["failed"] += 1
latencies.sort()
results["p95_latency"] = latencies[int(len(latencies) * 0.95)]
print(f"Passed: {results['passed']}")
print(f"Failed: {results['failed']}")
print(f"P95 Latency: {results['p95_latency']:.2f}s")
assert results["failed"] == 0, "Some tests failed"
assert results["p95_latency"] < 2.5, "P95 latency exceeded"
return results
if __name__ == "__main__":
run_eval_suite()