| #!/usr/bin/env python3 | |
| """Evaluation suite runner""" | |
| import json | |
| import requests | |
| from pathlib import Path | |
| def run_eval_suite(api_url="http://localhost:8000"): | |
| """Run evaluation against golden set""" | |
| golden_path = Path("golden_sets/gcc_claims_small.jsonl") | |
| results = {"passed": 0, "failed": 0, "p95_latency": 0} | |
| latencies = [] | |
| with open(golden_path) as f: | |
| for line in f: | |
| claim = json.loads(line) | |
| response = requests.post( | |
| f"{api_url}/claims/triage", | |
| json=claim, | |
| timeout=3.0 | |
| ) | |
| latencies.append(response.elapsed.total_seconds()) | |
| if response.status_code == 200: | |
| results["passed"] += 1 | |
| else: | |
| results["failed"] += 1 | |
| latencies.sort() | |
| results["p95_latency"] = latencies[int(len(latencies) * 0.95)] | |
| print(f"Passed: {results['passed']}") | |
| print(f"Failed: {results['failed']}") | |
| print(f"P95 Latency: {results['p95_latency']:.2f}s") | |
| assert results["failed"] == 0, "Some tests failed" | |
| assert results["p95_latency"] < 2.5, "P95 latency exceeded" | |
| return results | |
| if __name__ == "__main__": | |
| run_eval_suite() | |