Initial commit for claims-gpt-eval-suite

Browse files

Files changed (4) hide show

.github/workflows/run-eval.yml +11 -26
golden_sets/gcc_claims_small.jsonl +5 -5
runner.py +24 -107
tests/test_eval_suite.py +18 -29

.github/workflows/run-eval.yml CHANGED Viewed

@@ -2,35 +2,20 @@ name: Run Evaluation Suite
 on:
   push:
-    branches: [ main ]
-  workflow_dispatch:
 jobs:
   eval:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
-      - name: Set up Python
-        uses: actions/setup-python@v5
         with:
-          python-version: "3.11"
-      - name: Install dependencies
-        run: pip install httpx pytest
-      - name: Validate Golden Sets
-        run: python tests/test_eval_suite.py
-      - name: Comment on PR
-        if: github.event_name == 'pull_request'
-        uses: actions/github-script@v7
-        with:
-          script: |
-            github.rest.issues.createComment({
-              issue_number: context.issue.number,
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              body: '✅ Eval suite validation passed\n\n- Golden sets valid\n- Test runner ready'
-            })

 on:
   push:
+    branches: [main]
+  schedule:
+    - cron: '0 2 * * *'  # Nightly
 jobs:
   eval:
     runs-on: ubuntu-latest
     steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+      - run: pip install -r requirements.txt
+      - run: python runner.py
+      - name: Upload report
+        uses: actions/upload-artifact@v3
         with:
+          name: eval-report
+          path: eval_report.json

golden_sets/gcc_claims_small.jsonl CHANGED Viewed

@@ -1,5 +1,5 @@
-{"claim_id":"CLM-1001","policy":"POL-7788","description":"Rear-end accident, minor injuries, police report","amount":3500,"type":"motor","severity":"low"}
-{"claim_id":"CLM-1002","policy":"POL-7789","description":"Total loss collision, vehicle destroyed","amount":25000,"type":"motor","severity":"high"}
-{"claim_id":"CLM-1003","policy":"POL-7790","description":"Glass coverage, windshield replacement","amount":450,"type":"motor","severity":"low"}
-{"claim_id":"CLM-1004","policy":"POL-7791","description":"RED FLAG: Duplicate claim 30 days later, same incident","amount":5000,"type":"motor","severity":"fraud"}
-{"claim_id":"CLM-1005","policy":"POL-7792","description":"Medical treatment, documentation complete","amount":8500,"type":"health","severity":"medium"}

+{"id": "C001", "type": "motor", "amount": 5000, "description": "Minor fender bender"}
+{"id": "C002", "type": "health", "amount": 2000, "description": "Standard medical claim"}
+{"id": "C003", "type": "motor", "amount": 15000, "description": "Suspicious multiple claims same week"}
+{"id": "C004", "type": "health", "amount": 50000, "description": "Major surgery claim"}
+{"id": "C005", "type": "motor", "amount": 100000, "description": "Total loss suspected fraud"}

runner.py CHANGED Viewed

@@ -1,128 +1,45 @@
 #!/usr/bin/env python3
-"""ClaimsGPT Evaluation Suite Runner"""
 import json
-import time
-import statistics
 from pathlib import Path
-from typing import Dict, List
-try:
-    import httpx
-except ImportError:
-    print("❌ httpx required. Run: pip install httpx")
-    exit(1)
-API_ENDPOINT = "http://localhost:8000"
-GOLDEN_SET_FILE = Path(__file__).parent / "golden_sets" / "gcc_claims_small.jsonl"
-def percentile(values: List[int], p: float) -> int:
-    if not values:
-        return 0
-    values = sorted(values)
-    k = int(round((len(values) - 1) * p))
-    return values[k]
-def run_eval(api_url: str = API_ENDPOINT, token: str = None) -> Dict:
     """Run evaluation against golden set"""
-    lines = GOLDEN_SET_FILE.read_text().strip().splitlines()
-    claims = [json.loads(line) for line in lines]
-    results = {
-        "total_claims": len(claims),
-        "successful": 0,
-        "failed": 0,
-        "latencies_ms": [],
-        "errors": []
-    }
-    headers = {
-        "Authorization": f"Bearer {token}" if token else "",
-        "Content-Type": "application/json"
-    }
-    print(f"🧪 Running evaluation on {len(claims)} claims...\n")
-    for i, claim in enumerate(claims, 1):
-        try:
-            t0 = time.perf_counter()
-            response = httpx.post(
                 f"{api_url}/claims/triage",
                 json=claim,
-                headers=headers,
-                timeout=30
             )
-            latency_ms = int((time.perf_counter() - t0) * 1000)
             if response.status_code == 200:
-                output = response.json()
-                assert "recommendation" in output
-                assert "severity" in output
-                results["successful"] += 1
-                results["latencies_ms"].append(latency_ms)
-                print(f"  ✅ Claim {i}: {latency_ms}ms")
             else:
                 results["failed"] += 1
-                results["errors"].append(f"HTTP {response.status_code}")
-                print(f"  ❌ Claim {i}: HTTP {response.status_code}")
-        except Exception as e:
-            results["failed"] += 1
-            results["errors"].append(str(e))
-            print(f"  ❌ Claim {i}: {type(e).__name__}")
-    if results["latencies_ms"]:
-        results["p50_latency_ms"] = int(statistics.median(results["latencies_ms"]))
-        results["p95_latency_ms"] = percentile(results["latencies_ms"], 0.95)
-        results["p99_latency_ms"] = percentile(results["latencies_ms"], 0.99)
-    results["ok_rate"] = (
-        results["successful"] / results["total_claims"]
-        if results["total_claims"] > 0
-        else 0
-    )
     return results
 if __name__ == "__main__":
-    print("🚀 ClaimsGPT Evaluation Suite\n")
-    try:
-        report = run_eval()
-        print("\n" + "="*50)
-        print("📊 EVALUATION RESULTS")
-        print("="*50)
-        print(json.dumps(report, indent=2))
-        exit_code = 0
-        print("\n" + "="*50)
-        print("🚪 SLA GATES")
-        print("="*50)
-        ok_rate = report["ok_rate"]
-        p95 = report.get("p95_latency_ms", float("inf"))
-        if ok_rate >= 0.95:
-            print(f"✅ OK Rate: {ok_rate:.1%} >= 95%")
-        else:
-            print(f"❌ OK Rate: {ok_rate:.1%} < 95%")
-            exit_code = 1
-        if p95 <= 2500:
-            print(f"✅ P95 Latency: {p95}ms <= 2500ms")
-        else:
-            print(f"❌ P95 Latency: {p95}ms > 2500ms")
-            exit_code = 1
-        if exit_code == 0:
-            print("\n✅ ALL GATES PASSED")
-        else:
-            print("\n❌ GATES FAILED")
-        exit(exit_code)
-    except Exception as e:
-        print(f"\n❌ ERROR: {e}")
-        import traceback
-        traceback.print_exc()
-        exit(1)

 #!/usr/bin/env python3
+"""Evaluation suite runner"""
 import json
+import requests
 from pathlib import Path
+def run_eval_suite(api_url="http://localhost:8000"):
     """Run evaluation against golden set"""
+    golden_path = Path("golden_sets/gcc_claims_small.jsonl")
+    results = {"passed": 0, "failed": 0, "p95_latency": 0}
+    latencies = []
+    with open(golden_path) as f:
+        for line in f:
+            claim = json.loads(line)
+            response = requests.post(
                 f"{api_url}/claims/triage",
                 json=claim,
+                timeout=3.0
             )
+            latencies.append(response.elapsed.total_seconds())
             if response.status_code == 200:
+                results["passed"] += 1
             else:
                 results["failed"] += 1
+    latencies.sort()
+    results["p95_latency"] = latencies[int(len(latencies) * 0.95)]
+    print(f"Passed: {results['passed']}")
+    print(f"Failed: {results['failed']}")
+    print(f"P95 Latency: {results['p95_latency']:.2f}s")
+    assert results["failed"] == 0, "Some tests failed"
+    assert results["p95_latency"] < 2.5, "P95 latency exceeded"
     return results
 if __name__ == "__main__":
+    run_eval_suite()

tests/test_eval_suite.py CHANGED Viewed

@@ -1,34 +1,23 @@
-#!/usr/bin/env python3
-"""Test: Verify golden sets are valid"""
 import json
 from pathlib import Path
-GOLDEN_SETS_DIR = Path(__file__).parent.parent / "golden_sets"
-def test_golden_sets_exist():
-    """Verify golden set files exist"""
-    small = GOLDEN_SETS_DIR / "gcc_claims_small.jsonl"
-    medium = GOLDEN_SETS_DIR / "gcc_claims_medium.jsonl"
-    assert small.exists(), "Small golden set missing"
-    assert medium.exists(), "Medium golden set missing"
-    print("✅ Golden sets exist")
-def test_golden_sets_valid_jsonl():
-    """Verify files are valid JSONL"""
-    for jsonl_file in GOLDEN_SETS_DIR.glob("*.jsonl"):
-        lines = jsonl_file.read_text().strip().splitlines()
-        for i, line in enumerate(lines, 1):
-            try:
-                json.loads(line)
-            except json.JSONDecodeError as e:
-                raise AssertionError(f"Line {i} in {jsonl_file.name} invalid: {e}")
-        print(f"✅ {jsonl_file.name}: {len(lines)} valid claims")
-if __name__ == "__main__":
-    test_golden_sets_exist()
-    test_golden_sets_valid_jsonl()
-    print("\n🎉 Eval suite tests passed!")

 import json
 from pathlib import Path
+def test_golden_set_valid():
+    """Test that golden set is valid JSONL"""
+    path = Path("golden_sets/gcc_claims_small.jsonl")
+    with open(path) as f:
+        for line in f:
+            claim = json.loads(line)
+            assert "id" in claim
+            assert "type" in claim
+            assert "amount" in claim
+def test_claims_have_valid_types():
+    """Test that claims have valid types"""
+    valid_types = {"motor", "health", "property", "cargo"}
+    path = Path("golden_sets/gcc_claims_small.jsonl")
+    with open(path) as f:
+        for line in f:
+            claim = json.loads(line)
+            assert claim["type"] in valid_types