BDR-AI commited on
Commit
8b2fc89
·
verified ·
1 Parent(s): a65c6b4

Initial commit for claims-gpt-eval-suite

Browse files
.github/workflows/run-eval.yml CHANGED
@@ -2,35 +2,20 @@ name: Run Evaluation Suite
2
 
3
  on:
4
  push:
5
- branches: [ main ]
6
- workflow_dispatch:
 
7
 
8
  jobs:
9
  eval:
10
  runs-on: ubuntu-latest
11
-
12
  steps:
13
- - uses: actions/checkout@v4
14
-
15
- - name: Set up Python
16
- uses: actions/setup-python@v5
 
 
17
  with:
18
- python-version: "3.11"
19
-
20
- - name: Install dependencies
21
- run: pip install httpx pytest
22
-
23
- - name: Validate Golden Sets
24
- run: python tests/test_eval_suite.py
25
-
26
- - name: Comment on PR
27
- if: github.event_name == 'pull_request'
28
- uses: actions/github-script@v7
29
- with:
30
- script: |
31
- github.rest.issues.createComment({
32
- issue_number: context.issue.number,
33
- owner: context.repo.owner,
34
- repo: context.repo.repo,
35
- body: '✅ Eval suite validation passed\n\n- Golden sets valid\n- Test runner ready'
36
- })
 
2
 
3
  on:
4
  push:
5
+ branches: [main]
6
+ schedule:
7
+ - cron: '0 2 * * *' # Nightly
8
 
9
  jobs:
10
  eval:
11
  runs-on: ubuntu-latest
 
12
  steps:
13
+ - uses: actions/checkout@v3
14
+ - uses: actions/setup-python@v4
15
+ - run: pip install -r requirements.txt
16
+ - run: python runner.py
17
+ - name: Upload report
18
+ uses: actions/upload-artifact@v3
19
  with:
20
+ name: eval-report
21
+ path: eval_report.json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
golden_sets/gcc_claims_small.jsonl CHANGED
@@ -1,5 +1,5 @@
1
- {"claim_id":"CLM-1001","policy":"POL-7788","description":"Rear-end accident, minor injuries, police report","amount":3500,"type":"motor","severity":"low"}
2
- {"claim_id":"CLM-1002","policy":"POL-7789","description":"Total loss collision, vehicle destroyed","amount":25000,"type":"motor","severity":"high"}
3
- {"claim_id":"CLM-1003","policy":"POL-7790","description":"Glass coverage, windshield replacement","amount":450,"type":"motor","severity":"low"}
4
- {"claim_id":"CLM-1004","policy":"POL-7791","description":"RED FLAG: Duplicate claim 30 days later, same incident","amount":5000,"type":"motor","severity":"fraud"}
5
- {"claim_id":"CLM-1005","policy":"POL-7792","description":"Medical treatment, documentation complete","amount":8500,"type":"health","severity":"medium"}
 
1
+ {"id": "C001", "type": "motor", "amount": 5000, "description": "Minor fender bender"}
2
+ {"id": "C002", "type": "health", "amount": 2000, "description": "Standard medical claim"}
3
+ {"id": "C003", "type": "motor", "amount": 15000, "description": "Suspicious multiple claims same week"}
4
+ {"id": "C004", "type": "health", "amount": 50000, "description": "Major surgery claim"}
5
+ {"id": "C005", "type": "motor", "amount": 100000, "description": "Total loss suspected fraud"}
runner.py CHANGED
@@ -1,128 +1,45 @@
1
  #!/usr/bin/env python3
2
- """ClaimsGPT Evaluation Suite Runner"""
3
-
4
  import json
5
- import time
6
- import statistics
7
  from pathlib import Path
8
- from typing import Dict, List
9
-
10
- try:
11
- import httpx
12
- except ImportError:
13
- print("❌ httpx required. Run: pip install httpx")
14
- exit(1)
15
-
16
- API_ENDPOINT = "http://localhost:8000"
17
- GOLDEN_SET_FILE = Path(__file__).parent / "golden_sets" / "gcc_claims_small.jsonl"
18
 
19
- def percentile(values: List[int], p: float) -> int:
20
- if not values:
21
- return 0
22
- values = sorted(values)
23
- k = int(round((len(values) - 1) * p))
24
- return values[k]
25
-
26
- def run_eval(api_url: str = API_ENDPOINT, token: str = None) -> Dict:
27
  """Run evaluation against golden set"""
28
 
29
- lines = GOLDEN_SET_FILE.read_text().strip().splitlines()
30
- claims = [json.loads(line) for line in lines]
31
-
32
- results = {
33
- "total_claims": len(claims),
34
- "successful": 0,
35
- "failed": 0,
36
- "latencies_ms": [],
37
- "errors": []
38
- }
39
 
40
- headers = {
41
- "Authorization": f"Bearer {token}" if token else "",
42
- "Content-Type": "application/json"
43
- }
44
 
45
- print(f"🧪 Running evaluation on {len(claims)} claims...\n")
46
-
47
- for i, claim in enumerate(claims, 1):
48
- try:
49
- t0 = time.perf_counter()
50
- response = httpx.post(
51
  f"{api_url}/claims/triage",
52
  json=claim,
53
- headers=headers,
54
- timeout=30
55
  )
56
- latency_ms = int((time.perf_counter() - t0) * 1000)
 
57
 
58
  if response.status_code == 200:
59
- output = response.json()
60
- assert "recommendation" in output
61
- assert "severity" in output
62
- results["successful"] += 1
63
- results["latencies_ms"].append(latency_ms)
64
- print(f" ✅ Claim {i}: {latency_ms}ms")
65
  else:
66
  results["failed"] += 1
67
- results["errors"].append(f"HTTP {response.status_code}")
68
- print(f" ❌ Claim {i}: HTTP {response.status_code}")
69
- except Exception as e:
70
- results["failed"] += 1
71
- results["errors"].append(str(e))
72
- print(f" ❌ Claim {i}: {type(e).__name__}")
73
 
74
- if results["latencies_ms"]:
75
- results["p50_latency_ms"] = int(statistics.median(results["latencies_ms"]))
76
- results["p95_latency_ms"] = percentile(results["latencies_ms"], 0.95)
77
- results["p99_latency_ms"] = percentile(results["latencies_ms"], 0.99)
78
 
79
- results["ok_rate"] = (
80
- results["successful"] / results["total_claims"]
81
- if results["total_claims"] > 0
82
- else 0
83
- )
 
84
 
85
  return results
86
 
87
  if __name__ == "__main__":
88
- print("🚀 ClaimsGPT Evaluation Suite\n")
89
-
90
- try:
91
- report = run_eval()
92
-
93
- print("\n" + "="*50)
94
- print("📊 EVALUATION RESULTS")
95
- print("="*50)
96
- print(json.dumps(report, indent=2))
97
-
98
- exit_code = 0
99
- print("\n" + "="*50)
100
- print("🚪 SLA GATES")
101
- print("="*50)
102
-
103
- ok_rate = report["ok_rate"]
104
- p95 = report.get("p95_latency_ms", float("inf"))
105
-
106
- if ok_rate >= 0.95:
107
- print(f"✅ OK Rate: {ok_rate:.1%} >= 95%")
108
- else:
109
- print(f"❌ OK Rate: {ok_rate:.1%} < 95%")
110
- exit_code = 1
111
-
112
- if p95 <= 2500:
113
- print(f"✅ P95 Latency: {p95}ms <= 2500ms")
114
- else:
115
- print(f"❌ P95 Latency: {p95}ms > 2500ms")
116
- exit_code = 1
117
-
118
- if exit_code == 0:
119
- print("\n✅ ALL GATES PASSED")
120
- else:
121
- print("\n❌ GATES FAILED")
122
-
123
- exit(exit_code)
124
- except Exception as e:
125
- print(f"\n❌ ERROR: {e}")
126
- import traceback
127
- traceback.print_exc()
128
- exit(1)
 
1
  #!/usr/bin/env python3
2
+ """Evaluation suite runner"""
 
3
  import json
4
+ import requests
 
5
  from pathlib import Path
 
 
 
 
 
 
 
 
 
 
6
 
7
+ def run_eval_suite(api_url="http://localhost:8000"):
 
 
 
 
 
 
 
8
  """Run evaluation against golden set"""
9
 
10
+ golden_path = Path("golden_sets/gcc_claims_small.jsonl")
11
+ results = {"passed": 0, "failed": 0, "p95_latency": 0}
 
 
 
 
 
 
 
 
12
 
13
+ latencies = []
 
 
 
14
 
15
+ with open(golden_path) as f:
16
+ for line in f:
17
+ claim = json.loads(line)
18
+
19
+ response = requests.post(
 
20
  f"{api_url}/claims/triage",
21
  json=claim,
22
+ timeout=3.0
 
23
  )
24
+
25
+ latencies.append(response.elapsed.total_seconds())
26
 
27
  if response.status_code == 200:
28
+ results["passed"] += 1
 
 
 
 
 
29
  else:
30
  results["failed"] += 1
 
 
 
 
 
 
31
 
32
+ latencies.sort()
33
+ results["p95_latency"] = latencies[int(len(latencies) * 0.95)]
 
 
34
 
35
+ print(f"Passed: {results['passed']}")
36
+ print(f"Failed: {results['failed']}")
37
+ print(f"P95 Latency: {results['p95_latency']:.2f}s")
38
+
39
+ assert results["failed"] == 0, "Some tests failed"
40
+ assert results["p95_latency"] < 2.5, "P95 latency exceeded"
41
 
42
  return results
43
 
44
  if __name__ == "__main__":
45
+ run_eval_suite()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_eval_suite.py CHANGED
@@ -1,34 +1,23 @@
1
- #!/usr/bin/env python3
2
- """Test: Verify golden sets are valid"""
3
-
4
  import json
5
  from pathlib import Path
6
 
7
- GOLDEN_SETS_DIR = Path(__file__).parent.parent / "golden_sets"
8
-
9
- def test_golden_sets_exist():
10
- """Verify golden set files exist"""
11
- small = GOLDEN_SETS_DIR / "gcc_claims_small.jsonl"
12
- medium = GOLDEN_SETS_DIR / "gcc_claims_medium.jsonl"
13
 
14
- assert small.exists(), "Small golden set missing"
15
- assert medium.exists(), "Medium golden set missing"
16
- print("✅ Golden sets exist")
 
 
 
17
 
18
- def test_golden_sets_valid_jsonl():
19
- """Verify files are valid JSONL"""
20
- for jsonl_file in GOLDEN_SETS_DIR.glob("*.jsonl"):
21
- lines = jsonl_file.read_text().strip().splitlines()
22
-
23
- for i, line in enumerate(lines, 1):
24
- try:
25
- json.loads(line)
26
- except json.JSONDecodeError as e:
27
- raise AssertionError(f"Line {i} in {jsonl_file.name} invalid: {e}")
28
-
29
- print(f"✅ {jsonl_file.name}: {len(lines)} valid claims")
30
-
31
- if __name__ == "__main__":
32
- test_golden_sets_exist()
33
- test_golden_sets_valid_jsonl()
34
- print("\n🎉 Eval suite tests passed!")
 
 
 
 
1
  import json
2
  from pathlib import Path
3
 
4
+ def test_golden_set_valid():
5
+ """Test that golden set is valid JSONL"""
6
+ path = Path("golden_sets/gcc_claims_small.jsonl")
 
 
 
7
 
8
+ with open(path) as f:
9
+ for line in f:
10
+ claim = json.loads(line)
11
+ assert "id" in claim
12
+ assert "type" in claim
13
+ assert "amount" in claim
14
 
15
+ def test_claims_have_valid_types():
16
+ """Test that claims have valid types"""
17
+ valid_types = {"motor", "health", "property", "cargo"}
18
+ path = Path("golden_sets/gcc_claims_small.jsonl")
19
+
20
+ with open(path) as f:
21
+ for line in f:
22
+ claim = json.loads(line)
23
+ assert claim["type"] in valid_types