frabbani commited on
Commit
8daa8bf
·
1 Parent(s): dc3f8a9

Fix fact extraction - pass raw data for simple tools.......

Browse files
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
evaluation/__init__.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evaluation Framework for Pre-Visit Summary Agent
3
+
4
+ This package provides tools to evaluate the accuracy of the
5
+ pre-visit summary agent's data retrieval and reporting.
6
+
7
+ Modules:
8
+ test_generator: Generates test cases from Synthea database
9
+ expected_values: Computes ground truth values from database
10
+ evaluator: Compares agent facts vs expected values
11
+ metrics: Aggregates results and computes summary statistics
12
+ facts_schema: Defines structured output format for agent
13
+ run_evaluation: Main entry point for running evaluations
14
+
15
+ Usage:
16
+ # Run direct evaluation (validates framework)
17
+ python -m evaluation.run_evaluation --mode direct --patients 10
18
+
19
+ # Run simulated evaluation (tests error detection)
20
+ python -m evaluation.run_evaluation --mode simulated --error-rate 0.15
21
+ """
22
+
23
+ from .test_generator import generate_all_test_cases, get_test_summary
24
+ from .expected_values import compute_expected_values
25
+ from .evaluator import evaluate_case, CaseEvaluation
26
+ from .metrics import aggregate_metrics, format_report, EvaluationMetrics
27
+
28
+ __all__ = [
29
+ "generate_all_test_cases",
30
+ "get_test_summary",
31
+ "compute_expected_values",
32
+ "evaluate_case",
33
+ "CaseEvaluation",
34
+ "aggregate_metrics",
35
+ "format_report",
36
+ "EvaluationMetrics"
37
+ ]
evaluation/create_test_db.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Create a minimal test database for evaluation framework validation.
4
+
5
+ This creates a small SQLite database with sample patient data
6
+ that can be used to test the evaluation framework.
7
+ """
8
+
9
+ import sqlite3
10
+ import os
11
+ from datetime import datetime, timedelta
12
+ import random
13
+
14
+ DB_PATH = "data/fhir.db"
15
+
16
+
17
+ def create_test_database():
18
+ """Create test database with sample data."""
19
+
20
+ os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
21
+
22
+ conn = sqlite3.connect(DB_PATH)
23
+ cursor = conn.cursor()
24
+
25
+ # Create tables
26
+ cursor.executescript("""
27
+ -- Patients table
28
+ CREATE TABLE IF NOT EXISTS patients (
29
+ id TEXT PRIMARY KEY,
30
+ given_name TEXT,
31
+ family_name TEXT,
32
+ birth_date TEXT,
33
+ gender TEXT,
34
+ marital_status TEXT
35
+ );
36
+
37
+ -- Conditions table
38
+ CREATE TABLE IF NOT EXISTS conditions (
39
+ id TEXT PRIMARY KEY,
40
+ patient_id TEXT,
41
+ code TEXT,
42
+ display TEXT,
43
+ clinical_status TEXT,
44
+ onset_date TEXT,
45
+ abatement_date TEXT
46
+ );
47
+
48
+ -- Medications table
49
+ CREATE TABLE IF NOT EXISTS medications (
50
+ id TEXT PRIMARY KEY,
51
+ patient_id TEXT,
52
+ code TEXT,
53
+ display TEXT,
54
+ status TEXT,
55
+ start_date TEXT
56
+ );
57
+
58
+ -- Observations table
59
+ CREATE TABLE IF NOT EXISTS observations (
60
+ id TEXT PRIMARY KEY,
61
+ patient_id TEXT,
62
+ code TEXT,
63
+ display TEXT,
64
+ value_quantity REAL,
65
+ unit TEXT,
66
+ effective_date TEXT,
67
+ category TEXT
68
+ );
69
+
70
+ -- Allergies table
71
+ CREATE TABLE IF NOT EXISTS allergies (
72
+ id TEXT PRIMARY KEY,
73
+ patient_id TEXT,
74
+ substance TEXT,
75
+ reaction_display TEXT,
76
+ criticality TEXT,
77
+ category TEXT
78
+ );
79
+
80
+ -- Immunizations table
81
+ CREATE TABLE IF NOT EXISTS immunizations (
82
+ id TEXT PRIMARY KEY,
83
+ patient_id TEXT,
84
+ vaccine_code TEXT,
85
+ vaccine_display TEXT,
86
+ status TEXT,
87
+ occurrence_date TEXT
88
+ );
89
+
90
+ -- Procedures table
91
+ CREATE TABLE IF NOT EXISTS procedures (
92
+ id TEXT PRIMARY KEY,
93
+ patient_id TEXT,
94
+ code TEXT,
95
+ display TEXT,
96
+ status TEXT,
97
+ performed_date TEXT
98
+ );
99
+
100
+ -- Encounters table
101
+ CREATE TABLE IF NOT EXISTS encounters (
102
+ id TEXT PRIMARY KEY,
103
+ patient_id TEXT,
104
+ status TEXT,
105
+ class_code TEXT,
106
+ class_display TEXT,
107
+ type_code TEXT,
108
+ type_display TEXT,
109
+ reason_code TEXT,
110
+ reason_display TEXT,
111
+ period_start TEXT,
112
+ period_end TEXT
113
+ );
114
+ """)
115
+
116
+ # Create test patients
117
+ patients = [
118
+ ("patient-001", "John", "Smith", "1965-03-15", "male"),
119
+ ("patient-002", "Mary", "Johnson", "1978-07-22", "female"),
120
+ ("patient-003", "Robert", "Williams", "1952-11-08", "male"),
121
+ ]
122
+
123
+ for pid, given, family, dob, gender in patients:
124
+ cursor.execute("""
125
+ INSERT OR REPLACE INTO patients (id, given_name, family_name, birth_date, gender)
126
+ VALUES (?, ?, ?, ?, ?)
127
+ """, (pid, given, family, dob, gender))
128
+
129
+ # Create conditions
130
+ conditions = [
131
+ ("patient-001", "44054006", "Type 2 Diabetes Mellitus", "active", "2015-06-10"),
132
+ ("patient-001", "38341003", "Essential Hypertension", "active", "2018-02-15"),
133
+ ("patient-002", "195967001", "Asthma", "active", "2010-04-20"),
134
+ ("patient-002", "73211009", "Type 2 Diabetes Mellitus", "active", "2020-01-10"),
135
+ ("patient-003", "38341003", "Essential Hypertension", "active", "2005-08-12"),
136
+ ("patient-003", "13644009", "Hypercholesterolemia", "active", "2010-03-25"),
137
+ ]
138
+
139
+ for i, (pid, code, display, status, onset) in enumerate(conditions):
140
+ cursor.execute("""
141
+ INSERT OR REPLACE INTO conditions (id, patient_id, code, display, clinical_status, onset_date)
142
+ VALUES (?, ?, ?, ?, ?, ?)
143
+ """, (f"cond-{i+1:03d}", pid, code, display, status, onset))
144
+
145
+ # Create medications
146
+ medications = [
147
+ ("patient-001", "860975", "Metformin 500 MG Oral Tablet", "active", "2015-06-15"),
148
+ ("patient-001", "314076", "Lisinopril 10 MG Oral Tablet", "active", "2018-02-20"),
149
+ ("patient-002", "895994", "Albuterol 90 MCG Inhaler", "active", "2010-05-01"),
150
+ ("patient-002", "860975", "Metformin 500 MG Oral Tablet", "active", "2020-01-15"),
151
+ ("patient-003", "314076", "Lisinopril 20 MG Oral Tablet", "active", "2005-08-20"),
152
+ ("patient-003", "316672", "Atorvastatin 20 MG Oral Tablet", "active", "2010-04-01"),
153
+ ]
154
+
155
+ for i, (pid, code, display, status, start) in enumerate(medications):
156
+ cursor.execute("""
157
+ INSERT OR REPLACE INTO medications (id, patient_id, code, display, status, start_date)
158
+ VALUES (?, ?, ?, ?, ?, ?)
159
+ """, (f"med-{i+1:03d}", pid, code, display, status, start))
160
+
161
+ # Create observations (vitals)
162
+ base_date = datetime.now()
163
+
164
+ for pid in ["patient-001", "patient-002", "patient-003"]:
165
+ obs_id = 1
166
+
167
+ # Blood pressure readings over last 30 days
168
+ for days_ago in range(0, 30, 5):
169
+ date = (base_date - timedelta(days=days_ago)).strftime("%Y-%m-%d")
170
+ systolic = random.randint(120, 145)
171
+ diastolic = random.randint(75, 95)
172
+
173
+ cursor.execute("""
174
+ INSERT OR REPLACE INTO observations
175
+ (id, patient_id, code, display, value_quantity, unit, effective_date, category)
176
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
177
+ """, (f"obs-{pid}-{obs_id}", pid, "8480-6", "Systolic Blood Pressure",
178
+ systolic, "mmHg", date, "vital-signs"))
179
+ obs_id += 1
180
+
181
+ cursor.execute("""
182
+ INSERT OR REPLACE INTO observations
183
+ (id, patient_id, code, display, value_quantity, unit, effective_date, category)
184
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
185
+ """, (f"obs-{pid}-{obs_id}", pid, "8462-4", "Diastolic Blood Pressure",
186
+ diastolic, "mmHg", date, "vital-signs"))
187
+ obs_id += 1
188
+
189
+ # Heart rate
190
+ hr = random.randint(65, 85)
191
+ cursor.execute("""
192
+ INSERT OR REPLACE INTO observations
193
+ (id, patient_id, code, display, value_quantity, unit, effective_date, category)
194
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
195
+ """, (f"obs-{pid}-{obs_id}", pid, "8867-4", "Heart Rate",
196
+ hr, "/min", date, "vital-signs"))
197
+ obs_id += 1
198
+
199
+ # A1c readings (quarterly)
200
+ for months_ago in [0, 3, 6, 9]:
201
+ date = (base_date - timedelta(days=months_ago*30)).strftime("%Y-%m-%d")
202
+ a1c = round(random.uniform(6.0, 8.5), 1)
203
+
204
+ cursor.execute("""
205
+ INSERT OR REPLACE INTO observations
206
+ (id, patient_id, code, display, value_quantity, unit, effective_date, category)
207
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
208
+ """, (f"obs-{pid}-{obs_id}", pid, "4548-4", "Hemoglobin A1c",
209
+ a1c, "%", date, "laboratory"))
210
+ obs_id += 1
211
+
212
+ # Create allergies
213
+ allergies = [
214
+ ("patient-001", "Penicillin", "Hives", "high", "medication"),
215
+ ("patient-002", "Peanuts", "Anaphylaxis", "high", "food"),
216
+ ("patient-002", "Latex", "Rash", "low", "environment"),
217
+ ("patient-003", "Sulfa drugs", "Rash", "moderate", "medication"),
218
+ ]
219
+
220
+ for i, (pid, substance, reaction, criticality, category) in enumerate(allergies):
221
+ cursor.execute("""
222
+ INSERT OR REPLACE INTO allergies
223
+ (id, patient_id, substance, reaction_display, criticality, category)
224
+ VALUES (?, ?, ?, ?, ?, ?)
225
+ """, (f"allergy-{i+1:03d}", pid, substance, reaction, criticality, category))
226
+
227
+ # Create immunizations
228
+ immunizations = [
229
+ ("patient-001", "140", "Influenza Vaccine", "completed", "2024-10-15"),
230
+ ("patient-001", "207", "COVID-19 Vaccine", "completed", "2024-01-20"),
231
+ ("patient-002", "140", "Influenza Vaccine", "completed", "2024-11-01"),
232
+ ("patient-002", "113", "Tdap Vaccine", "completed", "2022-05-10"),
233
+ ("patient-003", "140", "Influenza Vaccine", "completed", "2024-09-20"),
234
+ ("patient-003", "33", "Pneumococcal Vaccine", "completed", "2023-03-15"),
235
+ ]
236
+
237
+ for i, (pid, code, display, status, date) in enumerate(immunizations):
238
+ cursor.execute("""
239
+ INSERT OR REPLACE INTO immunizations
240
+ (id, patient_id, vaccine_code, vaccine_display, status, occurrence_date)
241
+ VALUES (?, ?, ?, ?, ?, ?)
242
+ """, (f"imm-{i+1:03d}", pid, code, display, status, date))
243
+
244
+ # Create procedures
245
+ procedures = [
246
+ ("patient-001", "73761001", "Colonoscopy", "completed", "2023-06-15"),
247
+ ("patient-002", "80146002", "Appendectomy", "completed", "2015-08-20"),
248
+ ("patient-003", "232717009", "Coronary Angioplasty", "completed", "2020-02-10"),
249
+ ("patient-003", "73761001", "Colonoscopy", "completed", "2022-04-05"),
250
+ ]
251
+
252
+ for i, (pid, code, display, status, date) in enumerate(procedures):
253
+ cursor.execute("""
254
+ INSERT OR REPLACE INTO procedures
255
+ (id, patient_id, code, display, status, performed_date)
256
+ VALUES (?, ?, ?, ?, ?, ?)
257
+ """, (f"proc-{i+1:03d}", pid, code, display, status, date))
258
+
259
+ # Create encounters
260
+ for pid in ["patient-001", "patient-002", "patient-003"]:
261
+ for i in range(5):
262
+ days_ago = i * 60 # Every ~2 months
263
+ start = (base_date - timedelta(days=days_ago)).strftime("%Y-%m-%d")
264
+ end = start
265
+
266
+ cursor.execute("""
267
+ INSERT OR REPLACE INTO encounters
268
+ (id, patient_id, status, class_code, class_display, type_code, type_display,
269
+ reason_code, reason_display, period_start, period_end)
270
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
271
+ """, (f"enc-{pid}-{i+1}", pid, "finished", "AMB", "ambulatory",
272
+ "185349003", "Office Visit", "185349003", "Routine checkup",
273
+ start, end))
274
+
275
+ conn.commit()
276
+ conn.close()
277
+
278
+ print(f"Test database created at {DB_PATH}")
279
+ print("Contains:")
280
+ print(" - 3 patients")
281
+ print(" - 6 conditions")
282
+ print(" - 6 medications")
283
+ print(" - ~90 observations (vitals + labs)")
284
+ print(" - 4 allergies")
285
+ print(" - 6 immunizations")
286
+ print(" - 4 procedures")
287
+ print(" - 15 encounters")
288
+
289
+
290
+ if __name__ == "__main__":
291
+ create_test_database()
evaluation/evaluator.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Evaluator
4
+
5
+ Compares agent-reported facts against expected values from database.
6
+ Computes accuracy metrics for each comparison.
7
+ """
8
+
9
+ from typing import Dict, List, Any, Tuple, Optional
10
+ from dataclasses import dataclass, field
11
+ import math
12
+
13
+
14
+ # Tolerances for numerical comparisons
15
+ TOLERANCES = {
16
+ "vital_value": 0.5, # BP, heart rate, etc.
17
+ "lab_value": 0.5, # Lab results
18
+ "average": 0.5, # Computed averages
19
+ "percentage": 1.0, # Percentage values
20
+ "count": 0, # Counts must be exact
21
+ }
22
+
23
+
24
+ @dataclass
25
+ class ComparisonResult:
26
+ """Result of comparing a single value."""
27
+ field_name: str
28
+ expected: Any
29
+ actual: Any
30
+ match: bool
31
+ error_type: Optional[str] = None # "hallucination", "omission", "mismatch", "tolerance"
32
+ error_detail: Optional[str] = None
33
+
34
+
35
+ @dataclass
36
+ class CaseEvaluation:
37
+ """Evaluation result for a single test case."""
38
+ case_id: str
39
+ query_type: str
40
+ success: bool
41
+ comparisons: List[ComparisonResult] = field(default_factory=list)
42
+
43
+ # Summary stats
44
+ total_fields: int = 0
45
+ correct_fields: int = 0
46
+ hallucinations: int = 0
47
+ omissions: int = 0
48
+ mismatches: int = 0
49
+
50
+ def accuracy(self) -> float:
51
+ if self.total_fields == 0:
52
+ return 0.0
53
+ return self.correct_fields / self.total_fields
54
+
55
+
56
+ def values_match(expected: Any, actual: Any, tolerance: float = 0) -> bool:
57
+ """Check if two values match, with optional tolerance for numbers."""
58
+ if expected is None and actual is None:
59
+ return True
60
+ if expected is None or actual is None:
61
+ return False
62
+
63
+ # Numeric comparison with tolerance
64
+ if isinstance(expected, (int, float)) and isinstance(actual, (int, float)):
65
+ return abs(expected - actual) <= tolerance
66
+
67
+ # String comparison (case-insensitive, trimmed)
68
+ if isinstance(expected, str) and isinstance(actual, str):
69
+ return expected.lower().strip() == actual.lower().strip()
70
+
71
+ # List comparison (order-independent for certain types)
72
+ if isinstance(expected, list) and isinstance(actual, list):
73
+ return set(str(x).lower() for x in expected) == set(str(x).lower() for x in actual)
74
+
75
+ # Default exact comparison
76
+ return expected == actual
77
+
78
+
79
+ def compare_numeric(field_name: str, expected: float, actual: float,
80
+ tolerance: float) -> ComparisonResult:
81
+ """Compare two numeric values."""
82
+ if actual is None:
83
+ return ComparisonResult(
84
+ field_name=field_name,
85
+ expected=expected,
86
+ actual=actual,
87
+ match=False,
88
+ error_type="omission",
89
+ error_detail=f"Expected {expected}, got nothing"
90
+ )
91
+
92
+ diff = abs(expected - actual)
93
+ if diff <= tolerance:
94
+ return ComparisonResult(
95
+ field_name=field_name,
96
+ expected=expected,
97
+ actual=actual,
98
+ match=True
99
+ )
100
+ else:
101
+ return ComparisonResult(
102
+ field_name=field_name,
103
+ expected=expected,
104
+ actual=actual,
105
+ match=False,
106
+ error_type="mismatch" if diff <= tolerance * 3 else "hallucination",
107
+ error_detail=f"Expected {expected}, got {actual} (diff: {diff:.1f})"
108
+ )
109
+
110
+
111
+ def compare_list_items(field_name: str, expected_items: List[str],
112
+ actual_items: List[str]) -> Tuple[List[ComparisonResult], int, int]:
113
+ """
114
+ Compare two lists of items (e.g., medication names).
115
+ Returns comparisons, hallucination count, omission count.
116
+ """
117
+ comparisons = []
118
+
119
+ expected_lower = set(x.lower().strip() for x in expected_items)
120
+ actual_lower = set(x.lower().strip() for x in actual_items)
121
+
122
+ # Find matches
123
+ matches = expected_lower & actual_lower
124
+
125
+ # Find omissions (in expected but not actual)
126
+ omissions = expected_lower - actual_lower
127
+
128
+ # Find hallucinations (in actual but not expected)
129
+ hallucinations = actual_lower - expected_lower
130
+
131
+ # Record matches
132
+ for item in matches:
133
+ comparisons.append(ComparisonResult(
134
+ field_name=f"{field_name}_item",
135
+ expected=item,
136
+ actual=item,
137
+ match=True
138
+ ))
139
+
140
+ # Record omissions
141
+ for item in omissions:
142
+ comparisons.append(ComparisonResult(
143
+ field_name=f"{field_name}_item",
144
+ expected=item,
145
+ actual=None,
146
+ match=False,
147
+ error_type="omission",
148
+ error_detail=f"Missing: {item}"
149
+ ))
150
+
151
+ # Record hallucinations
152
+ for item in hallucinations:
153
+ comparisons.append(ComparisonResult(
154
+ field_name=f"{field_name}_item",
155
+ expected=None,
156
+ actual=item,
157
+ match=False,
158
+ error_type="hallucination",
159
+ error_detail=f"Not in database: {item}"
160
+ ))
161
+
162
+ return comparisons, len(hallucinations), len(omissions)
163
+
164
+
165
+ def evaluate_vital_trend(expected: Dict, actual_facts: Dict) -> CaseEvaluation:
166
+ """Evaluate vital trend response against expected values."""
167
+ evaluation = CaseEvaluation(
168
+ case_id="",
169
+ query_type="vital_trend",
170
+ success=True
171
+ )
172
+
173
+ if "metrics" not in expected:
174
+ evaluation.success = False
175
+ return evaluation
176
+
177
+ for label, expected_metrics in expected["metrics"].items():
178
+ actual_metrics = actual_facts.get("metrics", {}).get(label, {})
179
+
180
+ # Compare each metric
181
+ for metric_name in ["min", "max", "avg", "latest", "count"]:
182
+ if metric_name in expected_metrics:
183
+ exp_val = expected_metrics[metric_name]
184
+ act_val = actual_metrics.get(metric_name)
185
+
186
+ tolerance = TOLERANCES["count"] if metric_name == "count" else TOLERANCES["vital_value"]
187
+
188
+ comparison = compare_numeric(
189
+ f"{label}_{metric_name}",
190
+ exp_val,
191
+ act_val,
192
+ tolerance
193
+ )
194
+ evaluation.comparisons.append(comparison)
195
+ evaluation.total_fields += 1
196
+
197
+ if comparison.match:
198
+ evaluation.correct_fields += 1
199
+ elif comparison.error_type == "hallucination":
200
+ evaluation.hallucinations += 1
201
+ elif comparison.error_type == "omission":
202
+ evaluation.omissions += 1
203
+ else:
204
+ evaluation.mismatches += 1
205
+
206
+ # Compare date range
207
+ for date_field in ["earliest_date", "latest_date"]:
208
+ if date_field in expected_metrics:
209
+ exp_date = expected_metrics[date_field]
210
+ act_date = actual_metrics.get(date_field)
211
+
212
+ match = values_match(exp_date, act_date)
213
+ evaluation.comparisons.append(ComparisonResult(
214
+ field_name=f"{label}_{date_field}",
215
+ expected=exp_date,
216
+ actual=act_date,
217
+ match=match,
218
+ error_type=None if match else "mismatch"
219
+ ))
220
+ evaluation.total_fields += 1
221
+ if match:
222
+ evaluation.correct_fields += 1
223
+ else:
224
+ evaluation.mismatches += 1
225
+
226
+ evaluation.success = evaluation.accuracy() >= 0.8 # 80% threshold
227
+ return evaluation
228
+
229
+
230
+ def evaluate_list_query(expected: Dict, actual_facts: Dict,
231
+ items_key: str, names_key: str) -> CaseEvaluation:
232
+ """
233
+ Evaluate list-based queries (medications, conditions, allergies, etc.)
234
+ """
235
+ evaluation = CaseEvaluation(
236
+ case_id="",
237
+ query_type=expected["query_type"],
238
+ success=True
239
+ )
240
+
241
+ # Compare count
242
+ exp_count = expected.get("count", 0)
243
+ act_count = actual_facts.get("count", 0)
244
+
245
+ count_comparison = compare_numeric("count", exp_count, act_count, TOLERANCES["count"])
246
+ evaluation.comparisons.append(count_comparison)
247
+ evaluation.total_fields += 1
248
+ if count_comparison.match:
249
+ evaluation.correct_fields += 1
250
+
251
+ # Compare item names
252
+ exp_names = expected.get(names_key, [])
253
+ act_names = actual_facts.get(names_key, [])
254
+
255
+ item_comparisons, hallucinations, omissions = compare_list_items(
256
+ items_key, exp_names, act_names
257
+ )
258
+
259
+ evaluation.comparisons.extend(item_comparisons)
260
+ evaluation.total_fields += len(item_comparisons)
261
+ evaluation.correct_fields += sum(1 for c in item_comparisons if c.match)
262
+ evaluation.hallucinations += hallucinations
263
+ evaluation.omissions += omissions
264
+
265
+ evaluation.success = evaluation.accuracy() >= 0.8
266
+ return evaluation
267
+
268
+
269
+ def evaluate_medication_list(expected: Dict, actual_facts: Dict) -> CaseEvaluation:
270
+ """Evaluate medication list response."""
271
+ return evaluate_list_query(expected, actual_facts, "medications", "medication_names")
272
+
273
+
274
+ def evaluate_condition_list(expected: Dict, actual_facts: Dict) -> CaseEvaluation:
275
+ """Evaluate condition list response."""
276
+ return evaluate_list_query(expected, actual_facts, "conditions", "condition_names")
277
+
278
+
279
+ def evaluate_allergy_list(expected: Dict, actual_facts: Dict) -> CaseEvaluation:
280
+ """Evaluate allergy list response."""
281
+ return evaluate_list_query(expected, actual_facts, "allergies", "substances")
282
+
283
+
284
+ def evaluate_immunization_list(expected: Dict, actual_facts: Dict) -> CaseEvaluation:
285
+ """Evaluate immunization list response."""
286
+ return evaluate_list_query(expected, actual_facts, "immunizations", "vaccine_names")
287
+
288
+
289
+ def evaluate_procedure_list(expected: Dict, actual_facts: Dict) -> CaseEvaluation:
290
+ """Evaluate procedure list response."""
291
+ return evaluate_list_query(expected, actual_facts, "procedures", "procedure_names")
292
+
293
+
294
+ def evaluate_encounter_list(expected: Dict, actual_facts: Dict) -> CaseEvaluation:
295
+ """Evaluate encounter list response."""
296
+ evaluation = CaseEvaluation(
297
+ case_id="",
298
+ query_type="encounter_list",
299
+ success=True
300
+ )
301
+
302
+ exp_count = expected.get("count", 0)
303
+ act_count = actual_facts.get("count", 0)
304
+
305
+ # For encounters, we check if count is within the limit
306
+ limit = expected.get("limit", 5)
307
+
308
+ count_comparison = compare_numeric("count", min(exp_count, limit), act_count, TOLERANCES["count"])
309
+ evaluation.comparisons.append(count_comparison)
310
+ evaluation.total_fields += 1
311
+ if count_comparison.match:
312
+ evaluation.correct_fields += 1
313
+
314
+ evaluation.success = count_comparison.match
315
+ return evaluation
316
+
317
+
318
+ def evaluate_lab_trend(expected: Dict, actual_facts: Dict) -> CaseEvaluation:
319
+ """Evaluate lab trend response."""
320
+ evaluation = CaseEvaluation(
321
+ case_id="",
322
+ query_type="lab_trend",
323
+ success=True
324
+ )
325
+
326
+ if "metrics" not in expected:
327
+ evaluation.success = False
328
+ return evaluation
329
+
330
+ exp_metrics = expected["metrics"]
331
+ act_metrics = actual_facts.get("metrics", {})
332
+
333
+ for metric_name in ["min", "max", "avg", "latest", "count"]:
334
+ if metric_name in exp_metrics:
335
+ exp_val = exp_metrics[metric_name]
336
+ act_val = act_metrics.get(metric_name)
337
+
338
+ tolerance = TOLERANCES["count"] if metric_name == "count" else TOLERANCES["lab_value"]
339
+
340
+ comparison = compare_numeric(metric_name, exp_val, act_val, tolerance)
341
+ evaluation.comparisons.append(comparison)
342
+ evaluation.total_fields += 1
343
+
344
+ if comparison.match:
345
+ evaluation.correct_fields += 1
346
+ elif comparison.error_type == "hallucination":
347
+ evaluation.hallucinations += 1
348
+ elif comparison.error_type == "omission":
349
+ evaluation.omissions += 1
350
+ else:
351
+ evaluation.mismatches += 1
352
+
353
+ evaluation.success = evaluation.accuracy() >= 0.8
354
+ return evaluation
355
+
356
+
357
+ def evaluate_case(test_case: Dict, expected: Dict, actual_facts: Dict) -> CaseEvaluation:
358
+ """
359
+ Evaluate a single test case.
360
+
361
+ Args:
362
+ test_case: The test case definition
363
+ expected: Expected values computed from database
364
+ actual_facts: Facts reported by the agent
365
+
366
+ Returns:
367
+ CaseEvaluation with detailed comparison results
368
+ """
369
+ query_type = test_case["query_type"]
370
+
371
+ evaluators = {
372
+ "vital_trend": evaluate_vital_trend,
373
+ "medication_list": evaluate_medication_list,
374
+ "condition_list": evaluate_condition_list,
375
+ "allergy_list": evaluate_allergy_list,
376
+ "immunization_list": evaluate_immunization_list,
377
+ "procedure_list": evaluate_procedure_list,
378
+ "encounter_list": evaluate_encounter_list,
379
+ "lab_trend": evaluate_lab_trend
380
+ }
381
+
382
+ evaluator = evaluators.get(query_type)
383
+ if not evaluator:
384
+ return CaseEvaluation(
385
+ case_id=test_case["case_id"],
386
+ query_type=query_type,
387
+ success=False
388
+ )
389
+
390
+ evaluation = evaluator(expected, actual_facts)
391
+ evaluation.case_id = test_case["case_id"]
392
+
393
+ return evaluation
394
+
395
+
396
+ if __name__ == "__main__":
397
+ # Test with sample data
398
+ expected = {
399
+ "query_type": "vital_trend",
400
+ "metrics": {
401
+ "systolic": {
402
+ "min": 128.0,
403
+ "max": 142.0,
404
+ "avg": 134.8,
405
+ "count": 5,
406
+ "earliest_date": "2026-01-22",
407
+ "latest_date": "2026-01-27"
408
+ }
409
+ }
410
+ }
411
+
412
+ actual = {
413
+ "metrics": {
414
+ "systolic": {
415
+ "min": 128.0,
416
+ "max": 142.0,
417
+ "avg": 135.0, # Slightly off
418
+ "count": 5,
419
+ "earliest_date": "2026-01-22",
420
+ "latest_date": "2026-01-27"
421
+ }
422
+ }
423
+ }
424
+
425
+ test_case = {"case_id": "test_1", "query_type": "vital_trend"}
426
+
427
+ result = evaluate_case(test_case, expected, actual)
428
+
429
+ print(f"Case: {result.case_id}")
430
+ print(f"Success: {result.success}")
431
+ print(f"Accuracy: {result.accuracy():.1%}")
432
+ print(f"Fields: {result.correct_fields}/{result.total_fields}")
433
+ print(f"Hallucinations: {result.hallucinations}")
434
+ print(f"Omissions: {result.omissions}")
435
+ print("\nComparisons:")
436
+ for c in result.comparisons:
437
+ status = "✓" if c.match else "✗"
438
+ print(f" {status} {c.field_name}: expected={c.expected}, actual={c.actual}")
evaluation/expected_values.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Expected Values Calculator
4
+
5
+ Computes ground truth values directly from the database for each test case type.
6
+ These are the values we expect the LLM to report.
7
+ """
8
+
9
+ import sqlite3
10
+ from datetime import datetime, timedelta
11
+ from typing import Dict, List, Any, Optional
12
+ import os
13
+ import statistics
14
+
15
+ DB_PATH = os.getenv("DB_PATH", "data/fhir.db")
16
+
17
+
18
+ def get_db():
19
+ """Get database connection."""
20
+ conn = sqlite3.connect(DB_PATH)
21
+ conn.row_factory = sqlite3.Row
22
+ return conn
23
+
24
+
25
+ def compute_vital_trend_expected(patient_id: str, vital_type: str, codes: List[str],
26
+ labels: List[str], days: int = 30) -> Dict[str, Any]:
27
+ """
28
+ Compute expected values for vital trend queries.
29
+
30
+ Returns expected facts like min, max, avg, count, date range.
31
+ """
32
+ conn = get_db()
33
+ try:
34
+ cutoff_date = (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d")
35
+
36
+ result = {
37
+ "query_type": "vital_trend",
38
+ "vital_type": vital_type,
39
+ "days": days,
40
+ "metrics": {}
41
+ }
42
+
43
+ for code, label in zip(codes, labels):
44
+ cursor = conn.execute("""
45
+ SELECT value_quantity, effective_date
46
+ FROM observations
47
+ WHERE patient_id = ? AND code = ? AND effective_date >= ?
48
+ ORDER BY effective_date ASC
49
+ """, (patient_id, code, cutoff_date))
50
+
51
+ rows = cursor.fetchall()
52
+ values = [r["value_quantity"] for r in rows if r["value_quantity"] is not None]
53
+ dates = [r["effective_date"][:10] for r in rows]
54
+
55
+ if values:
56
+ result["metrics"][label] = {
57
+ "min": round(min(values), 1),
58
+ "max": round(max(values), 1),
59
+ "avg": round(statistics.mean(values), 1),
60
+ "count": len(values),
61
+ "latest": round(values[-1], 1),
62
+ "earliest_date": dates[0] if dates else None,
63
+ "latest_date": dates[-1] if dates else None,
64
+ "all_values": [round(v, 1) for v in values],
65
+ "all_dates": dates
66
+ }
67
+
68
+ return result
69
+ finally:
70
+ conn.close()
71
+
72
+
73
+ def compute_medication_expected(patient_id: str, status: Optional[str] = None) -> Dict[str, Any]:
74
+ """
75
+ Compute expected values for medication queries.
76
+
77
+ Returns list of medications with their details.
78
+ """
79
+ conn = get_db()
80
+ try:
81
+ if status:
82
+ cursor = conn.execute("""
83
+ SELECT code, display, status, start_date
84
+ FROM medications
85
+ WHERE patient_id = ? AND status = ?
86
+ ORDER BY start_date DESC
87
+ """, (patient_id, status))
88
+ else:
89
+ cursor = conn.execute("""
90
+ SELECT code, display, status, start_date
91
+ FROM medications
92
+ WHERE patient_id = ?
93
+ ORDER BY start_date DESC
94
+ """, (patient_id,))
95
+
96
+ medications = []
97
+ for row in cursor.fetchall():
98
+ medications.append({
99
+ "code": row["code"],
100
+ "display": row["display"],
101
+ "status": row["status"],
102
+ "start_date": row["start_date"][:10] if row["start_date"] else None
103
+ })
104
+
105
+ return {
106
+ "query_type": "medication_list",
107
+ "status_filter": status,
108
+ "count": len(medications),
109
+ "medications": medications,
110
+ "medication_names": [m["display"] for m in medications]
111
+ }
112
+ finally:
113
+ conn.close()
114
+
115
+
116
+ def compute_condition_expected(patient_id: str) -> Dict[str, Any]:
117
+ """
118
+ Compute expected values for condition queries.
119
+ """
120
+ conn = get_db()
121
+ try:
122
+ cursor = conn.execute("""
123
+ SELECT code, display, clinical_status, onset_date
124
+ FROM conditions
125
+ WHERE patient_id = ?
126
+ ORDER BY onset_date DESC
127
+ """, (patient_id,))
128
+
129
+ conditions = []
130
+ for row in cursor.fetchall():
131
+ conditions.append({
132
+ "code": row["code"],
133
+ "display": row["display"],
134
+ "clinical_status": row["clinical_status"],
135
+ "onset_date": row["onset_date"][:10] if row["onset_date"] else None
136
+ })
137
+
138
+ return {
139
+ "query_type": "condition_list",
140
+ "count": len(conditions),
141
+ "conditions": conditions,
142
+ "condition_names": [c["display"] for c in conditions]
143
+ }
144
+ finally:
145
+ conn.close()
146
+
147
+
148
+ def compute_allergy_expected(patient_id: str) -> Dict[str, Any]:
149
+ """
150
+ Compute expected values for allergy queries.
151
+ """
152
+ conn = get_db()
153
+ try:
154
+ cursor = conn.execute("""
155
+ SELECT substance, reaction_display, criticality, category
156
+ FROM allergies
157
+ WHERE patient_id = ?
158
+ """, (patient_id,))
159
+
160
+ allergies = []
161
+ for row in cursor.fetchall():
162
+ allergies.append({
163
+ "substance": row["substance"],
164
+ "reaction": row["reaction_display"],
165
+ "criticality": row["criticality"],
166
+ "category": row["category"]
167
+ })
168
+
169
+ return {
170
+ "query_type": "allergy_list",
171
+ "count": len(allergies),
172
+ "allergies": allergies,
173
+ "substances": [a["substance"] for a in allergies]
174
+ }
175
+ finally:
176
+ conn.close()
177
+
178
+
179
+ def compute_immunization_expected(patient_id: str) -> Dict[str, Any]:
180
+ """
181
+ Compute expected values for immunization queries.
182
+ """
183
+ conn = get_db()
184
+ try:
185
+ cursor = conn.execute("""
186
+ SELECT vaccine_code, vaccine_display, status, occurrence_date
187
+ FROM immunizations
188
+ WHERE patient_id = ?
189
+ ORDER BY occurrence_date DESC
190
+ """, (patient_id,))
191
+
192
+ immunizations = []
193
+ for row in cursor.fetchall():
194
+ immunizations.append({
195
+ "vaccine_code": row["vaccine_code"],
196
+ "vaccine_display": row["vaccine_display"],
197
+ "status": row["status"],
198
+ "occurrence_date": row["occurrence_date"][:10] if row["occurrence_date"] else None
199
+ })
200
+
201
+ return {
202
+ "query_type": "immunization_list",
203
+ "count": len(immunizations),
204
+ "immunizations": immunizations,
205
+ "vaccine_names": [i["vaccine_display"] for i in immunizations]
206
+ }
207
+ finally:
208
+ conn.close()
209
+
210
+
211
+ def compute_procedure_expected(patient_id: str) -> Dict[str, Any]:
212
+ """
213
+ Compute expected values for procedure queries.
214
+ """
215
+ conn = get_db()
216
+ try:
217
+ cursor = conn.execute("""
218
+ SELECT code, display, status, performed_date
219
+ FROM procedures
220
+ WHERE patient_id = ?
221
+ ORDER BY performed_date DESC
222
+ """, (patient_id,))
223
+
224
+ procedures = []
225
+ for row in cursor.fetchall():
226
+ procedures.append({
227
+ "code": row["code"],
228
+ "display": row["display"],
229
+ "status": row["status"],
230
+ "performed_date": row["performed_date"][:10] if row["performed_date"] else None
231
+ })
232
+
233
+ return {
234
+ "query_type": "procedure_list",
235
+ "count": len(procedures),
236
+ "procedures": procedures,
237
+ "procedure_names": [p["display"] for p in procedures]
238
+ }
239
+ finally:
240
+ conn.close()
241
+
242
+
243
+ def compute_encounter_expected(patient_id: str, limit: int = 5) -> Dict[str, Any]:
244
+ """
245
+ Compute expected values for encounter queries.
246
+ """
247
+ conn = get_db()
248
+ try:
249
+ cursor = conn.execute("""
250
+ SELECT type_display, reason_display, period_start, period_end, class_display
251
+ FROM encounters
252
+ WHERE patient_id = ?
253
+ ORDER BY period_start DESC
254
+ LIMIT ?
255
+ """, (patient_id, limit))
256
+
257
+ encounters = []
258
+ for row in cursor.fetchall():
259
+ encounters.append({
260
+ "type": row["type_display"],
261
+ "reason": row["reason_display"],
262
+ "class": row["class_display"],
263
+ "start_date": row["period_start"][:10] if row["period_start"] else None,
264
+ "end_date": row["period_end"][:10] if row["period_end"] else None
265
+ })
266
+
267
+ return {
268
+ "query_type": "encounter_list",
269
+ "count": len(encounters),
270
+ "limit": limit,
271
+ "encounters": encounters
272
+ }
273
+ finally:
274
+ conn.close()
275
+
276
+
277
+ def compute_lab_trend_expected(patient_id: str, lab_type: str, code: str,
278
+ periods: int = 4) -> Dict[str, Any]:
279
+ """
280
+ Compute expected values for lab trend queries.
281
+ """
282
+ conn = get_db()
283
+ try:
284
+ cursor = conn.execute("""
285
+ SELECT value_quantity, effective_date, unit
286
+ FROM observations
287
+ WHERE patient_id = ? AND code = ?
288
+ ORDER BY effective_date DESC
289
+ LIMIT ?
290
+ """, (patient_id, code, periods))
291
+
292
+ rows = cursor.fetchall()
293
+ values = [r["value_quantity"] for r in rows if r["value_quantity"] is not None]
294
+ dates = [r["effective_date"][:10] for r in rows]
295
+ unit = rows[0]["unit"] if rows else None
296
+
297
+ result = {
298
+ "query_type": "lab_trend",
299
+ "lab_type": lab_type,
300
+ "code": code,
301
+ "unit": unit,
302
+ "count": len(values)
303
+ }
304
+
305
+ if values:
306
+ result["metrics"] = {
307
+ "min": round(min(values), 1),
308
+ "max": round(max(values), 1),
309
+ "avg": round(statistics.mean(values), 1),
310
+ "latest": round(values[0], 1), # Most recent
311
+ "latest_date": dates[0] if dates else None,
312
+ "all_values": [round(v, 1) for v in values],
313
+ "all_dates": dates
314
+ }
315
+
316
+ return result
317
+ finally:
318
+ conn.close()
319
+
320
+
321
+ def compute_expected_values(test_case: Dict) -> Dict[str, Any]:
322
+ """
323
+ Compute expected values for any test case type.
324
+ Routes to the appropriate computation function.
325
+ """
326
+ query_type = test_case["query_type"]
327
+ patient_id = test_case["patient_id"]
328
+ params = test_case.get("parameters", {})
329
+
330
+ if query_type == "vital_trend":
331
+ return compute_vital_trend_expected(
332
+ patient_id,
333
+ params["vital_type"],
334
+ params["codes"],
335
+ params["labels"],
336
+ params.get("days", 30)
337
+ )
338
+
339
+ elif query_type == "medication_list":
340
+ return compute_medication_expected(patient_id, params.get("status"))
341
+
342
+ elif query_type == "condition_list":
343
+ return compute_condition_expected(patient_id)
344
+
345
+ elif query_type == "allergy_list":
346
+ return compute_allergy_expected(patient_id)
347
+
348
+ elif query_type == "immunization_list":
349
+ return compute_immunization_expected(patient_id)
350
+
351
+ elif query_type == "procedure_list":
352
+ return compute_procedure_expected(patient_id)
353
+
354
+ elif query_type == "encounter_list":
355
+ return compute_encounter_expected(patient_id, params.get("limit", 5))
356
+
357
+ elif query_type == "lab_trend":
358
+ return compute_lab_trend_expected(
359
+ patient_id,
360
+ params["lab_type"],
361
+ params["code"],
362
+ params.get("periods", 4)
363
+ )
364
+
365
+ else:
366
+ return {"error": f"Unknown query type: {query_type}"}
367
+
368
+
369
+ if __name__ == "__main__":
370
+ # Test with a sample case
371
+ from test_generator import generate_all_test_cases
372
+ import json
373
+
374
+ print("Generating test cases...")
375
+ cases = generate_all_test_cases(num_patients=1)
376
+
377
+ print(f"\nComputing expected values for {len(cases)} test cases...")
378
+
379
+ for case in cases[:3]: # Show first 3
380
+ print(f"\n{'='*60}")
381
+ print(f"Case: {case['case_id']}")
382
+ print(f"Query: {case['query']}")
383
+
384
+ expected = compute_expected_values(case)
385
+ print(f"Expected values:")
386
+ print(json.dumps(expected, indent=2, default=str))
evaluation/facts_schema.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Facts Schema
4
+
5
+ Defines the structured facts format that the agent should return
6
+ alongside its text responses. These facts are used for evaluation.
7
+ """
8
+
9
+ from typing import Dict, List, Any, Optional
10
+ from dataclasses import dataclass, asdict
11
+ import json
12
+
13
+
14
+ @dataclass
15
+ class VitalTrendFacts:
16
+ """Structured facts for vital sign trend queries."""
17
+ vital_type: str
18
+ days: int
19
+ metrics: Dict[str, Dict[str, Any]] # {label: {min, max, avg, count, dates...}}
20
+
21
+ def to_dict(self) -> Dict:
22
+ return asdict(self)
23
+
24
+
25
+ @dataclass
26
+ class MedicationFacts:
27
+ """Structured facts for medication queries."""
28
+ status_filter: Optional[str]
29
+ count: int
30
+ medication_names: List[str]
31
+
32
+ def to_dict(self) -> Dict:
33
+ return asdict(self)
34
+
35
+
36
+ @dataclass
37
+ class ConditionFacts:
38
+ """Structured facts for condition queries."""
39
+ count: int
40
+ condition_names: List[str]
41
+
42
+ def to_dict(self) -> Dict:
43
+ return asdict(self)
44
+
45
+
46
+ @dataclass
47
+ class AllergyFacts:
48
+ """Structured facts for allergy queries."""
49
+ count: int
50
+ substances: List[str]
51
+
52
+ def to_dict(self) -> Dict:
53
+ return asdict(self)
54
+
55
+
56
+ @dataclass
57
+ class ImmunizationFacts:
58
+ """Structured facts for immunization queries."""
59
+ count: int
60
+ vaccine_names: List[str]
61
+
62
+ def to_dict(self) -> Dict:
63
+ return asdict(self)
64
+
65
+
66
+ @dataclass
67
+ class ProcedureFacts:
68
+ """Structured facts for procedure queries."""
69
+ count: int
70
+ procedure_names: List[str]
71
+
72
+ def to_dict(self) -> Dict:
73
+ return asdict(self)
74
+
75
+
76
+ @dataclass
77
+ class EncounterFacts:
78
+ """Structured facts for encounter queries."""
79
+ count: int
80
+ limit: int
81
+
82
+ def to_dict(self) -> Dict:
83
+ return asdict(self)
84
+
85
+
86
+ @dataclass
87
+ class LabTrendFacts:
88
+ """Structured facts for lab trend queries."""
89
+ lab_type: str
90
+ code: str
91
+ unit: Optional[str]
92
+ count: int
93
+ metrics: Dict[str, Any] # {min, max, avg, latest, dates...}
94
+
95
+ def to_dict(self) -> Dict:
96
+ return asdict(self)
97
+
98
+
99
+ def extract_vital_facts_from_tool_result(tool_result: Dict) -> Optional[VitalTrendFacts]:
100
+ """
101
+ Extract structured facts from vital chart tool result.
102
+ The tool already returns structured JSON - we just reshape it.
103
+ """
104
+ if "error" in tool_result:
105
+ return None
106
+
107
+ chart_type = tool_result.get("chart_type", "")
108
+ if chart_type not in ["line", "line_dual"]:
109
+ return None
110
+
111
+ metrics = {}
112
+
113
+ for dataset in tool_result.get("datasets", []):
114
+ label = dataset.get("label", "unknown").lower().replace(" ", "_")
115
+ data_points = dataset.get("data", [])
116
+
117
+ if not data_points:
118
+ continue
119
+
120
+ values = [p["value"] for p in data_points if p.get("value") is not None]
121
+ dates = [p["date"] for p in data_points if p.get("date")]
122
+
123
+ if values:
124
+ import statistics
125
+ metrics[label] = {
126
+ "min": round(min(values), 1),
127
+ "max": round(max(values), 1),
128
+ "avg": round(statistics.mean(values), 1),
129
+ "count": len(values),
130
+ "latest": round(values[-1], 1) if values else None,
131
+ "earliest_date": dates[0] if dates else None,
132
+ "latest_date": dates[-1] if dates else None
133
+ }
134
+
135
+ return VitalTrendFacts(
136
+ vital_type=tool_result.get("title", "").lower().replace(" ", "_"),
137
+ days=30, # Default, could be extracted from title
138
+ metrics=metrics
139
+ )
140
+
141
+
142
+ def extract_lab_facts_from_tool_result(tool_result: Dict) -> Optional[LabTrendFacts]:
143
+ """Extract structured facts from lab chart tool result."""
144
+ if "error" in tool_result:
145
+ return None
146
+
147
+ datasets = tool_result.get("datasets", [])
148
+ if not datasets:
149
+ return None
150
+
151
+ # Get first dataset
152
+ dataset = datasets[0]
153
+ data_points = dataset.get("data", [])
154
+
155
+ if not data_points:
156
+ return None
157
+
158
+ values = [p["value"] for p in data_points if p.get("value") is not None]
159
+ dates = [p["date"] for p in data_points if p.get("date")]
160
+
161
+ metrics = {}
162
+ if values:
163
+ import statistics
164
+ metrics = {
165
+ "min": round(min(values), 1),
166
+ "max": round(max(values), 1),
167
+ "avg": round(statistics.mean(values), 1),
168
+ "latest": round(values[-1], 1),
169
+ "latest_date": dates[-1] if dates else None
170
+ }
171
+
172
+ return LabTrendFacts(
173
+ lab_type=dataset.get("label", "unknown").lower(),
174
+ code="", # Not in tool result
175
+ unit=tool_result.get("unit"),
176
+ count=len(values),
177
+ metrics=metrics
178
+ )
179
+
180
+
181
+ def extract_medication_facts(medications: List[Dict], status_filter: Optional[str] = None) -> MedicationFacts:
182
+ """Extract structured facts from medication list."""
183
+ names = [m.get("display", "") for m in medications]
184
+ return MedicationFacts(
185
+ status_filter=status_filter,
186
+ count=len(medications),
187
+ medication_names=names
188
+ )
189
+
190
+
191
+ def extract_condition_facts(conditions: List[Dict]) -> ConditionFacts:
192
+ """Extract structured facts from condition list."""
193
+ names = [c.get("display", "") for c in conditions]
194
+ return ConditionFacts(
195
+ count=len(conditions),
196
+ condition_names=names
197
+ )
198
+
199
+
200
+ def extract_allergy_facts(allergies: List[Dict]) -> AllergyFacts:
201
+ """Extract structured facts from allergy list."""
202
+ substances = [a.get("substance", "") for a in allergies]
203
+ return AllergyFacts(
204
+ count=len(allergies),
205
+ substances=substances
206
+ )
207
+
208
+
209
+ def extract_immunization_facts(immunizations: List[Dict]) -> ImmunizationFacts:
210
+ """Extract structured facts from immunization list."""
211
+ names = [i.get("vaccine_display", "") for i in immunizations]
212
+ return ImmunizationFacts(
213
+ count=len(immunizations),
214
+ vaccine_names=names
215
+ )
216
+
217
+
218
+ def extract_procedure_facts(procedures: List[Dict]) -> ProcedureFacts:
219
+ """Extract structured facts from procedure list."""
220
+ names = [p.get("display", "") for p in procedures]
221
+ return ProcedureFacts(
222
+ count=len(procedures),
223
+ procedure_names=names
224
+ )
225
+
226
+
227
+ def extract_encounter_facts(encounters: List[Dict], limit: int = 5) -> EncounterFacts:
228
+ """Extract structured facts from encounter list."""
229
+ return EncounterFacts(
230
+ count=len(encounters),
231
+ limit=limit
232
+ )
evaluation/metrics.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Metrics Calculator
4
+
5
+ Aggregates evaluation results across multiple test cases
6
+ and computes summary statistics.
7
+ """
8
+
9
+ from typing import Dict, List, Any
10
+ from dataclasses import dataclass, field
11
+ from datetime import datetime
12
+ import json
13
+
14
+ from .evaluator import CaseEvaluation
15
+
16
+
17
+ @dataclass
18
+ class EvaluationMetrics:
19
+ """Aggregated metrics across all test cases."""
20
+
21
+ # Overall
22
+ total_cases: int = 0
23
+ successful_cases: int = 0
24
+ failed_cases: int = 0
25
+
26
+ # Field-level
27
+ total_fields: int = 0
28
+ correct_fields: int = 0
29
+ total_hallucinations: int = 0
30
+ total_omissions: int = 0
31
+ total_mismatches: int = 0
32
+
33
+ # By query type
34
+ by_query_type: Dict[str, Dict[str, Any]] = field(default_factory=dict)
35
+
36
+ # Detailed results
37
+ case_results: List[Dict] = field(default_factory=list)
38
+
39
+ def success_rate(self) -> float:
40
+ if self.total_cases == 0:
41
+ return 0.0
42
+ return self.successful_cases / self.total_cases
43
+
44
+ def field_accuracy(self) -> float:
45
+ if self.total_fields == 0:
46
+ return 0.0
47
+ return self.correct_fields / self.total_fields
48
+
49
+ def hallucination_rate(self) -> float:
50
+ if self.total_fields == 0:
51
+ return 0.0
52
+ return self.total_hallucinations / self.total_fields
53
+
54
+ def omission_rate(self) -> float:
55
+ if self.total_fields == 0:
56
+ return 0.0
57
+ return self.total_omissions / self.total_fields
58
+
59
+ def to_dict(self) -> Dict:
60
+ return {
61
+ "summary": {
62
+ "total_cases": self.total_cases,
63
+ "successful_cases": self.successful_cases,
64
+ "failed_cases": self.failed_cases,
65
+ "success_rate": f"{self.success_rate():.1%}",
66
+ "field_accuracy": f"{self.field_accuracy():.1%}",
67
+ "hallucination_rate": f"{self.hallucination_rate():.1%}",
68
+ "omission_rate": f"{self.omission_rate():.1%}"
69
+ },
70
+ "field_level": {
71
+ "total_fields": self.total_fields,
72
+ "correct_fields": self.correct_fields,
73
+ "hallucinations": self.total_hallucinations,
74
+ "omissions": self.total_omissions,
75
+ "mismatches": self.total_mismatches
76
+ },
77
+ "by_query_type": self.by_query_type,
78
+ "case_results": self.case_results
79
+ }
80
+
81
+
82
+ def aggregate_metrics(evaluations: List[CaseEvaluation]) -> EvaluationMetrics:
83
+ """
84
+ Aggregate metrics from multiple case evaluations.
85
+ """
86
+ metrics = EvaluationMetrics()
87
+
88
+ for eval_result in evaluations:
89
+ metrics.total_cases += 1
90
+
91
+ if eval_result.success:
92
+ metrics.successful_cases += 1
93
+ else:
94
+ metrics.failed_cases += 1
95
+
96
+ # Field-level aggregation
97
+ metrics.total_fields += eval_result.total_fields
98
+ metrics.correct_fields += eval_result.correct_fields
99
+ metrics.total_hallucinations += eval_result.hallucinations
100
+ metrics.total_omissions += eval_result.omissions
101
+ metrics.total_mismatches += eval_result.mismatches
102
+
103
+ # By query type
104
+ qtype = eval_result.query_type
105
+ if qtype not in metrics.by_query_type:
106
+ metrics.by_query_type[qtype] = {
107
+ "total": 0,
108
+ "successful": 0,
109
+ "failed": 0,
110
+ "total_fields": 0,
111
+ "correct_fields": 0,
112
+ "hallucinations": 0,
113
+ "omissions": 0
114
+ }
115
+
116
+ metrics.by_query_type[qtype]["total"] += 1
117
+ if eval_result.success:
118
+ metrics.by_query_type[qtype]["successful"] += 1
119
+ else:
120
+ metrics.by_query_type[qtype]["failed"] += 1
121
+ metrics.by_query_type[qtype]["total_fields"] += eval_result.total_fields
122
+ metrics.by_query_type[qtype]["correct_fields"] += eval_result.correct_fields
123
+ metrics.by_query_type[qtype]["hallucinations"] += eval_result.hallucinations
124
+ metrics.by_query_type[qtype]["omissions"] += eval_result.omissions
125
+
126
+ # Store case result
127
+ metrics.case_results.append({
128
+ "case_id": eval_result.case_id,
129
+ "query_type": eval_result.query_type,
130
+ "success": eval_result.success,
131
+ "accuracy": eval_result.accuracy(),
132
+ "fields": f"{eval_result.correct_fields}/{eval_result.total_fields}",
133
+ "hallucinations": eval_result.hallucinations,
134
+ "omissions": eval_result.omissions
135
+ })
136
+
137
+ # Calculate per-type success rates
138
+ for qtype, stats in metrics.by_query_type.items():
139
+ if stats["total"] > 0:
140
+ stats["success_rate"] = f"{stats['successful'] / stats['total']:.1%}"
141
+ if stats["total_fields"] > 0:
142
+ stats["field_accuracy"] = f"{stats['correct_fields'] / stats['total_fields']:.1%}"
143
+
144
+ return metrics
145
+
146
+
147
+ def format_report(metrics: EvaluationMetrics) -> str:
148
+ """Format metrics as a human-readable report."""
149
+ lines = []
150
+
151
+ lines.append("=" * 60)
152
+ lines.append("PRE-VISIT SUMMARY EVALUATION REPORT")
153
+ lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
154
+ lines.append("=" * 60)
155
+
156
+ lines.append("")
157
+ lines.append("OVERALL RESULTS")
158
+ lines.append("-" * 40)
159
+ lines.append(f"Total Test Cases: {metrics.total_cases}")
160
+ lines.append(f"Successful: {metrics.successful_cases}")
161
+ lines.append(f"Failed: {metrics.failed_cases}")
162
+ lines.append(f"Success Rate: {metrics.success_rate():.1%}")
163
+ lines.append("")
164
+ lines.append(f"Total Fields Checked: {metrics.total_fields}")
165
+ lines.append(f"Correct Fields: {metrics.correct_fields}")
166
+ lines.append(f"Field Accuracy: {metrics.field_accuracy():.1%}")
167
+ lines.append("")
168
+ lines.append(f"Hallucinations: {metrics.total_hallucinations} ({metrics.hallucination_rate():.1%})")
169
+ lines.append(f"Omissions: {metrics.total_omissions} ({metrics.omission_rate():.1%})")
170
+ lines.append(f"Mismatches: {metrics.total_mismatches}")
171
+
172
+ lines.append("")
173
+ lines.append("BY QUERY TYPE")
174
+ lines.append("-" * 40)
175
+
176
+ # Sort by total count
177
+ sorted_types = sorted(metrics.by_query_type.items(),
178
+ key=lambda x: x[1]["total"], reverse=True)
179
+
180
+ # Header
181
+ lines.append(f"{'Query Type':<25} {'Success':<12} {'Accuracy':<12} {'Hall.':<8}")
182
+ lines.append("-" * 60)
183
+
184
+ for qtype, stats in sorted_types:
185
+ success_rate = stats.get("success_rate", "N/A")
186
+ field_acc = stats.get("field_accuracy", "N/A")
187
+ lines.append(f"{qtype:<25} {success_rate:<12} {field_acc:<12} {stats['hallucinations']:<8}")
188
+
189
+ # Failed cases detail
190
+ failed_cases = [c for c in metrics.case_results if not c["success"]]
191
+ if failed_cases:
192
+ lines.append("")
193
+ lines.append("FAILED CASES")
194
+ lines.append("-" * 40)
195
+ for case in failed_cases[:10]: # Show first 10
196
+ lines.append(f" {case['case_id']}")
197
+ lines.append(f" Type: {case['query_type']}, Accuracy: {case['accuracy']:.1%}")
198
+ lines.append(f" Hallucinations: {case['hallucinations']}, Omissions: {case['omissions']}")
199
+
200
+ lines.append("")
201
+ lines.append("=" * 60)
202
+
203
+ return "\n".join(lines)
204
+
205
+
206
+ def save_report(metrics: EvaluationMetrics, output_dir: str = "."):
207
+ """Save evaluation report to files."""
208
+ import os
209
+
210
+ os.makedirs(output_dir, exist_ok=True)
211
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
212
+
213
+ # Save text report
214
+ text_path = os.path.join(output_dir, f"eval_report_{timestamp}.txt")
215
+ with open(text_path, "w") as f:
216
+ f.write(format_report(metrics))
217
+
218
+ # Save JSON report
219
+ json_path = os.path.join(output_dir, f"eval_report_{timestamp}.json")
220
+ with open(json_path, "w") as f:
221
+ json.dump(metrics.to_dict(), f, indent=2, default=str)
222
+
223
+ return text_path, json_path
224
+
225
+
226
+ if __name__ == "__main__":
227
+ # Test with sample evaluations
228
+ from evaluator import CaseEvaluation, ComparisonResult
229
+
230
+ # Create some sample evaluations
231
+ evaluations = [
232
+ CaseEvaluation(
233
+ case_id="patient1_vital_bp",
234
+ query_type="vital_trend",
235
+ success=True,
236
+ total_fields=10,
237
+ correct_fields=9,
238
+ hallucinations=0,
239
+ omissions=1,
240
+ mismatches=0
241
+ ),
242
+ CaseEvaluation(
243
+ case_id="patient1_meds",
244
+ query_type="medication_list",
245
+ success=True,
246
+ total_fields=5,
247
+ correct_fields=5,
248
+ hallucinations=0,
249
+ omissions=0,
250
+ mismatches=0
251
+ ),
252
+ CaseEvaluation(
253
+ case_id="patient1_conditions",
254
+ query_type="condition_list",
255
+ success=False,
256
+ total_fields=8,
257
+ correct_fields=5,
258
+ hallucinations=2,
259
+ omissions=1,
260
+ mismatches=0
261
+ ),
262
+ ]
263
+
264
+ metrics = aggregate_metrics(evaluations)
265
+ print(format_report(metrics))
evaluation/reports/eval_report_20260127_174121.json ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "summary": {
3
+ "total_cases": 30,
4
+ "successful_cases": 30,
5
+ "failed_cases": 0,
6
+ "success_rate": "100.0%",
7
+ "field_accuracy": "100.0%",
8
+ "hallucination_rate": "0.0%",
9
+ "omission_rate": "0.0%"
10
+ },
11
+ "field_level": {
12
+ "total_fields": 128,
13
+ "correct_fields": 128,
14
+ "hallucinations": 0,
15
+ "omissions": 0,
16
+ "mismatches": 0
17
+ },
18
+ "by_query_type": {
19
+ "vital_trend": {
20
+ "total": 6,
21
+ "successful": 6,
22
+ "failed": 0,
23
+ "total_fields": 63,
24
+ "correct_fields": 63,
25
+ "hallucinations": 0,
26
+ "omissions": 0,
27
+ "success_rate": "100.0%",
28
+ "field_accuracy": "100.0%"
29
+ },
30
+ "medication_list": {
31
+ "total": 6,
32
+ "successful": 6,
33
+ "failed": 0,
34
+ "total_fields": 18,
35
+ "correct_fields": 18,
36
+ "hallucinations": 0,
37
+ "omissions": 0,
38
+ "success_rate": "100.0%",
39
+ "field_accuracy": "100.0%"
40
+ },
41
+ "condition_list": {
42
+ "total": 3,
43
+ "successful": 3,
44
+ "failed": 0,
45
+ "total_fields": 9,
46
+ "correct_fields": 9,
47
+ "hallucinations": 0,
48
+ "omissions": 0,
49
+ "success_rate": "100.0%",
50
+ "field_accuracy": "100.0%"
51
+ },
52
+ "allergy_list": {
53
+ "total": 3,
54
+ "successful": 3,
55
+ "failed": 0,
56
+ "total_fields": 7,
57
+ "correct_fields": 7,
58
+ "hallucinations": 0,
59
+ "omissions": 0,
60
+ "success_rate": "100.0%",
61
+ "field_accuracy": "100.0%"
62
+ },
63
+ "immunization_list": {
64
+ "total": 3,
65
+ "successful": 3,
66
+ "failed": 0,
67
+ "total_fields": 9,
68
+ "correct_fields": 9,
69
+ "hallucinations": 0,
70
+ "omissions": 0,
71
+ "success_rate": "100.0%",
72
+ "field_accuracy": "100.0%"
73
+ },
74
+ "procedure_list": {
75
+ "total": 3,
76
+ "successful": 3,
77
+ "failed": 0,
78
+ "total_fields": 7,
79
+ "correct_fields": 7,
80
+ "hallucinations": 0,
81
+ "omissions": 0,
82
+ "success_rate": "100.0%",
83
+ "field_accuracy": "100.0%"
84
+ },
85
+ "encounter_list": {
86
+ "total": 3,
87
+ "successful": 3,
88
+ "failed": 0,
89
+ "total_fields": 3,
90
+ "correct_fields": 3,
91
+ "hallucinations": 0,
92
+ "omissions": 0,
93
+ "success_rate": "100.0%",
94
+ "field_accuracy": "100.0%"
95
+ },
96
+ "lab_trend": {
97
+ "total": 3,
98
+ "successful": 3,
99
+ "failed": 0,
100
+ "total_fields": 12,
101
+ "correct_fields": 12,
102
+ "hallucinations": 0,
103
+ "omissions": 0,
104
+ "success_rate": "100.0%",
105
+ "field_accuracy": "100.0%"
106
+ }
107
+ },
108
+ "case_results": [
109
+ {
110
+ "case_id": "patient-001_vital_blood_pressure",
111
+ "query_type": "vital_trend",
112
+ "success": true,
113
+ "accuracy": 1.0,
114
+ "fields": "14/14",
115
+ "hallucinations": 0,
116
+ "omissions": 0
117
+ },
118
+ {
119
+ "case_id": "patient-001_vital_heart_rate",
120
+ "query_type": "vital_trend",
121
+ "success": true,
122
+ "accuracy": 1.0,
123
+ "fields": "7/7",
124
+ "hallucinations": 0,
125
+ "omissions": 0
126
+ },
127
+ {
128
+ "case_id": "patient-001_meds_all",
129
+ "query_type": "medication_list",
130
+ "success": true,
131
+ "accuracy": 1.0,
132
+ "fields": "3/3",
133
+ "hallucinations": 0,
134
+ "omissions": 0
135
+ },
136
+ {
137
+ "case_id": "patient-001_meds_active",
138
+ "query_type": "medication_list",
139
+ "success": true,
140
+ "accuracy": 1.0,
141
+ "fields": "3/3",
142
+ "hallucinations": 0,
143
+ "omissions": 0
144
+ },
145
+ {
146
+ "case_id": "patient-001_conditions",
147
+ "query_type": "condition_list",
148
+ "success": true,
149
+ "accuracy": 1.0,
150
+ "fields": "3/3",
151
+ "hallucinations": 0,
152
+ "omissions": 0
153
+ },
154
+ {
155
+ "case_id": "patient-001_allergies",
156
+ "query_type": "allergy_list",
157
+ "success": true,
158
+ "accuracy": 1.0,
159
+ "fields": "2/2",
160
+ "hallucinations": 0,
161
+ "omissions": 0
162
+ },
163
+ {
164
+ "case_id": "patient-001_immunizations",
165
+ "query_type": "immunization_list",
166
+ "success": true,
167
+ "accuracy": 1.0,
168
+ "fields": "3/3",
169
+ "hallucinations": 0,
170
+ "omissions": 0
171
+ },
172
+ {
173
+ "case_id": "patient-001_procedures",
174
+ "query_type": "procedure_list",
175
+ "success": true,
176
+ "accuracy": 1.0,
177
+ "fields": "2/2",
178
+ "hallucinations": 0,
179
+ "omissions": 0
180
+ },
181
+ {
182
+ "case_id": "patient-001_encounters",
183
+ "query_type": "encounter_list",
184
+ "success": true,
185
+ "accuracy": 1.0,
186
+ "fields": "1/1",
187
+ "hallucinations": 0,
188
+ "omissions": 0
189
+ },
190
+ {
191
+ "case_id": "patient-001_lab_a1c",
192
+ "query_type": "lab_trend",
193
+ "success": true,
194
+ "accuracy": 1.0,
195
+ "fields": "4/4",
196
+ "hallucinations": 0,
197
+ "omissions": 0
198
+ },
199
+ {
200
+ "case_id": "patient-002_vital_blood_pressure",
201
+ "query_type": "vital_trend",
202
+ "success": true,
203
+ "accuracy": 1.0,
204
+ "fields": "14/14",
205
+ "hallucinations": 0,
206
+ "omissions": 0
207
+ },
208
+ {
209
+ "case_id": "patient-002_vital_heart_rate",
210
+ "query_type": "vital_trend",
211
+ "success": true,
212
+ "accuracy": 1.0,
213
+ "fields": "7/7",
214
+ "hallucinations": 0,
215
+ "omissions": 0
216
+ },
217
+ {
218
+ "case_id": "patient-002_meds_all",
219
+ "query_type": "medication_list",
220
+ "success": true,
221
+ "accuracy": 1.0,
222
+ "fields": "3/3",
223
+ "hallucinations": 0,
224
+ "omissions": 0
225
+ },
226
+ {
227
+ "case_id": "patient-002_meds_active",
228
+ "query_type": "medication_list",
229
+ "success": true,
230
+ "accuracy": 1.0,
231
+ "fields": "3/3",
232
+ "hallucinations": 0,
233
+ "omissions": 0
234
+ },
235
+ {
236
+ "case_id": "patient-002_conditions",
237
+ "query_type": "condition_list",
238
+ "success": true,
239
+ "accuracy": 1.0,
240
+ "fields": "3/3",
241
+ "hallucinations": 0,
242
+ "omissions": 0
243
+ },
244
+ {
245
+ "case_id": "patient-002_allergies",
246
+ "query_type": "allergy_list",
247
+ "success": true,
248
+ "accuracy": 1.0,
249
+ "fields": "3/3",
250
+ "hallucinations": 0,
251
+ "omissions": 0
252
+ },
253
+ {
254
+ "case_id": "patient-002_immunizations",
255
+ "query_type": "immunization_list",
256
+ "success": true,
257
+ "accuracy": 1.0,
258
+ "fields": "3/3",
259
+ "hallucinations": 0,
260
+ "omissions": 0
261
+ },
262
+ {
263
+ "case_id": "patient-002_procedures",
264
+ "query_type": "procedure_list",
265
+ "success": true,
266
+ "accuracy": 1.0,
267
+ "fields": "2/2",
268
+ "hallucinations": 0,
269
+ "omissions": 0
270
+ },
271
+ {
272
+ "case_id": "patient-002_encounters",
273
+ "query_type": "encounter_list",
274
+ "success": true,
275
+ "accuracy": 1.0,
276
+ "fields": "1/1",
277
+ "hallucinations": 0,
278
+ "omissions": 0
279
+ },
280
+ {
281
+ "case_id": "patient-002_lab_a1c",
282
+ "query_type": "lab_trend",
283
+ "success": true,
284
+ "accuracy": 1.0,
285
+ "fields": "4/4",
286
+ "hallucinations": 0,
287
+ "omissions": 0
288
+ },
289
+ {
290
+ "case_id": "patient-003_vital_blood_pressure",
291
+ "query_type": "vital_trend",
292
+ "success": true,
293
+ "accuracy": 1.0,
294
+ "fields": "14/14",
295
+ "hallucinations": 0,
296
+ "omissions": 0
297
+ },
298
+ {
299
+ "case_id": "patient-003_vital_heart_rate",
300
+ "query_type": "vital_trend",
301
+ "success": true,
302
+ "accuracy": 1.0,
303
+ "fields": "7/7",
304
+ "hallucinations": 0,
305
+ "omissions": 0
306
+ },
307
+ {
308
+ "case_id": "patient-003_meds_all",
309
+ "query_type": "medication_list",
310
+ "success": true,
311
+ "accuracy": 1.0,
312
+ "fields": "3/3",
313
+ "hallucinations": 0,
314
+ "omissions": 0
315
+ },
316
+ {
317
+ "case_id": "patient-003_meds_active",
318
+ "query_type": "medication_list",
319
+ "success": true,
320
+ "accuracy": 1.0,
321
+ "fields": "3/3",
322
+ "hallucinations": 0,
323
+ "omissions": 0
324
+ },
325
+ {
326
+ "case_id": "patient-003_conditions",
327
+ "query_type": "condition_list",
328
+ "success": true,
329
+ "accuracy": 1.0,
330
+ "fields": "3/3",
331
+ "hallucinations": 0,
332
+ "omissions": 0
333
+ },
334
+ {
335
+ "case_id": "patient-003_allergies",
336
+ "query_type": "allergy_list",
337
+ "success": true,
338
+ "accuracy": 1.0,
339
+ "fields": "2/2",
340
+ "hallucinations": 0,
341
+ "omissions": 0
342
+ },
343
+ {
344
+ "case_id": "patient-003_immunizations",
345
+ "query_type": "immunization_list",
346
+ "success": true,
347
+ "accuracy": 1.0,
348
+ "fields": "3/3",
349
+ "hallucinations": 0,
350
+ "omissions": 0
351
+ },
352
+ {
353
+ "case_id": "patient-003_procedures",
354
+ "query_type": "procedure_list",
355
+ "success": true,
356
+ "accuracy": 1.0,
357
+ "fields": "3/3",
358
+ "hallucinations": 0,
359
+ "omissions": 0
360
+ },
361
+ {
362
+ "case_id": "patient-003_encounters",
363
+ "query_type": "encounter_list",
364
+ "success": true,
365
+ "accuracy": 1.0,
366
+ "fields": "1/1",
367
+ "hallucinations": 0,
368
+ "omissions": 0
369
+ },
370
+ {
371
+ "case_id": "patient-003_lab_a1c",
372
+ "query_type": "lab_trend",
373
+ "success": true,
374
+ "accuracy": 1.0,
375
+ "fields": "4/4",
376
+ "hallucinations": 0,
377
+ "omissions": 0
378
+ }
379
+ ]
380
+ }
evaluation/reports/eval_report_20260127_174121.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ============================================================
2
+ PRE-VISIT SUMMARY EVALUATION REPORT
3
+ Generated: 2026-01-27 17:41:21
4
+ ============================================================
5
+
6
+ OVERALL RESULTS
7
+ ----------------------------------------
8
+ Total Test Cases: 30
9
+ Successful: 30
10
+ Failed: 0
11
+ Success Rate: 100.0%
12
+
13
+ Total Fields Checked: 128
14
+ Correct Fields: 128
15
+ Field Accuracy: 100.0%
16
+
17
+ Hallucinations: 0 (0.0%)
18
+ Omissions: 0 (0.0%)
19
+ Mismatches: 0
20
+
21
+ BY QUERY TYPE
22
+ ----------------------------------------
23
+ Query Type Success Accuracy Hall.
24
+ ------------------------------------------------------------
25
+ vital_trend 100.0% 100.0% 0
26
+ medication_list 100.0% 100.0% 0
27
+ condition_list 100.0% 100.0% 0
28
+ allergy_list 100.0% 100.0% 0
29
+ immunization_list 100.0% 100.0% 0
30
+ procedure_list 100.0% 100.0% 0
31
+ encounter_list 100.0% 100.0% 0
32
+ lab_trend 100.0% 100.0% 0
33
+
34
+ ============================================================
evaluation/reports/eval_report_20260127_174147.json ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "summary": {
3
+ "total_cases": 30,
4
+ "successful_cases": 20,
5
+ "failed_cases": 10,
6
+ "success_rate": "66.7%",
7
+ "field_accuracy": "81.1%",
8
+ "hallucination_rate": "5.3%",
9
+ "omission_rate": "5.3%"
10
+ },
11
+ "field_level": {
12
+ "total_fields": 132,
13
+ "correct_fields": 107,
14
+ "hallucinations": 7,
15
+ "omissions": 7,
16
+ "mismatches": 3
17
+ },
18
+ "by_query_type": {
19
+ "vital_trend": {
20
+ "total": 6,
21
+ "successful": 5,
22
+ "failed": 1,
23
+ "total_fields": 63,
24
+ "correct_fields": 56,
25
+ "hallucinations": 3,
26
+ "omissions": 1,
27
+ "success_rate": "83.3%",
28
+ "field_accuracy": "88.9%"
29
+ },
30
+ "medication_list": {
31
+ "total": 6,
32
+ "successful": 4,
33
+ "failed": 2,
34
+ "total_fields": 19,
35
+ "correct_fields": 15,
36
+ "hallucinations": 1,
37
+ "omissions": 1,
38
+ "success_rate": "66.7%",
39
+ "field_accuracy": "78.9%"
40
+ },
41
+ "condition_list": {
42
+ "total": 3,
43
+ "successful": 1,
44
+ "failed": 2,
45
+ "total_fields": 9,
46
+ "correct_fields": 5,
47
+ "hallucinations": 0,
48
+ "omissions": 2,
49
+ "success_rate": "33.3%",
50
+ "field_accuracy": "55.6%"
51
+ },
52
+ "allergy_list": {
53
+ "total": 3,
54
+ "successful": 1,
55
+ "failed": 2,
56
+ "total_fields": 8,
57
+ "correct_fields": 4,
58
+ "hallucinations": 1,
59
+ "omissions": 2,
60
+ "success_rate": "33.3%",
61
+ "field_accuracy": "50.0%"
62
+ },
63
+ "immunization_list": {
64
+ "total": 3,
65
+ "successful": 1,
66
+ "failed": 2,
67
+ "total_fields": 10,
68
+ "correct_fields": 6,
69
+ "hallucinations": 1,
70
+ "omissions": 1,
71
+ "success_rate": "33.3%",
72
+ "field_accuracy": "60.0%"
73
+ },
74
+ "procedure_list": {
75
+ "total": 3,
76
+ "successful": 2,
77
+ "failed": 1,
78
+ "total_fields": 8,
79
+ "correct_fields": 6,
80
+ "hallucinations": 1,
81
+ "omissions": 0,
82
+ "success_rate": "66.7%",
83
+ "field_accuracy": "75.0%"
84
+ },
85
+ "encounter_list": {
86
+ "total": 3,
87
+ "successful": 3,
88
+ "failed": 0,
89
+ "total_fields": 3,
90
+ "correct_fields": 3,
91
+ "hallucinations": 0,
92
+ "omissions": 0,
93
+ "success_rate": "100.0%",
94
+ "field_accuracy": "100.0%"
95
+ },
96
+ "lab_trend": {
97
+ "total": 3,
98
+ "successful": 3,
99
+ "failed": 0,
100
+ "total_fields": 12,
101
+ "correct_fields": 12,
102
+ "hallucinations": 0,
103
+ "omissions": 0,
104
+ "success_rate": "100.0%",
105
+ "field_accuracy": "100.0%"
106
+ }
107
+ },
108
+ "case_results": [
109
+ {
110
+ "case_id": "patient-001_vital_blood_pressure",
111
+ "query_type": "vital_trend",
112
+ "success": true,
113
+ "accuracy": 0.9285714285714286,
114
+ "fields": "13/14",
115
+ "hallucinations": 1,
116
+ "omissions": 0
117
+ },
118
+ {
119
+ "case_id": "patient-001_vital_heart_rate",
120
+ "query_type": "vital_trend",
121
+ "success": true,
122
+ "accuracy": 0.8571428571428571,
123
+ "fields": "6/7",
124
+ "hallucinations": 0,
125
+ "omissions": 1
126
+ },
127
+ {
128
+ "case_id": "patient-001_meds_all",
129
+ "query_type": "medication_list",
130
+ "success": true,
131
+ "accuracy": 1.0,
132
+ "fields": "3/3",
133
+ "hallucinations": 0,
134
+ "omissions": 0
135
+ },
136
+ {
137
+ "case_id": "patient-001_meds_active",
138
+ "query_type": "medication_list",
139
+ "success": false,
140
+ "accuracy": 0.5,
141
+ "fields": "2/4",
142
+ "hallucinations": 1,
143
+ "omissions": 0
144
+ },
145
+ {
146
+ "case_id": "patient-001_conditions",
147
+ "query_type": "condition_list",
148
+ "success": true,
149
+ "accuracy": 1.0,
150
+ "fields": "3/3",
151
+ "hallucinations": 0,
152
+ "omissions": 0
153
+ },
154
+ {
155
+ "case_id": "patient-001_allergies",
156
+ "query_type": "allergy_list",
157
+ "success": false,
158
+ "accuracy": 0.3333333333333333,
159
+ "fields": "1/3",
160
+ "hallucinations": 1,
161
+ "omissions": 1
162
+ },
163
+ {
164
+ "case_id": "patient-001_immunizations",
165
+ "query_type": "immunization_list",
166
+ "success": false,
167
+ "accuracy": 0.5,
168
+ "fields": "2/4",
169
+ "hallucinations": 1,
170
+ "omissions": 0
171
+ },
172
+ {
173
+ "case_id": "patient-001_procedures",
174
+ "query_type": "procedure_list",
175
+ "success": true,
176
+ "accuracy": 1.0,
177
+ "fields": "2/2",
178
+ "hallucinations": 0,
179
+ "omissions": 0
180
+ },
181
+ {
182
+ "case_id": "patient-001_encounters",
183
+ "query_type": "encounter_list",
184
+ "success": true,
185
+ "accuracy": 1.0,
186
+ "fields": "1/1",
187
+ "hallucinations": 0,
188
+ "omissions": 0
189
+ },
190
+ {
191
+ "case_id": "patient-001_lab_a1c",
192
+ "query_type": "lab_trend",
193
+ "success": true,
194
+ "accuracy": 1.0,
195
+ "fields": "4/4",
196
+ "hallucinations": 0,
197
+ "omissions": 0
198
+ },
199
+ {
200
+ "case_id": "patient-002_vital_blood_pressure",
201
+ "query_type": "vital_trend",
202
+ "success": false,
203
+ "accuracy": 0.7857142857142857,
204
+ "fields": "11/14",
205
+ "hallucinations": 1,
206
+ "omissions": 0
207
+ },
208
+ {
209
+ "case_id": "patient-002_vital_heart_rate",
210
+ "query_type": "vital_trend",
211
+ "success": true,
212
+ "accuracy": 0.8571428571428571,
213
+ "fields": "6/7",
214
+ "hallucinations": 1,
215
+ "omissions": 0
216
+ },
217
+ {
218
+ "case_id": "patient-002_meds_all",
219
+ "query_type": "medication_list",
220
+ "success": true,
221
+ "accuracy": 1.0,
222
+ "fields": "3/3",
223
+ "hallucinations": 0,
224
+ "omissions": 0
225
+ },
226
+ {
227
+ "case_id": "patient-002_meds_active",
228
+ "query_type": "medication_list",
229
+ "success": true,
230
+ "accuracy": 1.0,
231
+ "fields": "3/3",
232
+ "hallucinations": 0,
233
+ "omissions": 0
234
+ },
235
+ {
236
+ "case_id": "patient-002_conditions",
237
+ "query_type": "condition_list",
238
+ "success": false,
239
+ "accuracy": 0.3333333333333333,
240
+ "fields": "1/3",
241
+ "hallucinations": 0,
242
+ "omissions": 1
243
+ },
244
+ {
245
+ "case_id": "patient-002_allergies",
246
+ "query_type": "allergy_list",
247
+ "success": false,
248
+ "accuracy": 0.3333333333333333,
249
+ "fields": "1/3",
250
+ "hallucinations": 0,
251
+ "omissions": 1
252
+ },
253
+ {
254
+ "case_id": "patient-002_immunizations",
255
+ "query_type": "immunization_list",
256
+ "success": false,
257
+ "accuracy": 0.3333333333333333,
258
+ "fields": "1/3",
259
+ "hallucinations": 0,
260
+ "omissions": 1
261
+ },
262
+ {
263
+ "case_id": "patient-002_procedures",
264
+ "query_type": "procedure_list",
265
+ "success": true,
266
+ "accuracy": 1.0,
267
+ "fields": "2/2",
268
+ "hallucinations": 0,
269
+ "omissions": 0
270
+ },
271
+ {
272
+ "case_id": "patient-002_encounters",
273
+ "query_type": "encounter_list",
274
+ "success": true,
275
+ "accuracy": 1.0,
276
+ "fields": "1/1",
277
+ "hallucinations": 0,
278
+ "omissions": 0
279
+ },
280
+ {
281
+ "case_id": "patient-002_lab_a1c",
282
+ "query_type": "lab_trend",
283
+ "success": true,
284
+ "accuracy": 1.0,
285
+ "fields": "4/4",
286
+ "hallucinations": 0,
287
+ "omissions": 0
288
+ },
289
+ {
290
+ "case_id": "patient-003_vital_blood_pressure",
291
+ "query_type": "vital_trend",
292
+ "success": true,
293
+ "accuracy": 1.0,
294
+ "fields": "14/14",
295
+ "hallucinations": 0,
296
+ "omissions": 0
297
+ },
298
+ {
299
+ "case_id": "patient-003_vital_heart_rate",
300
+ "query_type": "vital_trend",
301
+ "success": true,
302
+ "accuracy": 0.8571428571428571,
303
+ "fields": "6/7",
304
+ "hallucinations": 0,
305
+ "omissions": 0
306
+ },
307
+ {
308
+ "case_id": "patient-003_meds_all",
309
+ "query_type": "medication_list",
310
+ "success": false,
311
+ "accuracy": 0.3333333333333333,
312
+ "fields": "1/3",
313
+ "hallucinations": 0,
314
+ "omissions": 1
315
+ },
316
+ {
317
+ "case_id": "patient-003_meds_active",
318
+ "query_type": "medication_list",
319
+ "success": true,
320
+ "accuracy": 1.0,
321
+ "fields": "3/3",
322
+ "hallucinations": 0,
323
+ "omissions": 0
324
+ },
325
+ {
326
+ "case_id": "patient-003_conditions",
327
+ "query_type": "condition_list",
328
+ "success": false,
329
+ "accuracy": 0.3333333333333333,
330
+ "fields": "1/3",
331
+ "hallucinations": 0,
332
+ "omissions": 1
333
+ },
334
+ {
335
+ "case_id": "patient-003_allergies",
336
+ "query_type": "allergy_list",
337
+ "success": true,
338
+ "accuracy": 1.0,
339
+ "fields": "2/2",
340
+ "hallucinations": 0,
341
+ "omissions": 0
342
+ },
343
+ {
344
+ "case_id": "patient-003_immunizations",
345
+ "query_type": "immunization_list",
346
+ "success": true,
347
+ "accuracy": 1.0,
348
+ "fields": "3/3",
349
+ "hallucinations": 0,
350
+ "omissions": 0
351
+ },
352
+ {
353
+ "case_id": "patient-003_procedures",
354
+ "query_type": "procedure_list",
355
+ "success": false,
356
+ "accuracy": 0.5,
357
+ "fields": "2/4",
358
+ "hallucinations": 1,
359
+ "omissions": 0
360
+ },
361
+ {
362
+ "case_id": "patient-003_encounters",
363
+ "query_type": "encounter_list",
364
+ "success": true,
365
+ "accuracy": 1.0,
366
+ "fields": "1/1",
367
+ "hallucinations": 0,
368
+ "omissions": 0
369
+ },
370
+ {
371
+ "case_id": "patient-003_lab_a1c",
372
+ "query_type": "lab_trend",
373
+ "success": true,
374
+ "accuracy": 1.0,
375
+ "fields": "4/4",
376
+ "hallucinations": 0,
377
+ "omissions": 0
378
+ }
379
+ ]
380
+ }
evaluation/reports/eval_report_20260127_174147.txt ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ============================================================
2
+ PRE-VISIT SUMMARY EVALUATION REPORT
3
+ Generated: 2026-01-27 17:41:47
4
+ ============================================================
5
+
6
+ OVERALL RESULTS
7
+ ----------------------------------------
8
+ Total Test Cases: 30
9
+ Successful: 20
10
+ Failed: 10
11
+ Success Rate: 66.7%
12
+
13
+ Total Fields Checked: 132
14
+ Correct Fields: 107
15
+ Field Accuracy: 81.1%
16
+
17
+ Hallucinations: 7 (5.3%)
18
+ Omissions: 7 (5.3%)
19
+ Mismatches: 3
20
+
21
+ BY QUERY TYPE
22
+ ----------------------------------------
23
+ Query Type Success Accuracy Hall.
24
+ ------------------------------------------------------------
25
+ vital_trend 83.3% 88.9% 3
26
+ medication_list 66.7% 78.9% 1
27
+ condition_list 33.3% 55.6% 0
28
+ allergy_list 33.3% 50.0% 1
29
+ immunization_list 33.3% 60.0% 1
30
+ procedure_list 66.7% 75.0% 1
31
+ encounter_list 100.0% 100.0% 0
32
+ lab_trend 100.0% 100.0% 0
33
+
34
+ FAILED CASES
35
+ ----------------------------------------
36
+ patient-001_meds_active
37
+ Type: medication_list, Accuracy: 50.0%
38
+ Hallucinations: 1, Omissions: 0
39
+ patient-001_allergies
40
+ Type: allergy_list, Accuracy: 33.3%
41
+ Hallucinations: 1, Omissions: 1
42
+ patient-001_immunizations
43
+ Type: immunization_list, Accuracy: 50.0%
44
+ Hallucinations: 1, Omissions: 0
45
+ patient-002_vital_blood_pressure
46
+ Type: vital_trend, Accuracy: 78.6%
47
+ Hallucinations: 1, Omissions: 0
48
+ patient-002_conditions
49
+ Type: condition_list, Accuracy: 33.3%
50
+ Hallucinations: 0, Omissions: 1
51
+ patient-002_allergies
52
+ Type: allergy_list, Accuracy: 33.3%
53
+ Hallucinations: 0, Omissions: 1
54
+ patient-002_immunizations
55
+ Type: immunization_list, Accuracy: 33.3%
56
+ Hallucinations: 0, Omissions: 1
57
+ patient-003_meds_all
58
+ Type: medication_list, Accuracy: 33.3%
59
+ Hallucinations: 0, Omissions: 1
60
+ patient-003_conditions
61
+ Type: condition_list, Accuracy: 33.3%
62
+ Hallucinations: 0, Omissions: 1
63
+ patient-003_procedures
64
+ Type: procedure_list, Accuracy: 50.0%
65
+ Hallucinations: 1, Omissions: 0
66
+
67
+ ============================================================
evaluation/run_evaluation.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Evaluation Runner
4
+
5
+ Main entry point for running the pre-visit summary evaluation.
6
+
7
+ This can be run in two modes:
8
+ 1. Direct mode: Directly compute expected vs actual from database (no LLM needed)
9
+ 2. Agent mode: Run actual agent queries and extract facts from responses
10
+
11
+ For initial testing, we use direct mode to validate the evaluation framework.
12
+ """
13
+
14
+ import os
15
+ import sys
16
+ import json
17
+ import argparse
18
+ from datetime import datetime
19
+ from typing import Dict, List, Any
20
+
21
+ # Add parent directory to path for imports
22
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
23
+
24
+ from evaluation.test_generator import generate_all_test_cases, get_test_summary
25
+ from evaluation.expected_values import compute_expected_values
26
+ from evaluation.evaluator import evaluate_case, CaseEvaluation
27
+ from evaluation.metrics import aggregate_metrics, format_report, save_report
28
+
29
+
30
+ def run_direct_evaluation(num_patients: int = 10, output_dir: str = None) -> Dict:
31
+ """
32
+ Run evaluation in direct mode.
33
+
34
+ This mode:
35
+ 1. Generates test cases from database
36
+ 2. Computes expected values from database
37
+ 3. Simulates "perfect" agent that returns exactly the expected values
38
+ 4. Computes metrics
39
+
40
+ This validates the evaluation framework works correctly.
41
+ A perfect agent should score 100%.
42
+ """
43
+ print("=" * 60)
44
+ print("PRE-VISIT SUMMARY EVALUATION - DIRECT MODE")
45
+ print("=" * 60)
46
+ print(f"\nGenerating test cases for {num_patients} patients...")
47
+
48
+ # Generate test cases
49
+ test_cases = generate_all_test_cases(num_patients=num_patients)
50
+ summary = get_test_summary(test_cases)
51
+
52
+ print(f"Generated {summary['total_cases']} test cases")
53
+ print("\nBy query type:")
54
+ for qtype, count in sorted(summary["by_type"].items()):
55
+ print(f" {qtype}: {count}")
56
+
57
+ print("\nRunning evaluation...")
58
+
59
+ evaluations = []
60
+ for i, test_case in enumerate(test_cases):
61
+ # Compute expected values
62
+ expected = compute_expected_values(test_case)
63
+
64
+ # In direct mode, actual = expected (simulating perfect agent)
65
+ actual_facts = expected.copy()
66
+
67
+ # Evaluate
68
+ evaluation = evaluate_case(test_case, expected, actual_facts)
69
+ evaluations.append(evaluation)
70
+
71
+ # Progress indicator
72
+ if (i + 1) % 20 == 0:
73
+ print(f" Processed {i + 1}/{len(test_cases)} cases...")
74
+
75
+ # Aggregate metrics
76
+ metrics = aggregate_metrics(evaluations)
77
+
78
+ # Print report
79
+ print("\n" + format_report(metrics))
80
+
81
+ # Save report
82
+ if output_dir:
83
+ text_path, json_path = save_report(metrics, output_dir)
84
+ print(f"\nReports saved to:")
85
+ print(f" {text_path}")
86
+ print(f" {json_path}")
87
+
88
+ return metrics.to_dict()
89
+
90
+
91
+ def run_simulated_evaluation(num_patients: int = 10, error_rate: float = 0.1,
92
+ output_dir: str = None) -> Dict:
93
+ """
94
+ Run evaluation with simulated errors.
95
+
96
+ This mode introduces controlled errors to test that the
97
+ evaluation framework correctly detects them.
98
+
99
+ Args:
100
+ num_patients: Number of patients to test
101
+ error_rate: Fraction of values to corrupt (0.0 - 1.0)
102
+ output_dir: Directory to save reports
103
+ """
104
+ import random
105
+
106
+ print("=" * 60)
107
+ print("PRE-VISIT SUMMARY EVALUATION - SIMULATED ERROR MODE")
108
+ print(f"Error rate: {error_rate:.0%}")
109
+ print("=" * 60)
110
+
111
+ print(f"\nGenerating test cases for {num_patients} patients...")
112
+
113
+ test_cases = generate_all_test_cases(num_patients=num_patients)
114
+ summary = get_test_summary(test_cases)
115
+
116
+ print(f"Generated {summary['total_cases']} test cases")
117
+
118
+ print("\nRunning evaluation with simulated errors...")
119
+
120
+ evaluations = []
121
+ for i, test_case in enumerate(test_cases):
122
+ expected = compute_expected_values(test_case)
123
+
124
+ # Create actual with some errors
125
+ actual_facts = introduce_errors(expected, error_rate)
126
+
127
+ evaluation = evaluate_case(test_case, expected, actual_facts)
128
+ evaluations.append(evaluation)
129
+
130
+ if (i + 1) % 20 == 0:
131
+ print(f" Processed {i + 1}/{len(test_cases)} cases...")
132
+
133
+ metrics = aggregate_metrics(evaluations)
134
+
135
+ print("\n" + format_report(metrics))
136
+
137
+ if output_dir:
138
+ text_path, json_path = save_report(metrics, output_dir)
139
+ print(f"\nReports saved to:")
140
+ print(f" {text_path}")
141
+ print(f" {json_path}")
142
+
143
+ return metrics.to_dict()
144
+
145
+
146
+ def introduce_errors(expected: Dict, error_rate: float) -> Dict:
147
+ """
148
+ Introduce controlled errors into expected values.
149
+
150
+ Error types:
151
+ - Numeric perturbation (add/subtract random amount)
152
+ - Omission (remove items from lists)
153
+ - Hallucination (add fake items to lists)
154
+ """
155
+ import random
156
+ import copy
157
+
158
+ actual = copy.deepcopy(expected)
159
+
160
+ # Handle metrics dict (for vital/lab trends)
161
+ if "metrics" in actual:
162
+ for label, label_metrics in actual["metrics"].items():
163
+ if isinstance(label_metrics, dict):
164
+ for key, value in list(label_metrics.items()):
165
+ if random.random() < error_rate:
166
+ if isinstance(value, (int, float)) and key != "count":
167
+ # Numeric perturbation
168
+ label_metrics[key] = round(value + random.uniform(-5, 5), 1)
169
+ elif key == "count" and random.random() < 0.5:
170
+ # Sometimes omit count
171
+ label_metrics[key] = None
172
+
173
+ # Handle list fields
174
+ for list_key in ["medication_names", "condition_names", "substances",
175
+ "vaccine_names", "procedure_names"]:
176
+ if list_key in actual:
177
+ items = actual[list_key]
178
+ new_items = []
179
+
180
+ for item in items:
181
+ if random.random() < error_rate:
182
+ # Omit this item
183
+ continue
184
+ new_items.append(item)
185
+
186
+ # Maybe add hallucination
187
+ if random.random() < error_rate:
188
+ new_items.append(f"FAKE_ITEM_{random.randint(1000, 9999)}")
189
+
190
+ actual[list_key] = new_items
191
+ actual["count"] = len(new_items)
192
+
193
+ return actual
194
+
195
+
196
+ def main():
197
+ parser = argparse.ArgumentParser(description="Run pre-visit summary evaluation")
198
+
199
+ parser.add_argument(
200
+ "--mode",
201
+ choices=["direct", "simulated"],
202
+ default="direct",
203
+ help="Evaluation mode: 'direct' for perfect agent, 'simulated' for errors"
204
+ )
205
+
206
+ parser.add_argument(
207
+ "--patients",
208
+ type=int,
209
+ default=10,
210
+ help="Number of patients to test (default: 10)"
211
+ )
212
+
213
+ parser.add_argument(
214
+ "--error-rate",
215
+ type=float,
216
+ default=0.1,
217
+ help="Error rate for simulated mode (default: 0.1)"
218
+ )
219
+
220
+ parser.add_argument(
221
+ "--output-dir",
222
+ type=str,
223
+ default="evaluation/reports",
224
+ help="Directory to save reports (default: evaluation/reports)"
225
+ )
226
+
227
+ args = parser.parse_args()
228
+
229
+ # Ensure output directory exists
230
+ os.makedirs(args.output_dir, exist_ok=True)
231
+
232
+ if args.mode == "direct":
233
+ run_direct_evaluation(
234
+ num_patients=args.patients,
235
+ output_dir=args.output_dir
236
+ )
237
+ else:
238
+ run_simulated_evaluation(
239
+ num_patients=args.patients,
240
+ error_rate=args.error_rate,
241
+ output_dir=args.output_dir
242
+ )
243
+
244
+
245
+ if __name__ == "__main__":
246
+ main()
evaluation/test_generator.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test Case Generator for Pre-Visit Summary Evaluation
4
+
5
+ Generates test cases from Synthea patient data with known ground truth.
6
+ """
7
+
8
+ import sqlite3
9
+ import random
10
+ from datetime import datetime, timedelta
11
+ from typing import List, Dict, Any
12
+ import os
13
+
14
+ DB_PATH = os.getenv("DB_PATH", "data/fhir.db")
15
+
16
+
17
+ def get_db():
18
+ """Get database connection."""
19
+ conn = sqlite3.connect(DB_PATH)
20
+ conn.row_factory = sqlite3.Row
21
+ return conn
22
+
23
+
24
+ def get_test_patients(limit: int = 10) -> List[Dict]:
25
+ """Get patients that have sufficient data for testing."""
26
+ conn = get_db()
27
+ try:
28
+ # Find patients with good data coverage
29
+ cursor = conn.execute("""
30
+ SELECT p.id, p.given_name, p.family_name, p.birth_date, p.gender,
31
+ (SELECT COUNT(*) FROM conditions WHERE patient_id = p.id) as condition_count,
32
+ (SELECT COUNT(*) FROM medications WHERE patient_id = p.id) as med_count,
33
+ (SELECT COUNT(*) FROM observations WHERE patient_id = p.id) as obs_count,
34
+ (SELECT COUNT(*) FROM allergies WHERE patient_id = p.id) as allergy_count,
35
+ (SELECT COUNT(*) FROM immunizations WHERE patient_id = p.id) as imm_count,
36
+ (SELECT COUNT(*) FROM procedures WHERE patient_id = p.id) as proc_count,
37
+ (SELECT COUNT(*) FROM encounters WHERE patient_id = p.id) as enc_count
38
+ FROM patients p
39
+ WHERE (SELECT COUNT(*) FROM observations WHERE patient_id = p.id) > 10
40
+ ORDER BY obs_count DESC
41
+ LIMIT ?
42
+ """, (limit,))
43
+
44
+ patients = []
45
+ for row in cursor.fetchall():
46
+ patients.append({
47
+ "patient_id": row["id"],
48
+ "name": f"{row['given_name']} {row['family_name']}",
49
+ "birth_date": row["birth_date"],
50
+ "gender": row["gender"],
51
+ "data_counts": {
52
+ "conditions": row["condition_count"],
53
+ "medications": row["med_count"],
54
+ "observations": row["obs_count"],
55
+ "allergies": row["allergy_count"],
56
+ "immunizations": row["imm_count"],
57
+ "procedures": row["proc_count"],
58
+ "encounters": row["enc_count"]
59
+ }
60
+ })
61
+ return patients
62
+ finally:
63
+ conn.close()
64
+
65
+
66
+ def generate_vital_trend_cases(patient_id: str, days: int = 30) -> List[Dict]:
67
+ """Generate test cases for vital sign trends (BP, heart rate, etc.)."""
68
+ test_cases = []
69
+
70
+ vital_types = [
71
+ ("blood_pressure", ["8480-6", "8462-4"], ["systolic", "diastolic"]),
72
+ ("heart_rate", ["8867-4"], ["heart_rate"]),
73
+ ("weight", ["29463-7"], ["weight"]),
74
+ ("temperature", ["8310-5"], ["temperature"]),
75
+ ("oxygen_saturation", ["2708-6"], ["oxygen_saturation"]),
76
+ ]
77
+
78
+ conn = get_db()
79
+ try:
80
+ for vital_name, codes, labels in vital_types:
81
+ # Check if patient has this vital data
82
+ placeholders = ",".join(["?" for _ in codes])
83
+ cursor = conn.execute(f"""
84
+ SELECT COUNT(*) as cnt FROM observations
85
+ WHERE patient_id = ? AND code IN ({placeholders})
86
+ """, [patient_id] + codes)
87
+
88
+ count = cursor.fetchone()["cnt"]
89
+ if count >= 3: # Need at least 3 readings for meaningful test
90
+ test_cases.append({
91
+ "case_id": f"{patient_id}_vital_{vital_name}",
92
+ "patient_id": patient_id,
93
+ "query_type": "vital_trend",
94
+ "query": f"Show me my {vital_name.replace('_', ' ')} chart",
95
+ "parameters": {
96
+ "vital_type": vital_name,
97
+ "days": days,
98
+ "codes": codes,
99
+ "labels": labels
100
+ }
101
+ })
102
+ finally:
103
+ conn.close()
104
+
105
+ return test_cases
106
+
107
+
108
+ def generate_medication_cases(patient_id: str) -> List[Dict]:
109
+ """Generate test cases for medication queries."""
110
+ test_cases = []
111
+
112
+ conn = get_db()
113
+ try:
114
+ # Check if patient has medications
115
+ cursor = conn.execute("""
116
+ SELECT COUNT(*) as total,
117
+ SUM(CASE WHEN status = 'active' THEN 1 ELSE 0 END) as active
118
+ FROM medications WHERE patient_id = ?
119
+ """, (patient_id,))
120
+
121
+ row = cursor.fetchone()
122
+ if row["total"] > 0:
123
+ # All medications
124
+ test_cases.append({
125
+ "case_id": f"{patient_id}_meds_all",
126
+ "patient_id": patient_id,
127
+ "query_type": "medication_list",
128
+ "query": "What medications am I taking?",
129
+ "parameters": {"status": None}
130
+ })
131
+
132
+ # Active only
133
+ if row["active"] > 0:
134
+ test_cases.append({
135
+ "case_id": f"{patient_id}_meds_active",
136
+ "patient_id": patient_id,
137
+ "query_type": "medication_list",
138
+ "query": "What are my current active medications?",
139
+ "parameters": {"status": "active"}
140
+ })
141
+ finally:
142
+ conn.close()
143
+
144
+ return test_cases
145
+
146
+
147
+ def generate_condition_cases(patient_id: str) -> List[Dict]:
148
+ """Generate test cases for condition queries."""
149
+ test_cases = []
150
+
151
+ conn = get_db()
152
+ try:
153
+ cursor = conn.execute("""
154
+ SELECT COUNT(*) as cnt FROM conditions WHERE patient_id = ?
155
+ """, (patient_id,))
156
+
157
+ if cursor.fetchone()["cnt"] > 0:
158
+ test_cases.append({
159
+ "case_id": f"{patient_id}_conditions",
160
+ "patient_id": patient_id,
161
+ "query_type": "condition_list",
162
+ "query": "What are my medical conditions?",
163
+ "parameters": {}
164
+ })
165
+ finally:
166
+ conn.close()
167
+
168
+ return test_cases
169
+
170
+
171
+ def generate_allergy_cases(patient_id: str) -> List[Dict]:
172
+ """Generate test cases for allergy queries."""
173
+ test_cases = []
174
+
175
+ conn = get_db()
176
+ try:
177
+ cursor = conn.execute("""
178
+ SELECT COUNT(*) as cnt FROM allergies WHERE patient_id = ?
179
+ """, (patient_id,))
180
+
181
+ if cursor.fetchone()["cnt"] > 0:
182
+ test_cases.append({
183
+ "case_id": f"{patient_id}_allergies",
184
+ "patient_id": patient_id,
185
+ "query_type": "allergy_list",
186
+ "query": "What are my allergies?",
187
+ "parameters": {}
188
+ })
189
+ finally:
190
+ conn.close()
191
+
192
+ return test_cases
193
+
194
+
195
+ def generate_immunization_cases(patient_id: str) -> List[Dict]:
196
+ """Generate test cases for immunization queries."""
197
+ test_cases = []
198
+
199
+ conn = get_db()
200
+ try:
201
+ cursor = conn.execute("""
202
+ SELECT COUNT(*) as cnt FROM immunizations WHERE patient_id = ?
203
+ """, (patient_id,))
204
+
205
+ if cursor.fetchone()["cnt"] > 0:
206
+ test_cases.append({
207
+ "case_id": f"{patient_id}_immunizations",
208
+ "patient_id": patient_id,
209
+ "query_type": "immunization_list",
210
+ "query": "What immunizations have I had?",
211
+ "parameters": {}
212
+ })
213
+ finally:
214
+ conn.close()
215
+
216
+ return test_cases
217
+
218
+
219
+ def generate_procedure_cases(patient_id: str) -> List[Dict]:
220
+ """Generate test cases for procedure/surgical history queries."""
221
+ test_cases = []
222
+
223
+ conn = get_db()
224
+ try:
225
+ cursor = conn.execute("""
226
+ SELECT COUNT(*) as cnt FROM procedures WHERE patient_id = ?
227
+ """, (patient_id,))
228
+
229
+ if cursor.fetchone()["cnt"] > 0:
230
+ test_cases.append({
231
+ "case_id": f"{patient_id}_procedures",
232
+ "patient_id": patient_id,
233
+ "query_type": "procedure_list",
234
+ "query": "What procedures or surgeries have I had?",
235
+ "parameters": {}
236
+ })
237
+ finally:
238
+ conn.close()
239
+
240
+ return test_cases
241
+
242
+
243
+ def generate_encounter_cases(patient_id: str) -> List[Dict]:
244
+ """Generate test cases for encounter history queries."""
245
+ test_cases = []
246
+
247
+ conn = get_db()
248
+ try:
249
+ cursor = conn.execute("""
250
+ SELECT COUNT(*) as cnt FROM encounters WHERE patient_id = ?
251
+ """, (patient_id,))
252
+
253
+ if cursor.fetchone()["cnt"] > 0:
254
+ test_cases.append({
255
+ "case_id": f"{patient_id}_encounters",
256
+ "patient_id": patient_id,
257
+ "query_type": "encounter_list",
258
+ "query": "Show me my recent visits",
259
+ "parameters": {"limit": 5}
260
+ })
261
+ finally:
262
+ conn.close()
263
+
264
+ return test_cases
265
+
266
+
267
+ def generate_lab_cases(patient_id: str) -> List[Dict]:
268
+ """Generate test cases for lab result queries."""
269
+ test_cases = []
270
+
271
+ lab_types = [
272
+ ("a1c", "4548-4", "HbA1c"),
273
+ ("cholesterol", "2093-3", "Total Cholesterol"),
274
+ ("glucose", "2345-7", "Glucose"),
275
+ ]
276
+
277
+ conn = get_db()
278
+ try:
279
+ for lab_name, code, display in lab_types:
280
+ cursor = conn.execute("""
281
+ SELECT COUNT(*) as cnt FROM observations
282
+ WHERE patient_id = ? AND code = ?
283
+ """, (patient_id, code))
284
+
285
+ if cursor.fetchone()["cnt"] >= 2:
286
+ test_cases.append({
287
+ "case_id": f"{patient_id}_lab_{lab_name}",
288
+ "patient_id": patient_id,
289
+ "query_type": "lab_trend",
290
+ "query": f"Show me my {display} history",
291
+ "parameters": {
292
+ "lab_type": lab_name,
293
+ "code": code
294
+ }
295
+ })
296
+ finally:
297
+ conn.close()
298
+
299
+ return test_cases
300
+
301
+
302
+ def generate_all_test_cases(num_patients: int = 10) -> List[Dict]:
303
+ """Generate complete test suite from available patients."""
304
+ patients = get_test_patients(num_patients)
305
+ all_cases = []
306
+
307
+ for patient in patients:
308
+ pid = patient["patient_id"]
309
+
310
+ # Generate cases for each data type
311
+ all_cases.extend(generate_vital_trend_cases(pid))
312
+ all_cases.extend(generate_medication_cases(pid))
313
+ all_cases.extend(generate_condition_cases(pid))
314
+ all_cases.extend(generate_allergy_cases(pid))
315
+ all_cases.extend(generate_immunization_cases(pid))
316
+ all_cases.extend(generate_procedure_cases(pid))
317
+ all_cases.extend(generate_encounter_cases(pid))
318
+ all_cases.extend(generate_lab_cases(pid))
319
+
320
+ return all_cases
321
+
322
+
323
+ def get_test_summary(test_cases: List[Dict]) -> Dict:
324
+ """Get summary of generated test cases."""
325
+ summary = {
326
+ "total_cases": len(test_cases),
327
+ "by_type": {},
328
+ "by_patient": {}
329
+ }
330
+
331
+ for case in test_cases:
332
+ # Count by type
333
+ qtype = case["query_type"]
334
+ summary["by_type"][qtype] = summary["by_type"].get(qtype, 0) + 1
335
+
336
+ # Count by patient
337
+ pid = case["patient_id"]
338
+ summary["by_patient"][pid] = summary["by_patient"].get(pid, 0) + 1
339
+
340
+ return summary
341
+
342
+
343
+ if __name__ == "__main__":
344
+ # Test the generator
345
+ print("Generating test cases...")
346
+ cases = generate_all_test_cases(num_patients=5)
347
+ summary = get_test_summary(cases)
348
+
349
+ print(f"\nTotal test cases: {summary['total_cases']}")
350
+ print("\nBy query type:")
351
+ for qtype, count in sorted(summary["by_type"].items()):
352
+ print(f" {qtype}: {count}")
353
+
354
+ print("\nSample test case:")
355
+ if cases:
356
+ import json
357
+ print(json.dumps(cases[0], indent=2))