Spaces:
Sleeping
Sleeping
frabbani commited on
Commit ·
8daa8bf
1
Parent(s): dc3f8a9
Fix fact extraction - pass raw data for simple tools.......
Browse files- .DS_Store +0 -0
- evaluation/__init__.py +37 -0
- evaluation/create_test_db.py +291 -0
- evaluation/evaluator.py +438 -0
- evaluation/expected_values.py +386 -0
- evaluation/facts_schema.py +232 -0
- evaluation/metrics.py +265 -0
- evaluation/reports/eval_report_20260127_174121.json +380 -0
- evaluation/reports/eval_report_20260127_174121.txt +34 -0
- evaluation/reports/eval_report_20260127_174147.json +380 -0
- evaluation/reports/eval_report_20260127_174147.txt +67 -0
- evaluation/run_evaluation.py +246 -0
- evaluation/test_generator.py +357 -0
.DS_Store
CHANGED
|
Binary files a/.DS_Store and b/.DS_Store differ
|
|
|
evaluation/__init__.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Evaluation Framework for Pre-Visit Summary Agent
|
| 3 |
+
|
| 4 |
+
This package provides tools to evaluate the accuracy of the
|
| 5 |
+
pre-visit summary agent's data retrieval and reporting.
|
| 6 |
+
|
| 7 |
+
Modules:
|
| 8 |
+
test_generator: Generates test cases from Synthea database
|
| 9 |
+
expected_values: Computes ground truth values from database
|
| 10 |
+
evaluator: Compares agent facts vs expected values
|
| 11 |
+
metrics: Aggregates results and computes summary statistics
|
| 12 |
+
facts_schema: Defines structured output format for agent
|
| 13 |
+
run_evaluation: Main entry point for running evaluations
|
| 14 |
+
|
| 15 |
+
Usage:
|
| 16 |
+
# Run direct evaluation (validates framework)
|
| 17 |
+
python -m evaluation.run_evaluation --mode direct --patients 10
|
| 18 |
+
|
| 19 |
+
# Run simulated evaluation (tests error detection)
|
| 20 |
+
python -m evaluation.run_evaluation --mode simulated --error-rate 0.15
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
from .test_generator import generate_all_test_cases, get_test_summary
|
| 24 |
+
from .expected_values import compute_expected_values
|
| 25 |
+
from .evaluator import evaluate_case, CaseEvaluation
|
| 26 |
+
from .metrics import aggregate_metrics, format_report, EvaluationMetrics
|
| 27 |
+
|
| 28 |
+
__all__ = [
|
| 29 |
+
"generate_all_test_cases",
|
| 30 |
+
"get_test_summary",
|
| 31 |
+
"compute_expected_values",
|
| 32 |
+
"evaluate_case",
|
| 33 |
+
"CaseEvaluation",
|
| 34 |
+
"aggregate_metrics",
|
| 35 |
+
"format_report",
|
| 36 |
+
"EvaluationMetrics"
|
| 37 |
+
]
|
evaluation/create_test_db.py
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Create a minimal test database for evaluation framework validation.
|
| 4 |
+
|
| 5 |
+
This creates a small SQLite database with sample patient data
|
| 6 |
+
that can be used to test the evaluation framework.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import sqlite3
|
| 10 |
+
import os
|
| 11 |
+
from datetime import datetime, timedelta
|
| 12 |
+
import random
|
| 13 |
+
|
| 14 |
+
DB_PATH = "data/fhir.db"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def create_test_database():
|
| 18 |
+
"""Create test database with sample data."""
|
| 19 |
+
|
| 20 |
+
os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
|
| 21 |
+
|
| 22 |
+
conn = sqlite3.connect(DB_PATH)
|
| 23 |
+
cursor = conn.cursor()
|
| 24 |
+
|
| 25 |
+
# Create tables
|
| 26 |
+
cursor.executescript("""
|
| 27 |
+
-- Patients table
|
| 28 |
+
CREATE TABLE IF NOT EXISTS patients (
|
| 29 |
+
id TEXT PRIMARY KEY,
|
| 30 |
+
given_name TEXT,
|
| 31 |
+
family_name TEXT,
|
| 32 |
+
birth_date TEXT,
|
| 33 |
+
gender TEXT,
|
| 34 |
+
marital_status TEXT
|
| 35 |
+
);
|
| 36 |
+
|
| 37 |
+
-- Conditions table
|
| 38 |
+
CREATE TABLE IF NOT EXISTS conditions (
|
| 39 |
+
id TEXT PRIMARY KEY,
|
| 40 |
+
patient_id TEXT,
|
| 41 |
+
code TEXT,
|
| 42 |
+
display TEXT,
|
| 43 |
+
clinical_status TEXT,
|
| 44 |
+
onset_date TEXT,
|
| 45 |
+
abatement_date TEXT
|
| 46 |
+
);
|
| 47 |
+
|
| 48 |
+
-- Medications table
|
| 49 |
+
CREATE TABLE IF NOT EXISTS medications (
|
| 50 |
+
id TEXT PRIMARY KEY,
|
| 51 |
+
patient_id TEXT,
|
| 52 |
+
code TEXT,
|
| 53 |
+
display TEXT,
|
| 54 |
+
status TEXT,
|
| 55 |
+
start_date TEXT
|
| 56 |
+
);
|
| 57 |
+
|
| 58 |
+
-- Observations table
|
| 59 |
+
CREATE TABLE IF NOT EXISTS observations (
|
| 60 |
+
id TEXT PRIMARY KEY,
|
| 61 |
+
patient_id TEXT,
|
| 62 |
+
code TEXT,
|
| 63 |
+
display TEXT,
|
| 64 |
+
value_quantity REAL,
|
| 65 |
+
unit TEXT,
|
| 66 |
+
effective_date TEXT,
|
| 67 |
+
category TEXT
|
| 68 |
+
);
|
| 69 |
+
|
| 70 |
+
-- Allergies table
|
| 71 |
+
CREATE TABLE IF NOT EXISTS allergies (
|
| 72 |
+
id TEXT PRIMARY KEY,
|
| 73 |
+
patient_id TEXT,
|
| 74 |
+
substance TEXT,
|
| 75 |
+
reaction_display TEXT,
|
| 76 |
+
criticality TEXT,
|
| 77 |
+
category TEXT
|
| 78 |
+
);
|
| 79 |
+
|
| 80 |
+
-- Immunizations table
|
| 81 |
+
CREATE TABLE IF NOT EXISTS immunizations (
|
| 82 |
+
id TEXT PRIMARY KEY,
|
| 83 |
+
patient_id TEXT,
|
| 84 |
+
vaccine_code TEXT,
|
| 85 |
+
vaccine_display TEXT,
|
| 86 |
+
status TEXT,
|
| 87 |
+
occurrence_date TEXT
|
| 88 |
+
);
|
| 89 |
+
|
| 90 |
+
-- Procedures table
|
| 91 |
+
CREATE TABLE IF NOT EXISTS procedures (
|
| 92 |
+
id TEXT PRIMARY KEY,
|
| 93 |
+
patient_id TEXT,
|
| 94 |
+
code TEXT,
|
| 95 |
+
display TEXT,
|
| 96 |
+
status TEXT,
|
| 97 |
+
performed_date TEXT
|
| 98 |
+
);
|
| 99 |
+
|
| 100 |
+
-- Encounters table
|
| 101 |
+
CREATE TABLE IF NOT EXISTS encounters (
|
| 102 |
+
id TEXT PRIMARY KEY,
|
| 103 |
+
patient_id TEXT,
|
| 104 |
+
status TEXT,
|
| 105 |
+
class_code TEXT,
|
| 106 |
+
class_display TEXT,
|
| 107 |
+
type_code TEXT,
|
| 108 |
+
type_display TEXT,
|
| 109 |
+
reason_code TEXT,
|
| 110 |
+
reason_display TEXT,
|
| 111 |
+
period_start TEXT,
|
| 112 |
+
period_end TEXT
|
| 113 |
+
);
|
| 114 |
+
""")
|
| 115 |
+
|
| 116 |
+
# Create test patients
|
| 117 |
+
patients = [
|
| 118 |
+
("patient-001", "John", "Smith", "1965-03-15", "male"),
|
| 119 |
+
("patient-002", "Mary", "Johnson", "1978-07-22", "female"),
|
| 120 |
+
("patient-003", "Robert", "Williams", "1952-11-08", "male"),
|
| 121 |
+
]
|
| 122 |
+
|
| 123 |
+
for pid, given, family, dob, gender in patients:
|
| 124 |
+
cursor.execute("""
|
| 125 |
+
INSERT OR REPLACE INTO patients (id, given_name, family_name, birth_date, gender)
|
| 126 |
+
VALUES (?, ?, ?, ?, ?)
|
| 127 |
+
""", (pid, given, family, dob, gender))
|
| 128 |
+
|
| 129 |
+
# Create conditions
|
| 130 |
+
conditions = [
|
| 131 |
+
("patient-001", "44054006", "Type 2 Diabetes Mellitus", "active", "2015-06-10"),
|
| 132 |
+
("patient-001", "38341003", "Essential Hypertension", "active", "2018-02-15"),
|
| 133 |
+
("patient-002", "195967001", "Asthma", "active", "2010-04-20"),
|
| 134 |
+
("patient-002", "73211009", "Type 2 Diabetes Mellitus", "active", "2020-01-10"),
|
| 135 |
+
("patient-003", "38341003", "Essential Hypertension", "active", "2005-08-12"),
|
| 136 |
+
("patient-003", "13644009", "Hypercholesterolemia", "active", "2010-03-25"),
|
| 137 |
+
]
|
| 138 |
+
|
| 139 |
+
for i, (pid, code, display, status, onset) in enumerate(conditions):
|
| 140 |
+
cursor.execute("""
|
| 141 |
+
INSERT OR REPLACE INTO conditions (id, patient_id, code, display, clinical_status, onset_date)
|
| 142 |
+
VALUES (?, ?, ?, ?, ?, ?)
|
| 143 |
+
""", (f"cond-{i+1:03d}", pid, code, display, status, onset))
|
| 144 |
+
|
| 145 |
+
# Create medications
|
| 146 |
+
medications = [
|
| 147 |
+
("patient-001", "860975", "Metformin 500 MG Oral Tablet", "active", "2015-06-15"),
|
| 148 |
+
("patient-001", "314076", "Lisinopril 10 MG Oral Tablet", "active", "2018-02-20"),
|
| 149 |
+
("patient-002", "895994", "Albuterol 90 MCG Inhaler", "active", "2010-05-01"),
|
| 150 |
+
("patient-002", "860975", "Metformin 500 MG Oral Tablet", "active", "2020-01-15"),
|
| 151 |
+
("patient-003", "314076", "Lisinopril 20 MG Oral Tablet", "active", "2005-08-20"),
|
| 152 |
+
("patient-003", "316672", "Atorvastatin 20 MG Oral Tablet", "active", "2010-04-01"),
|
| 153 |
+
]
|
| 154 |
+
|
| 155 |
+
for i, (pid, code, display, status, start) in enumerate(medications):
|
| 156 |
+
cursor.execute("""
|
| 157 |
+
INSERT OR REPLACE INTO medications (id, patient_id, code, display, status, start_date)
|
| 158 |
+
VALUES (?, ?, ?, ?, ?, ?)
|
| 159 |
+
""", (f"med-{i+1:03d}", pid, code, display, status, start))
|
| 160 |
+
|
| 161 |
+
# Create observations (vitals)
|
| 162 |
+
base_date = datetime.now()
|
| 163 |
+
|
| 164 |
+
for pid in ["patient-001", "patient-002", "patient-003"]:
|
| 165 |
+
obs_id = 1
|
| 166 |
+
|
| 167 |
+
# Blood pressure readings over last 30 days
|
| 168 |
+
for days_ago in range(0, 30, 5):
|
| 169 |
+
date = (base_date - timedelta(days=days_ago)).strftime("%Y-%m-%d")
|
| 170 |
+
systolic = random.randint(120, 145)
|
| 171 |
+
diastolic = random.randint(75, 95)
|
| 172 |
+
|
| 173 |
+
cursor.execute("""
|
| 174 |
+
INSERT OR REPLACE INTO observations
|
| 175 |
+
(id, patient_id, code, display, value_quantity, unit, effective_date, category)
|
| 176 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
| 177 |
+
""", (f"obs-{pid}-{obs_id}", pid, "8480-6", "Systolic Blood Pressure",
|
| 178 |
+
systolic, "mmHg", date, "vital-signs"))
|
| 179 |
+
obs_id += 1
|
| 180 |
+
|
| 181 |
+
cursor.execute("""
|
| 182 |
+
INSERT OR REPLACE INTO observations
|
| 183 |
+
(id, patient_id, code, display, value_quantity, unit, effective_date, category)
|
| 184 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
| 185 |
+
""", (f"obs-{pid}-{obs_id}", pid, "8462-4", "Diastolic Blood Pressure",
|
| 186 |
+
diastolic, "mmHg", date, "vital-signs"))
|
| 187 |
+
obs_id += 1
|
| 188 |
+
|
| 189 |
+
# Heart rate
|
| 190 |
+
hr = random.randint(65, 85)
|
| 191 |
+
cursor.execute("""
|
| 192 |
+
INSERT OR REPLACE INTO observations
|
| 193 |
+
(id, patient_id, code, display, value_quantity, unit, effective_date, category)
|
| 194 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
| 195 |
+
""", (f"obs-{pid}-{obs_id}", pid, "8867-4", "Heart Rate",
|
| 196 |
+
hr, "/min", date, "vital-signs"))
|
| 197 |
+
obs_id += 1
|
| 198 |
+
|
| 199 |
+
# A1c readings (quarterly)
|
| 200 |
+
for months_ago in [0, 3, 6, 9]:
|
| 201 |
+
date = (base_date - timedelta(days=months_ago*30)).strftime("%Y-%m-%d")
|
| 202 |
+
a1c = round(random.uniform(6.0, 8.5), 1)
|
| 203 |
+
|
| 204 |
+
cursor.execute("""
|
| 205 |
+
INSERT OR REPLACE INTO observations
|
| 206 |
+
(id, patient_id, code, display, value_quantity, unit, effective_date, category)
|
| 207 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
| 208 |
+
""", (f"obs-{pid}-{obs_id}", pid, "4548-4", "Hemoglobin A1c",
|
| 209 |
+
a1c, "%", date, "laboratory"))
|
| 210 |
+
obs_id += 1
|
| 211 |
+
|
| 212 |
+
# Create allergies
|
| 213 |
+
allergies = [
|
| 214 |
+
("patient-001", "Penicillin", "Hives", "high", "medication"),
|
| 215 |
+
("patient-002", "Peanuts", "Anaphylaxis", "high", "food"),
|
| 216 |
+
("patient-002", "Latex", "Rash", "low", "environment"),
|
| 217 |
+
("patient-003", "Sulfa drugs", "Rash", "moderate", "medication"),
|
| 218 |
+
]
|
| 219 |
+
|
| 220 |
+
for i, (pid, substance, reaction, criticality, category) in enumerate(allergies):
|
| 221 |
+
cursor.execute("""
|
| 222 |
+
INSERT OR REPLACE INTO allergies
|
| 223 |
+
(id, patient_id, substance, reaction_display, criticality, category)
|
| 224 |
+
VALUES (?, ?, ?, ?, ?, ?)
|
| 225 |
+
""", (f"allergy-{i+1:03d}", pid, substance, reaction, criticality, category))
|
| 226 |
+
|
| 227 |
+
# Create immunizations
|
| 228 |
+
immunizations = [
|
| 229 |
+
("patient-001", "140", "Influenza Vaccine", "completed", "2024-10-15"),
|
| 230 |
+
("patient-001", "207", "COVID-19 Vaccine", "completed", "2024-01-20"),
|
| 231 |
+
("patient-002", "140", "Influenza Vaccine", "completed", "2024-11-01"),
|
| 232 |
+
("patient-002", "113", "Tdap Vaccine", "completed", "2022-05-10"),
|
| 233 |
+
("patient-003", "140", "Influenza Vaccine", "completed", "2024-09-20"),
|
| 234 |
+
("patient-003", "33", "Pneumococcal Vaccine", "completed", "2023-03-15"),
|
| 235 |
+
]
|
| 236 |
+
|
| 237 |
+
for i, (pid, code, display, status, date) in enumerate(immunizations):
|
| 238 |
+
cursor.execute("""
|
| 239 |
+
INSERT OR REPLACE INTO immunizations
|
| 240 |
+
(id, patient_id, vaccine_code, vaccine_display, status, occurrence_date)
|
| 241 |
+
VALUES (?, ?, ?, ?, ?, ?)
|
| 242 |
+
""", (f"imm-{i+1:03d}", pid, code, display, status, date))
|
| 243 |
+
|
| 244 |
+
# Create procedures
|
| 245 |
+
procedures = [
|
| 246 |
+
("patient-001", "73761001", "Colonoscopy", "completed", "2023-06-15"),
|
| 247 |
+
("patient-002", "80146002", "Appendectomy", "completed", "2015-08-20"),
|
| 248 |
+
("patient-003", "232717009", "Coronary Angioplasty", "completed", "2020-02-10"),
|
| 249 |
+
("patient-003", "73761001", "Colonoscopy", "completed", "2022-04-05"),
|
| 250 |
+
]
|
| 251 |
+
|
| 252 |
+
for i, (pid, code, display, status, date) in enumerate(procedures):
|
| 253 |
+
cursor.execute("""
|
| 254 |
+
INSERT OR REPLACE INTO procedures
|
| 255 |
+
(id, patient_id, code, display, status, performed_date)
|
| 256 |
+
VALUES (?, ?, ?, ?, ?, ?)
|
| 257 |
+
""", (f"proc-{i+1:03d}", pid, code, display, status, date))
|
| 258 |
+
|
| 259 |
+
# Create encounters
|
| 260 |
+
for pid in ["patient-001", "patient-002", "patient-003"]:
|
| 261 |
+
for i in range(5):
|
| 262 |
+
days_ago = i * 60 # Every ~2 months
|
| 263 |
+
start = (base_date - timedelta(days=days_ago)).strftime("%Y-%m-%d")
|
| 264 |
+
end = start
|
| 265 |
+
|
| 266 |
+
cursor.execute("""
|
| 267 |
+
INSERT OR REPLACE INTO encounters
|
| 268 |
+
(id, patient_id, status, class_code, class_display, type_code, type_display,
|
| 269 |
+
reason_code, reason_display, period_start, period_end)
|
| 270 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 271 |
+
""", (f"enc-{pid}-{i+1}", pid, "finished", "AMB", "ambulatory",
|
| 272 |
+
"185349003", "Office Visit", "185349003", "Routine checkup",
|
| 273 |
+
start, end))
|
| 274 |
+
|
| 275 |
+
conn.commit()
|
| 276 |
+
conn.close()
|
| 277 |
+
|
| 278 |
+
print(f"Test database created at {DB_PATH}")
|
| 279 |
+
print("Contains:")
|
| 280 |
+
print(" - 3 patients")
|
| 281 |
+
print(" - 6 conditions")
|
| 282 |
+
print(" - 6 medications")
|
| 283 |
+
print(" - ~90 observations (vitals + labs)")
|
| 284 |
+
print(" - 4 allergies")
|
| 285 |
+
print(" - 6 immunizations")
|
| 286 |
+
print(" - 4 procedures")
|
| 287 |
+
print(" - 15 encounters")
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
if __name__ == "__main__":
|
| 291 |
+
create_test_database()
|
evaluation/evaluator.py
ADDED
|
@@ -0,0 +1,438 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Evaluator
|
| 4 |
+
|
| 5 |
+
Compares agent-reported facts against expected values from database.
|
| 6 |
+
Computes accuracy metrics for each comparison.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from typing import Dict, List, Any, Tuple, Optional
|
| 10 |
+
from dataclasses import dataclass, field
|
| 11 |
+
import math
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Tolerances for numerical comparisons
|
| 15 |
+
TOLERANCES = {
|
| 16 |
+
"vital_value": 0.5, # BP, heart rate, etc.
|
| 17 |
+
"lab_value": 0.5, # Lab results
|
| 18 |
+
"average": 0.5, # Computed averages
|
| 19 |
+
"percentage": 1.0, # Percentage values
|
| 20 |
+
"count": 0, # Counts must be exact
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class ComparisonResult:
|
| 26 |
+
"""Result of comparing a single value."""
|
| 27 |
+
field_name: str
|
| 28 |
+
expected: Any
|
| 29 |
+
actual: Any
|
| 30 |
+
match: bool
|
| 31 |
+
error_type: Optional[str] = None # "hallucination", "omission", "mismatch", "tolerance"
|
| 32 |
+
error_detail: Optional[str] = None
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class CaseEvaluation:
|
| 37 |
+
"""Evaluation result for a single test case."""
|
| 38 |
+
case_id: str
|
| 39 |
+
query_type: str
|
| 40 |
+
success: bool
|
| 41 |
+
comparisons: List[ComparisonResult] = field(default_factory=list)
|
| 42 |
+
|
| 43 |
+
# Summary stats
|
| 44 |
+
total_fields: int = 0
|
| 45 |
+
correct_fields: int = 0
|
| 46 |
+
hallucinations: int = 0
|
| 47 |
+
omissions: int = 0
|
| 48 |
+
mismatches: int = 0
|
| 49 |
+
|
| 50 |
+
def accuracy(self) -> float:
|
| 51 |
+
if self.total_fields == 0:
|
| 52 |
+
return 0.0
|
| 53 |
+
return self.correct_fields / self.total_fields
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def values_match(expected: Any, actual: Any, tolerance: float = 0) -> bool:
|
| 57 |
+
"""Check if two values match, with optional tolerance for numbers."""
|
| 58 |
+
if expected is None and actual is None:
|
| 59 |
+
return True
|
| 60 |
+
if expected is None or actual is None:
|
| 61 |
+
return False
|
| 62 |
+
|
| 63 |
+
# Numeric comparison with tolerance
|
| 64 |
+
if isinstance(expected, (int, float)) and isinstance(actual, (int, float)):
|
| 65 |
+
return abs(expected - actual) <= tolerance
|
| 66 |
+
|
| 67 |
+
# String comparison (case-insensitive, trimmed)
|
| 68 |
+
if isinstance(expected, str) and isinstance(actual, str):
|
| 69 |
+
return expected.lower().strip() == actual.lower().strip()
|
| 70 |
+
|
| 71 |
+
# List comparison (order-independent for certain types)
|
| 72 |
+
if isinstance(expected, list) and isinstance(actual, list):
|
| 73 |
+
return set(str(x).lower() for x in expected) == set(str(x).lower() for x in actual)
|
| 74 |
+
|
| 75 |
+
# Default exact comparison
|
| 76 |
+
return expected == actual
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def compare_numeric(field_name: str, expected: float, actual: float,
|
| 80 |
+
tolerance: float) -> ComparisonResult:
|
| 81 |
+
"""Compare two numeric values."""
|
| 82 |
+
if actual is None:
|
| 83 |
+
return ComparisonResult(
|
| 84 |
+
field_name=field_name,
|
| 85 |
+
expected=expected,
|
| 86 |
+
actual=actual,
|
| 87 |
+
match=False,
|
| 88 |
+
error_type="omission",
|
| 89 |
+
error_detail=f"Expected {expected}, got nothing"
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
diff = abs(expected - actual)
|
| 93 |
+
if diff <= tolerance:
|
| 94 |
+
return ComparisonResult(
|
| 95 |
+
field_name=field_name,
|
| 96 |
+
expected=expected,
|
| 97 |
+
actual=actual,
|
| 98 |
+
match=True
|
| 99 |
+
)
|
| 100 |
+
else:
|
| 101 |
+
return ComparisonResult(
|
| 102 |
+
field_name=field_name,
|
| 103 |
+
expected=expected,
|
| 104 |
+
actual=actual,
|
| 105 |
+
match=False,
|
| 106 |
+
error_type="mismatch" if diff <= tolerance * 3 else "hallucination",
|
| 107 |
+
error_detail=f"Expected {expected}, got {actual} (diff: {diff:.1f})"
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def compare_list_items(field_name: str, expected_items: List[str],
|
| 112 |
+
actual_items: List[str]) -> Tuple[List[ComparisonResult], int, int]:
|
| 113 |
+
"""
|
| 114 |
+
Compare two lists of items (e.g., medication names).
|
| 115 |
+
Returns comparisons, hallucination count, omission count.
|
| 116 |
+
"""
|
| 117 |
+
comparisons = []
|
| 118 |
+
|
| 119 |
+
expected_lower = set(x.lower().strip() for x in expected_items)
|
| 120 |
+
actual_lower = set(x.lower().strip() for x in actual_items)
|
| 121 |
+
|
| 122 |
+
# Find matches
|
| 123 |
+
matches = expected_lower & actual_lower
|
| 124 |
+
|
| 125 |
+
# Find omissions (in expected but not actual)
|
| 126 |
+
omissions = expected_lower - actual_lower
|
| 127 |
+
|
| 128 |
+
# Find hallucinations (in actual but not expected)
|
| 129 |
+
hallucinations = actual_lower - expected_lower
|
| 130 |
+
|
| 131 |
+
# Record matches
|
| 132 |
+
for item in matches:
|
| 133 |
+
comparisons.append(ComparisonResult(
|
| 134 |
+
field_name=f"{field_name}_item",
|
| 135 |
+
expected=item,
|
| 136 |
+
actual=item,
|
| 137 |
+
match=True
|
| 138 |
+
))
|
| 139 |
+
|
| 140 |
+
# Record omissions
|
| 141 |
+
for item in omissions:
|
| 142 |
+
comparisons.append(ComparisonResult(
|
| 143 |
+
field_name=f"{field_name}_item",
|
| 144 |
+
expected=item,
|
| 145 |
+
actual=None,
|
| 146 |
+
match=False,
|
| 147 |
+
error_type="omission",
|
| 148 |
+
error_detail=f"Missing: {item}"
|
| 149 |
+
))
|
| 150 |
+
|
| 151 |
+
# Record hallucinations
|
| 152 |
+
for item in hallucinations:
|
| 153 |
+
comparisons.append(ComparisonResult(
|
| 154 |
+
field_name=f"{field_name}_item",
|
| 155 |
+
expected=None,
|
| 156 |
+
actual=item,
|
| 157 |
+
match=False,
|
| 158 |
+
error_type="hallucination",
|
| 159 |
+
error_detail=f"Not in database: {item}"
|
| 160 |
+
))
|
| 161 |
+
|
| 162 |
+
return comparisons, len(hallucinations), len(omissions)
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def evaluate_vital_trend(expected: Dict, actual_facts: Dict) -> CaseEvaluation:
|
| 166 |
+
"""Evaluate vital trend response against expected values."""
|
| 167 |
+
evaluation = CaseEvaluation(
|
| 168 |
+
case_id="",
|
| 169 |
+
query_type="vital_trend",
|
| 170 |
+
success=True
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
if "metrics" not in expected:
|
| 174 |
+
evaluation.success = False
|
| 175 |
+
return evaluation
|
| 176 |
+
|
| 177 |
+
for label, expected_metrics in expected["metrics"].items():
|
| 178 |
+
actual_metrics = actual_facts.get("metrics", {}).get(label, {})
|
| 179 |
+
|
| 180 |
+
# Compare each metric
|
| 181 |
+
for metric_name in ["min", "max", "avg", "latest", "count"]:
|
| 182 |
+
if metric_name in expected_metrics:
|
| 183 |
+
exp_val = expected_metrics[metric_name]
|
| 184 |
+
act_val = actual_metrics.get(metric_name)
|
| 185 |
+
|
| 186 |
+
tolerance = TOLERANCES["count"] if metric_name == "count" else TOLERANCES["vital_value"]
|
| 187 |
+
|
| 188 |
+
comparison = compare_numeric(
|
| 189 |
+
f"{label}_{metric_name}",
|
| 190 |
+
exp_val,
|
| 191 |
+
act_val,
|
| 192 |
+
tolerance
|
| 193 |
+
)
|
| 194 |
+
evaluation.comparisons.append(comparison)
|
| 195 |
+
evaluation.total_fields += 1
|
| 196 |
+
|
| 197 |
+
if comparison.match:
|
| 198 |
+
evaluation.correct_fields += 1
|
| 199 |
+
elif comparison.error_type == "hallucination":
|
| 200 |
+
evaluation.hallucinations += 1
|
| 201 |
+
elif comparison.error_type == "omission":
|
| 202 |
+
evaluation.omissions += 1
|
| 203 |
+
else:
|
| 204 |
+
evaluation.mismatches += 1
|
| 205 |
+
|
| 206 |
+
# Compare date range
|
| 207 |
+
for date_field in ["earliest_date", "latest_date"]:
|
| 208 |
+
if date_field in expected_metrics:
|
| 209 |
+
exp_date = expected_metrics[date_field]
|
| 210 |
+
act_date = actual_metrics.get(date_field)
|
| 211 |
+
|
| 212 |
+
match = values_match(exp_date, act_date)
|
| 213 |
+
evaluation.comparisons.append(ComparisonResult(
|
| 214 |
+
field_name=f"{label}_{date_field}",
|
| 215 |
+
expected=exp_date,
|
| 216 |
+
actual=act_date,
|
| 217 |
+
match=match,
|
| 218 |
+
error_type=None if match else "mismatch"
|
| 219 |
+
))
|
| 220 |
+
evaluation.total_fields += 1
|
| 221 |
+
if match:
|
| 222 |
+
evaluation.correct_fields += 1
|
| 223 |
+
else:
|
| 224 |
+
evaluation.mismatches += 1
|
| 225 |
+
|
| 226 |
+
evaluation.success = evaluation.accuracy() >= 0.8 # 80% threshold
|
| 227 |
+
return evaluation
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def evaluate_list_query(expected: Dict, actual_facts: Dict,
|
| 231 |
+
items_key: str, names_key: str) -> CaseEvaluation:
|
| 232 |
+
"""
|
| 233 |
+
Evaluate list-based queries (medications, conditions, allergies, etc.)
|
| 234 |
+
"""
|
| 235 |
+
evaluation = CaseEvaluation(
|
| 236 |
+
case_id="",
|
| 237 |
+
query_type=expected["query_type"],
|
| 238 |
+
success=True
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
# Compare count
|
| 242 |
+
exp_count = expected.get("count", 0)
|
| 243 |
+
act_count = actual_facts.get("count", 0)
|
| 244 |
+
|
| 245 |
+
count_comparison = compare_numeric("count", exp_count, act_count, TOLERANCES["count"])
|
| 246 |
+
evaluation.comparisons.append(count_comparison)
|
| 247 |
+
evaluation.total_fields += 1
|
| 248 |
+
if count_comparison.match:
|
| 249 |
+
evaluation.correct_fields += 1
|
| 250 |
+
|
| 251 |
+
# Compare item names
|
| 252 |
+
exp_names = expected.get(names_key, [])
|
| 253 |
+
act_names = actual_facts.get(names_key, [])
|
| 254 |
+
|
| 255 |
+
item_comparisons, hallucinations, omissions = compare_list_items(
|
| 256 |
+
items_key, exp_names, act_names
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
evaluation.comparisons.extend(item_comparisons)
|
| 260 |
+
evaluation.total_fields += len(item_comparisons)
|
| 261 |
+
evaluation.correct_fields += sum(1 for c in item_comparisons if c.match)
|
| 262 |
+
evaluation.hallucinations += hallucinations
|
| 263 |
+
evaluation.omissions += omissions
|
| 264 |
+
|
| 265 |
+
evaluation.success = evaluation.accuracy() >= 0.8
|
| 266 |
+
return evaluation
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
def evaluate_medication_list(expected: Dict, actual_facts: Dict) -> CaseEvaluation:
|
| 270 |
+
"""Evaluate medication list response."""
|
| 271 |
+
return evaluate_list_query(expected, actual_facts, "medications", "medication_names")
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def evaluate_condition_list(expected: Dict, actual_facts: Dict) -> CaseEvaluation:
|
| 275 |
+
"""Evaluate condition list response."""
|
| 276 |
+
return evaluate_list_query(expected, actual_facts, "conditions", "condition_names")
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def evaluate_allergy_list(expected: Dict, actual_facts: Dict) -> CaseEvaluation:
|
| 280 |
+
"""Evaluate allergy list response."""
|
| 281 |
+
return evaluate_list_query(expected, actual_facts, "allergies", "substances")
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def evaluate_immunization_list(expected: Dict, actual_facts: Dict) -> CaseEvaluation:
|
| 285 |
+
"""Evaluate immunization list response."""
|
| 286 |
+
return evaluate_list_query(expected, actual_facts, "immunizations", "vaccine_names")
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
def evaluate_procedure_list(expected: Dict, actual_facts: Dict) -> CaseEvaluation:
|
| 290 |
+
"""Evaluate procedure list response."""
|
| 291 |
+
return evaluate_list_query(expected, actual_facts, "procedures", "procedure_names")
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
def evaluate_encounter_list(expected: Dict, actual_facts: Dict) -> CaseEvaluation:
|
| 295 |
+
"""Evaluate encounter list response."""
|
| 296 |
+
evaluation = CaseEvaluation(
|
| 297 |
+
case_id="",
|
| 298 |
+
query_type="encounter_list",
|
| 299 |
+
success=True
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
exp_count = expected.get("count", 0)
|
| 303 |
+
act_count = actual_facts.get("count", 0)
|
| 304 |
+
|
| 305 |
+
# For encounters, we check if count is within the limit
|
| 306 |
+
limit = expected.get("limit", 5)
|
| 307 |
+
|
| 308 |
+
count_comparison = compare_numeric("count", min(exp_count, limit), act_count, TOLERANCES["count"])
|
| 309 |
+
evaluation.comparisons.append(count_comparison)
|
| 310 |
+
evaluation.total_fields += 1
|
| 311 |
+
if count_comparison.match:
|
| 312 |
+
evaluation.correct_fields += 1
|
| 313 |
+
|
| 314 |
+
evaluation.success = count_comparison.match
|
| 315 |
+
return evaluation
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
def evaluate_lab_trend(expected: Dict, actual_facts: Dict) -> CaseEvaluation:
|
| 319 |
+
"""Evaluate lab trend response."""
|
| 320 |
+
evaluation = CaseEvaluation(
|
| 321 |
+
case_id="",
|
| 322 |
+
query_type="lab_trend",
|
| 323 |
+
success=True
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
if "metrics" not in expected:
|
| 327 |
+
evaluation.success = False
|
| 328 |
+
return evaluation
|
| 329 |
+
|
| 330 |
+
exp_metrics = expected["metrics"]
|
| 331 |
+
act_metrics = actual_facts.get("metrics", {})
|
| 332 |
+
|
| 333 |
+
for metric_name in ["min", "max", "avg", "latest", "count"]:
|
| 334 |
+
if metric_name in exp_metrics:
|
| 335 |
+
exp_val = exp_metrics[metric_name]
|
| 336 |
+
act_val = act_metrics.get(metric_name)
|
| 337 |
+
|
| 338 |
+
tolerance = TOLERANCES["count"] if metric_name == "count" else TOLERANCES["lab_value"]
|
| 339 |
+
|
| 340 |
+
comparison = compare_numeric(metric_name, exp_val, act_val, tolerance)
|
| 341 |
+
evaluation.comparisons.append(comparison)
|
| 342 |
+
evaluation.total_fields += 1
|
| 343 |
+
|
| 344 |
+
if comparison.match:
|
| 345 |
+
evaluation.correct_fields += 1
|
| 346 |
+
elif comparison.error_type == "hallucination":
|
| 347 |
+
evaluation.hallucinations += 1
|
| 348 |
+
elif comparison.error_type == "omission":
|
| 349 |
+
evaluation.omissions += 1
|
| 350 |
+
else:
|
| 351 |
+
evaluation.mismatches += 1
|
| 352 |
+
|
| 353 |
+
evaluation.success = evaluation.accuracy() >= 0.8
|
| 354 |
+
return evaluation
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
def evaluate_case(test_case: Dict, expected: Dict, actual_facts: Dict) -> CaseEvaluation:
|
| 358 |
+
"""
|
| 359 |
+
Evaluate a single test case.
|
| 360 |
+
|
| 361 |
+
Args:
|
| 362 |
+
test_case: The test case definition
|
| 363 |
+
expected: Expected values computed from database
|
| 364 |
+
actual_facts: Facts reported by the agent
|
| 365 |
+
|
| 366 |
+
Returns:
|
| 367 |
+
CaseEvaluation with detailed comparison results
|
| 368 |
+
"""
|
| 369 |
+
query_type = test_case["query_type"]
|
| 370 |
+
|
| 371 |
+
evaluators = {
|
| 372 |
+
"vital_trend": evaluate_vital_trend,
|
| 373 |
+
"medication_list": evaluate_medication_list,
|
| 374 |
+
"condition_list": evaluate_condition_list,
|
| 375 |
+
"allergy_list": evaluate_allergy_list,
|
| 376 |
+
"immunization_list": evaluate_immunization_list,
|
| 377 |
+
"procedure_list": evaluate_procedure_list,
|
| 378 |
+
"encounter_list": evaluate_encounter_list,
|
| 379 |
+
"lab_trend": evaluate_lab_trend
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
evaluator = evaluators.get(query_type)
|
| 383 |
+
if not evaluator:
|
| 384 |
+
return CaseEvaluation(
|
| 385 |
+
case_id=test_case["case_id"],
|
| 386 |
+
query_type=query_type,
|
| 387 |
+
success=False
|
| 388 |
+
)
|
| 389 |
+
|
| 390 |
+
evaluation = evaluator(expected, actual_facts)
|
| 391 |
+
evaluation.case_id = test_case["case_id"]
|
| 392 |
+
|
| 393 |
+
return evaluation
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
if __name__ == "__main__":
|
| 397 |
+
# Test with sample data
|
| 398 |
+
expected = {
|
| 399 |
+
"query_type": "vital_trend",
|
| 400 |
+
"metrics": {
|
| 401 |
+
"systolic": {
|
| 402 |
+
"min": 128.0,
|
| 403 |
+
"max": 142.0,
|
| 404 |
+
"avg": 134.8,
|
| 405 |
+
"count": 5,
|
| 406 |
+
"earliest_date": "2026-01-22",
|
| 407 |
+
"latest_date": "2026-01-27"
|
| 408 |
+
}
|
| 409 |
+
}
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
actual = {
|
| 413 |
+
"metrics": {
|
| 414 |
+
"systolic": {
|
| 415 |
+
"min": 128.0,
|
| 416 |
+
"max": 142.0,
|
| 417 |
+
"avg": 135.0, # Slightly off
|
| 418 |
+
"count": 5,
|
| 419 |
+
"earliest_date": "2026-01-22",
|
| 420 |
+
"latest_date": "2026-01-27"
|
| 421 |
+
}
|
| 422 |
+
}
|
| 423 |
+
}
|
| 424 |
+
|
| 425 |
+
test_case = {"case_id": "test_1", "query_type": "vital_trend"}
|
| 426 |
+
|
| 427 |
+
result = evaluate_case(test_case, expected, actual)
|
| 428 |
+
|
| 429 |
+
print(f"Case: {result.case_id}")
|
| 430 |
+
print(f"Success: {result.success}")
|
| 431 |
+
print(f"Accuracy: {result.accuracy():.1%}")
|
| 432 |
+
print(f"Fields: {result.correct_fields}/{result.total_fields}")
|
| 433 |
+
print(f"Hallucinations: {result.hallucinations}")
|
| 434 |
+
print(f"Omissions: {result.omissions}")
|
| 435 |
+
print("\nComparisons:")
|
| 436 |
+
for c in result.comparisons:
|
| 437 |
+
status = "✓" if c.match else "✗"
|
| 438 |
+
print(f" {status} {c.field_name}: expected={c.expected}, actual={c.actual}")
|
evaluation/expected_values.py
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Expected Values Calculator
|
| 4 |
+
|
| 5 |
+
Computes ground truth values directly from the database for each test case type.
|
| 6 |
+
These are the values we expect the LLM to report.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import sqlite3
|
| 10 |
+
from datetime import datetime, timedelta
|
| 11 |
+
from typing import Dict, List, Any, Optional
|
| 12 |
+
import os
|
| 13 |
+
import statistics
|
| 14 |
+
|
| 15 |
+
DB_PATH = os.getenv("DB_PATH", "data/fhir.db")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def get_db():
|
| 19 |
+
"""Get database connection."""
|
| 20 |
+
conn = sqlite3.connect(DB_PATH)
|
| 21 |
+
conn.row_factory = sqlite3.Row
|
| 22 |
+
return conn
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def compute_vital_trend_expected(patient_id: str, vital_type: str, codes: List[str],
|
| 26 |
+
labels: List[str], days: int = 30) -> Dict[str, Any]:
|
| 27 |
+
"""
|
| 28 |
+
Compute expected values for vital trend queries.
|
| 29 |
+
|
| 30 |
+
Returns expected facts like min, max, avg, count, date range.
|
| 31 |
+
"""
|
| 32 |
+
conn = get_db()
|
| 33 |
+
try:
|
| 34 |
+
cutoff_date = (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d")
|
| 35 |
+
|
| 36 |
+
result = {
|
| 37 |
+
"query_type": "vital_trend",
|
| 38 |
+
"vital_type": vital_type,
|
| 39 |
+
"days": days,
|
| 40 |
+
"metrics": {}
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
for code, label in zip(codes, labels):
|
| 44 |
+
cursor = conn.execute("""
|
| 45 |
+
SELECT value_quantity, effective_date
|
| 46 |
+
FROM observations
|
| 47 |
+
WHERE patient_id = ? AND code = ? AND effective_date >= ?
|
| 48 |
+
ORDER BY effective_date ASC
|
| 49 |
+
""", (patient_id, code, cutoff_date))
|
| 50 |
+
|
| 51 |
+
rows = cursor.fetchall()
|
| 52 |
+
values = [r["value_quantity"] for r in rows if r["value_quantity"] is not None]
|
| 53 |
+
dates = [r["effective_date"][:10] for r in rows]
|
| 54 |
+
|
| 55 |
+
if values:
|
| 56 |
+
result["metrics"][label] = {
|
| 57 |
+
"min": round(min(values), 1),
|
| 58 |
+
"max": round(max(values), 1),
|
| 59 |
+
"avg": round(statistics.mean(values), 1),
|
| 60 |
+
"count": len(values),
|
| 61 |
+
"latest": round(values[-1], 1),
|
| 62 |
+
"earliest_date": dates[0] if dates else None,
|
| 63 |
+
"latest_date": dates[-1] if dates else None,
|
| 64 |
+
"all_values": [round(v, 1) for v in values],
|
| 65 |
+
"all_dates": dates
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
return result
|
| 69 |
+
finally:
|
| 70 |
+
conn.close()
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def compute_medication_expected(patient_id: str, status: Optional[str] = None) -> Dict[str, Any]:
|
| 74 |
+
"""
|
| 75 |
+
Compute expected values for medication queries.
|
| 76 |
+
|
| 77 |
+
Returns list of medications with their details.
|
| 78 |
+
"""
|
| 79 |
+
conn = get_db()
|
| 80 |
+
try:
|
| 81 |
+
if status:
|
| 82 |
+
cursor = conn.execute("""
|
| 83 |
+
SELECT code, display, status, start_date
|
| 84 |
+
FROM medications
|
| 85 |
+
WHERE patient_id = ? AND status = ?
|
| 86 |
+
ORDER BY start_date DESC
|
| 87 |
+
""", (patient_id, status))
|
| 88 |
+
else:
|
| 89 |
+
cursor = conn.execute("""
|
| 90 |
+
SELECT code, display, status, start_date
|
| 91 |
+
FROM medications
|
| 92 |
+
WHERE patient_id = ?
|
| 93 |
+
ORDER BY start_date DESC
|
| 94 |
+
""", (patient_id,))
|
| 95 |
+
|
| 96 |
+
medications = []
|
| 97 |
+
for row in cursor.fetchall():
|
| 98 |
+
medications.append({
|
| 99 |
+
"code": row["code"],
|
| 100 |
+
"display": row["display"],
|
| 101 |
+
"status": row["status"],
|
| 102 |
+
"start_date": row["start_date"][:10] if row["start_date"] else None
|
| 103 |
+
})
|
| 104 |
+
|
| 105 |
+
return {
|
| 106 |
+
"query_type": "medication_list",
|
| 107 |
+
"status_filter": status,
|
| 108 |
+
"count": len(medications),
|
| 109 |
+
"medications": medications,
|
| 110 |
+
"medication_names": [m["display"] for m in medications]
|
| 111 |
+
}
|
| 112 |
+
finally:
|
| 113 |
+
conn.close()
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def compute_condition_expected(patient_id: str) -> Dict[str, Any]:
|
| 117 |
+
"""
|
| 118 |
+
Compute expected values for condition queries.
|
| 119 |
+
"""
|
| 120 |
+
conn = get_db()
|
| 121 |
+
try:
|
| 122 |
+
cursor = conn.execute("""
|
| 123 |
+
SELECT code, display, clinical_status, onset_date
|
| 124 |
+
FROM conditions
|
| 125 |
+
WHERE patient_id = ?
|
| 126 |
+
ORDER BY onset_date DESC
|
| 127 |
+
""", (patient_id,))
|
| 128 |
+
|
| 129 |
+
conditions = []
|
| 130 |
+
for row in cursor.fetchall():
|
| 131 |
+
conditions.append({
|
| 132 |
+
"code": row["code"],
|
| 133 |
+
"display": row["display"],
|
| 134 |
+
"clinical_status": row["clinical_status"],
|
| 135 |
+
"onset_date": row["onset_date"][:10] if row["onset_date"] else None
|
| 136 |
+
})
|
| 137 |
+
|
| 138 |
+
return {
|
| 139 |
+
"query_type": "condition_list",
|
| 140 |
+
"count": len(conditions),
|
| 141 |
+
"conditions": conditions,
|
| 142 |
+
"condition_names": [c["display"] for c in conditions]
|
| 143 |
+
}
|
| 144 |
+
finally:
|
| 145 |
+
conn.close()
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def compute_allergy_expected(patient_id: str) -> Dict[str, Any]:
|
| 149 |
+
"""
|
| 150 |
+
Compute expected values for allergy queries.
|
| 151 |
+
"""
|
| 152 |
+
conn = get_db()
|
| 153 |
+
try:
|
| 154 |
+
cursor = conn.execute("""
|
| 155 |
+
SELECT substance, reaction_display, criticality, category
|
| 156 |
+
FROM allergies
|
| 157 |
+
WHERE patient_id = ?
|
| 158 |
+
""", (patient_id,))
|
| 159 |
+
|
| 160 |
+
allergies = []
|
| 161 |
+
for row in cursor.fetchall():
|
| 162 |
+
allergies.append({
|
| 163 |
+
"substance": row["substance"],
|
| 164 |
+
"reaction": row["reaction_display"],
|
| 165 |
+
"criticality": row["criticality"],
|
| 166 |
+
"category": row["category"]
|
| 167 |
+
})
|
| 168 |
+
|
| 169 |
+
return {
|
| 170 |
+
"query_type": "allergy_list",
|
| 171 |
+
"count": len(allergies),
|
| 172 |
+
"allergies": allergies,
|
| 173 |
+
"substances": [a["substance"] for a in allergies]
|
| 174 |
+
}
|
| 175 |
+
finally:
|
| 176 |
+
conn.close()
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def compute_immunization_expected(patient_id: str) -> Dict[str, Any]:
|
| 180 |
+
"""
|
| 181 |
+
Compute expected values for immunization queries.
|
| 182 |
+
"""
|
| 183 |
+
conn = get_db()
|
| 184 |
+
try:
|
| 185 |
+
cursor = conn.execute("""
|
| 186 |
+
SELECT vaccine_code, vaccine_display, status, occurrence_date
|
| 187 |
+
FROM immunizations
|
| 188 |
+
WHERE patient_id = ?
|
| 189 |
+
ORDER BY occurrence_date DESC
|
| 190 |
+
""", (patient_id,))
|
| 191 |
+
|
| 192 |
+
immunizations = []
|
| 193 |
+
for row in cursor.fetchall():
|
| 194 |
+
immunizations.append({
|
| 195 |
+
"vaccine_code": row["vaccine_code"],
|
| 196 |
+
"vaccine_display": row["vaccine_display"],
|
| 197 |
+
"status": row["status"],
|
| 198 |
+
"occurrence_date": row["occurrence_date"][:10] if row["occurrence_date"] else None
|
| 199 |
+
})
|
| 200 |
+
|
| 201 |
+
return {
|
| 202 |
+
"query_type": "immunization_list",
|
| 203 |
+
"count": len(immunizations),
|
| 204 |
+
"immunizations": immunizations,
|
| 205 |
+
"vaccine_names": [i["vaccine_display"] for i in immunizations]
|
| 206 |
+
}
|
| 207 |
+
finally:
|
| 208 |
+
conn.close()
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def compute_procedure_expected(patient_id: str) -> Dict[str, Any]:
|
| 212 |
+
"""
|
| 213 |
+
Compute expected values for procedure queries.
|
| 214 |
+
"""
|
| 215 |
+
conn = get_db()
|
| 216 |
+
try:
|
| 217 |
+
cursor = conn.execute("""
|
| 218 |
+
SELECT code, display, status, performed_date
|
| 219 |
+
FROM procedures
|
| 220 |
+
WHERE patient_id = ?
|
| 221 |
+
ORDER BY performed_date DESC
|
| 222 |
+
""", (patient_id,))
|
| 223 |
+
|
| 224 |
+
procedures = []
|
| 225 |
+
for row in cursor.fetchall():
|
| 226 |
+
procedures.append({
|
| 227 |
+
"code": row["code"],
|
| 228 |
+
"display": row["display"],
|
| 229 |
+
"status": row["status"],
|
| 230 |
+
"performed_date": row["performed_date"][:10] if row["performed_date"] else None
|
| 231 |
+
})
|
| 232 |
+
|
| 233 |
+
return {
|
| 234 |
+
"query_type": "procedure_list",
|
| 235 |
+
"count": len(procedures),
|
| 236 |
+
"procedures": procedures,
|
| 237 |
+
"procedure_names": [p["display"] for p in procedures]
|
| 238 |
+
}
|
| 239 |
+
finally:
|
| 240 |
+
conn.close()
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def compute_encounter_expected(patient_id: str, limit: int = 5) -> Dict[str, Any]:
|
| 244 |
+
"""
|
| 245 |
+
Compute expected values for encounter queries.
|
| 246 |
+
"""
|
| 247 |
+
conn = get_db()
|
| 248 |
+
try:
|
| 249 |
+
cursor = conn.execute("""
|
| 250 |
+
SELECT type_display, reason_display, period_start, period_end, class_display
|
| 251 |
+
FROM encounters
|
| 252 |
+
WHERE patient_id = ?
|
| 253 |
+
ORDER BY period_start DESC
|
| 254 |
+
LIMIT ?
|
| 255 |
+
""", (patient_id, limit))
|
| 256 |
+
|
| 257 |
+
encounters = []
|
| 258 |
+
for row in cursor.fetchall():
|
| 259 |
+
encounters.append({
|
| 260 |
+
"type": row["type_display"],
|
| 261 |
+
"reason": row["reason_display"],
|
| 262 |
+
"class": row["class_display"],
|
| 263 |
+
"start_date": row["period_start"][:10] if row["period_start"] else None,
|
| 264 |
+
"end_date": row["period_end"][:10] if row["period_end"] else None
|
| 265 |
+
})
|
| 266 |
+
|
| 267 |
+
return {
|
| 268 |
+
"query_type": "encounter_list",
|
| 269 |
+
"count": len(encounters),
|
| 270 |
+
"limit": limit,
|
| 271 |
+
"encounters": encounters
|
| 272 |
+
}
|
| 273 |
+
finally:
|
| 274 |
+
conn.close()
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def compute_lab_trend_expected(patient_id: str, lab_type: str, code: str,
|
| 278 |
+
periods: int = 4) -> Dict[str, Any]:
|
| 279 |
+
"""
|
| 280 |
+
Compute expected values for lab trend queries.
|
| 281 |
+
"""
|
| 282 |
+
conn = get_db()
|
| 283 |
+
try:
|
| 284 |
+
cursor = conn.execute("""
|
| 285 |
+
SELECT value_quantity, effective_date, unit
|
| 286 |
+
FROM observations
|
| 287 |
+
WHERE patient_id = ? AND code = ?
|
| 288 |
+
ORDER BY effective_date DESC
|
| 289 |
+
LIMIT ?
|
| 290 |
+
""", (patient_id, code, periods))
|
| 291 |
+
|
| 292 |
+
rows = cursor.fetchall()
|
| 293 |
+
values = [r["value_quantity"] for r in rows if r["value_quantity"] is not None]
|
| 294 |
+
dates = [r["effective_date"][:10] for r in rows]
|
| 295 |
+
unit = rows[0]["unit"] if rows else None
|
| 296 |
+
|
| 297 |
+
result = {
|
| 298 |
+
"query_type": "lab_trend",
|
| 299 |
+
"lab_type": lab_type,
|
| 300 |
+
"code": code,
|
| 301 |
+
"unit": unit,
|
| 302 |
+
"count": len(values)
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
if values:
|
| 306 |
+
result["metrics"] = {
|
| 307 |
+
"min": round(min(values), 1),
|
| 308 |
+
"max": round(max(values), 1),
|
| 309 |
+
"avg": round(statistics.mean(values), 1),
|
| 310 |
+
"latest": round(values[0], 1), # Most recent
|
| 311 |
+
"latest_date": dates[0] if dates else None,
|
| 312 |
+
"all_values": [round(v, 1) for v in values],
|
| 313 |
+
"all_dates": dates
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
return result
|
| 317 |
+
finally:
|
| 318 |
+
conn.close()
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
def compute_expected_values(test_case: Dict) -> Dict[str, Any]:
|
| 322 |
+
"""
|
| 323 |
+
Compute expected values for any test case type.
|
| 324 |
+
Routes to the appropriate computation function.
|
| 325 |
+
"""
|
| 326 |
+
query_type = test_case["query_type"]
|
| 327 |
+
patient_id = test_case["patient_id"]
|
| 328 |
+
params = test_case.get("parameters", {})
|
| 329 |
+
|
| 330 |
+
if query_type == "vital_trend":
|
| 331 |
+
return compute_vital_trend_expected(
|
| 332 |
+
patient_id,
|
| 333 |
+
params["vital_type"],
|
| 334 |
+
params["codes"],
|
| 335 |
+
params["labels"],
|
| 336 |
+
params.get("days", 30)
|
| 337 |
+
)
|
| 338 |
+
|
| 339 |
+
elif query_type == "medication_list":
|
| 340 |
+
return compute_medication_expected(patient_id, params.get("status"))
|
| 341 |
+
|
| 342 |
+
elif query_type == "condition_list":
|
| 343 |
+
return compute_condition_expected(patient_id)
|
| 344 |
+
|
| 345 |
+
elif query_type == "allergy_list":
|
| 346 |
+
return compute_allergy_expected(patient_id)
|
| 347 |
+
|
| 348 |
+
elif query_type == "immunization_list":
|
| 349 |
+
return compute_immunization_expected(patient_id)
|
| 350 |
+
|
| 351 |
+
elif query_type == "procedure_list":
|
| 352 |
+
return compute_procedure_expected(patient_id)
|
| 353 |
+
|
| 354 |
+
elif query_type == "encounter_list":
|
| 355 |
+
return compute_encounter_expected(patient_id, params.get("limit", 5))
|
| 356 |
+
|
| 357 |
+
elif query_type == "lab_trend":
|
| 358 |
+
return compute_lab_trend_expected(
|
| 359 |
+
patient_id,
|
| 360 |
+
params["lab_type"],
|
| 361 |
+
params["code"],
|
| 362 |
+
params.get("periods", 4)
|
| 363 |
+
)
|
| 364 |
+
|
| 365 |
+
else:
|
| 366 |
+
return {"error": f"Unknown query type: {query_type}"}
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
if __name__ == "__main__":
|
| 370 |
+
# Test with a sample case
|
| 371 |
+
from test_generator import generate_all_test_cases
|
| 372 |
+
import json
|
| 373 |
+
|
| 374 |
+
print("Generating test cases...")
|
| 375 |
+
cases = generate_all_test_cases(num_patients=1)
|
| 376 |
+
|
| 377 |
+
print(f"\nComputing expected values for {len(cases)} test cases...")
|
| 378 |
+
|
| 379 |
+
for case in cases[:3]: # Show first 3
|
| 380 |
+
print(f"\n{'='*60}")
|
| 381 |
+
print(f"Case: {case['case_id']}")
|
| 382 |
+
print(f"Query: {case['query']}")
|
| 383 |
+
|
| 384 |
+
expected = compute_expected_values(case)
|
| 385 |
+
print(f"Expected values:")
|
| 386 |
+
print(json.dumps(expected, indent=2, default=str))
|
evaluation/facts_schema.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Facts Schema
|
| 4 |
+
|
| 5 |
+
Defines the structured facts format that the agent should return
|
| 6 |
+
alongside its text responses. These facts are used for evaluation.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from typing import Dict, List, Any, Optional
|
| 10 |
+
from dataclasses import dataclass, asdict
|
| 11 |
+
import json
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class VitalTrendFacts:
|
| 16 |
+
"""Structured facts for vital sign trend queries."""
|
| 17 |
+
vital_type: str
|
| 18 |
+
days: int
|
| 19 |
+
metrics: Dict[str, Dict[str, Any]] # {label: {min, max, avg, count, dates...}}
|
| 20 |
+
|
| 21 |
+
def to_dict(self) -> Dict:
|
| 22 |
+
return asdict(self)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@dataclass
|
| 26 |
+
class MedicationFacts:
|
| 27 |
+
"""Structured facts for medication queries."""
|
| 28 |
+
status_filter: Optional[str]
|
| 29 |
+
count: int
|
| 30 |
+
medication_names: List[str]
|
| 31 |
+
|
| 32 |
+
def to_dict(self) -> Dict:
|
| 33 |
+
return asdict(self)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@dataclass
|
| 37 |
+
class ConditionFacts:
|
| 38 |
+
"""Structured facts for condition queries."""
|
| 39 |
+
count: int
|
| 40 |
+
condition_names: List[str]
|
| 41 |
+
|
| 42 |
+
def to_dict(self) -> Dict:
|
| 43 |
+
return asdict(self)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@dataclass
|
| 47 |
+
class AllergyFacts:
|
| 48 |
+
"""Structured facts for allergy queries."""
|
| 49 |
+
count: int
|
| 50 |
+
substances: List[str]
|
| 51 |
+
|
| 52 |
+
def to_dict(self) -> Dict:
|
| 53 |
+
return asdict(self)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@dataclass
|
| 57 |
+
class ImmunizationFacts:
|
| 58 |
+
"""Structured facts for immunization queries."""
|
| 59 |
+
count: int
|
| 60 |
+
vaccine_names: List[str]
|
| 61 |
+
|
| 62 |
+
def to_dict(self) -> Dict:
|
| 63 |
+
return asdict(self)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@dataclass
|
| 67 |
+
class ProcedureFacts:
|
| 68 |
+
"""Structured facts for procedure queries."""
|
| 69 |
+
count: int
|
| 70 |
+
procedure_names: List[str]
|
| 71 |
+
|
| 72 |
+
def to_dict(self) -> Dict:
|
| 73 |
+
return asdict(self)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
@dataclass
|
| 77 |
+
class EncounterFacts:
|
| 78 |
+
"""Structured facts for encounter queries."""
|
| 79 |
+
count: int
|
| 80 |
+
limit: int
|
| 81 |
+
|
| 82 |
+
def to_dict(self) -> Dict:
|
| 83 |
+
return asdict(self)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
@dataclass
|
| 87 |
+
class LabTrendFacts:
|
| 88 |
+
"""Structured facts for lab trend queries."""
|
| 89 |
+
lab_type: str
|
| 90 |
+
code: str
|
| 91 |
+
unit: Optional[str]
|
| 92 |
+
count: int
|
| 93 |
+
metrics: Dict[str, Any] # {min, max, avg, latest, dates...}
|
| 94 |
+
|
| 95 |
+
def to_dict(self) -> Dict:
|
| 96 |
+
return asdict(self)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def extract_vital_facts_from_tool_result(tool_result: Dict) -> Optional[VitalTrendFacts]:
|
| 100 |
+
"""
|
| 101 |
+
Extract structured facts from vital chart tool result.
|
| 102 |
+
The tool already returns structured JSON - we just reshape it.
|
| 103 |
+
"""
|
| 104 |
+
if "error" in tool_result:
|
| 105 |
+
return None
|
| 106 |
+
|
| 107 |
+
chart_type = tool_result.get("chart_type", "")
|
| 108 |
+
if chart_type not in ["line", "line_dual"]:
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
metrics = {}
|
| 112 |
+
|
| 113 |
+
for dataset in tool_result.get("datasets", []):
|
| 114 |
+
label = dataset.get("label", "unknown").lower().replace(" ", "_")
|
| 115 |
+
data_points = dataset.get("data", [])
|
| 116 |
+
|
| 117 |
+
if not data_points:
|
| 118 |
+
continue
|
| 119 |
+
|
| 120 |
+
values = [p["value"] for p in data_points if p.get("value") is not None]
|
| 121 |
+
dates = [p["date"] for p in data_points if p.get("date")]
|
| 122 |
+
|
| 123 |
+
if values:
|
| 124 |
+
import statistics
|
| 125 |
+
metrics[label] = {
|
| 126 |
+
"min": round(min(values), 1),
|
| 127 |
+
"max": round(max(values), 1),
|
| 128 |
+
"avg": round(statistics.mean(values), 1),
|
| 129 |
+
"count": len(values),
|
| 130 |
+
"latest": round(values[-1], 1) if values else None,
|
| 131 |
+
"earliest_date": dates[0] if dates else None,
|
| 132 |
+
"latest_date": dates[-1] if dates else None
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
return VitalTrendFacts(
|
| 136 |
+
vital_type=tool_result.get("title", "").lower().replace(" ", "_"),
|
| 137 |
+
days=30, # Default, could be extracted from title
|
| 138 |
+
metrics=metrics
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def extract_lab_facts_from_tool_result(tool_result: Dict) -> Optional[LabTrendFacts]:
|
| 143 |
+
"""Extract structured facts from lab chart tool result."""
|
| 144 |
+
if "error" in tool_result:
|
| 145 |
+
return None
|
| 146 |
+
|
| 147 |
+
datasets = tool_result.get("datasets", [])
|
| 148 |
+
if not datasets:
|
| 149 |
+
return None
|
| 150 |
+
|
| 151 |
+
# Get first dataset
|
| 152 |
+
dataset = datasets[0]
|
| 153 |
+
data_points = dataset.get("data", [])
|
| 154 |
+
|
| 155 |
+
if not data_points:
|
| 156 |
+
return None
|
| 157 |
+
|
| 158 |
+
values = [p["value"] for p in data_points if p.get("value") is not None]
|
| 159 |
+
dates = [p["date"] for p in data_points if p.get("date")]
|
| 160 |
+
|
| 161 |
+
metrics = {}
|
| 162 |
+
if values:
|
| 163 |
+
import statistics
|
| 164 |
+
metrics = {
|
| 165 |
+
"min": round(min(values), 1),
|
| 166 |
+
"max": round(max(values), 1),
|
| 167 |
+
"avg": round(statistics.mean(values), 1),
|
| 168 |
+
"latest": round(values[-1], 1),
|
| 169 |
+
"latest_date": dates[-1] if dates else None
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
return LabTrendFacts(
|
| 173 |
+
lab_type=dataset.get("label", "unknown").lower(),
|
| 174 |
+
code="", # Not in tool result
|
| 175 |
+
unit=tool_result.get("unit"),
|
| 176 |
+
count=len(values),
|
| 177 |
+
metrics=metrics
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def extract_medication_facts(medications: List[Dict], status_filter: Optional[str] = None) -> MedicationFacts:
|
| 182 |
+
"""Extract structured facts from medication list."""
|
| 183 |
+
names = [m.get("display", "") for m in medications]
|
| 184 |
+
return MedicationFacts(
|
| 185 |
+
status_filter=status_filter,
|
| 186 |
+
count=len(medications),
|
| 187 |
+
medication_names=names
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def extract_condition_facts(conditions: List[Dict]) -> ConditionFacts:
|
| 192 |
+
"""Extract structured facts from condition list."""
|
| 193 |
+
names = [c.get("display", "") for c in conditions]
|
| 194 |
+
return ConditionFacts(
|
| 195 |
+
count=len(conditions),
|
| 196 |
+
condition_names=names
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def extract_allergy_facts(allergies: List[Dict]) -> AllergyFacts:
|
| 201 |
+
"""Extract structured facts from allergy list."""
|
| 202 |
+
substances = [a.get("substance", "") for a in allergies]
|
| 203 |
+
return AllergyFacts(
|
| 204 |
+
count=len(allergies),
|
| 205 |
+
substances=substances
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def extract_immunization_facts(immunizations: List[Dict]) -> ImmunizationFacts:
|
| 210 |
+
"""Extract structured facts from immunization list."""
|
| 211 |
+
names = [i.get("vaccine_display", "") for i in immunizations]
|
| 212 |
+
return ImmunizationFacts(
|
| 213 |
+
count=len(immunizations),
|
| 214 |
+
vaccine_names=names
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def extract_procedure_facts(procedures: List[Dict]) -> ProcedureFacts:
|
| 219 |
+
"""Extract structured facts from procedure list."""
|
| 220 |
+
names = [p.get("display", "") for p in procedures]
|
| 221 |
+
return ProcedureFacts(
|
| 222 |
+
count=len(procedures),
|
| 223 |
+
procedure_names=names
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def extract_encounter_facts(encounters: List[Dict], limit: int = 5) -> EncounterFacts:
|
| 228 |
+
"""Extract structured facts from encounter list."""
|
| 229 |
+
return EncounterFacts(
|
| 230 |
+
count=len(encounters),
|
| 231 |
+
limit=limit
|
| 232 |
+
)
|
evaluation/metrics.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Metrics Calculator
|
| 4 |
+
|
| 5 |
+
Aggregates evaluation results across multiple test cases
|
| 6 |
+
and computes summary statistics.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from typing import Dict, List, Any
|
| 10 |
+
from dataclasses import dataclass, field
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
import json
|
| 13 |
+
|
| 14 |
+
from .evaluator import CaseEvaluation
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class EvaluationMetrics:
|
| 19 |
+
"""Aggregated metrics across all test cases."""
|
| 20 |
+
|
| 21 |
+
# Overall
|
| 22 |
+
total_cases: int = 0
|
| 23 |
+
successful_cases: int = 0
|
| 24 |
+
failed_cases: int = 0
|
| 25 |
+
|
| 26 |
+
# Field-level
|
| 27 |
+
total_fields: int = 0
|
| 28 |
+
correct_fields: int = 0
|
| 29 |
+
total_hallucinations: int = 0
|
| 30 |
+
total_omissions: int = 0
|
| 31 |
+
total_mismatches: int = 0
|
| 32 |
+
|
| 33 |
+
# By query type
|
| 34 |
+
by_query_type: Dict[str, Dict[str, Any]] = field(default_factory=dict)
|
| 35 |
+
|
| 36 |
+
# Detailed results
|
| 37 |
+
case_results: List[Dict] = field(default_factory=list)
|
| 38 |
+
|
| 39 |
+
def success_rate(self) -> float:
|
| 40 |
+
if self.total_cases == 0:
|
| 41 |
+
return 0.0
|
| 42 |
+
return self.successful_cases / self.total_cases
|
| 43 |
+
|
| 44 |
+
def field_accuracy(self) -> float:
|
| 45 |
+
if self.total_fields == 0:
|
| 46 |
+
return 0.0
|
| 47 |
+
return self.correct_fields / self.total_fields
|
| 48 |
+
|
| 49 |
+
def hallucination_rate(self) -> float:
|
| 50 |
+
if self.total_fields == 0:
|
| 51 |
+
return 0.0
|
| 52 |
+
return self.total_hallucinations / self.total_fields
|
| 53 |
+
|
| 54 |
+
def omission_rate(self) -> float:
|
| 55 |
+
if self.total_fields == 0:
|
| 56 |
+
return 0.0
|
| 57 |
+
return self.total_omissions / self.total_fields
|
| 58 |
+
|
| 59 |
+
def to_dict(self) -> Dict:
|
| 60 |
+
return {
|
| 61 |
+
"summary": {
|
| 62 |
+
"total_cases": self.total_cases,
|
| 63 |
+
"successful_cases": self.successful_cases,
|
| 64 |
+
"failed_cases": self.failed_cases,
|
| 65 |
+
"success_rate": f"{self.success_rate():.1%}",
|
| 66 |
+
"field_accuracy": f"{self.field_accuracy():.1%}",
|
| 67 |
+
"hallucination_rate": f"{self.hallucination_rate():.1%}",
|
| 68 |
+
"omission_rate": f"{self.omission_rate():.1%}"
|
| 69 |
+
},
|
| 70 |
+
"field_level": {
|
| 71 |
+
"total_fields": self.total_fields,
|
| 72 |
+
"correct_fields": self.correct_fields,
|
| 73 |
+
"hallucinations": self.total_hallucinations,
|
| 74 |
+
"omissions": self.total_omissions,
|
| 75 |
+
"mismatches": self.total_mismatches
|
| 76 |
+
},
|
| 77 |
+
"by_query_type": self.by_query_type,
|
| 78 |
+
"case_results": self.case_results
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def aggregate_metrics(evaluations: List[CaseEvaluation]) -> EvaluationMetrics:
|
| 83 |
+
"""
|
| 84 |
+
Aggregate metrics from multiple case evaluations.
|
| 85 |
+
"""
|
| 86 |
+
metrics = EvaluationMetrics()
|
| 87 |
+
|
| 88 |
+
for eval_result in evaluations:
|
| 89 |
+
metrics.total_cases += 1
|
| 90 |
+
|
| 91 |
+
if eval_result.success:
|
| 92 |
+
metrics.successful_cases += 1
|
| 93 |
+
else:
|
| 94 |
+
metrics.failed_cases += 1
|
| 95 |
+
|
| 96 |
+
# Field-level aggregation
|
| 97 |
+
metrics.total_fields += eval_result.total_fields
|
| 98 |
+
metrics.correct_fields += eval_result.correct_fields
|
| 99 |
+
metrics.total_hallucinations += eval_result.hallucinations
|
| 100 |
+
metrics.total_omissions += eval_result.omissions
|
| 101 |
+
metrics.total_mismatches += eval_result.mismatches
|
| 102 |
+
|
| 103 |
+
# By query type
|
| 104 |
+
qtype = eval_result.query_type
|
| 105 |
+
if qtype not in metrics.by_query_type:
|
| 106 |
+
metrics.by_query_type[qtype] = {
|
| 107 |
+
"total": 0,
|
| 108 |
+
"successful": 0,
|
| 109 |
+
"failed": 0,
|
| 110 |
+
"total_fields": 0,
|
| 111 |
+
"correct_fields": 0,
|
| 112 |
+
"hallucinations": 0,
|
| 113 |
+
"omissions": 0
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
metrics.by_query_type[qtype]["total"] += 1
|
| 117 |
+
if eval_result.success:
|
| 118 |
+
metrics.by_query_type[qtype]["successful"] += 1
|
| 119 |
+
else:
|
| 120 |
+
metrics.by_query_type[qtype]["failed"] += 1
|
| 121 |
+
metrics.by_query_type[qtype]["total_fields"] += eval_result.total_fields
|
| 122 |
+
metrics.by_query_type[qtype]["correct_fields"] += eval_result.correct_fields
|
| 123 |
+
metrics.by_query_type[qtype]["hallucinations"] += eval_result.hallucinations
|
| 124 |
+
metrics.by_query_type[qtype]["omissions"] += eval_result.omissions
|
| 125 |
+
|
| 126 |
+
# Store case result
|
| 127 |
+
metrics.case_results.append({
|
| 128 |
+
"case_id": eval_result.case_id,
|
| 129 |
+
"query_type": eval_result.query_type,
|
| 130 |
+
"success": eval_result.success,
|
| 131 |
+
"accuracy": eval_result.accuracy(),
|
| 132 |
+
"fields": f"{eval_result.correct_fields}/{eval_result.total_fields}",
|
| 133 |
+
"hallucinations": eval_result.hallucinations,
|
| 134 |
+
"omissions": eval_result.omissions
|
| 135 |
+
})
|
| 136 |
+
|
| 137 |
+
# Calculate per-type success rates
|
| 138 |
+
for qtype, stats in metrics.by_query_type.items():
|
| 139 |
+
if stats["total"] > 0:
|
| 140 |
+
stats["success_rate"] = f"{stats['successful'] / stats['total']:.1%}"
|
| 141 |
+
if stats["total_fields"] > 0:
|
| 142 |
+
stats["field_accuracy"] = f"{stats['correct_fields'] / stats['total_fields']:.1%}"
|
| 143 |
+
|
| 144 |
+
return metrics
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def format_report(metrics: EvaluationMetrics) -> str:
|
| 148 |
+
"""Format metrics as a human-readable report."""
|
| 149 |
+
lines = []
|
| 150 |
+
|
| 151 |
+
lines.append("=" * 60)
|
| 152 |
+
lines.append("PRE-VISIT SUMMARY EVALUATION REPORT")
|
| 153 |
+
lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 154 |
+
lines.append("=" * 60)
|
| 155 |
+
|
| 156 |
+
lines.append("")
|
| 157 |
+
lines.append("OVERALL RESULTS")
|
| 158 |
+
lines.append("-" * 40)
|
| 159 |
+
lines.append(f"Total Test Cases: {metrics.total_cases}")
|
| 160 |
+
lines.append(f"Successful: {metrics.successful_cases}")
|
| 161 |
+
lines.append(f"Failed: {metrics.failed_cases}")
|
| 162 |
+
lines.append(f"Success Rate: {metrics.success_rate():.1%}")
|
| 163 |
+
lines.append("")
|
| 164 |
+
lines.append(f"Total Fields Checked: {metrics.total_fields}")
|
| 165 |
+
lines.append(f"Correct Fields: {metrics.correct_fields}")
|
| 166 |
+
lines.append(f"Field Accuracy: {metrics.field_accuracy():.1%}")
|
| 167 |
+
lines.append("")
|
| 168 |
+
lines.append(f"Hallucinations: {metrics.total_hallucinations} ({metrics.hallucination_rate():.1%})")
|
| 169 |
+
lines.append(f"Omissions: {metrics.total_omissions} ({metrics.omission_rate():.1%})")
|
| 170 |
+
lines.append(f"Mismatches: {metrics.total_mismatches}")
|
| 171 |
+
|
| 172 |
+
lines.append("")
|
| 173 |
+
lines.append("BY QUERY TYPE")
|
| 174 |
+
lines.append("-" * 40)
|
| 175 |
+
|
| 176 |
+
# Sort by total count
|
| 177 |
+
sorted_types = sorted(metrics.by_query_type.items(),
|
| 178 |
+
key=lambda x: x[1]["total"], reverse=True)
|
| 179 |
+
|
| 180 |
+
# Header
|
| 181 |
+
lines.append(f"{'Query Type':<25} {'Success':<12} {'Accuracy':<12} {'Hall.':<8}")
|
| 182 |
+
lines.append("-" * 60)
|
| 183 |
+
|
| 184 |
+
for qtype, stats in sorted_types:
|
| 185 |
+
success_rate = stats.get("success_rate", "N/A")
|
| 186 |
+
field_acc = stats.get("field_accuracy", "N/A")
|
| 187 |
+
lines.append(f"{qtype:<25} {success_rate:<12} {field_acc:<12} {stats['hallucinations']:<8}")
|
| 188 |
+
|
| 189 |
+
# Failed cases detail
|
| 190 |
+
failed_cases = [c for c in metrics.case_results if not c["success"]]
|
| 191 |
+
if failed_cases:
|
| 192 |
+
lines.append("")
|
| 193 |
+
lines.append("FAILED CASES")
|
| 194 |
+
lines.append("-" * 40)
|
| 195 |
+
for case in failed_cases[:10]: # Show first 10
|
| 196 |
+
lines.append(f" {case['case_id']}")
|
| 197 |
+
lines.append(f" Type: {case['query_type']}, Accuracy: {case['accuracy']:.1%}")
|
| 198 |
+
lines.append(f" Hallucinations: {case['hallucinations']}, Omissions: {case['omissions']}")
|
| 199 |
+
|
| 200 |
+
lines.append("")
|
| 201 |
+
lines.append("=" * 60)
|
| 202 |
+
|
| 203 |
+
return "\n".join(lines)
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def save_report(metrics: EvaluationMetrics, output_dir: str = "."):
|
| 207 |
+
"""Save evaluation report to files."""
|
| 208 |
+
import os
|
| 209 |
+
|
| 210 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 211 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 212 |
+
|
| 213 |
+
# Save text report
|
| 214 |
+
text_path = os.path.join(output_dir, f"eval_report_{timestamp}.txt")
|
| 215 |
+
with open(text_path, "w") as f:
|
| 216 |
+
f.write(format_report(metrics))
|
| 217 |
+
|
| 218 |
+
# Save JSON report
|
| 219 |
+
json_path = os.path.join(output_dir, f"eval_report_{timestamp}.json")
|
| 220 |
+
with open(json_path, "w") as f:
|
| 221 |
+
json.dump(metrics.to_dict(), f, indent=2, default=str)
|
| 222 |
+
|
| 223 |
+
return text_path, json_path
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
if __name__ == "__main__":
|
| 227 |
+
# Test with sample evaluations
|
| 228 |
+
from evaluator import CaseEvaluation, ComparisonResult
|
| 229 |
+
|
| 230 |
+
# Create some sample evaluations
|
| 231 |
+
evaluations = [
|
| 232 |
+
CaseEvaluation(
|
| 233 |
+
case_id="patient1_vital_bp",
|
| 234 |
+
query_type="vital_trend",
|
| 235 |
+
success=True,
|
| 236 |
+
total_fields=10,
|
| 237 |
+
correct_fields=9,
|
| 238 |
+
hallucinations=0,
|
| 239 |
+
omissions=1,
|
| 240 |
+
mismatches=0
|
| 241 |
+
),
|
| 242 |
+
CaseEvaluation(
|
| 243 |
+
case_id="patient1_meds",
|
| 244 |
+
query_type="medication_list",
|
| 245 |
+
success=True,
|
| 246 |
+
total_fields=5,
|
| 247 |
+
correct_fields=5,
|
| 248 |
+
hallucinations=0,
|
| 249 |
+
omissions=0,
|
| 250 |
+
mismatches=0
|
| 251 |
+
),
|
| 252 |
+
CaseEvaluation(
|
| 253 |
+
case_id="patient1_conditions",
|
| 254 |
+
query_type="condition_list",
|
| 255 |
+
success=False,
|
| 256 |
+
total_fields=8,
|
| 257 |
+
correct_fields=5,
|
| 258 |
+
hallucinations=2,
|
| 259 |
+
omissions=1,
|
| 260 |
+
mismatches=0
|
| 261 |
+
),
|
| 262 |
+
]
|
| 263 |
+
|
| 264 |
+
metrics = aggregate_metrics(evaluations)
|
| 265 |
+
print(format_report(metrics))
|
evaluation/reports/eval_report_20260127_174121.json
ADDED
|
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"summary": {
|
| 3 |
+
"total_cases": 30,
|
| 4 |
+
"successful_cases": 30,
|
| 5 |
+
"failed_cases": 0,
|
| 6 |
+
"success_rate": "100.0%",
|
| 7 |
+
"field_accuracy": "100.0%",
|
| 8 |
+
"hallucination_rate": "0.0%",
|
| 9 |
+
"omission_rate": "0.0%"
|
| 10 |
+
},
|
| 11 |
+
"field_level": {
|
| 12 |
+
"total_fields": 128,
|
| 13 |
+
"correct_fields": 128,
|
| 14 |
+
"hallucinations": 0,
|
| 15 |
+
"omissions": 0,
|
| 16 |
+
"mismatches": 0
|
| 17 |
+
},
|
| 18 |
+
"by_query_type": {
|
| 19 |
+
"vital_trend": {
|
| 20 |
+
"total": 6,
|
| 21 |
+
"successful": 6,
|
| 22 |
+
"failed": 0,
|
| 23 |
+
"total_fields": 63,
|
| 24 |
+
"correct_fields": 63,
|
| 25 |
+
"hallucinations": 0,
|
| 26 |
+
"omissions": 0,
|
| 27 |
+
"success_rate": "100.0%",
|
| 28 |
+
"field_accuracy": "100.0%"
|
| 29 |
+
},
|
| 30 |
+
"medication_list": {
|
| 31 |
+
"total": 6,
|
| 32 |
+
"successful": 6,
|
| 33 |
+
"failed": 0,
|
| 34 |
+
"total_fields": 18,
|
| 35 |
+
"correct_fields": 18,
|
| 36 |
+
"hallucinations": 0,
|
| 37 |
+
"omissions": 0,
|
| 38 |
+
"success_rate": "100.0%",
|
| 39 |
+
"field_accuracy": "100.0%"
|
| 40 |
+
},
|
| 41 |
+
"condition_list": {
|
| 42 |
+
"total": 3,
|
| 43 |
+
"successful": 3,
|
| 44 |
+
"failed": 0,
|
| 45 |
+
"total_fields": 9,
|
| 46 |
+
"correct_fields": 9,
|
| 47 |
+
"hallucinations": 0,
|
| 48 |
+
"omissions": 0,
|
| 49 |
+
"success_rate": "100.0%",
|
| 50 |
+
"field_accuracy": "100.0%"
|
| 51 |
+
},
|
| 52 |
+
"allergy_list": {
|
| 53 |
+
"total": 3,
|
| 54 |
+
"successful": 3,
|
| 55 |
+
"failed": 0,
|
| 56 |
+
"total_fields": 7,
|
| 57 |
+
"correct_fields": 7,
|
| 58 |
+
"hallucinations": 0,
|
| 59 |
+
"omissions": 0,
|
| 60 |
+
"success_rate": "100.0%",
|
| 61 |
+
"field_accuracy": "100.0%"
|
| 62 |
+
},
|
| 63 |
+
"immunization_list": {
|
| 64 |
+
"total": 3,
|
| 65 |
+
"successful": 3,
|
| 66 |
+
"failed": 0,
|
| 67 |
+
"total_fields": 9,
|
| 68 |
+
"correct_fields": 9,
|
| 69 |
+
"hallucinations": 0,
|
| 70 |
+
"omissions": 0,
|
| 71 |
+
"success_rate": "100.0%",
|
| 72 |
+
"field_accuracy": "100.0%"
|
| 73 |
+
},
|
| 74 |
+
"procedure_list": {
|
| 75 |
+
"total": 3,
|
| 76 |
+
"successful": 3,
|
| 77 |
+
"failed": 0,
|
| 78 |
+
"total_fields": 7,
|
| 79 |
+
"correct_fields": 7,
|
| 80 |
+
"hallucinations": 0,
|
| 81 |
+
"omissions": 0,
|
| 82 |
+
"success_rate": "100.0%",
|
| 83 |
+
"field_accuracy": "100.0%"
|
| 84 |
+
},
|
| 85 |
+
"encounter_list": {
|
| 86 |
+
"total": 3,
|
| 87 |
+
"successful": 3,
|
| 88 |
+
"failed": 0,
|
| 89 |
+
"total_fields": 3,
|
| 90 |
+
"correct_fields": 3,
|
| 91 |
+
"hallucinations": 0,
|
| 92 |
+
"omissions": 0,
|
| 93 |
+
"success_rate": "100.0%",
|
| 94 |
+
"field_accuracy": "100.0%"
|
| 95 |
+
},
|
| 96 |
+
"lab_trend": {
|
| 97 |
+
"total": 3,
|
| 98 |
+
"successful": 3,
|
| 99 |
+
"failed": 0,
|
| 100 |
+
"total_fields": 12,
|
| 101 |
+
"correct_fields": 12,
|
| 102 |
+
"hallucinations": 0,
|
| 103 |
+
"omissions": 0,
|
| 104 |
+
"success_rate": "100.0%",
|
| 105 |
+
"field_accuracy": "100.0%"
|
| 106 |
+
}
|
| 107 |
+
},
|
| 108 |
+
"case_results": [
|
| 109 |
+
{
|
| 110 |
+
"case_id": "patient-001_vital_blood_pressure",
|
| 111 |
+
"query_type": "vital_trend",
|
| 112 |
+
"success": true,
|
| 113 |
+
"accuracy": 1.0,
|
| 114 |
+
"fields": "14/14",
|
| 115 |
+
"hallucinations": 0,
|
| 116 |
+
"omissions": 0
|
| 117 |
+
},
|
| 118 |
+
{
|
| 119 |
+
"case_id": "patient-001_vital_heart_rate",
|
| 120 |
+
"query_type": "vital_trend",
|
| 121 |
+
"success": true,
|
| 122 |
+
"accuracy": 1.0,
|
| 123 |
+
"fields": "7/7",
|
| 124 |
+
"hallucinations": 0,
|
| 125 |
+
"omissions": 0
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"case_id": "patient-001_meds_all",
|
| 129 |
+
"query_type": "medication_list",
|
| 130 |
+
"success": true,
|
| 131 |
+
"accuracy": 1.0,
|
| 132 |
+
"fields": "3/3",
|
| 133 |
+
"hallucinations": 0,
|
| 134 |
+
"omissions": 0
|
| 135 |
+
},
|
| 136 |
+
{
|
| 137 |
+
"case_id": "patient-001_meds_active",
|
| 138 |
+
"query_type": "medication_list",
|
| 139 |
+
"success": true,
|
| 140 |
+
"accuracy": 1.0,
|
| 141 |
+
"fields": "3/3",
|
| 142 |
+
"hallucinations": 0,
|
| 143 |
+
"omissions": 0
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"case_id": "patient-001_conditions",
|
| 147 |
+
"query_type": "condition_list",
|
| 148 |
+
"success": true,
|
| 149 |
+
"accuracy": 1.0,
|
| 150 |
+
"fields": "3/3",
|
| 151 |
+
"hallucinations": 0,
|
| 152 |
+
"omissions": 0
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"case_id": "patient-001_allergies",
|
| 156 |
+
"query_type": "allergy_list",
|
| 157 |
+
"success": true,
|
| 158 |
+
"accuracy": 1.0,
|
| 159 |
+
"fields": "2/2",
|
| 160 |
+
"hallucinations": 0,
|
| 161 |
+
"omissions": 0
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"case_id": "patient-001_immunizations",
|
| 165 |
+
"query_type": "immunization_list",
|
| 166 |
+
"success": true,
|
| 167 |
+
"accuracy": 1.0,
|
| 168 |
+
"fields": "3/3",
|
| 169 |
+
"hallucinations": 0,
|
| 170 |
+
"omissions": 0
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"case_id": "patient-001_procedures",
|
| 174 |
+
"query_type": "procedure_list",
|
| 175 |
+
"success": true,
|
| 176 |
+
"accuracy": 1.0,
|
| 177 |
+
"fields": "2/2",
|
| 178 |
+
"hallucinations": 0,
|
| 179 |
+
"omissions": 0
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"case_id": "patient-001_encounters",
|
| 183 |
+
"query_type": "encounter_list",
|
| 184 |
+
"success": true,
|
| 185 |
+
"accuracy": 1.0,
|
| 186 |
+
"fields": "1/1",
|
| 187 |
+
"hallucinations": 0,
|
| 188 |
+
"omissions": 0
|
| 189 |
+
},
|
| 190 |
+
{
|
| 191 |
+
"case_id": "patient-001_lab_a1c",
|
| 192 |
+
"query_type": "lab_trend",
|
| 193 |
+
"success": true,
|
| 194 |
+
"accuracy": 1.0,
|
| 195 |
+
"fields": "4/4",
|
| 196 |
+
"hallucinations": 0,
|
| 197 |
+
"omissions": 0
|
| 198 |
+
},
|
| 199 |
+
{
|
| 200 |
+
"case_id": "patient-002_vital_blood_pressure",
|
| 201 |
+
"query_type": "vital_trend",
|
| 202 |
+
"success": true,
|
| 203 |
+
"accuracy": 1.0,
|
| 204 |
+
"fields": "14/14",
|
| 205 |
+
"hallucinations": 0,
|
| 206 |
+
"omissions": 0
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"case_id": "patient-002_vital_heart_rate",
|
| 210 |
+
"query_type": "vital_trend",
|
| 211 |
+
"success": true,
|
| 212 |
+
"accuracy": 1.0,
|
| 213 |
+
"fields": "7/7",
|
| 214 |
+
"hallucinations": 0,
|
| 215 |
+
"omissions": 0
|
| 216 |
+
},
|
| 217 |
+
{
|
| 218 |
+
"case_id": "patient-002_meds_all",
|
| 219 |
+
"query_type": "medication_list",
|
| 220 |
+
"success": true,
|
| 221 |
+
"accuracy": 1.0,
|
| 222 |
+
"fields": "3/3",
|
| 223 |
+
"hallucinations": 0,
|
| 224 |
+
"omissions": 0
|
| 225 |
+
},
|
| 226 |
+
{
|
| 227 |
+
"case_id": "patient-002_meds_active",
|
| 228 |
+
"query_type": "medication_list",
|
| 229 |
+
"success": true,
|
| 230 |
+
"accuracy": 1.0,
|
| 231 |
+
"fields": "3/3",
|
| 232 |
+
"hallucinations": 0,
|
| 233 |
+
"omissions": 0
|
| 234 |
+
},
|
| 235 |
+
{
|
| 236 |
+
"case_id": "patient-002_conditions",
|
| 237 |
+
"query_type": "condition_list",
|
| 238 |
+
"success": true,
|
| 239 |
+
"accuracy": 1.0,
|
| 240 |
+
"fields": "3/3",
|
| 241 |
+
"hallucinations": 0,
|
| 242 |
+
"omissions": 0
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"case_id": "patient-002_allergies",
|
| 246 |
+
"query_type": "allergy_list",
|
| 247 |
+
"success": true,
|
| 248 |
+
"accuracy": 1.0,
|
| 249 |
+
"fields": "3/3",
|
| 250 |
+
"hallucinations": 0,
|
| 251 |
+
"omissions": 0
|
| 252 |
+
},
|
| 253 |
+
{
|
| 254 |
+
"case_id": "patient-002_immunizations",
|
| 255 |
+
"query_type": "immunization_list",
|
| 256 |
+
"success": true,
|
| 257 |
+
"accuracy": 1.0,
|
| 258 |
+
"fields": "3/3",
|
| 259 |
+
"hallucinations": 0,
|
| 260 |
+
"omissions": 0
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
"case_id": "patient-002_procedures",
|
| 264 |
+
"query_type": "procedure_list",
|
| 265 |
+
"success": true,
|
| 266 |
+
"accuracy": 1.0,
|
| 267 |
+
"fields": "2/2",
|
| 268 |
+
"hallucinations": 0,
|
| 269 |
+
"omissions": 0
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"case_id": "patient-002_encounters",
|
| 273 |
+
"query_type": "encounter_list",
|
| 274 |
+
"success": true,
|
| 275 |
+
"accuracy": 1.0,
|
| 276 |
+
"fields": "1/1",
|
| 277 |
+
"hallucinations": 0,
|
| 278 |
+
"omissions": 0
|
| 279 |
+
},
|
| 280 |
+
{
|
| 281 |
+
"case_id": "patient-002_lab_a1c",
|
| 282 |
+
"query_type": "lab_trend",
|
| 283 |
+
"success": true,
|
| 284 |
+
"accuracy": 1.0,
|
| 285 |
+
"fields": "4/4",
|
| 286 |
+
"hallucinations": 0,
|
| 287 |
+
"omissions": 0
|
| 288 |
+
},
|
| 289 |
+
{
|
| 290 |
+
"case_id": "patient-003_vital_blood_pressure",
|
| 291 |
+
"query_type": "vital_trend",
|
| 292 |
+
"success": true,
|
| 293 |
+
"accuracy": 1.0,
|
| 294 |
+
"fields": "14/14",
|
| 295 |
+
"hallucinations": 0,
|
| 296 |
+
"omissions": 0
|
| 297 |
+
},
|
| 298 |
+
{
|
| 299 |
+
"case_id": "patient-003_vital_heart_rate",
|
| 300 |
+
"query_type": "vital_trend",
|
| 301 |
+
"success": true,
|
| 302 |
+
"accuracy": 1.0,
|
| 303 |
+
"fields": "7/7",
|
| 304 |
+
"hallucinations": 0,
|
| 305 |
+
"omissions": 0
|
| 306 |
+
},
|
| 307 |
+
{
|
| 308 |
+
"case_id": "patient-003_meds_all",
|
| 309 |
+
"query_type": "medication_list",
|
| 310 |
+
"success": true,
|
| 311 |
+
"accuracy": 1.0,
|
| 312 |
+
"fields": "3/3",
|
| 313 |
+
"hallucinations": 0,
|
| 314 |
+
"omissions": 0
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"case_id": "patient-003_meds_active",
|
| 318 |
+
"query_type": "medication_list",
|
| 319 |
+
"success": true,
|
| 320 |
+
"accuracy": 1.0,
|
| 321 |
+
"fields": "3/3",
|
| 322 |
+
"hallucinations": 0,
|
| 323 |
+
"omissions": 0
|
| 324 |
+
},
|
| 325 |
+
{
|
| 326 |
+
"case_id": "patient-003_conditions",
|
| 327 |
+
"query_type": "condition_list",
|
| 328 |
+
"success": true,
|
| 329 |
+
"accuracy": 1.0,
|
| 330 |
+
"fields": "3/3",
|
| 331 |
+
"hallucinations": 0,
|
| 332 |
+
"omissions": 0
|
| 333 |
+
},
|
| 334 |
+
{
|
| 335 |
+
"case_id": "patient-003_allergies",
|
| 336 |
+
"query_type": "allergy_list",
|
| 337 |
+
"success": true,
|
| 338 |
+
"accuracy": 1.0,
|
| 339 |
+
"fields": "2/2",
|
| 340 |
+
"hallucinations": 0,
|
| 341 |
+
"omissions": 0
|
| 342 |
+
},
|
| 343 |
+
{
|
| 344 |
+
"case_id": "patient-003_immunizations",
|
| 345 |
+
"query_type": "immunization_list",
|
| 346 |
+
"success": true,
|
| 347 |
+
"accuracy": 1.0,
|
| 348 |
+
"fields": "3/3",
|
| 349 |
+
"hallucinations": 0,
|
| 350 |
+
"omissions": 0
|
| 351 |
+
},
|
| 352 |
+
{
|
| 353 |
+
"case_id": "patient-003_procedures",
|
| 354 |
+
"query_type": "procedure_list",
|
| 355 |
+
"success": true,
|
| 356 |
+
"accuracy": 1.0,
|
| 357 |
+
"fields": "3/3",
|
| 358 |
+
"hallucinations": 0,
|
| 359 |
+
"omissions": 0
|
| 360 |
+
},
|
| 361 |
+
{
|
| 362 |
+
"case_id": "patient-003_encounters",
|
| 363 |
+
"query_type": "encounter_list",
|
| 364 |
+
"success": true,
|
| 365 |
+
"accuracy": 1.0,
|
| 366 |
+
"fields": "1/1",
|
| 367 |
+
"hallucinations": 0,
|
| 368 |
+
"omissions": 0
|
| 369 |
+
},
|
| 370 |
+
{
|
| 371 |
+
"case_id": "patient-003_lab_a1c",
|
| 372 |
+
"query_type": "lab_trend",
|
| 373 |
+
"success": true,
|
| 374 |
+
"accuracy": 1.0,
|
| 375 |
+
"fields": "4/4",
|
| 376 |
+
"hallucinations": 0,
|
| 377 |
+
"omissions": 0
|
| 378 |
+
}
|
| 379 |
+
]
|
| 380 |
+
}
|
evaluation/reports/eval_report_20260127_174121.txt
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
============================================================
|
| 2 |
+
PRE-VISIT SUMMARY EVALUATION REPORT
|
| 3 |
+
Generated: 2026-01-27 17:41:21
|
| 4 |
+
============================================================
|
| 5 |
+
|
| 6 |
+
OVERALL RESULTS
|
| 7 |
+
----------------------------------------
|
| 8 |
+
Total Test Cases: 30
|
| 9 |
+
Successful: 30
|
| 10 |
+
Failed: 0
|
| 11 |
+
Success Rate: 100.0%
|
| 12 |
+
|
| 13 |
+
Total Fields Checked: 128
|
| 14 |
+
Correct Fields: 128
|
| 15 |
+
Field Accuracy: 100.0%
|
| 16 |
+
|
| 17 |
+
Hallucinations: 0 (0.0%)
|
| 18 |
+
Omissions: 0 (0.0%)
|
| 19 |
+
Mismatches: 0
|
| 20 |
+
|
| 21 |
+
BY QUERY TYPE
|
| 22 |
+
----------------------------------------
|
| 23 |
+
Query Type Success Accuracy Hall.
|
| 24 |
+
------------------------------------------------------------
|
| 25 |
+
vital_trend 100.0% 100.0% 0
|
| 26 |
+
medication_list 100.0% 100.0% 0
|
| 27 |
+
condition_list 100.0% 100.0% 0
|
| 28 |
+
allergy_list 100.0% 100.0% 0
|
| 29 |
+
immunization_list 100.0% 100.0% 0
|
| 30 |
+
procedure_list 100.0% 100.0% 0
|
| 31 |
+
encounter_list 100.0% 100.0% 0
|
| 32 |
+
lab_trend 100.0% 100.0% 0
|
| 33 |
+
|
| 34 |
+
============================================================
|
evaluation/reports/eval_report_20260127_174147.json
ADDED
|
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"summary": {
|
| 3 |
+
"total_cases": 30,
|
| 4 |
+
"successful_cases": 20,
|
| 5 |
+
"failed_cases": 10,
|
| 6 |
+
"success_rate": "66.7%",
|
| 7 |
+
"field_accuracy": "81.1%",
|
| 8 |
+
"hallucination_rate": "5.3%",
|
| 9 |
+
"omission_rate": "5.3%"
|
| 10 |
+
},
|
| 11 |
+
"field_level": {
|
| 12 |
+
"total_fields": 132,
|
| 13 |
+
"correct_fields": 107,
|
| 14 |
+
"hallucinations": 7,
|
| 15 |
+
"omissions": 7,
|
| 16 |
+
"mismatches": 3
|
| 17 |
+
},
|
| 18 |
+
"by_query_type": {
|
| 19 |
+
"vital_trend": {
|
| 20 |
+
"total": 6,
|
| 21 |
+
"successful": 5,
|
| 22 |
+
"failed": 1,
|
| 23 |
+
"total_fields": 63,
|
| 24 |
+
"correct_fields": 56,
|
| 25 |
+
"hallucinations": 3,
|
| 26 |
+
"omissions": 1,
|
| 27 |
+
"success_rate": "83.3%",
|
| 28 |
+
"field_accuracy": "88.9%"
|
| 29 |
+
},
|
| 30 |
+
"medication_list": {
|
| 31 |
+
"total": 6,
|
| 32 |
+
"successful": 4,
|
| 33 |
+
"failed": 2,
|
| 34 |
+
"total_fields": 19,
|
| 35 |
+
"correct_fields": 15,
|
| 36 |
+
"hallucinations": 1,
|
| 37 |
+
"omissions": 1,
|
| 38 |
+
"success_rate": "66.7%",
|
| 39 |
+
"field_accuracy": "78.9%"
|
| 40 |
+
},
|
| 41 |
+
"condition_list": {
|
| 42 |
+
"total": 3,
|
| 43 |
+
"successful": 1,
|
| 44 |
+
"failed": 2,
|
| 45 |
+
"total_fields": 9,
|
| 46 |
+
"correct_fields": 5,
|
| 47 |
+
"hallucinations": 0,
|
| 48 |
+
"omissions": 2,
|
| 49 |
+
"success_rate": "33.3%",
|
| 50 |
+
"field_accuracy": "55.6%"
|
| 51 |
+
},
|
| 52 |
+
"allergy_list": {
|
| 53 |
+
"total": 3,
|
| 54 |
+
"successful": 1,
|
| 55 |
+
"failed": 2,
|
| 56 |
+
"total_fields": 8,
|
| 57 |
+
"correct_fields": 4,
|
| 58 |
+
"hallucinations": 1,
|
| 59 |
+
"omissions": 2,
|
| 60 |
+
"success_rate": "33.3%",
|
| 61 |
+
"field_accuracy": "50.0%"
|
| 62 |
+
},
|
| 63 |
+
"immunization_list": {
|
| 64 |
+
"total": 3,
|
| 65 |
+
"successful": 1,
|
| 66 |
+
"failed": 2,
|
| 67 |
+
"total_fields": 10,
|
| 68 |
+
"correct_fields": 6,
|
| 69 |
+
"hallucinations": 1,
|
| 70 |
+
"omissions": 1,
|
| 71 |
+
"success_rate": "33.3%",
|
| 72 |
+
"field_accuracy": "60.0%"
|
| 73 |
+
},
|
| 74 |
+
"procedure_list": {
|
| 75 |
+
"total": 3,
|
| 76 |
+
"successful": 2,
|
| 77 |
+
"failed": 1,
|
| 78 |
+
"total_fields": 8,
|
| 79 |
+
"correct_fields": 6,
|
| 80 |
+
"hallucinations": 1,
|
| 81 |
+
"omissions": 0,
|
| 82 |
+
"success_rate": "66.7%",
|
| 83 |
+
"field_accuracy": "75.0%"
|
| 84 |
+
},
|
| 85 |
+
"encounter_list": {
|
| 86 |
+
"total": 3,
|
| 87 |
+
"successful": 3,
|
| 88 |
+
"failed": 0,
|
| 89 |
+
"total_fields": 3,
|
| 90 |
+
"correct_fields": 3,
|
| 91 |
+
"hallucinations": 0,
|
| 92 |
+
"omissions": 0,
|
| 93 |
+
"success_rate": "100.0%",
|
| 94 |
+
"field_accuracy": "100.0%"
|
| 95 |
+
},
|
| 96 |
+
"lab_trend": {
|
| 97 |
+
"total": 3,
|
| 98 |
+
"successful": 3,
|
| 99 |
+
"failed": 0,
|
| 100 |
+
"total_fields": 12,
|
| 101 |
+
"correct_fields": 12,
|
| 102 |
+
"hallucinations": 0,
|
| 103 |
+
"omissions": 0,
|
| 104 |
+
"success_rate": "100.0%",
|
| 105 |
+
"field_accuracy": "100.0%"
|
| 106 |
+
}
|
| 107 |
+
},
|
| 108 |
+
"case_results": [
|
| 109 |
+
{
|
| 110 |
+
"case_id": "patient-001_vital_blood_pressure",
|
| 111 |
+
"query_type": "vital_trend",
|
| 112 |
+
"success": true,
|
| 113 |
+
"accuracy": 0.9285714285714286,
|
| 114 |
+
"fields": "13/14",
|
| 115 |
+
"hallucinations": 1,
|
| 116 |
+
"omissions": 0
|
| 117 |
+
},
|
| 118 |
+
{
|
| 119 |
+
"case_id": "patient-001_vital_heart_rate",
|
| 120 |
+
"query_type": "vital_trend",
|
| 121 |
+
"success": true,
|
| 122 |
+
"accuracy": 0.8571428571428571,
|
| 123 |
+
"fields": "6/7",
|
| 124 |
+
"hallucinations": 0,
|
| 125 |
+
"omissions": 1
|
| 126 |
+
},
|
| 127 |
+
{
|
| 128 |
+
"case_id": "patient-001_meds_all",
|
| 129 |
+
"query_type": "medication_list",
|
| 130 |
+
"success": true,
|
| 131 |
+
"accuracy": 1.0,
|
| 132 |
+
"fields": "3/3",
|
| 133 |
+
"hallucinations": 0,
|
| 134 |
+
"omissions": 0
|
| 135 |
+
},
|
| 136 |
+
{
|
| 137 |
+
"case_id": "patient-001_meds_active",
|
| 138 |
+
"query_type": "medication_list",
|
| 139 |
+
"success": false,
|
| 140 |
+
"accuracy": 0.5,
|
| 141 |
+
"fields": "2/4",
|
| 142 |
+
"hallucinations": 1,
|
| 143 |
+
"omissions": 0
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"case_id": "patient-001_conditions",
|
| 147 |
+
"query_type": "condition_list",
|
| 148 |
+
"success": true,
|
| 149 |
+
"accuracy": 1.0,
|
| 150 |
+
"fields": "3/3",
|
| 151 |
+
"hallucinations": 0,
|
| 152 |
+
"omissions": 0
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"case_id": "patient-001_allergies",
|
| 156 |
+
"query_type": "allergy_list",
|
| 157 |
+
"success": false,
|
| 158 |
+
"accuracy": 0.3333333333333333,
|
| 159 |
+
"fields": "1/3",
|
| 160 |
+
"hallucinations": 1,
|
| 161 |
+
"omissions": 1
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"case_id": "patient-001_immunizations",
|
| 165 |
+
"query_type": "immunization_list",
|
| 166 |
+
"success": false,
|
| 167 |
+
"accuracy": 0.5,
|
| 168 |
+
"fields": "2/4",
|
| 169 |
+
"hallucinations": 1,
|
| 170 |
+
"omissions": 0
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"case_id": "patient-001_procedures",
|
| 174 |
+
"query_type": "procedure_list",
|
| 175 |
+
"success": true,
|
| 176 |
+
"accuracy": 1.0,
|
| 177 |
+
"fields": "2/2",
|
| 178 |
+
"hallucinations": 0,
|
| 179 |
+
"omissions": 0
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"case_id": "patient-001_encounters",
|
| 183 |
+
"query_type": "encounter_list",
|
| 184 |
+
"success": true,
|
| 185 |
+
"accuracy": 1.0,
|
| 186 |
+
"fields": "1/1",
|
| 187 |
+
"hallucinations": 0,
|
| 188 |
+
"omissions": 0
|
| 189 |
+
},
|
| 190 |
+
{
|
| 191 |
+
"case_id": "patient-001_lab_a1c",
|
| 192 |
+
"query_type": "lab_trend",
|
| 193 |
+
"success": true,
|
| 194 |
+
"accuracy": 1.0,
|
| 195 |
+
"fields": "4/4",
|
| 196 |
+
"hallucinations": 0,
|
| 197 |
+
"omissions": 0
|
| 198 |
+
},
|
| 199 |
+
{
|
| 200 |
+
"case_id": "patient-002_vital_blood_pressure",
|
| 201 |
+
"query_type": "vital_trend",
|
| 202 |
+
"success": false,
|
| 203 |
+
"accuracy": 0.7857142857142857,
|
| 204 |
+
"fields": "11/14",
|
| 205 |
+
"hallucinations": 1,
|
| 206 |
+
"omissions": 0
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"case_id": "patient-002_vital_heart_rate",
|
| 210 |
+
"query_type": "vital_trend",
|
| 211 |
+
"success": true,
|
| 212 |
+
"accuracy": 0.8571428571428571,
|
| 213 |
+
"fields": "6/7",
|
| 214 |
+
"hallucinations": 1,
|
| 215 |
+
"omissions": 0
|
| 216 |
+
},
|
| 217 |
+
{
|
| 218 |
+
"case_id": "patient-002_meds_all",
|
| 219 |
+
"query_type": "medication_list",
|
| 220 |
+
"success": true,
|
| 221 |
+
"accuracy": 1.0,
|
| 222 |
+
"fields": "3/3",
|
| 223 |
+
"hallucinations": 0,
|
| 224 |
+
"omissions": 0
|
| 225 |
+
},
|
| 226 |
+
{
|
| 227 |
+
"case_id": "patient-002_meds_active",
|
| 228 |
+
"query_type": "medication_list",
|
| 229 |
+
"success": true,
|
| 230 |
+
"accuracy": 1.0,
|
| 231 |
+
"fields": "3/3",
|
| 232 |
+
"hallucinations": 0,
|
| 233 |
+
"omissions": 0
|
| 234 |
+
},
|
| 235 |
+
{
|
| 236 |
+
"case_id": "patient-002_conditions",
|
| 237 |
+
"query_type": "condition_list",
|
| 238 |
+
"success": false,
|
| 239 |
+
"accuracy": 0.3333333333333333,
|
| 240 |
+
"fields": "1/3",
|
| 241 |
+
"hallucinations": 0,
|
| 242 |
+
"omissions": 1
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"case_id": "patient-002_allergies",
|
| 246 |
+
"query_type": "allergy_list",
|
| 247 |
+
"success": false,
|
| 248 |
+
"accuracy": 0.3333333333333333,
|
| 249 |
+
"fields": "1/3",
|
| 250 |
+
"hallucinations": 0,
|
| 251 |
+
"omissions": 1
|
| 252 |
+
},
|
| 253 |
+
{
|
| 254 |
+
"case_id": "patient-002_immunizations",
|
| 255 |
+
"query_type": "immunization_list",
|
| 256 |
+
"success": false,
|
| 257 |
+
"accuracy": 0.3333333333333333,
|
| 258 |
+
"fields": "1/3",
|
| 259 |
+
"hallucinations": 0,
|
| 260 |
+
"omissions": 1
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
"case_id": "patient-002_procedures",
|
| 264 |
+
"query_type": "procedure_list",
|
| 265 |
+
"success": true,
|
| 266 |
+
"accuracy": 1.0,
|
| 267 |
+
"fields": "2/2",
|
| 268 |
+
"hallucinations": 0,
|
| 269 |
+
"omissions": 0
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"case_id": "patient-002_encounters",
|
| 273 |
+
"query_type": "encounter_list",
|
| 274 |
+
"success": true,
|
| 275 |
+
"accuracy": 1.0,
|
| 276 |
+
"fields": "1/1",
|
| 277 |
+
"hallucinations": 0,
|
| 278 |
+
"omissions": 0
|
| 279 |
+
},
|
| 280 |
+
{
|
| 281 |
+
"case_id": "patient-002_lab_a1c",
|
| 282 |
+
"query_type": "lab_trend",
|
| 283 |
+
"success": true,
|
| 284 |
+
"accuracy": 1.0,
|
| 285 |
+
"fields": "4/4",
|
| 286 |
+
"hallucinations": 0,
|
| 287 |
+
"omissions": 0
|
| 288 |
+
},
|
| 289 |
+
{
|
| 290 |
+
"case_id": "patient-003_vital_blood_pressure",
|
| 291 |
+
"query_type": "vital_trend",
|
| 292 |
+
"success": true,
|
| 293 |
+
"accuracy": 1.0,
|
| 294 |
+
"fields": "14/14",
|
| 295 |
+
"hallucinations": 0,
|
| 296 |
+
"omissions": 0
|
| 297 |
+
},
|
| 298 |
+
{
|
| 299 |
+
"case_id": "patient-003_vital_heart_rate",
|
| 300 |
+
"query_type": "vital_trend",
|
| 301 |
+
"success": true,
|
| 302 |
+
"accuracy": 0.8571428571428571,
|
| 303 |
+
"fields": "6/7",
|
| 304 |
+
"hallucinations": 0,
|
| 305 |
+
"omissions": 0
|
| 306 |
+
},
|
| 307 |
+
{
|
| 308 |
+
"case_id": "patient-003_meds_all",
|
| 309 |
+
"query_type": "medication_list",
|
| 310 |
+
"success": false,
|
| 311 |
+
"accuracy": 0.3333333333333333,
|
| 312 |
+
"fields": "1/3",
|
| 313 |
+
"hallucinations": 0,
|
| 314 |
+
"omissions": 1
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"case_id": "patient-003_meds_active",
|
| 318 |
+
"query_type": "medication_list",
|
| 319 |
+
"success": true,
|
| 320 |
+
"accuracy": 1.0,
|
| 321 |
+
"fields": "3/3",
|
| 322 |
+
"hallucinations": 0,
|
| 323 |
+
"omissions": 0
|
| 324 |
+
},
|
| 325 |
+
{
|
| 326 |
+
"case_id": "patient-003_conditions",
|
| 327 |
+
"query_type": "condition_list",
|
| 328 |
+
"success": false,
|
| 329 |
+
"accuracy": 0.3333333333333333,
|
| 330 |
+
"fields": "1/3",
|
| 331 |
+
"hallucinations": 0,
|
| 332 |
+
"omissions": 1
|
| 333 |
+
},
|
| 334 |
+
{
|
| 335 |
+
"case_id": "patient-003_allergies",
|
| 336 |
+
"query_type": "allergy_list",
|
| 337 |
+
"success": true,
|
| 338 |
+
"accuracy": 1.0,
|
| 339 |
+
"fields": "2/2",
|
| 340 |
+
"hallucinations": 0,
|
| 341 |
+
"omissions": 0
|
| 342 |
+
},
|
| 343 |
+
{
|
| 344 |
+
"case_id": "patient-003_immunizations",
|
| 345 |
+
"query_type": "immunization_list",
|
| 346 |
+
"success": true,
|
| 347 |
+
"accuracy": 1.0,
|
| 348 |
+
"fields": "3/3",
|
| 349 |
+
"hallucinations": 0,
|
| 350 |
+
"omissions": 0
|
| 351 |
+
},
|
| 352 |
+
{
|
| 353 |
+
"case_id": "patient-003_procedures",
|
| 354 |
+
"query_type": "procedure_list",
|
| 355 |
+
"success": false,
|
| 356 |
+
"accuracy": 0.5,
|
| 357 |
+
"fields": "2/4",
|
| 358 |
+
"hallucinations": 1,
|
| 359 |
+
"omissions": 0
|
| 360 |
+
},
|
| 361 |
+
{
|
| 362 |
+
"case_id": "patient-003_encounters",
|
| 363 |
+
"query_type": "encounter_list",
|
| 364 |
+
"success": true,
|
| 365 |
+
"accuracy": 1.0,
|
| 366 |
+
"fields": "1/1",
|
| 367 |
+
"hallucinations": 0,
|
| 368 |
+
"omissions": 0
|
| 369 |
+
},
|
| 370 |
+
{
|
| 371 |
+
"case_id": "patient-003_lab_a1c",
|
| 372 |
+
"query_type": "lab_trend",
|
| 373 |
+
"success": true,
|
| 374 |
+
"accuracy": 1.0,
|
| 375 |
+
"fields": "4/4",
|
| 376 |
+
"hallucinations": 0,
|
| 377 |
+
"omissions": 0
|
| 378 |
+
}
|
| 379 |
+
]
|
| 380 |
+
}
|
evaluation/reports/eval_report_20260127_174147.txt
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
============================================================
|
| 2 |
+
PRE-VISIT SUMMARY EVALUATION REPORT
|
| 3 |
+
Generated: 2026-01-27 17:41:47
|
| 4 |
+
============================================================
|
| 5 |
+
|
| 6 |
+
OVERALL RESULTS
|
| 7 |
+
----------------------------------------
|
| 8 |
+
Total Test Cases: 30
|
| 9 |
+
Successful: 20
|
| 10 |
+
Failed: 10
|
| 11 |
+
Success Rate: 66.7%
|
| 12 |
+
|
| 13 |
+
Total Fields Checked: 132
|
| 14 |
+
Correct Fields: 107
|
| 15 |
+
Field Accuracy: 81.1%
|
| 16 |
+
|
| 17 |
+
Hallucinations: 7 (5.3%)
|
| 18 |
+
Omissions: 7 (5.3%)
|
| 19 |
+
Mismatches: 3
|
| 20 |
+
|
| 21 |
+
BY QUERY TYPE
|
| 22 |
+
----------------------------------------
|
| 23 |
+
Query Type Success Accuracy Hall.
|
| 24 |
+
------------------------------------------------------------
|
| 25 |
+
vital_trend 83.3% 88.9% 3
|
| 26 |
+
medication_list 66.7% 78.9% 1
|
| 27 |
+
condition_list 33.3% 55.6% 0
|
| 28 |
+
allergy_list 33.3% 50.0% 1
|
| 29 |
+
immunization_list 33.3% 60.0% 1
|
| 30 |
+
procedure_list 66.7% 75.0% 1
|
| 31 |
+
encounter_list 100.0% 100.0% 0
|
| 32 |
+
lab_trend 100.0% 100.0% 0
|
| 33 |
+
|
| 34 |
+
FAILED CASES
|
| 35 |
+
----------------------------------------
|
| 36 |
+
patient-001_meds_active
|
| 37 |
+
Type: medication_list, Accuracy: 50.0%
|
| 38 |
+
Hallucinations: 1, Omissions: 0
|
| 39 |
+
patient-001_allergies
|
| 40 |
+
Type: allergy_list, Accuracy: 33.3%
|
| 41 |
+
Hallucinations: 1, Omissions: 1
|
| 42 |
+
patient-001_immunizations
|
| 43 |
+
Type: immunization_list, Accuracy: 50.0%
|
| 44 |
+
Hallucinations: 1, Omissions: 0
|
| 45 |
+
patient-002_vital_blood_pressure
|
| 46 |
+
Type: vital_trend, Accuracy: 78.6%
|
| 47 |
+
Hallucinations: 1, Omissions: 0
|
| 48 |
+
patient-002_conditions
|
| 49 |
+
Type: condition_list, Accuracy: 33.3%
|
| 50 |
+
Hallucinations: 0, Omissions: 1
|
| 51 |
+
patient-002_allergies
|
| 52 |
+
Type: allergy_list, Accuracy: 33.3%
|
| 53 |
+
Hallucinations: 0, Omissions: 1
|
| 54 |
+
patient-002_immunizations
|
| 55 |
+
Type: immunization_list, Accuracy: 33.3%
|
| 56 |
+
Hallucinations: 0, Omissions: 1
|
| 57 |
+
patient-003_meds_all
|
| 58 |
+
Type: medication_list, Accuracy: 33.3%
|
| 59 |
+
Hallucinations: 0, Omissions: 1
|
| 60 |
+
patient-003_conditions
|
| 61 |
+
Type: condition_list, Accuracy: 33.3%
|
| 62 |
+
Hallucinations: 0, Omissions: 1
|
| 63 |
+
patient-003_procedures
|
| 64 |
+
Type: procedure_list, Accuracy: 50.0%
|
| 65 |
+
Hallucinations: 1, Omissions: 0
|
| 66 |
+
|
| 67 |
+
============================================================
|
evaluation/run_evaluation.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Evaluation Runner
|
| 4 |
+
|
| 5 |
+
Main entry point for running the pre-visit summary evaluation.
|
| 6 |
+
|
| 7 |
+
This can be run in two modes:
|
| 8 |
+
1. Direct mode: Directly compute expected vs actual from database (no LLM needed)
|
| 9 |
+
2. Agent mode: Run actual agent queries and extract facts from responses
|
| 10 |
+
|
| 11 |
+
For initial testing, we use direct mode to validate the evaluation framework.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
import json
|
| 17 |
+
import argparse
|
| 18 |
+
from datetime import datetime
|
| 19 |
+
from typing import Dict, List, Any
|
| 20 |
+
|
| 21 |
+
# Add parent directory to path for imports
|
| 22 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 23 |
+
|
| 24 |
+
from evaluation.test_generator import generate_all_test_cases, get_test_summary
|
| 25 |
+
from evaluation.expected_values import compute_expected_values
|
| 26 |
+
from evaluation.evaluator import evaluate_case, CaseEvaluation
|
| 27 |
+
from evaluation.metrics import aggregate_metrics, format_report, save_report
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def run_direct_evaluation(num_patients: int = 10, output_dir: str = None) -> Dict:
|
| 31 |
+
"""
|
| 32 |
+
Run evaluation in direct mode.
|
| 33 |
+
|
| 34 |
+
This mode:
|
| 35 |
+
1. Generates test cases from database
|
| 36 |
+
2. Computes expected values from database
|
| 37 |
+
3. Simulates "perfect" agent that returns exactly the expected values
|
| 38 |
+
4. Computes metrics
|
| 39 |
+
|
| 40 |
+
This validates the evaluation framework works correctly.
|
| 41 |
+
A perfect agent should score 100%.
|
| 42 |
+
"""
|
| 43 |
+
print("=" * 60)
|
| 44 |
+
print("PRE-VISIT SUMMARY EVALUATION - DIRECT MODE")
|
| 45 |
+
print("=" * 60)
|
| 46 |
+
print(f"\nGenerating test cases for {num_patients} patients...")
|
| 47 |
+
|
| 48 |
+
# Generate test cases
|
| 49 |
+
test_cases = generate_all_test_cases(num_patients=num_patients)
|
| 50 |
+
summary = get_test_summary(test_cases)
|
| 51 |
+
|
| 52 |
+
print(f"Generated {summary['total_cases']} test cases")
|
| 53 |
+
print("\nBy query type:")
|
| 54 |
+
for qtype, count in sorted(summary["by_type"].items()):
|
| 55 |
+
print(f" {qtype}: {count}")
|
| 56 |
+
|
| 57 |
+
print("\nRunning evaluation...")
|
| 58 |
+
|
| 59 |
+
evaluations = []
|
| 60 |
+
for i, test_case in enumerate(test_cases):
|
| 61 |
+
# Compute expected values
|
| 62 |
+
expected = compute_expected_values(test_case)
|
| 63 |
+
|
| 64 |
+
# In direct mode, actual = expected (simulating perfect agent)
|
| 65 |
+
actual_facts = expected.copy()
|
| 66 |
+
|
| 67 |
+
# Evaluate
|
| 68 |
+
evaluation = evaluate_case(test_case, expected, actual_facts)
|
| 69 |
+
evaluations.append(evaluation)
|
| 70 |
+
|
| 71 |
+
# Progress indicator
|
| 72 |
+
if (i + 1) % 20 == 0:
|
| 73 |
+
print(f" Processed {i + 1}/{len(test_cases)} cases...")
|
| 74 |
+
|
| 75 |
+
# Aggregate metrics
|
| 76 |
+
metrics = aggregate_metrics(evaluations)
|
| 77 |
+
|
| 78 |
+
# Print report
|
| 79 |
+
print("\n" + format_report(metrics))
|
| 80 |
+
|
| 81 |
+
# Save report
|
| 82 |
+
if output_dir:
|
| 83 |
+
text_path, json_path = save_report(metrics, output_dir)
|
| 84 |
+
print(f"\nReports saved to:")
|
| 85 |
+
print(f" {text_path}")
|
| 86 |
+
print(f" {json_path}")
|
| 87 |
+
|
| 88 |
+
return metrics.to_dict()
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def run_simulated_evaluation(num_patients: int = 10, error_rate: float = 0.1,
|
| 92 |
+
output_dir: str = None) -> Dict:
|
| 93 |
+
"""
|
| 94 |
+
Run evaluation with simulated errors.
|
| 95 |
+
|
| 96 |
+
This mode introduces controlled errors to test that the
|
| 97 |
+
evaluation framework correctly detects them.
|
| 98 |
+
|
| 99 |
+
Args:
|
| 100 |
+
num_patients: Number of patients to test
|
| 101 |
+
error_rate: Fraction of values to corrupt (0.0 - 1.0)
|
| 102 |
+
output_dir: Directory to save reports
|
| 103 |
+
"""
|
| 104 |
+
import random
|
| 105 |
+
|
| 106 |
+
print("=" * 60)
|
| 107 |
+
print("PRE-VISIT SUMMARY EVALUATION - SIMULATED ERROR MODE")
|
| 108 |
+
print(f"Error rate: {error_rate:.0%}")
|
| 109 |
+
print("=" * 60)
|
| 110 |
+
|
| 111 |
+
print(f"\nGenerating test cases for {num_patients} patients...")
|
| 112 |
+
|
| 113 |
+
test_cases = generate_all_test_cases(num_patients=num_patients)
|
| 114 |
+
summary = get_test_summary(test_cases)
|
| 115 |
+
|
| 116 |
+
print(f"Generated {summary['total_cases']} test cases")
|
| 117 |
+
|
| 118 |
+
print("\nRunning evaluation with simulated errors...")
|
| 119 |
+
|
| 120 |
+
evaluations = []
|
| 121 |
+
for i, test_case in enumerate(test_cases):
|
| 122 |
+
expected = compute_expected_values(test_case)
|
| 123 |
+
|
| 124 |
+
# Create actual with some errors
|
| 125 |
+
actual_facts = introduce_errors(expected, error_rate)
|
| 126 |
+
|
| 127 |
+
evaluation = evaluate_case(test_case, expected, actual_facts)
|
| 128 |
+
evaluations.append(evaluation)
|
| 129 |
+
|
| 130 |
+
if (i + 1) % 20 == 0:
|
| 131 |
+
print(f" Processed {i + 1}/{len(test_cases)} cases...")
|
| 132 |
+
|
| 133 |
+
metrics = aggregate_metrics(evaluations)
|
| 134 |
+
|
| 135 |
+
print("\n" + format_report(metrics))
|
| 136 |
+
|
| 137 |
+
if output_dir:
|
| 138 |
+
text_path, json_path = save_report(metrics, output_dir)
|
| 139 |
+
print(f"\nReports saved to:")
|
| 140 |
+
print(f" {text_path}")
|
| 141 |
+
print(f" {json_path}")
|
| 142 |
+
|
| 143 |
+
return metrics.to_dict()
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def introduce_errors(expected: Dict, error_rate: float) -> Dict:
|
| 147 |
+
"""
|
| 148 |
+
Introduce controlled errors into expected values.
|
| 149 |
+
|
| 150 |
+
Error types:
|
| 151 |
+
- Numeric perturbation (add/subtract random amount)
|
| 152 |
+
- Omission (remove items from lists)
|
| 153 |
+
- Hallucination (add fake items to lists)
|
| 154 |
+
"""
|
| 155 |
+
import random
|
| 156 |
+
import copy
|
| 157 |
+
|
| 158 |
+
actual = copy.deepcopy(expected)
|
| 159 |
+
|
| 160 |
+
# Handle metrics dict (for vital/lab trends)
|
| 161 |
+
if "metrics" in actual:
|
| 162 |
+
for label, label_metrics in actual["metrics"].items():
|
| 163 |
+
if isinstance(label_metrics, dict):
|
| 164 |
+
for key, value in list(label_metrics.items()):
|
| 165 |
+
if random.random() < error_rate:
|
| 166 |
+
if isinstance(value, (int, float)) and key != "count":
|
| 167 |
+
# Numeric perturbation
|
| 168 |
+
label_metrics[key] = round(value + random.uniform(-5, 5), 1)
|
| 169 |
+
elif key == "count" and random.random() < 0.5:
|
| 170 |
+
# Sometimes omit count
|
| 171 |
+
label_metrics[key] = None
|
| 172 |
+
|
| 173 |
+
# Handle list fields
|
| 174 |
+
for list_key in ["medication_names", "condition_names", "substances",
|
| 175 |
+
"vaccine_names", "procedure_names"]:
|
| 176 |
+
if list_key in actual:
|
| 177 |
+
items = actual[list_key]
|
| 178 |
+
new_items = []
|
| 179 |
+
|
| 180 |
+
for item in items:
|
| 181 |
+
if random.random() < error_rate:
|
| 182 |
+
# Omit this item
|
| 183 |
+
continue
|
| 184 |
+
new_items.append(item)
|
| 185 |
+
|
| 186 |
+
# Maybe add hallucination
|
| 187 |
+
if random.random() < error_rate:
|
| 188 |
+
new_items.append(f"FAKE_ITEM_{random.randint(1000, 9999)}")
|
| 189 |
+
|
| 190 |
+
actual[list_key] = new_items
|
| 191 |
+
actual["count"] = len(new_items)
|
| 192 |
+
|
| 193 |
+
return actual
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def main():
|
| 197 |
+
parser = argparse.ArgumentParser(description="Run pre-visit summary evaluation")
|
| 198 |
+
|
| 199 |
+
parser.add_argument(
|
| 200 |
+
"--mode",
|
| 201 |
+
choices=["direct", "simulated"],
|
| 202 |
+
default="direct",
|
| 203 |
+
help="Evaluation mode: 'direct' for perfect agent, 'simulated' for errors"
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
parser.add_argument(
|
| 207 |
+
"--patients",
|
| 208 |
+
type=int,
|
| 209 |
+
default=10,
|
| 210 |
+
help="Number of patients to test (default: 10)"
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
parser.add_argument(
|
| 214 |
+
"--error-rate",
|
| 215 |
+
type=float,
|
| 216 |
+
default=0.1,
|
| 217 |
+
help="Error rate for simulated mode (default: 0.1)"
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
parser.add_argument(
|
| 221 |
+
"--output-dir",
|
| 222 |
+
type=str,
|
| 223 |
+
default="evaluation/reports",
|
| 224 |
+
help="Directory to save reports (default: evaluation/reports)"
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
args = parser.parse_args()
|
| 228 |
+
|
| 229 |
+
# Ensure output directory exists
|
| 230 |
+
os.makedirs(args.output_dir, exist_ok=True)
|
| 231 |
+
|
| 232 |
+
if args.mode == "direct":
|
| 233 |
+
run_direct_evaluation(
|
| 234 |
+
num_patients=args.patients,
|
| 235 |
+
output_dir=args.output_dir
|
| 236 |
+
)
|
| 237 |
+
else:
|
| 238 |
+
run_simulated_evaluation(
|
| 239 |
+
num_patients=args.patients,
|
| 240 |
+
error_rate=args.error_rate,
|
| 241 |
+
output_dir=args.output_dir
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
if __name__ == "__main__":
|
| 246 |
+
main()
|
evaluation/test_generator.py
ADDED
|
@@ -0,0 +1,357 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test Case Generator for Pre-Visit Summary Evaluation
|
| 4 |
+
|
| 5 |
+
Generates test cases from Synthea patient data with known ground truth.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import sqlite3
|
| 9 |
+
import random
|
| 10 |
+
from datetime import datetime, timedelta
|
| 11 |
+
from typing import List, Dict, Any
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
DB_PATH = os.getenv("DB_PATH", "data/fhir.db")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def get_db():
|
| 18 |
+
"""Get database connection."""
|
| 19 |
+
conn = sqlite3.connect(DB_PATH)
|
| 20 |
+
conn.row_factory = sqlite3.Row
|
| 21 |
+
return conn
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def get_test_patients(limit: int = 10) -> List[Dict]:
|
| 25 |
+
"""Get patients that have sufficient data for testing."""
|
| 26 |
+
conn = get_db()
|
| 27 |
+
try:
|
| 28 |
+
# Find patients with good data coverage
|
| 29 |
+
cursor = conn.execute("""
|
| 30 |
+
SELECT p.id, p.given_name, p.family_name, p.birth_date, p.gender,
|
| 31 |
+
(SELECT COUNT(*) FROM conditions WHERE patient_id = p.id) as condition_count,
|
| 32 |
+
(SELECT COUNT(*) FROM medications WHERE patient_id = p.id) as med_count,
|
| 33 |
+
(SELECT COUNT(*) FROM observations WHERE patient_id = p.id) as obs_count,
|
| 34 |
+
(SELECT COUNT(*) FROM allergies WHERE patient_id = p.id) as allergy_count,
|
| 35 |
+
(SELECT COUNT(*) FROM immunizations WHERE patient_id = p.id) as imm_count,
|
| 36 |
+
(SELECT COUNT(*) FROM procedures WHERE patient_id = p.id) as proc_count,
|
| 37 |
+
(SELECT COUNT(*) FROM encounters WHERE patient_id = p.id) as enc_count
|
| 38 |
+
FROM patients p
|
| 39 |
+
WHERE (SELECT COUNT(*) FROM observations WHERE patient_id = p.id) > 10
|
| 40 |
+
ORDER BY obs_count DESC
|
| 41 |
+
LIMIT ?
|
| 42 |
+
""", (limit,))
|
| 43 |
+
|
| 44 |
+
patients = []
|
| 45 |
+
for row in cursor.fetchall():
|
| 46 |
+
patients.append({
|
| 47 |
+
"patient_id": row["id"],
|
| 48 |
+
"name": f"{row['given_name']} {row['family_name']}",
|
| 49 |
+
"birth_date": row["birth_date"],
|
| 50 |
+
"gender": row["gender"],
|
| 51 |
+
"data_counts": {
|
| 52 |
+
"conditions": row["condition_count"],
|
| 53 |
+
"medications": row["med_count"],
|
| 54 |
+
"observations": row["obs_count"],
|
| 55 |
+
"allergies": row["allergy_count"],
|
| 56 |
+
"immunizations": row["imm_count"],
|
| 57 |
+
"procedures": row["proc_count"],
|
| 58 |
+
"encounters": row["enc_count"]
|
| 59 |
+
}
|
| 60 |
+
})
|
| 61 |
+
return patients
|
| 62 |
+
finally:
|
| 63 |
+
conn.close()
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def generate_vital_trend_cases(patient_id: str, days: int = 30) -> List[Dict]:
|
| 67 |
+
"""Generate test cases for vital sign trends (BP, heart rate, etc.)."""
|
| 68 |
+
test_cases = []
|
| 69 |
+
|
| 70 |
+
vital_types = [
|
| 71 |
+
("blood_pressure", ["8480-6", "8462-4"], ["systolic", "diastolic"]),
|
| 72 |
+
("heart_rate", ["8867-4"], ["heart_rate"]),
|
| 73 |
+
("weight", ["29463-7"], ["weight"]),
|
| 74 |
+
("temperature", ["8310-5"], ["temperature"]),
|
| 75 |
+
("oxygen_saturation", ["2708-6"], ["oxygen_saturation"]),
|
| 76 |
+
]
|
| 77 |
+
|
| 78 |
+
conn = get_db()
|
| 79 |
+
try:
|
| 80 |
+
for vital_name, codes, labels in vital_types:
|
| 81 |
+
# Check if patient has this vital data
|
| 82 |
+
placeholders = ",".join(["?" for _ in codes])
|
| 83 |
+
cursor = conn.execute(f"""
|
| 84 |
+
SELECT COUNT(*) as cnt FROM observations
|
| 85 |
+
WHERE patient_id = ? AND code IN ({placeholders})
|
| 86 |
+
""", [patient_id] + codes)
|
| 87 |
+
|
| 88 |
+
count = cursor.fetchone()["cnt"]
|
| 89 |
+
if count >= 3: # Need at least 3 readings for meaningful test
|
| 90 |
+
test_cases.append({
|
| 91 |
+
"case_id": f"{patient_id}_vital_{vital_name}",
|
| 92 |
+
"patient_id": patient_id,
|
| 93 |
+
"query_type": "vital_trend",
|
| 94 |
+
"query": f"Show me my {vital_name.replace('_', ' ')} chart",
|
| 95 |
+
"parameters": {
|
| 96 |
+
"vital_type": vital_name,
|
| 97 |
+
"days": days,
|
| 98 |
+
"codes": codes,
|
| 99 |
+
"labels": labels
|
| 100 |
+
}
|
| 101 |
+
})
|
| 102 |
+
finally:
|
| 103 |
+
conn.close()
|
| 104 |
+
|
| 105 |
+
return test_cases
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def generate_medication_cases(patient_id: str) -> List[Dict]:
|
| 109 |
+
"""Generate test cases for medication queries."""
|
| 110 |
+
test_cases = []
|
| 111 |
+
|
| 112 |
+
conn = get_db()
|
| 113 |
+
try:
|
| 114 |
+
# Check if patient has medications
|
| 115 |
+
cursor = conn.execute("""
|
| 116 |
+
SELECT COUNT(*) as total,
|
| 117 |
+
SUM(CASE WHEN status = 'active' THEN 1 ELSE 0 END) as active
|
| 118 |
+
FROM medications WHERE patient_id = ?
|
| 119 |
+
""", (patient_id,))
|
| 120 |
+
|
| 121 |
+
row = cursor.fetchone()
|
| 122 |
+
if row["total"] > 0:
|
| 123 |
+
# All medications
|
| 124 |
+
test_cases.append({
|
| 125 |
+
"case_id": f"{patient_id}_meds_all",
|
| 126 |
+
"patient_id": patient_id,
|
| 127 |
+
"query_type": "medication_list",
|
| 128 |
+
"query": "What medications am I taking?",
|
| 129 |
+
"parameters": {"status": None}
|
| 130 |
+
})
|
| 131 |
+
|
| 132 |
+
# Active only
|
| 133 |
+
if row["active"] > 0:
|
| 134 |
+
test_cases.append({
|
| 135 |
+
"case_id": f"{patient_id}_meds_active",
|
| 136 |
+
"patient_id": patient_id,
|
| 137 |
+
"query_type": "medication_list",
|
| 138 |
+
"query": "What are my current active medications?",
|
| 139 |
+
"parameters": {"status": "active"}
|
| 140 |
+
})
|
| 141 |
+
finally:
|
| 142 |
+
conn.close()
|
| 143 |
+
|
| 144 |
+
return test_cases
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def generate_condition_cases(patient_id: str) -> List[Dict]:
|
| 148 |
+
"""Generate test cases for condition queries."""
|
| 149 |
+
test_cases = []
|
| 150 |
+
|
| 151 |
+
conn = get_db()
|
| 152 |
+
try:
|
| 153 |
+
cursor = conn.execute("""
|
| 154 |
+
SELECT COUNT(*) as cnt FROM conditions WHERE patient_id = ?
|
| 155 |
+
""", (patient_id,))
|
| 156 |
+
|
| 157 |
+
if cursor.fetchone()["cnt"] > 0:
|
| 158 |
+
test_cases.append({
|
| 159 |
+
"case_id": f"{patient_id}_conditions",
|
| 160 |
+
"patient_id": patient_id,
|
| 161 |
+
"query_type": "condition_list",
|
| 162 |
+
"query": "What are my medical conditions?",
|
| 163 |
+
"parameters": {}
|
| 164 |
+
})
|
| 165 |
+
finally:
|
| 166 |
+
conn.close()
|
| 167 |
+
|
| 168 |
+
return test_cases
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def generate_allergy_cases(patient_id: str) -> List[Dict]:
|
| 172 |
+
"""Generate test cases for allergy queries."""
|
| 173 |
+
test_cases = []
|
| 174 |
+
|
| 175 |
+
conn = get_db()
|
| 176 |
+
try:
|
| 177 |
+
cursor = conn.execute("""
|
| 178 |
+
SELECT COUNT(*) as cnt FROM allergies WHERE patient_id = ?
|
| 179 |
+
""", (patient_id,))
|
| 180 |
+
|
| 181 |
+
if cursor.fetchone()["cnt"] > 0:
|
| 182 |
+
test_cases.append({
|
| 183 |
+
"case_id": f"{patient_id}_allergies",
|
| 184 |
+
"patient_id": patient_id,
|
| 185 |
+
"query_type": "allergy_list",
|
| 186 |
+
"query": "What are my allergies?",
|
| 187 |
+
"parameters": {}
|
| 188 |
+
})
|
| 189 |
+
finally:
|
| 190 |
+
conn.close()
|
| 191 |
+
|
| 192 |
+
return test_cases
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def generate_immunization_cases(patient_id: str) -> List[Dict]:
|
| 196 |
+
"""Generate test cases for immunization queries."""
|
| 197 |
+
test_cases = []
|
| 198 |
+
|
| 199 |
+
conn = get_db()
|
| 200 |
+
try:
|
| 201 |
+
cursor = conn.execute("""
|
| 202 |
+
SELECT COUNT(*) as cnt FROM immunizations WHERE patient_id = ?
|
| 203 |
+
""", (patient_id,))
|
| 204 |
+
|
| 205 |
+
if cursor.fetchone()["cnt"] > 0:
|
| 206 |
+
test_cases.append({
|
| 207 |
+
"case_id": f"{patient_id}_immunizations",
|
| 208 |
+
"patient_id": patient_id,
|
| 209 |
+
"query_type": "immunization_list",
|
| 210 |
+
"query": "What immunizations have I had?",
|
| 211 |
+
"parameters": {}
|
| 212 |
+
})
|
| 213 |
+
finally:
|
| 214 |
+
conn.close()
|
| 215 |
+
|
| 216 |
+
return test_cases
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def generate_procedure_cases(patient_id: str) -> List[Dict]:
|
| 220 |
+
"""Generate test cases for procedure/surgical history queries."""
|
| 221 |
+
test_cases = []
|
| 222 |
+
|
| 223 |
+
conn = get_db()
|
| 224 |
+
try:
|
| 225 |
+
cursor = conn.execute("""
|
| 226 |
+
SELECT COUNT(*) as cnt FROM procedures WHERE patient_id = ?
|
| 227 |
+
""", (patient_id,))
|
| 228 |
+
|
| 229 |
+
if cursor.fetchone()["cnt"] > 0:
|
| 230 |
+
test_cases.append({
|
| 231 |
+
"case_id": f"{patient_id}_procedures",
|
| 232 |
+
"patient_id": patient_id,
|
| 233 |
+
"query_type": "procedure_list",
|
| 234 |
+
"query": "What procedures or surgeries have I had?",
|
| 235 |
+
"parameters": {}
|
| 236 |
+
})
|
| 237 |
+
finally:
|
| 238 |
+
conn.close()
|
| 239 |
+
|
| 240 |
+
return test_cases
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def generate_encounter_cases(patient_id: str) -> List[Dict]:
|
| 244 |
+
"""Generate test cases for encounter history queries."""
|
| 245 |
+
test_cases = []
|
| 246 |
+
|
| 247 |
+
conn = get_db()
|
| 248 |
+
try:
|
| 249 |
+
cursor = conn.execute("""
|
| 250 |
+
SELECT COUNT(*) as cnt FROM encounters WHERE patient_id = ?
|
| 251 |
+
""", (patient_id,))
|
| 252 |
+
|
| 253 |
+
if cursor.fetchone()["cnt"] > 0:
|
| 254 |
+
test_cases.append({
|
| 255 |
+
"case_id": f"{patient_id}_encounters",
|
| 256 |
+
"patient_id": patient_id,
|
| 257 |
+
"query_type": "encounter_list",
|
| 258 |
+
"query": "Show me my recent visits",
|
| 259 |
+
"parameters": {"limit": 5}
|
| 260 |
+
})
|
| 261 |
+
finally:
|
| 262 |
+
conn.close()
|
| 263 |
+
|
| 264 |
+
return test_cases
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def generate_lab_cases(patient_id: str) -> List[Dict]:
|
| 268 |
+
"""Generate test cases for lab result queries."""
|
| 269 |
+
test_cases = []
|
| 270 |
+
|
| 271 |
+
lab_types = [
|
| 272 |
+
("a1c", "4548-4", "HbA1c"),
|
| 273 |
+
("cholesterol", "2093-3", "Total Cholesterol"),
|
| 274 |
+
("glucose", "2345-7", "Glucose"),
|
| 275 |
+
]
|
| 276 |
+
|
| 277 |
+
conn = get_db()
|
| 278 |
+
try:
|
| 279 |
+
for lab_name, code, display in lab_types:
|
| 280 |
+
cursor = conn.execute("""
|
| 281 |
+
SELECT COUNT(*) as cnt FROM observations
|
| 282 |
+
WHERE patient_id = ? AND code = ?
|
| 283 |
+
""", (patient_id, code))
|
| 284 |
+
|
| 285 |
+
if cursor.fetchone()["cnt"] >= 2:
|
| 286 |
+
test_cases.append({
|
| 287 |
+
"case_id": f"{patient_id}_lab_{lab_name}",
|
| 288 |
+
"patient_id": patient_id,
|
| 289 |
+
"query_type": "lab_trend",
|
| 290 |
+
"query": f"Show me my {display} history",
|
| 291 |
+
"parameters": {
|
| 292 |
+
"lab_type": lab_name,
|
| 293 |
+
"code": code
|
| 294 |
+
}
|
| 295 |
+
})
|
| 296 |
+
finally:
|
| 297 |
+
conn.close()
|
| 298 |
+
|
| 299 |
+
return test_cases
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
def generate_all_test_cases(num_patients: int = 10) -> List[Dict]:
|
| 303 |
+
"""Generate complete test suite from available patients."""
|
| 304 |
+
patients = get_test_patients(num_patients)
|
| 305 |
+
all_cases = []
|
| 306 |
+
|
| 307 |
+
for patient in patients:
|
| 308 |
+
pid = patient["patient_id"]
|
| 309 |
+
|
| 310 |
+
# Generate cases for each data type
|
| 311 |
+
all_cases.extend(generate_vital_trend_cases(pid))
|
| 312 |
+
all_cases.extend(generate_medication_cases(pid))
|
| 313 |
+
all_cases.extend(generate_condition_cases(pid))
|
| 314 |
+
all_cases.extend(generate_allergy_cases(pid))
|
| 315 |
+
all_cases.extend(generate_immunization_cases(pid))
|
| 316 |
+
all_cases.extend(generate_procedure_cases(pid))
|
| 317 |
+
all_cases.extend(generate_encounter_cases(pid))
|
| 318 |
+
all_cases.extend(generate_lab_cases(pid))
|
| 319 |
+
|
| 320 |
+
return all_cases
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
def get_test_summary(test_cases: List[Dict]) -> Dict:
|
| 324 |
+
"""Get summary of generated test cases."""
|
| 325 |
+
summary = {
|
| 326 |
+
"total_cases": len(test_cases),
|
| 327 |
+
"by_type": {},
|
| 328 |
+
"by_patient": {}
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
for case in test_cases:
|
| 332 |
+
# Count by type
|
| 333 |
+
qtype = case["query_type"]
|
| 334 |
+
summary["by_type"][qtype] = summary["by_type"].get(qtype, 0) + 1
|
| 335 |
+
|
| 336 |
+
# Count by patient
|
| 337 |
+
pid = case["patient_id"]
|
| 338 |
+
summary["by_patient"][pid] = summary["by_patient"].get(pid, 0) + 1
|
| 339 |
+
|
| 340 |
+
return summary
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
if __name__ == "__main__":
|
| 344 |
+
# Test the generator
|
| 345 |
+
print("Generating test cases...")
|
| 346 |
+
cases = generate_all_test_cases(num_patients=5)
|
| 347 |
+
summary = get_test_summary(cases)
|
| 348 |
+
|
| 349 |
+
print(f"\nTotal test cases: {summary['total_cases']}")
|
| 350 |
+
print("\nBy query type:")
|
| 351 |
+
for qtype, count in sorted(summary["by_type"].items()):
|
| 352 |
+
print(f" {qtype}: {count}")
|
| 353 |
+
|
| 354 |
+
print("\nSample test case:")
|
| 355 |
+
if cases:
|
| 356 |
+
import json
|
| 357 |
+
print(json.dumps(cases[0], indent=2))
|