|
|
""" |
|
|
Synthetic Medical Test Data Generator |
|
|
Creates realistic medical test cases for validation without real PHI |
|
|
""" |
|
|
|
|
|
import json |
|
|
import random |
|
|
from datetime import datetime, timedelta |
|
|
from typing import Dict, List, Any |
|
|
|
|
|
class MedicalTestDataGenerator: |
|
|
"""Generate synthetic medical test data for validation""" |
|
|
|
|
|
def __init__(self, seed=42): |
|
|
random.seed(seed) |
|
|
|
|
|
def generate_ecg_test_case(self, case_id: int, pathology: str) -> Dict[str, Any]: |
|
|
"""Generate a synthetic ECG test case""" |
|
|
|
|
|
|
|
|
base_hr = { |
|
|
"normal": (60, 100), |
|
|
"atrial_fibrillation": (80, 150), |
|
|
"ventricular_tachycardia": (150, 250), |
|
|
"heart_block": (30, 60), |
|
|
"st_elevation": (60, 100), |
|
|
"st_depression": (60, 100), |
|
|
"qt_prolongation": (60, 90), |
|
|
"bundle_branch_block": (60, 100) |
|
|
} |
|
|
|
|
|
hr_range = base_hr.get(pathology, (60, 100)) |
|
|
heart_rate = random.randint(hr_range[0], hr_range[1]) |
|
|
|
|
|
|
|
|
pr_interval = random.randint(120, 200) if pathology != "heart_block" else random.randint(200, 350) |
|
|
qrs_duration = random.randint(80, 100) if pathology != "bundle_branch_block" else random.randint(120, 160) |
|
|
qt_interval = random.randint(350, 450) if pathology != "qt_prolongation" else random.randint(450, 550) |
|
|
qtc = qt_interval / (60/heart_rate)**0.5 |
|
|
|
|
|
return { |
|
|
"case_id": f"ECG_{case_id:04d}", |
|
|
"modality": "ECG", |
|
|
"patient_age": random.randint(30, 80), |
|
|
"patient_sex": random.choice(["M", "F"]), |
|
|
"pathology": pathology, |
|
|
"measurements": { |
|
|
"heart_rate": heart_rate, |
|
|
"pr_interval_ms": pr_interval, |
|
|
"qrs_duration_ms": qrs_duration, |
|
|
"qt_interval_ms": qt_interval, |
|
|
"qtc_ms": round(qtc, 1), |
|
|
"axis": random.choice(["normal", "left", "right"]) |
|
|
}, |
|
|
"ground_truth": { |
|
|
"diagnosis": pathology, |
|
|
"severity": random.choice(["mild", "moderate", "severe"]), |
|
|
"clinical_significance": self._get_clinical_significance(pathology), |
|
|
"requires_immediate_action": pathology in ["ventricular_tachycardia", "st_elevation"] |
|
|
}, |
|
|
"confidence_expected": self._get_expected_confidence(pathology), |
|
|
"review_required": pathology in ["heart_block", "qt_prolongation"] |
|
|
} |
|
|
|
|
|
def generate_radiology_test_case(self, case_id: int, pathology: str, modality: str) -> Dict[str, Any]: |
|
|
"""Generate a synthetic radiology test case""" |
|
|
|
|
|
findings = { |
|
|
"normal": "No acute findings", |
|
|
"pneumonia": "Focal consolidation in right lower lobe", |
|
|
"fracture": "Transverse fracture of distal radius", |
|
|
"tumor": "3.2 cm mass in left upper lobe", |
|
|
"organomegaly": "Hepatomegaly with liver span 18 cm" |
|
|
} |
|
|
|
|
|
return { |
|
|
"case_id": f"RAD_{case_id:04d}", |
|
|
"modality": modality, |
|
|
"imaging_type": random.choice(["Chest X-ray", "CT Chest", "MRI Brain", "Ultrasound Abdomen"]), |
|
|
"patient_age": random.randint(20, 85), |
|
|
"patient_sex": random.choice(["M", "F"]), |
|
|
"pathology": pathology, |
|
|
"findings": findings.get(pathology, "Unknown findings"), |
|
|
"ground_truth": { |
|
|
"primary_diagnosis": pathology, |
|
|
"anatomical_location": self._get_anatomical_location(pathology), |
|
|
"severity": random.choice(["mild", "moderate", "severe"]), |
|
|
"clinical_significance": self._get_clinical_significance(pathology), |
|
|
"requires_follow_up": pathology != "normal" |
|
|
}, |
|
|
"confidence_expected": self._get_expected_confidence(pathology), |
|
|
"review_required": pathology in ["tumor", "fracture"] |
|
|
} |
|
|
|
|
|
def _get_clinical_significance(self, pathology: str) -> str: |
|
|
significance_map = { |
|
|
"normal": "None", |
|
|
"atrial_fibrillation": "High - stroke risk", |
|
|
"ventricular_tachycardia": "Critical - life-threatening", |
|
|
"heart_block": "High - may require pacemaker", |
|
|
"st_elevation": "Critical - acute MI", |
|
|
"st_depression": "High - ischemia", |
|
|
"qt_prolongation": "Moderate - arrhythmia risk", |
|
|
"bundle_branch_block": "Moderate - conduction disorder", |
|
|
"pneumonia": "High - infectious process", |
|
|
"fracture": "Moderate - structural injury", |
|
|
"tumor": "High - potential malignancy", |
|
|
"organomegaly": "Moderate - systemic disease" |
|
|
} |
|
|
return significance_map.get(pathology, "Unknown") |
|
|
|
|
|
def _get_anatomical_location(self, pathology: str) -> str: |
|
|
location_map = { |
|
|
"pneumonia": "Right lower lobe", |
|
|
"fracture": "Distal radius", |
|
|
"tumor": "Left upper lobe", |
|
|
"organomegaly": "Liver" |
|
|
} |
|
|
return location_map.get(pathology, "N/A") |
|
|
|
|
|
def _get_expected_confidence(self, pathology: str) -> float: |
|
|
"""Expected confidence score for validation""" |
|
|
|
|
|
if pathology in ["normal", "st_elevation", "ventricular_tachycardia", "fracture"]: |
|
|
return random.uniform(0.85, 0.95) |
|
|
|
|
|
elif pathology in ["qt_prolongation", "heart_block", "pneumonia", "tumor"]: |
|
|
return random.uniform(0.65, 0.85) |
|
|
|
|
|
else: |
|
|
return random.uniform(0.50, 0.70) |
|
|
|
|
|
def generate_test_dataset(self, num_ecg=500, num_radiology=200) -> Dict[str, List[Dict]]: |
|
|
"""Generate complete test dataset""" |
|
|
|
|
|
print(f"Generating synthetic medical test dataset...") |
|
|
print(f"ECG cases: {num_ecg}") |
|
|
print(f"Radiology cases: {num_radiology}") |
|
|
|
|
|
|
|
|
ecg_pathologies = [ |
|
|
("normal", int(num_ecg * 0.20)), |
|
|
("atrial_fibrillation", int(num_ecg * 0.16)), |
|
|
("ventricular_tachycardia", int(num_ecg * 0.12)), |
|
|
("heart_block", int(num_ecg * 0.10)), |
|
|
("st_elevation", int(num_ecg * 0.14)), |
|
|
("st_depression", int(num_ecg * 0.12)), |
|
|
("qt_prolongation", int(num_ecg * 0.08)), |
|
|
("bundle_branch_block", int(num_ecg * 0.08)) |
|
|
] |
|
|
|
|
|
ecg_cases = [] |
|
|
case_id = 1 |
|
|
for pathology, count in ecg_pathologies: |
|
|
for _ in range(count): |
|
|
ecg_cases.append(self.generate_ecg_test_case(case_id, pathology)) |
|
|
case_id += 1 |
|
|
|
|
|
|
|
|
rad_pathologies = [ |
|
|
("normal", int(num_radiology * 0.25)), |
|
|
("pneumonia", int(num_radiology * 0.30)), |
|
|
("fracture", int(num_radiology * 0.20)), |
|
|
("tumor", int(num_radiology * 0.15)), |
|
|
("organomegaly", int(num_radiology * 0.10)) |
|
|
] |
|
|
|
|
|
rad_cases = [] |
|
|
case_id = 1 |
|
|
for pathology, count in rad_pathologies: |
|
|
for _ in range(count): |
|
|
modality = random.choice(["Chest X-ray", "CT", "MRI", "Ultrasound"]) |
|
|
rad_cases.append(self.generate_radiology_test_case(case_id, pathology, modality)) |
|
|
case_id += 1 |
|
|
|
|
|
print(f"\nGenerated:") |
|
|
print(f" ECG cases: {len(ecg_cases)}") |
|
|
print(f" Radiology cases: {len(rad_cases)}") |
|
|
print(f" Total: {len(ecg_cases) + len(rad_cases)}") |
|
|
|
|
|
return { |
|
|
"ecg_cases": ecg_cases, |
|
|
"radiology_cases": rad_cases, |
|
|
"metadata": { |
|
|
"generated_date": datetime.now().isoformat(), |
|
|
"total_cases": len(ecg_cases) + len(rad_cases), |
|
|
"ecg_distribution": {p: c for p, c in ecg_pathologies}, |
|
|
"radiology_distribution": {p: c for p, c in rad_pathologies} |
|
|
} |
|
|
} |
|
|
|
|
|
class ValidationMetricsCalculator: |
|
|
"""Calculate clinical validation metrics""" |
|
|
|
|
|
def calculate_metrics(self, predictions: List[Dict], ground_truth: List[Dict]) -> Dict[str, Any]: |
|
|
"""Calculate sensitivity, specificity, F1, AUROC""" |
|
|
|
|
|
|
|
|
tp = fp = tn = fn = 0 |
|
|
|
|
|
for pred, truth in zip(predictions, ground_truth): |
|
|
pred_positive = pred.get("diagnosis") == truth.get("pathology") |
|
|
truth_positive = truth.get("pathology") != "normal" |
|
|
|
|
|
if pred_positive and truth_positive: |
|
|
tp += 1 |
|
|
elif pred_positive and not truth_positive: |
|
|
fp += 1 |
|
|
elif not pred_positive and not truth_positive: |
|
|
tn += 1 |
|
|
elif not pred_positive and truth_positive: |
|
|
fn += 1 |
|
|
|
|
|
|
|
|
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0.0 |
|
|
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0 |
|
|
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0 |
|
|
recall = sensitivity |
|
|
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0 |
|
|
|
|
|
return { |
|
|
"confusion_matrix": { |
|
|
"true_positives": tp, |
|
|
"false_positives": fp, |
|
|
"true_negatives": tn, |
|
|
"false_negatives": fn |
|
|
}, |
|
|
"metrics": { |
|
|
"sensitivity": round(sensitivity, 4), |
|
|
"specificity": round(specificity, 4), |
|
|
"precision": round(precision, 4), |
|
|
"recall": round(recall, 4), |
|
|
"f1_score": round(f1_score, 4) |
|
|
}, |
|
|
"total_cases": len(predictions) |
|
|
} |
|
|
|
|
|
def main(): |
|
|
"""Generate test dataset and save to files""" |
|
|
|
|
|
print("="*60) |
|
|
print("SYNTHETIC MEDICAL TEST DATA GENERATION") |
|
|
print("="*60) |
|
|
print(f"Started: {datetime.now().isoformat()}\n") |
|
|
|
|
|
generator = MedicalTestDataGenerator(seed=42) |
|
|
|
|
|
|
|
|
dataset = generator.generate_test_dataset(num_ecg=500, num_radiology=200) |
|
|
|
|
|
|
|
|
output_dir = "/workspace/medical-ai-platform/test_data" |
|
|
import os |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
with open(f"{output_dir}/complete_test_dataset.json", "w") as f: |
|
|
json.dump(dataset, f, indent=2) |
|
|
print(f"\nSaved complete dataset to: {output_dir}/complete_test_dataset.json") |
|
|
|
|
|
|
|
|
with open(f"{output_dir}/ecg_test_cases.json", "w") as f: |
|
|
json.dump(dataset["ecg_cases"], f, indent=2) |
|
|
print(f"Saved ECG cases to: {output_dir}/ecg_test_cases.json") |
|
|
|
|
|
|
|
|
with open(f"{output_dir}/radiology_test_cases.json", "w") as f: |
|
|
json.dump(dataset["radiology_cases"], f, indent=2) |
|
|
print(f"Saved radiology cases to: {output_dir}/radiology_test_cases.json") |
|
|
|
|
|
|
|
|
summary = { |
|
|
"total_cases": dataset["metadata"]["total_cases"], |
|
|
"ecg_cases": len(dataset["ecg_cases"]), |
|
|
"radiology_cases": len(dataset["radiology_cases"]), |
|
|
"ecg_distribution": dataset["metadata"]["ecg_distribution"], |
|
|
"radiology_distribution": dataset["metadata"]["radiology_distribution"], |
|
|
"generated_date": dataset["metadata"]["generated_date"] |
|
|
} |
|
|
|
|
|
with open(f"{output_dir}/dataset_summary.json", "w") as f: |
|
|
json.dump(summary, f, indent=2) |
|
|
print(f"Saved summary to: {output_dir}/dataset_summary.json") |
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("DATA GENERATION COMPLETE") |
|
|
print("="*60) |
|
|
print(f"\nDataset Statistics:") |
|
|
print(f" Total Cases: {summary['total_cases']}") |
|
|
print(f" ECG Cases: {summary['ecg_cases']}") |
|
|
print(f" Radiology Cases: {summary['radiology_cases']}") |
|
|
print(f"\nECG Pathology Distribution:") |
|
|
for pathology, count in summary['ecg_distribution'].items(): |
|
|
print(f" {pathology}: {count} cases") |
|
|
print(f"\nRadiology Pathology Distribution:") |
|
|
for pathology, count in summary['radiology_distribution'].items(): |
|
|
print(f" {pathology}: {count} cases") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|