snikhilesh commited on
Commit
5d407bd
·
verified ·
1 Parent(s): 9c479c4

Deploy medical_schemas.py to backend/ directory

Browse files
Files changed (1) hide show
  1. backend/medical_schemas.py +534 -0
backend/medical_schemas.py ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Medical Data Schemas - Phase 1 Implementation
3
+ Canonical JSON schemas for medical data modalities with validation rules and confidence scoring.
4
+
5
+ This module defines the structured data contracts that ensure proper input/output
6
+ formats across the medical AI pipeline, replacing unstructured PDF processing.
7
+
8
+ Author: MiniMax Agent
9
+ Date: 2025-10-29
10
+ Version: 1.0.0
11
+ """
12
+
13
+ from typing import List, Optional, Dict, Any, Union, Literal
14
+ from pydantic import BaseModel, Field, validator, confloat
15
+ from datetime import datetime
16
+ import uuid
17
+ import numpy as np
18
+
19
+
20
+ # ================================
21
+ # BASE TYPES AND ENUMS
22
+ # ================================
23
+
24
+ class ConfidenceScore(BaseModel):
25
+ """Composite confidence scoring for medical data extraction and analysis"""
26
+ extraction_confidence: confloat(ge=0.0, le=1.0) = Field(
27
+ description="Confidence in data extraction from source document (0.0-1.0)"
28
+ )
29
+ model_confidence: confloat(ge=0.0, le=1.0) = Field(
30
+ description="Confidence in AI model analysis/output (0.0-1.0)"
31
+ )
32
+ data_quality: confloat(ge=0.0, le=1.0) = Field(
33
+ description="Quality of source data (completeness, clarity, resolution) (0.0-1.0)"
34
+ )
35
+
36
+ @property
37
+ def overall_confidence(self) -> float:
38
+ """Calculate composite confidence using weighted formula: 0.5 * extraction + 0.3 * model + 0.2 * quality"""
39
+ return (0.5 * self.extraction_confidence +
40
+ 0.3 * self.model_confidence +
41
+ 0.2 * self.data_quality)
42
+
43
+ @property
44
+ def requires_review(self) -> bool:
45
+ """Determine if this data requires human review based on confidence thresholds"""
46
+ overall = self.overall_confidence
47
+ return overall < 0.85 # Below 85% requires review
48
+
49
+
50
+ class MedicalDocumentMetadata(BaseModel):
51
+ """Common metadata for all medical documents"""
52
+ document_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
53
+ source_type: Literal["ECG", "radiology", "laboratory", "clinical_notes", "unknown"]
54
+ document_date: Optional[datetime] = None
55
+ patient_id_hash: Optional[str] = None # Anonymized identifier
56
+ facility: Optional[str] = None
57
+ provider: Optional[str] = None
58
+ extraction_timestamp: datetime = Field(default_factory=datetime.now)
59
+ data_completeness: confloat(ge=0.0, le=1.0) = Field(
60
+ description="Overall completeness of extracted data (0.0-1.0)"
61
+ )
62
+
63
+
64
+ # ================================
65
+ # ECG SCHEMA (PHASE 1 PRIORITY)
66
+ # ================================
67
+
68
+ class ECGSignalData(BaseModel):
69
+ """ECG signal array data for rhythm analysis"""
70
+ lead_names: List[str] = Field(
71
+ description="List of ECG lead names (I, II, III, aVR, aVL, aVF, V1-V6)"
72
+ )
73
+ sampling_rate_hz: int = Field(ge=100, le=1000, description="Sampling rate in Hz")
74
+ signal_arrays: Dict[str, List[float]] = Field(
75
+ description="Dictionary mapping lead names to signal arrays (mV values)"
76
+ )
77
+ duration_seconds: float = Field(gt=0, description="Recording duration in seconds")
78
+ num_samples: int = Field(gt=0, description="Number of samples per lead")
79
+
80
+ @validator('signal_arrays')
81
+ def validate_signal_arrays(cls, v):
82
+ """Ensure all lead arrays have consistent length and valid values"""
83
+ if not v:
84
+ raise ValueError("Signal arrays cannot be empty")
85
+
86
+ expected_length = None
87
+ for lead_name, signal in v.items():
88
+ if not isinstance(signal, list) or not signal:
89
+ raise ValueError(f"Lead {lead_name} must be non-empty list")
90
+
91
+ # Check for valid mV range (-5 to +5 mV)
92
+ if any(abs(val) > 5.0 for val in signal):
93
+ raise ValueError(f"Lead {lead_name} contains values outside valid ECG range (-5 to +5 mV)")
94
+
95
+ # Ensure consistent array length
96
+ if expected_length is None:
97
+ expected_length = len(signal)
98
+ elif len(signal) != expected_length:
99
+ raise ValueError(f"All leads must have same array length")
100
+
101
+ return v
102
+
103
+
104
+ class ECGIntervals(BaseModel):
105
+ """ECG timing intervals for arrhythmia detection"""
106
+ pr_ms: Optional[float] = Field(None, ge=0, le=400, description="PR interval in milliseconds")
107
+ qrs_ms: Optional[float] = Field(None, ge=0, le=200, description="QRS duration in milliseconds")
108
+ qt_ms: Optional[float] = Field(None, ge=200, le=600, description="QT interval in milliseconds")
109
+ qtc_ms: Optional[float] = Field(None, ge=200, le=600, description="QTc interval in milliseconds")
110
+ rr_ms: Optional[float] = Field(None, ge=300, le=2000, description="RR interval in milliseconds")
111
+
112
+ @property
113
+ def is_bradycardia(self) -> Optional[bool]:
114
+ """Detect bradycardia based on RR interval"""
115
+ if self.rr_ms:
116
+ return self.rr_ms > 1000 # HR < 60 bpm
117
+ return None
118
+
119
+ @property
120
+ def is_tachycardia(self) -> Optional[bool]:
121
+ """Detect tachycardia based on RR interval"""
122
+ if self.rr_ms:
123
+ return self.rr_ms < 600 # HR > 100 bpm
124
+ return None
125
+
126
+
127
+ class ECGRhythmClassification(BaseModel):
128
+ """ECG rhythm classification results"""
129
+ primary_rhythm: Optional[str] = Field(None, description="Primary rhythm classification")
130
+ rhythm_confidence: Optional[confloat(ge=0.0, le=1.0)] = None
131
+ arrhythmia_types: List[str] = Field(default_factory=list, description="Detected arrhythmia types")
132
+ heart_rate_bpm: Optional[int] = Field(None, ge=20, le=300, description="Heart rate in beats per minute")
133
+ heart_rate_regularity: Optional[Literal["regular", "irregular", "variable"]] = None
134
+
135
+
136
+ class ECGArrhythmiaProbabilities(BaseModel):
137
+ """Probabilities for specific arrhythmia conditions"""
138
+ normal_rhythm: Optional[confloat(ge=0.0, le=1.0)] = Field(None, description="Normal sinus rhythm probability")
139
+ atrial_fibrillation: Optional[confloat(ge=0.0, le=1.0)] = Field(None, description="Atrial fibrillation probability")
140
+ atrial_flutter: Optional[confloat(ge=0.0, le=1.0)] = Field(None, description="Atrial flutter probability")
141
+ ventricular_tachycardia: Optional[confloat(ge=0.0, le=1.0)] = Field(None, description="Ventricular tachycardia probability")
142
+ heart_block: Optional[confloat(ge=0.0, le=1.0)] = Field(None, description="Heart block probability")
143
+ premature_beats: Optional[confloat(ge=0.0, le=1.0)] = Field(None, description="Premature beat probability")
144
+
145
+
146
+ class ECGDerivedFeatures(BaseModel):
147
+ """ECG-derived clinical features for downstream analysis"""
148
+ st_elevation_mm: Optional[Dict[str, float]] = Field(None, description="ST elevation by lead (mm)")
149
+ st_depression_mm: Optional[Dict[str, float]] = Field(None, description="ST depression by lead (mm)")
150
+ t_wave_abnormalities: List[str] = Field(default_factory=list, description="T-wave abnormality flags")
151
+ q_wave_indicators: List[str] = Field(default_factory=list, description="Pathological Q-wave indicators")
152
+ voltage_criteria: Optional[Dict[str, Any]] = Field(None, description="Voltage criteria for hypertrophy")
153
+ axis_deviation: Optional[Literal["normal", "left", "right", "extreme"]] = None
154
+
155
+
156
+ class ECGAnalysis(BaseModel):
157
+ """Complete ECG analysis results with structured output"""
158
+ metadata: MedicalDocumentMetadata = Field(source_type="ECG")
159
+ signal_data: ECGSignalData
160
+ intervals: ECGIntervals
161
+ rhythm_classification: ECGRhythmClassification
162
+ arrhythmia_probabilities: ECGArrhythmiaProbabilities
163
+ derived_features: ECGDerivedFeatures
164
+ confidence: ConfidenceScore
165
+ clinical_summary: Optional[str] = Field(None, description="Human-readable clinical summary")
166
+ recommendations: List[str] = Field(default_factory=list, description="Clinical recommendations")
167
+
168
+ class Config:
169
+ schema_extra = {
170
+ "example": {
171
+ "metadata": {
172
+ "document_id": "ecg-12345",
173
+ "source_type": "ECG",
174
+ "document_date": "2025-10-29T10:38:55Z",
175
+ "facility": "General Hospital",
176
+ "extraction_timestamp": "2025-10-29T10:38:55Z"
177
+ },
178
+ "signal_data": {
179
+ "lead_names": ["I", "II", "III", "aVR", "aVL", "aVF", "V1", "V2", "V3", "V4", "V5", "V6"],
180
+ "sampling_rate_hz": 500,
181
+ "duration_seconds": 10.0,
182
+ "num_samples": 5000
183
+ },
184
+ "intervals": {
185
+ "pr_ms": 160.0,
186
+ "qrs_ms": 88.0,
187
+ "qt_ms": 380.0,
188
+ "qtc_ms": 420.0
189
+ },
190
+ "confidence": {
191
+ "extraction_confidence": 0.92,
192
+ "model_confidence": 0.89,
193
+ "data_quality": 0.95,
194
+ "overall_confidence": 0.917
195
+ }
196
+ }
197
+ }
198
+
199
+
200
+ # ================================
201
+ # RADIOLOGY SCHEMA
202
+ # ================================
203
+
204
+ class RadiologyImageReference(BaseModel):
205
+ """Reference to radiology images with metadata"""
206
+ image_id: str = Field(description="Unique image identifier")
207
+ modality: Literal["CT", "MRI", "XRAY", "ULTRASOUND", "MAMMOGRAPHY", "NUCLEAR"] = Field(
208
+ description="Imaging modality"
209
+ )
210
+ body_part: str = Field(description="Anatomical region imaged")
211
+ view_orientation: Optional[str] = Field(None, description="Image orientation/plane")
212
+ slice_thickness_mm: Optional[float] = Field(None, description="Slice thickness in mm")
213
+ resolution: Optional[Dict[str, int]] = Field(None, description="Image resolution (width, height)")
214
+
215
+
216
+ class RadiologySegmentation(BaseModel):
217
+ """Medical image segmentation results"""
218
+ organ_name: str = Field(description="Name of segmented organ/structure")
219
+ volume_ml: Optional[float] = Field(None, ge=0, description="Volume in milliliters")
220
+ surface_area_cm2: Optional[float] = Field(None, ge=0, description="Surface area in cm²")
221
+ mean_intensity: Optional[float] = Field(None, description="Mean pixel intensity")
222
+ max_intensity: Optional[float] = Field(None, description="Maximum pixel intensity")
223
+ lesions: List[Dict[str, Any]] = Field(default_factory=list, description="Detected lesions")
224
+
225
+
226
+ class RadiologyFindings(BaseModel):
227
+ """Structured radiology findings extraction"""
228
+ findings_text: str = Field(description="Raw findings text from report")
229
+ impression_text: str = Field(description="Impression/conclusion section")
230
+ critical_findings: List[str] = Field(default_factory=list, description="Urgent/critical findings")
231
+ incidental_findings: List[str] = Field(default_factory=list, description="Incidental findings")
232
+ comparison_prior: Optional[str] = Field(None, description="Comparison with prior studies")
233
+ technique_description: Optional[str] = Field(None, description="Imaging technique details")
234
+
235
+
236
+ class RadiologyMetrics(BaseModel):
237
+ """Quantitative metrics from imaging analysis"""
238
+ organ_volumes: Dict[str, float] = Field(default_factory=dict, description="Organ volumes in ml")
239
+ lesion_measurements: List[Dict[str, float]] = Field(
240
+ default_factory=list,
241
+ description="Lesion size measurements"
242
+ )
243
+ enhancement_patterns: List[str] = Field(default_factory=list, description="Contrast enhancement patterns")
244
+ calcification_scores: Dict[str, float] = Field(default_factory=dict, description="Calcification severity scores")
245
+ tissue_density: Optional[Dict[str, float]] = Field(None, description="Tissue density measurements")
246
+
247
+
248
+ class RadiologyAnalysis(BaseModel):
249
+ """Complete radiology analysis results"""
250
+ metadata: MedicalDocumentMetadata = Field(source_type="radiology")
251
+ image_references: List[RadiologyImageReference]
252
+ findings: RadiologyFindings
253
+ segmentations: List[RadiologySegmentation] = Field(default_factory=list)
254
+ metrics: RadiologyMetrics
255
+ confidence: ConfidenceScore
256
+ criticality_level: Literal["routine", "urgent", "stat"] = Field(default="routine")
257
+ follow_up_recommendations: List[str] = Field(default_factory=list)
258
+
259
+ class Config:
260
+ schema_extra = {
261
+ "example": {
262
+ "metadata": {
263
+ "document_id": "rad-67890",
264
+ "source_type": "radiology",
265
+ "document_date": "2025-10-29T10:38:55Z",
266
+ "facility": "Imaging Center"
267
+ },
268
+ "findings": {
269
+ "findings_text": "Chest CT shows bilateral pulmonary nodules...",
270
+ "impression_text": "Bilateral pulmonary nodules, likely benign",
271
+ "critical_findings": [],
272
+ "incidental_findings": ["Thyroid nodule", "Hepatic cyst"]
273
+ },
274
+ "confidence": {
275
+ "extraction_confidence": 0.88,
276
+ "model_confidence": 0.91,
277
+ "data_quality": 0.94
278
+ }
279
+ }
280
+ }
281
+
282
+
283
+ # ================================
284
+ # LABORATORY SCHEMA
285
+ # ================================
286
+
287
+ class LabTestResult(BaseModel):
288
+ """Individual laboratory test result"""
289
+ test_name: str = Field(description="Full name of the laboratory test")
290
+ test_code: Optional[str] = Field(None, description="Standard test code (LOINC, etc.)")
291
+ value: Optional[Union[float, str]] = Field(None, description="Test result value")
292
+ unit: Optional[str] = Field(None, description="Units of measurement")
293
+ reference_range_low: Optional[Union[float, str]] = Field(None, description="Lower reference limit")
294
+ reference_range_high: Optional[Union[float, str]] = Field(None, description="Upper reference limit")
295
+ flags: List[str] = Field(default_factory=list, description="Abnormal value flags (H, L, HH, LL)")
296
+ test_date: Optional[datetime] = Field(None, description="Date/time test was performed")
297
+
298
+ @property
299
+ def is_abnormal(self) -> Optional[bool]:
300
+ """Determine if test result is outside reference range"""
301
+ if self.value is None or not isinstance(self.value, (int, float)):
302
+ return None
303
+
304
+ low = self.reference_range_low
305
+ high = self.reference_range_high
306
+
307
+ if low is None or high is None:
308
+ return None
309
+
310
+ try:
311
+ low_val = float(low) if isinstance(low, str) else low
312
+ high_val = float(high) if isinstance(high, str) else high
313
+ value_val = float(self.value)
314
+
315
+ return value_val < low_val or value_val > high_val
316
+ except (ValueError, TypeError):
317
+ return None
318
+
319
+
320
+ class LaboratoryResults(BaseModel):
321
+ """Complete laboratory results analysis"""
322
+ metadata: MedicalDocumentMetadata = Field(source_type="laboratory")
323
+ tests: List[LabTestResult] = Field(description="List of all test results")
324
+ critical_values: List[str] = Field(default_factory=list, description="Critical values requiring immediate attention")
325
+ panel_name: Optional[str] = Field(None, description="Name of test panel (CMP, CBC, etc.)")
326
+ fasting_status: Optional[Literal["fasting", "non_fasting", "unknown"]] = None
327
+ collection_date: Optional[datetime] = Field(None, description="Specimen collection date")
328
+ confidence: ConfidenceScore
329
+ abnormal_count: int = Field(default=0, description="Number of abnormal results")
330
+ critical_count: int = Field(default=0, description="Number of critical results")
331
+
332
+ class Config:
333
+ schema_extra = {
334
+ "example": {
335
+ "metadata": {
336
+ "document_id": "lab-11111",
337
+ "source_type": "laboratory",
338
+ "document_date": "2025-10-29T10:38:55Z"
339
+ },
340
+ "tests": [
341
+ {
342
+ "test_name": "Glucose",
343
+ "test_code": "2345-7",
344
+ "value": 110.0,
345
+ "unit": "mg/dL",
346
+ "reference_range_low": 70.0,
347
+ "reference_range_high": 99.0,
348
+ "flags": ["H"]
349
+ }
350
+ ],
351
+ "confidence": {
352
+ "extraction_confidence": 0.95,
353
+ "model_confidence": 0.92,
354
+ "data_quality": 0.97
355
+ }
356
+ }
357
+ }
358
+
359
+
360
+ # ================================
361
+ # CLINICAL NOTES SCHEMA
362
+ # ================================
363
+
364
+ class ClinicalSection(BaseModel):
365
+ """Structured clinical note sections"""
366
+ section_type: Literal["chief_complaint", "history_present_illness", "past_medical_history",
367
+ "medications", "allergies", "review_of_systems", "physical_exam",
368
+ "assessment", "plan", "discharge_summary"] = Field(
369
+ description="Type of clinical section"
370
+ )
371
+ content: str = Field(description="Section content text")
372
+ confidence: confloat(ge=0.0, le=1.0) = Field(description="Confidence in section extraction")
373
+
374
+
375
+ class ClinicalEntity(BaseModel):
376
+ """Medical entities extracted from clinical notes"""
377
+ entity_type: Literal["diagnosis", "medication", "procedure", "symptom", "anatomy", "date", "lab_value"] = Field(
378
+ description="Type of medical entity"
379
+ )
380
+ text: str = Field(description="Entity text")
381
+ value: Optional[Union[str, float]] = Field(None, description="Entity value if applicable")
382
+ unit: Optional[str] = Field(None, description="Unit if applicable")
383
+ confidence: confloat(ge=0.0, le=1.0) = Field(description="Confidence in entity extraction")
384
+ context: Optional[str] = Field(None, description="Surrounding context for entity")
385
+
386
+
387
+ class ClinicalNotesAnalysis(BaseModel):
388
+ """Complete clinical notes analysis"""
389
+ metadata: MedicalDocumentMetadata = Field(source_type="clinical_notes")
390
+ sections: List[ClinicalSection] = Field(description="Extracted clinical sections")
391
+ entities: List[ClinicalEntity] = Field(default_factory=list, description="Extracted medical entities")
392
+ diagnoses: List[str] = Field(default_factory=list, description="Primary diagnoses")
393
+ medications: List[str] = Field(default_factory=list, description="Current medications")
394
+ procedures: List[str] = Field(default_factory=list, description="Recent procedures")
395
+ confidence: ConfidenceScore
396
+ note_type: Optional[Literal["progress_note", "consultation", "discharge_summary", "history_physical"]] = None
397
+
398
+ class Config:
399
+ schema_extra = {
400
+ "example": {
401
+ "metadata": {
402
+ "document_id": "note-22222",
403
+ "source_type": "clinical_notes",
404
+ "document_date": "2025-10-29T10:38:55Z"
405
+ },
406
+ "sections": [
407
+ {
408
+ "section_type": "chief_complaint",
409
+ "content": "Patient presents with chest pain",
410
+ "confidence": 0.98
411
+ }
412
+ ],
413
+ "entities": [
414
+ {
415
+ "entity_type": "symptom",
416
+ "text": "chest pain",
417
+ "confidence": 0.95
418
+ }
419
+ ],
420
+ "confidence": {
421
+ "extraction_confidence": 0.90,
422
+ "model_confidence": 0.87,
423
+ "data_quality": 0.93
424
+ }
425
+ }
426
+ }
427
+
428
+
429
+ # ================================
430
+ # PIPELINE VALIDATION AND ROUTING
431
+ # ================================
432
+
433
+ class DocumentClassification(BaseModel):
434
+ """Document type classification with confidence"""
435
+ predicted_type: Literal["ECG", "radiology", "laboratory", "clinical_notes", "unknown"]
436
+ confidence: confloat(ge=0.0, le=1.0)
437
+ alternative_types: List[Dict[str, float]] = Field(default_factory=list, description="Alternative classifications")
438
+ requires_human_review: bool = Field(description="Whether human review is recommended")
439
+
440
+
441
+ class ValidationResult(BaseModel):
442
+ """Validation result for schema compliance"""
443
+ is_valid: bool
444
+ validation_errors: List[str] = Field(default_factory=list)
445
+ warnings: List[str] = Field(default_factory=list)
446
+ compliance_score: confloat(ge=0.0, le=1.0) = Field(description="Overall compliance score")
447
+
448
+
449
+ def validate_document_schema(data: Dict[str, Any]) -> ValidationResult:
450
+ """
451
+ Validate document against appropriate schema based on document type
452
+
453
+ Args:
454
+ data: Document data dictionary
455
+
456
+ Returns:
457
+ ValidationResult with validation status and any errors
458
+ """
459
+ try:
460
+ doc_type = data.get("metadata", {}).get("source_type", "unknown")
461
+
462
+ if doc_type == "ECG":
463
+ ECGAnalysis(**data)
464
+ elif doc_type == "radiology":
465
+ RadiologyAnalysis(**data)
466
+ elif doc_type == "laboratory":
467
+ LaboratoryResults(**data)
468
+ elif doc_type == "clinical_notes":
469
+ ClinicalNotesAnalysis(**data)
470
+ else:
471
+ return ValidationResult(
472
+ is_valid=False,
473
+ validation_errors=[f"Unknown document type: {doc_type}"],
474
+ warnings=["Document type not recognized"]
475
+ )
476
+
477
+ return ValidationResult(
478
+ is_valid=True,
479
+ compliance_score=1.0
480
+ )
481
+
482
+ except Exception as e:
483
+ return ValidationResult(
484
+ is_valid=False,
485
+ validation_errors=[str(e)],
486
+ compliance_score=0.0
487
+ )
488
+
489
+
490
+ def route_to_specialized_model(document_data: Dict[str, Any]) -> str:
491
+ """
492
+ Route document to appropriate specialized model based on validated schema
493
+
494
+ Args:
495
+ document_data: Validated document data
496
+
497
+ Returns:
498
+ Model name for specialized processing
499
+ """
500
+ doc_type = document_data.get("metadata", {}).get("source_type", "unknown")
501
+ confidence = document_data.get("confidence", {})
502
+
503
+ # Route based on document type and confidence
504
+ if doc_type == "ECG":
505
+ if confidence.get("overall_confidence", 0) >= 0.85:
506
+ return "hubert-ecg" # HuBERT-ECG for high-confidence ECG
507
+ else:
508
+ return "bio-clinicalbert" # Fallback for lower confidence
509
+ elif doc_type == "radiology":
510
+ return "monai-unetr" # MONAI UNETR for radiology segmentation
511
+ elif doc_type == "laboratory":
512
+ return "biomedical-ner" # Biomedical NER for lab value extraction
513
+ elif doc_type == "clinical_notes":
514
+ return "medgemma" # MedGemma for clinical text generation
515
+ else:
516
+ return "scibert" # Default fallback model
517
+
518
+
519
+ # ================================
520
+ # EXPORT SCHEMAS FOR PIPELINE
521
+ # ================================
522
+
523
+ __all__ = [
524
+ "ConfidenceScore",
525
+ "MedicalDocumentMetadata",
526
+ "ECGAnalysis",
527
+ "RadiologyAnalysis",
528
+ "LaboratoryResults",
529
+ "ClinicalNotesAnalysis",
530
+ "DocumentClassification",
531
+ "ValidationResult",
532
+ "validate_document_schema",
533
+ "route_to_specialized_model"
534
+ ]