snikhilesh commited on
Commit
a4fbae6
·
verified ·
1 Parent(s): e942eb2

Deploy pdf_extractor.py to backend/ directory

Browse files
Files changed (1) hide show
  1. backend/pdf_extractor.py +670 -0
backend/pdf_extractor.py ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF Medical Extractor - Phase 2
3
+ Structured PDF extraction using Donut/LayoutLMv3 for medical documents.
4
+
5
+ This module provides specialized extraction for medical PDFs including
6
+ radiology reports, laboratory results, clinical notes, and ECG reports.
7
+
8
+ Author: MiniMax Agent
9
+ Date: 2025-10-29
10
+ Version: 1.0.0
11
+ """
12
+
13
+ import os
14
+ import json
15
+ import io
16
+ import logging
17
+ from typing import Dict, List, Optional, Any, Tuple
18
+ from dataclasses import dataclass
19
+ from pathlib import Path
20
+ import numpy as np
21
+ from PIL import Image
22
+ import fitz # PyMuPDF
23
+ import pytesseract
24
+ from transformers import DonutProcessor, VisionEncoderDecoderModel
25
+ import torch
26
+ from tqdm import tqdm
27
+
28
+ from medical_schemas import (
29
+ MedicalDocumentMetadata, ConfidenceScore, RadiologyAnalysis,
30
+ LaboratoryResults, ClinicalNotesAnalysis, ValidationResult,
31
+ validate_document_schema
32
+ )
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ @dataclass
38
+ class ExtractionResult:
39
+ """Result of PDF extraction with confidence scoring"""
40
+ raw_text: str
41
+ structured_data: Dict[str, Any]
42
+ confidence_scores: Dict[str, float]
43
+ extraction_method: str # "donut", "ocr", "hybrid"
44
+ processing_time: float
45
+ tables_extracted: List[Dict[str, Any]]
46
+ images_extracted: List[str]
47
+ metadata: Dict[str, Any]
48
+
49
+
50
+ class DonutMedicalExtractor:
51
+ """Medical PDF extraction using Donut model for structured output"""
52
+
53
+ def __init__(self, model_name: str = "naver-clova-ix/donut-base-finetuned-rvlcdip"):
54
+ self.model_name = model_name
55
+ self.processor = None
56
+ self.model = None
57
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
58
+ self._load_model()
59
+
60
+ def _load_model(self):
61
+ """Load Donut model and processor"""
62
+ try:
63
+ logger.info(f"Loading Donut model: {self.model_name}")
64
+ self.processor = DonutProcessor.from_pretrained(self.model_name)
65
+ self.model = VisionEncoderDecoderModel.from_pretrained(self.model_name)
66
+ self.model.to(self.device)
67
+ self.model.eval()
68
+ logger.info("Donut model loaded successfully")
69
+ except Exception as e:
70
+ logger.error(f"Failed to load Donut model: {str(e)}")
71
+ raise
72
+
73
+ def extract_from_image(self, image: Image.Image, task_prompt: str = None) -> Dict[str, Any]:
74
+ """Extract structured data from image using Donut"""
75
+ if task_prompt is None:
76
+ task_prompt = "<s_rvlcdip>"
77
+
78
+ try:
79
+ # Prepare image for Donut
80
+ pixel_values = self.processor(images=image, return_tensors="pt").pixel_values
81
+ pixel_values = pixel_values.to(self.device)
82
+
83
+ # Generate structured output
84
+ task_prompt_ids = self.processor.tokenizer(task_prompt, add_special_tokens=False,
85
+ return_tensors="pt").input_ids
86
+ task_prompt_ids = task_prompt_ids.to(self.device)
87
+
88
+ with torch.no_grad():
89
+ outputs = self.model.generate(
90
+ task_prompt_ids,
91
+ pixel_values,
92
+ max_length=512,
93
+ early_stopping=False,
94
+ pad_token_id=self.processor.tokenizer.pad_token_id,
95
+ eos_token_id=self.processor.tokenizer.eos_token_id,
96
+ use_cache=True,
97
+ )
98
+
99
+ # Decode output
100
+ output_sequence = outputs.cpu().numpy()[0]
101
+ decoded_output = self.processor.tokenizer.decode(output_sequence, skip_special_tokens=True)
102
+
103
+ # Parse JSON from decoded output
104
+ json_start = decoded_output.find('{')
105
+ json_end = decoded_output.rfind('}') + 1
106
+
107
+ if json_start != -1 and json_end != -1:
108
+ json_str = decoded_output[json_start:json_end]
109
+ structured_data = json.loads(json_str)
110
+ else:
111
+ structured_data = {"raw_text": decoded_output}
112
+
113
+ return structured_data
114
+
115
+ except Exception as e:
116
+ logger.error(f"Donut extraction error: {str(e)}")
117
+ return {"raw_text": "", "error": str(e)}
118
+
119
+
120
+ class MedicalPDFProcessor:
121
+ """Medical PDF processing with multiple extraction methods"""
122
+
123
+ def __init__(self):
124
+ self.donut_extractor = None
125
+ self.ocr_enabled = True
126
+
127
+ # Initialize Donut extractor
128
+ try:
129
+ self.donut_extractor = DonutMedicalExtractor()
130
+ except Exception as e:
131
+ logger.warning(f"Donut extractor not available: {str(e)}")
132
+ self.donut_extractor = None
133
+
134
+ def process_pdf(self, pdf_path: str, document_type: str = "unknown") -> ExtractionResult:
135
+ """
136
+ Process medical PDF with multiple extraction methods
137
+
138
+ Args:
139
+ pdf_path: Path to PDF file
140
+ document_type: Type of medical document
141
+
142
+ Returns:
143
+ ExtractionResult with structured data
144
+ """
145
+ import time
146
+ start_time = time.time()
147
+
148
+ try:
149
+ # Open PDF and extract basic info
150
+ doc = fitz.open(pdf_path)
151
+ page_count = len(doc)
152
+ metadata = {
153
+ "page_count": page_count,
154
+ "pdf_metadata": doc.metadata,
155
+ "file_size": os.path.getsize(pdf_path)
156
+ }
157
+
158
+ # Extract text using multiple methods
159
+ raw_text = ""
160
+ tables = []
161
+ images = []
162
+
163
+ for page_num in range(page_count):
164
+ page = doc.load_page(page_num)
165
+
166
+ # Extract text
167
+ page_text = page.get_text()
168
+ raw_text += f"\n--- Page {page_num + 1} ---\n{page_text}"
169
+
170
+ # Extract tables using different methods
171
+ page_tables = self._extract_tables(page)
172
+ tables.extend(page_tables)
173
+
174
+ # Extract images
175
+ page_images = self._extract_images(page, pdf_path, page_num)
176
+ images.extend(page_images)
177
+
178
+ doc.close()
179
+
180
+ # Determine extraction method based on content
181
+ extraction_method = self._determine_extraction_method(raw_text, document_type)
182
+
183
+ # Extract structured data based on document type
184
+ if extraction_method == "donut" and self.donut_extractor:
185
+ structured_data = self._extract_with_donut(pdf_path, document_type)
186
+ else:
187
+ structured_data = self._extract_with_fallback(raw_text, document_type)
188
+
189
+ # Calculate confidence scores
190
+ confidence_scores = self._calculate_extraction_confidence(
191
+ raw_text, structured_data, tables, images
192
+ )
193
+
194
+ processing_time = time.time() - start_time
195
+
196
+ return ExtractionResult(
197
+ raw_text=raw_text,
198
+ structured_data=structured_data,
199
+ confidence_scores=confidence_scores,
200
+ extraction_method=extraction_method,
201
+ processing_time=processing_time,
202
+ tables_extracted=tables,
203
+ images_extracted=images,
204
+ metadata=metadata
205
+ )
206
+
207
+ except Exception as e:
208
+ logger.error(f"PDF processing error: {str(e)}")
209
+ return ExtractionResult(
210
+ raw_text="",
211
+ structured_data={"error": str(e)},
212
+ confidence_scores={"overall": 0.0},
213
+ extraction_method="error",
214
+ processing_time=time.time() - start_time,
215
+ tables_extracted=[],
216
+ images_extracted=[],
217
+ metadata={"error": str(e)}
218
+ )
219
+
220
+ def _determine_extraction_method(self, text: str, document_type: str) -> str:
221
+ """Determine best extraction method based on content and type"""
222
+ # High confidence cases for Donut
223
+ if document_type in ["radiology", "ecg_report"] and len(text) > 500:
224
+ return "donut"
225
+
226
+ # Check for structured content indicators
227
+ structured_indicators = [
228
+ "findings:", "impression:", "technique:", "results:",
229
+ "normal ranges:", "reference values:", "patient information:"
230
+ ]
231
+
232
+ indicator_count = sum(1 for indicator in structured_indicators if indicator.lower() in text.lower())
233
+
234
+ if indicator_count >= 3 and len(text) > 1000:
235
+ return "donut"
236
+
237
+ # Fallback to text-based extraction
238
+ return "fallback"
239
+
240
+ def _extract_with_donut(self, pdf_path: str, document_type: str) -> Dict[str, Any]:
241
+ """Extract structured data using Donut model"""
242
+ if not self.donut_extractor:
243
+ return self._extract_with_fallback("", document_type)
244
+
245
+ try:
246
+ # Convert PDF to images (first page for now, can be extended)
247
+ images = self._pdf_to_images(pdf_path)
248
+
249
+ if not images:
250
+ return self._extract_with_fallback("", document_type)
251
+
252
+ # Define task prompt based on document type
253
+ task_prompts = {
254
+ "radiology": "<s_radiology_report>",
255
+ "laboratory": "<s_laboratory_report>",
256
+ "clinical_notes": "<s_clinical_note>",
257
+ "ecg_report": "<s_ecg_report>",
258
+ "unknown": "<s_medical_document>"
259
+ }
260
+
261
+ task_prompt = task_prompts.get(document_type, "<s_medical_document>")
262
+
263
+ # Extract using Donut
264
+ structured_data = self.donut_extractor.extract_from_image(images[0], task_prompt)
265
+
266
+ # Post-process based on document type
267
+ if document_type == "radiology":
268
+ structured_data = self._postprocess_radiology(structured_data)
269
+ elif document_type == "laboratory":
270
+ structured_data = self._postprocess_laboratory(structured_data)
271
+ elif document_type == "clinical_notes":
272
+ structured_data = self._postprocess_clinical_notes(structured_data)
273
+ elif document_type == "ecg_report":
274
+ structured_data = self._postprocess_ecg(structured_data)
275
+
276
+ return structured_data
277
+
278
+ except Exception as e:
279
+ logger.error(f"Donut extraction error: {str(e)}")
280
+ return self._extract_with_fallback("", document_type)
281
+
282
+ def _extract_with_fallback(self, text: str, document_type: str) -> Dict[str, Any]:
283
+ """Fallback extraction using text processing and OCR if needed"""
284
+ try:
285
+ # Basic text cleaning
286
+ cleaned_text = text.strip()
287
+
288
+ # Document-type specific extraction
289
+ if document_type == "radiology":
290
+ return self._extract_radiology_from_text(cleaned_text)
291
+ elif document_type == "laboratory":
292
+ return self._extract_laboratory_from_text(cleaned_text)
293
+ elif document_type == "clinical_notes":
294
+ return self._extract_clinical_notes_from_text(cleaned_text)
295
+ elif document_type == "ecg_report":
296
+ return self._extract_ecg_from_text(cleaned_text)
297
+ else:
298
+ return {
299
+ "raw_text": cleaned_text,
300
+ "document_type": document_type,
301
+ "extraction_method": "fallback_text"
302
+ }
303
+
304
+ except Exception as e:
305
+ logger.error(f"Fallback extraction error: {str(e)}")
306
+ return {"raw_text": text, "error": str(e), "extraction_method": "fallback"}
307
+
308
+ def _extract_radiology_from_text(self, text: str) -> Dict[str, Any]:
309
+ """Extract radiology information from text"""
310
+ lines = text.split('\n')
311
+ findings = []
312
+ impression = []
313
+ technique = []
314
+
315
+ current_section = None
316
+
317
+ for line in lines:
318
+ line = line.strip()
319
+ if not line:
320
+ continue
321
+
322
+ line_lower = line.lower()
323
+
324
+ if any(keyword in line_lower for keyword in ["findings:", "findings"]):
325
+ current_section = "findings"
326
+ continue
327
+ elif any(keyword in line_lower for keyword in ["impression:", "impression", "conclusion:"]):
328
+ current_section = "impression"
329
+ continue
330
+ elif any(keyword in line_lower for keyword in ["technique:", "protocol:"]):
331
+ current_section = "technique"
332
+ continue
333
+
334
+ if current_section == "findings":
335
+ findings.append(line)
336
+ elif current_section == "impression":
337
+ impression.append(line)
338
+ elif current_section == "technique":
339
+ technique.append(line)
340
+
341
+ return {
342
+ "findings": " ".join(findings),
343
+ "impression": " ".join(impression),
344
+ "technique": " ".join(technique),
345
+ "document_type": "radiology",
346
+ "extraction_method": "text_pattern_matching"
347
+ }
348
+
349
+ def _extract_laboratory_from_text(self, text: str) -> Dict[str, Any]:
350
+ """Extract laboratory results from text"""
351
+ lines = text.split('\n')
352
+ tests = []
353
+
354
+ for line in lines:
355
+ line = line.strip()
356
+ if not line:
357
+ continue
358
+
359
+ # Look for test patterns
360
+ # Pattern: Test Name Value Units Reference Range Flag
361
+ parts = line.split()
362
+ if len(parts) >= 3:
363
+ # Try to identify test components
364
+ test_data = {
365
+ "raw_line": line,
366
+ "potential_test": parts[0] if len(parts) > 0 else "",
367
+ "potential_value": parts[1] if len(parts) > 1 else "",
368
+ "potential_unit": parts[2] if len(parts) > 2 else "",
369
+ }
370
+ tests.append(test_data)
371
+
372
+ return {
373
+ "tests": tests,
374
+ "document_type": "laboratory",
375
+ "extraction_method": "text_pattern_matching"
376
+ }
377
+
378
+ def _extract_clinical_notes_from_text(self, text: str) -> Dict[str, Any]:
379
+ """Extract clinical notes sections from text"""
380
+ lines = text.split('\n')
381
+ sections = {}
382
+ current_section = "general"
383
+
384
+ for line in lines:
385
+ line = line.strip()
386
+ if not line:
387
+ continue
388
+
389
+ line_lower = line.lower()
390
+
391
+ # Identify section headers
392
+ if any(keyword in line_lower for keyword in ["chief complaint:", "chief complaint", "cc:"]):
393
+ current_section = "chief_complaint"
394
+ continue
395
+ elif any(keyword in line_lower for keyword in ["history of present illness:", "hpi:", "history:"]):
396
+ current_section = "history_present_illness"
397
+ continue
398
+ elif any(keyword in line_lower for keyword in ["assessment:", "diagnosis:", "impression:"]):
399
+ current_section = "assessment"
400
+ continue
401
+ elif any(keyword in line_lower for keyword in ["plan:", "treatment:", "recommendations:"]):
402
+ current_section = "plan"
403
+ continue
404
+
405
+ # Add line to current section
406
+ if current_section not in sections:
407
+ sections[current_section] = []
408
+ sections[current_section].append(line)
409
+
410
+ # Convert lists to text
411
+ for section in sections:
412
+ sections[section] = " ".join(sections[section])
413
+
414
+ return {
415
+ "sections": sections,
416
+ "document_type": "clinical_notes",
417
+ "extraction_method": "text_pattern_matching"
418
+ }
419
+
420
+ def _extract_ecg_from_text(self, text: str) -> Dict[str, Any]:
421
+ """Extract ECG information from text"""
422
+ lines = text.split('\n')
423
+ ecg_data = {}
424
+
425
+ for line in lines:
426
+ line = line.strip().lower()
427
+
428
+ # Extract ECG measurements
429
+ if "heart rate" in line or "hr" in line:
430
+ import re
431
+ hr_match = re.search(r'(\d+)', line)
432
+ if hr_match:
433
+ ecg_data["heart_rate"] = int(hr_match.group(1))
434
+
435
+ if "rhythm" in line:
436
+ ecg_data["rhythm"] = line
437
+
438
+ if any(interval in line for interval in ["pr interval", "qrs", "qt"]):
439
+ ecg_data[line.split(':')[0]] = line
440
+
441
+ return {
442
+ "ecg_data": ecg_data,
443
+ "document_type": "ecg_report",
444
+ "extraction_method": "text_pattern_matching"
445
+ }
446
+
447
+ def _postprocess_radiology(self, data: Dict[str, Any]) -> Dict[str, Any]:
448
+ """Post-process radiology extraction results"""
449
+ # Ensure required fields exist
450
+ if "findings" not in data:
451
+ data["findings"] = ""
452
+ if "impression" not in data:
453
+ data["impression"] = ""
454
+
455
+ data["document_type"] = "radiology"
456
+ return data
457
+
458
+ def _postprocess_laboratory(self, data: Dict[str, Any]) -> Dict[str, Any]:
459
+ """Post-process laboratory extraction results"""
460
+ # Ensure tests array exists
461
+ if "tests" not in data:
462
+ data["tests"] = []
463
+
464
+ data["document_type"] = "laboratory"
465
+ return data
466
+
467
+ def _postprocess_clinical_notes(self, data: Dict[str, Any]) -> Dict[str, Any]:
468
+ """Post-process clinical notes extraction results"""
469
+ # Ensure sections exist
470
+ if "sections" not in data:
471
+ data["sections"] = {}
472
+
473
+ data["document_type"] = "clinical_notes"
474
+ return data
475
+
476
+ def _postprocess_ecg(self, data: Dict[str, Any]) -> Dict[str, Any]:
477
+ """Post-process ECG extraction results"""
478
+ # Ensure ecg_data exists
479
+ if "ecg_data" not in data:
480
+ data["ecg_data"] = {}
481
+
482
+ data["document_type"] = "ecg_report"
483
+ return data
484
+
485
+ def _pdf_to_images(self, pdf_path: str) -> List[Image.Image]:
486
+ """Convert PDF pages to images for Donut processing"""
487
+ images = []
488
+ try:
489
+ doc = fitz.open(pdf_path)
490
+ for page_num in range(min(3, len(doc))): # Process first 3 pages
491
+ page = doc.load_page(page_num)
492
+ mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better OCR
493
+ pix = page.get_pixmap(matrix=mat)
494
+ img_data = pix.tobytes("png")
495
+ image = Image.open(io.BytesIO(img_data))
496
+ images.append(image)
497
+ doc.close()
498
+ except Exception as e:
499
+ logger.error(f"PDF to image conversion error: {str(e)}")
500
+
501
+ return images
502
+
503
+ def _extract_tables(self, page) -> List[Dict[str, Any]]:
504
+ """Extract tables from PDF page"""
505
+ tables = []
506
+ try:
507
+ # Use PyMuPDF table extraction if available
508
+ tables_data = page.find_tables()
509
+ for table in tables_data:
510
+ table_dict = table.extract()
511
+ tables.append({
512
+ "rows": len(table_dict),
513
+ "columns": len(table_dict[0]) if table_dict else 0,
514
+ "data": table_dict
515
+ })
516
+ except Exception as e:
517
+ logger.debug(f"Table extraction failed: {str(e)}")
518
+
519
+ return tables
520
+
521
+ def _extract_images(self, page, pdf_path: str, page_num: int) -> List[str]:
522
+ """Extract images from PDF page"""
523
+ images = []
524
+ try:
525
+ image_list = page.get_images()
526
+ for img_index, img in enumerate(image_list):
527
+ xref = img[0]
528
+ pix = fitz.Pixmap(page.parent, xref)
529
+ if pix.n - pix.alpha < 4: # GRAY or RGB
530
+ img_path = f"{Path(pdf_path).stem}_page{page_num+1}_img{img_index+1}.png"
531
+ pix.save(img_path)
532
+ images.append(img_path)
533
+ pix = None
534
+ except Exception as e:
535
+ logger.debug(f"Image extraction failed: {str(e)}")
536
+
537
+ return images
538
+
539
+ def _calculate_extraction_confidence(self, raw_text: str, structured_data: Dict[str, Any],
540
+ tables: List[Dict], images: List[str]) -> Dict[str, float]:
541
+ """Calculate confidence scores for extraction quality"""
542
+ confidence_scores = {}
543
+
544
+ # Text extraction confidence
545
+ text_length = len(raw_text.strip())
546
+ confidence_scores["text_extraction"] = min(1.0, text_length / 1000) if text_length > 0 else 0.0
547
+
548
+ # Structured data completeness
549
+ required_fields = 0
550
+ present_fields = 0
551
+
552
+ if "findings" in structured_data or "impression" in structured_data:
553
+ required_fields += 1
554
+ if structured_data.get("findings") or structured_data.get("impression"):
555
+ present_fields += 1
556
+
557
+ if "tests" in structured_data:
558
+ required_fields += 1
559
+ if structured_data.get("tests"):
560
+ present_fields += 1
561
+
562
+ if "sections" in structured_data:
563
+ required_fields += 1
564
+ if structured_data.get("sections"):
565
+ present_fields += 1
566
+
567
+ confidence_scores["structural_completeness"] = present_fields / max(required_fields, 1)
568
+
569
+ # Table extraction confidence
570
+ confidence_scores["table_extraction"] = min(1.0, len(tables) * 0.3)
571
+
572
+ # Image extraction confidence
573
+ confidence_scores["image_extraction"] = min(1.0, len(images) * 0.2)
574
+
575
+ # Overall confidence (weighted average)
576
+ overall = (
577
+ 0.4 * confidence_scores["text_extraction"] +
578
+ 0.4 * confidence_scores["structural_completeness"] +
579
+ 0.1 * confidence_scores["table_extraction"] +
580
+ 0.1 * confidence_scores["image_extraction"]
581
+ )
582
+ confidence_scores["overall"] = overall
583
+
584
+ return confidence_scores
585
+
586
+ def convert_to_schema_format(self, extraction_result: ExtractionResult,
587
+ document_type: str) -> Optional[Dict[str, Any]]:
588
+ """Convert extraction result to canonical schema format"""
589
+ try:
590
+ # Create metadata
591
+ metadata = MedicalDocumentMetadata(
592
+ source_type=document_type,
593
+ data_completeness=extraction_result.confidence_scores.get("overall", 0.0)
594
+ )
595
+
596
+ # Create confidence score
597
+ confidence = ConfidenceScore(
598
+ extraction_confidence=extraction_result.confidence_scores.get("overall", 0.0),
599
+ model_confidence=0.8, # Default assumption
600
+ data_quality=extraction_result.confidence_scores.get("text_extraction", 0.0)
601
+ )
602
+
603
+ # Convert based on document type
604
+ if document_type == "radiology":
605
+ return self._convert_to_radiology_schema(extraction_result, metadata, confidence)
606
+ elif document_type == "laboratory":
607
+ return self._convert_to_laboratory_schema(extraction_result, metadata, confidence)
608
+ elif document_type == "clinical_notes":
609
+ return self._convert_to_clinical_notes_schema(extraction_result, metadata, confidence)
610
+ else:
611
+ return None
612
+
613
+ except Exception as e:
614
+ logger.error(f"Schema conversion error: {str(e)}")
615
+ return None
616
+
617
+ def _convert_to_radiology_schema(self, result: ExtractionResult, metadata: MedicalDocumentMetadata,
618
+ confidence: ConfidenceScore) -> Dict[str, Any]:
619
+ """Convert to radiology schema format"""
620
+ data = result.structured_data
621
+
622
+ return {
623
+ "metadata": metadata.dict(),
624
+ "image_references": [],
625
+ "findings": {
626
+ "findings_text": data.get("findings", ""),
627
+ "impression_text": data.get("impression", ""),
628
+ "technique_description": data.get("technique", "")
629
+ },
630
+ "segmentations": [],
631
+ "metrics": {},
632
+ "confidence": confidence.dict(),
633
+ "criticality_level": "routine",
634
+ "follow_up_recommendations": []
635
+ }
636
+
637
+ def _convert_to_laboratory_schema(self, result: ExtractionResult, metadata: MedicalDocumentMetadata,
638
+ confidence: ConfidenceScore) -> Dict[str, Any]:
639
+ """Convert to laboratory schema format"""
640
+ data = result.structured_data
641
+
642
+ return {
643
+ "metadata": metadata.dict(),
644
+ "tests": data.get("tests", []),
645
+ "confidence": confidence.dict(),
646
+ "critical_values": [],
647
+ "abnormal_count": 0,
648
+ "critical_count": 0
649
+ }
650
+
651
+ def _convert_to_clinical_notes_schema(self, result: ExtractionResult, metadata: MedicalDocumentMetadata,
652
+ confidence: ConfidenceScore) -> Dict[str, Any]:
653
+ """Convert to clinical notes schema format"""
654
+ data = result.structured_data
655
+ sections = data.get("sections", {})
656
+
657
+ return {
658
+ "metadata": metadata.dict(),
659
+ "sections": [{"section_type": k, "content": v, "confidence": 0.8} for k, v in sections.items()],
660
+ "entities": [],
661
+ "confidence": confidence.dict()
662
+ }
663
+
664
+
665
+ # Export main classes
666
+ __all__ = [
667
+ "MedicalPDFProcessor",
668
+ "DonutMedicalExtractor",
669
+ "ExtractionResult"
670
+ ]