MakPr016 commited on
Commit
e158d2f
·
0 Parent(s):

Inital phase

Browse files
.gitignore ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.so
5
+ .Python
6
+ build/
7
+ develop-eggs/
8
+ dist/
9
+ downloads/
10
+ eggs/
11
+ .eggs/
12
+ lib/
13
+ lib64/
14
+ parts/
15
+ sdist/
16
+ var/
17
+ wheels/
18
+ share/python-wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+ MANIFEST
23
+
24
+ .pytest_cache/
25
+ .coverage
26
+ .coverage.*
27
+ htmlcov/
28
+ .tox/
29
+ .nox/
30
+ .hypothesis/
31
+ pytestdebug.log
32
+
33
+ *.log
34
+ *.pot
35
+ *.pyc
36
+
37
+ .env
38
+ .venv
39
+ env/
40
+ venv/
41
+ ENV/
42
+ env.bak/
43
+ venv.bak/
44
+
45
+ .spyderproject
46
+ .spyproject
47
+ .ropeproject
48
+
49
+ instance/
50
+ .webassets-cache
51
+
52
+ .mypy_cache/
53
+ .dmypy.json
54
+ dmypy.json
55
+ .pyre/
56
+ .pytype/
57
+ cython_debug/
58
+
59
+ .vscode/
60
+ .idea/
61
+ *.swp
62
+ *.swo
63
+ *~
64
+ .DS_Store
65
+
66
+ models/
67
+ *.pkl
68
+ *.pth
69
+ *.pt
70
+ *.bin
71
+ *.h5
72
+ *.onnx
73
+ *.pb
74
+ *.caffemodel
75
+ *.weights
76
+
77
+ data/
78
+ datasets/
79
+ *.csv
80
+ *.json
81
+ *.jsonl
82
+ *.tsv
83
+
84
+ *.pdf
85
+ *.jpg
86
+ *.jpeg
87
+ *.png
88
+ *.gif
89
+ *.bmp
90
+ *.tiff
91
+ *.svg
92
+ *.ico
93
+
94
+ test_files/
95
+ uploads/
96
+ temp/
97
+ tmp/
98
+ cache/
99
+
100
+ .ipynb_checkpoints/
101
+ *.ipynb
102
+
103
+ node_modules/
104
+ package-lock.json
105
+ yarn.lock
106
+
107
+ flagged/
108
+ .env
Dockerfile ADDED
File without changes
app/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ Lab Report NER API
3
+ Extracts structured entities from medical reports using spaCy NER + EasyOCR + ClinicalDistilBERT
4
+ """
5
+
6
+ __version__ = "1.0.0"
app/crypto_utils.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Encryption utilities using NaCl (libsodium)
3
+ """
4
+ import base64
5
+ import gzip
6
+ import json
7
+ from nacl.secret import SecretBox
8
+ from nacl.utils import random
9
+
10
+ class CryptoManager:
11
+ def __init__(self, secret_key_hex: str):
12
+ """
13
+ Initialize with hex key string from .env
14
+ Converts 64-character hex string to 32 bytes
15
+ """
16
+ if not secret_key_hex:
17
+ raise ValueError("Secret key is required")
18
+
19
+ # Check if it's already the right length
20
+ if len(secret_key_hex) == 64:
21
+ # 64 hex chars = 32 bytes (correct)
22
+ self.secret_key = bytes.fromhex(secret_key_hex)
23
+ elif len(secret_key_hex) == 32:
24
+ # If someone passes 32 chars thinking it's bytes, warn them
25
+ print(f"⚠️ WARNING: Key is only 32 characters (16 bytes)")
26
+ print(f" Should be 64 hex characters for 32 bytes")
27
+ # Try to use as-is but it will fail
28
+ self.secret_key = secret_key_hex.encode('utf-8')
29
+ else:
30
+ raise ValueError(f"Secret key must be 64 hex characters (got {len(secret_key_hex)})")
31
+
32
+ if len(self.secret_key) != 32:
33
+ raise ValueError(f"Secret key must be 32 bytes (got {len(self.secret_key)} bytes)")
34
+
35
+ self.box = SecretBox(self.secret_key)
36
+ print(f"✓ CryptoManager initialized (key: {len(self.secret_key)} bytes)")
37
+
38
+ def encrypt(self, plaintext: bytes, nonce: bytes = None) -> bytes:
39
+ """Encrypt plaintext bytes"""
40
+ if nonce is None:
41
+ nonce = random(SecretBox.NONCE_SIZE)
42
+ return self.box.encrypt(plaintext, nonce)
43
+
44
+ def decrypt(self, ciphertext: str, nonce: str) -> bytes:
45
+ """Decrypt base64-encoded ciphertext with base64-encoded nonce"""
46
+ try:
47
+ ciphertext_bytes = base64.b64decode(ciphertext)
48
+ nonce_bytes = base64.b64decode(nonce)
49
+ return self.box.decrypt(ciphertext_bytes, nonce_bytes)
50
+ except Exception as e:
51
+ raise ValueError(f"Decryption failed. {e}")
52
+
53
+ def encrypt_json(self, data: dict) -> dict:
54
+ """
55
+ Encrypt JSON data with compression
56
+ Returns dict with base64-encoded ciphertext and nonce
57
+ """
58
+ # Convert to JSON and compress
59
+ json_data = json.dumps(data).encode('utf-8')
60
+ compressed = gzip.compress(json_data, compresslevel=6)
61
+ compressed_b64 = base64.b64encode(compressed).decode('utf-8')
62
+
63
+ # Encrypt
64
+ nonce = random(SecretBox.NONCE_SIZE)
65
+ ciphertext = self.box.encrypt(compressed_b64.encode('utf-8'), nonce)
66
+
67
+ return {
68
+ "ciphertext": base64.b64encode(ciphertext.ciphertext).decode('utf-8'),
69
+ "nonce": base64.b64encode(nonce).decode('utf-8')
70
+ }
71
+
72
+ def decrypt_json(self, ciphertext: str, nonce: str) -> dict:
73
+ """
74
+ Decrypt and decompress JSON data
75
+ """
76
+ try:
77
+ # Decrypt
78
+ decrypted = self.decrypt(ciphertext, nonce)
79
+
80
+ # Decompress
81
+ compressed_b64 = decrypted.decode('utf-8')
82
+ compressed_bytes = base64.b64decode(compressed_b64)
83
+ decompressed = gzip.decompress(compressed_bytes)
84
+
85
+ # Parse JSON
86
+ return json.loads(decompressed.decode('utf-8'))
87
+ except Exception as e:
88
+ raise ValueError(f"Decryption/decompression failed. {e}")
app/image_extractor.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Extract embedded images from PDF files
3
+ """
4
+
5
+ import fitz # PyMuPDF
6
+ import base64
7
+ from PIL import Image
8
+ import io
9
+ from typing import List, Dict
10
+
11
+ def extract_images_from_pdf(pdf_bytes: bytes) -> List[Dict]:
12
+ """
13
+ Extract all embedded images from PDF
14
+ Returns list of image dictionaries with base64 data
15
+ """
16
+ if not pdf_bytes:
17
+ return []
18
+
19
+ try:
20
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
21
+ images = []
22
+
23
+ for page_num in range(len(doc)):
24
+ page = doc[page_num]
25
+ image_list = page.get_images(full=True)
26
+
27
+ for img_index, img in enumerate(image_list):
28
+ try:
29
+ xref = img[0]
30
+ base_image = doc.extract_image(xref)
31
+
32
+ image_bytes = base_image["image"]
33
+ image_ext = base_image["ext"]
34
+
35
+ # Get dimensions
36
+ pil_image = Image.open(io.BytesIO(image_bytes))
37
+
38
+ # Convert to base64
39
+ image_b64 = base64.b64encode(image_bytes).decode('utf-8')
40
+
41
+ images.append({
42
+ "page": page_num + 1,
43
+ "format": image_ext,
44
+ "width": pil_image.width,
45
+ "height": pil_image.height,
46
+ "data": f"data:image/{image_ext};base64,{image_b64}"
47
+ })
48
+
49
+ except Exception as e:
50
+ print(f"⚠ Failed to extract image {img_index} from page {page_num + 1}: {e}")
51
+ continue
52
+
53
+ doc.close()
54
+ print(f"✓ Extracted {len(images)} images from PDF")
55
+ return images
56
+
57
+ except Exception as e:
58
+ print(f"✗ Image extraction error: {e}")
59
+ return []
60
+
61
+ def create_thumbnail(image_bytes: bytes, size: tuple = (200, 200)) -> str:
62
+ """
63
+ Create thumbnail version of image (base64)
64
+ """
65
+ try:
66
+ image = Image.open(io.BytesIO(image_bytes))
67
+ image.thumbnail(size, Image.Resampling.LANCZOS)
68
+
69
+ buffered = io.BytesIO()
70
+ image.save(buffered, format="JPEG", quality=85)
71
+ img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
72
+
73
+ return f"data:image/jpeg;base64,{img_str}"
74
+
75
+ except Exception as e:
76
+ print(f"✗ Thumbnail creation failed: {e}")
77
+ return ""
app/lab_processor.py ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Lab Report Processing with Smart NER + Regex + ClinicalDistilBERT
3
+ Based on your proven local implementation
4
+ """
5
+
6
+ import spacy
7
+ import re
8
+ import time
9
+ import torch
10
+ from datetime import datetime
11
+ from typing import Dict, List, Set
12
+ from collections import defaultdict
13
+ from transformers import AutoTokenizer, AutoModel
14
+
15
+ REFERENCE_RANGES = {
16
+ "White Blood Cell Count": {"min": 4.0, "max": 11.0, "unit": "x10^9/L"},
17
+ "Red Blood Cell Count": {"min": 4.2, "max": 5.9, "unit": "x10^12/L"},
18
+ "Hemoglobin": {"min": 13.5, "max": 17.5, "unit": "g/dL"},
19
+ "Hematocrit": {"min": 38.3, "max": 48.6, "unit": "%"},
20
+ "Platelet Count": {"min": 150, "max": 450, "unit": "x10^9/L"},
21
+ "Glucose": {"min": 70, "max": 99, "unit": "mg/dL"},
22
+ "Creatinine": {"min": 0.6, "max": 1.2, "unit": "mg/dL"},
23
+ "Urea": {"min": 15, "max": 50, "unit": "mg/dL"},
24
+ "Cholesterol": {"min": 0, "max": 200, "unit": "mg/dL"},
25
+ "Alanine Aminotransferase": {"min": 7, "max": 56, "unit": "U/L"},
26
+ "Aspartate Aminotransferase": {"min": 8, "max": 48, "unit": "U/L"},
27
+ "Alkaline Phosphatase": {"min": 40, "max": 129, "unit": "U/L"},
28
+ "Bilirubin": {"min": 0.3, "max": 1.9, "unit": "mg/dL"},
29
+ "Albumin": {"min": 3.5, "max": 5.5, "unit": "g/dL"},
30
+ "Thyroid Stimulating Hormone": {"min": 0.5, "max": 4.5, "unit": "mIU/L"},
31
+ "Free Thyroxine": {"min": 0.9, "max": 1.7, "unit": "ng/dL"},
32
+ }
33
+
34
+ class RadioloLabProcessor:
35
+
36
+ def __init__(self, ner_model_path: str):
37
+ """Initialize smart lab processor with NER, stopwords, and ClinicalDistilBERT"""
38
+
39
+ # Load custom NER model
40
+ self.nlp = spacy.load(ner_model_path)
41
+ print(f"✓ Lab NER model loaded: {ner_model_path}")
42
+
43
+ # Load ClinicalDistilBERT
44
+ print("Loading ClinicalDistilBERT...")
45
+ self.clinical_tokenizer = AutoTokenizer.from_pretrained("nlpie/clinical-distilbert")
46
+ self.clinical_model = AutoModel.from_pretrained("nlpie/clinical-distilbert")
47
+
48
+ # Set device
49
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
50
+ self.clinical_model = self.clinical_model.to(self.device)
51
+ self.clinical_model.eval()
52
+ print(f"✓ ClinicalDistilBERT loaded on {self.device}")
53
+
54
+ # Strict stopwords to filter false positives
55
+ self.stopwords = {
56
+ # Document structure
57
+ 'hemolab', 'central', 'medicity', 'wellbeing', 'healthland',
58
+ 'laboratory', 'health', 'ave', 'page',
59
+
60
+ # Metadata fields
61
+ 'age', 'gender', 'email', 'sample', 'results', 'verified by',
62
+ 'processing', 'details',
63
+
64
+ # Table headers
65
+ 'test', 'result', 'unit', 'normal', 'range', 'status',
66
+ 'normal range', 'result status',
67
+
68
+ # Section headers
69
+ 'hematology', 'biochemistry', 'liver function', 'thyroid function',
70
+ 'kidney function', 'lipid profile',
71
+
72
+ # Names (common in reports)
73
+ 'john', 'doe', 'johnatan', 'emily', 'johnson', 'dr',
74
+
75
+ # Standalone numbers
76
+ '30', '123', '12345',
77
+ }
78
+
79
+ # Valid lab tests for NER filtering
80
+ self.valid_tests = {
81
+ 'white blood cell count', 'wbc', 'red blood cell count', 'rbc',
82
+ 'hemoglobin', 'hgb', 'hb', 'hematocrit', 'hct',
83
+ 'platelet count', 'platelets', 'plt',
84
+ 'mcv', 'mch', 'mchc',
85
+ 'glucose', 'glu', 'creatinine', 'urea', 'bun',
86
+ 'cholesterol', 'ldl', 'hdl', 'triglycerides',
87
+ 'alt', 'ast', 'alp', 'bilirubin', 'albumin',
88
+ 'tsh', 'ft4', 'free thyroxine', 'hba1c', 'a1c',
89
+ 'sodium', 'potassium', 'calcium', 'chloride',
90
+ 'aminotransferase', 'phosphatase',
91
+ }
92
+
93
+ # Targeted regex for structured lab values
94
+ self.lab_value_pattern = re.compile(
95
+ r'(White Blood Cell Count|Red Blood Cell Count|Hemoglobin|Hematocrit|'
96
+ r'Platelet Count|Glucose|Creatinine|Urea|Cholesterol|'
97
+ r'Alanine Aminotransferase|Aspartate Aminotransferase|'
98
+ r'Alkaline Phosphatase|Bilirubin|Albumin|'
99
+ r'Thyroid Stimulating Hormone|Free Thyroxine|'
100
+ r'WBC|RBC|HGB|HCT|PLT|ALT|AST|ALP|TSH|FT4|HbA1c)'
101
+ r'\s*[:\n]\s*'
102
+ r'(\d+\.?\d*)'
103
+ r'\s*'
104
+ r'([a-zA-Z/%^0-9]+)?',
105
+ re.IGNORECASE
106
+ )
107
+
108
+ # Status pattern for interpretations
109
+ self.status_pattern = re.compile(r'\b(Elevated|High|Low|Normal|Critical|Abnormal)\b')
110
+
111
+ def _normalize_test_name(self, name: str) -> str:
112
+ """Normalize test abbreviations to full names"""
113
+ name_lower = name.lower().strip()
114
+
115
+ mapping = {
116
+ 'wbc': 'White Blood Cell Count',
117
+ 'rbc': 'Red Blood Cell Count',
118
+ 'hgb': 'Hemoglobin',
119
+ 'hb': 'Hemoglobin',
120
+ 'hct': 'Hematocrit',
121
+ 'plt': 'Platelet Count',
122
+ 'platelets': 'Platelet Count',
123
+ 'glu': 'Glucose',
124
+ 'alt': 'Alanine Aminotransferase',
125
+ 'ast': 'Aspartate Aminotransferase',
126
+ 'alp': 'Alkaline Phosphatase',
127
+ 'tsh': 'Thyroid Stimulating Hormone',
128
+ 'ft4': 'Free Thyroxine',
129
+ }
130
+
131
+ return mapping.get(name_lower, name)
132
+
133
+ def _calculate_status(self, test_name: str, value: float) -> Dict:
134
+ """Calculate test status and deviation from reference range"""
135
+ ref_range = REFERENCE_RANGES.get(test_name)
136
+
137
+ if not ref_range:
138
+ return {
139
+ "status": "unknown",
140
+ "deviation_percentage": 0.0,
141
+ "clinical_significance": "Reference range not available"
142
+ }
143
+
144
+ min_val, max_val = ref_range['min'], ref_range['max']
145
+
146
+ if value < min_val:
147
+ deviation = ((min_val - value) / min_val) * 100
148
+ status = "critical_low" if deviation > 50 else "low"
149
+ significance = f"Below normal range (↓{deviation:.1f}%)"
150
+ elif value > max_val:
151
+ deviation = ((value - max_val) / max_val) * 100
152
+ status = "critical_high" if deviation > 50 else "high"
153
+ significance = f"Above normal range (↑{deviation:.1f}%)"
154
+ else:
155
+ deviation = 0.0
156
+ status = "normal"
157
+ significance = "Within normal limits"
158
+
159
+ return {
160
+ "status": status,
161
+ "deviation_percentage": round(deviation, 2),
162
+ "clinical_significance": significance
163
+ }
164
+
165
+ def _get_clinical_embeddings(self, text: str) -> torch.Tensor:
166
+ """Get clinical embeddings using ClinicalDistilBERT"""
167
+ inputs = self.clinical_tokenizer(
168
+ text,
169
+ return_tensors="pt",
170
+ truncation=True,
171
+ max_length=512,
172
+ padding=True,
173
+ return_token_type_ids=False
174
+ ).to(self.device)
175
+
176
+ with torch.no_grad():
177
+ outputs = self.clinical_model(**inputs)
178
+ embeddings = outputs.last_hidden_state[:, 0, :] # [CLS] token
179
+
180
+ return embeddings
181
+
182
+ def _generate_clinical_insights(self, text: str, abnormal_results: List[Dict],
183
+ diseases: Set[str], interpretations: Set[str]) -> Dict:
184
+ """Generate clinical insights using ClinicalDistilBERT"""
185
+ # Get embeddings
186
+ embeddings = self._get_clinical_embeddings(text[:512])
187
+
188
+ insights = {
189
+ "embedding_dimension": embeddings.shape[1],
190
+ "clinical_context_captured": True,
191
+ "embeddings_generated": True,
192
+ "diseases_detected": list(diseases),
193
+ "status_flags": list(interpretations),
194
+ "abnormality_patterns": [],
195
+ "clinical_relevance_score": 0.0
196
+ }
197
+
198
+ # Analyze patterns
199
+ if len(abnormal_results) > 0:
200
+ critical_count = sum(1 for r in abnormal_results if r.get('severity') == 'critical')
201
+ moderate_count = len(abnormal_results) - critical_count
202
+
203
+ relevance_score = min(100.0, (critical_count * 30.0) + (moderate_count * 10.0))
204
+ insights["clinical_relevance_score"] = round(relevance_score, 2)
205
+
206
+ insights["abnormality_patterns"].append(
207
+ f"Detected {len(abnormal_results)} abnormal parameter(s)"
208
+ )
209
+
210
+ if critical_count > 0:
211
+ insights["abnormality_patterns"].append(
212
+ f"{critical_count} critical finding(s) require immediate attention"
213
+ )
214
+ else:
215
+ insights["clinical_relevance_score"] = 0.0
216
+ insights["abnormality_patterns"].append("All parameters within normal clinical ranges")
217
+
218
+ return insights
219
+
220
+ def _smart_ner_extraction(self, doc, extracted_test_names: Set[str]) -> tuple:
221
+ """Smart NER extraction with strict filtering"""
222
+ additional_tests = []
223
+ diseases = set()
224
+ interpretations = set()
225
+ ner_stats = defaultdict(int)
226
+
227
+ for ent in doc.ents:
228
+ ner_stats[ent.label_] += 1
229
+
230
+ if ent.label_ == 'TEST_NAME':
231
+ ent_lower = ent.text.lower()
232
+
233
+ # Skip if in stopwords
234
+ if ent_lower in self.stopwords:
235
+ continue
236
+
237
+ # Skip if looks like date
238
+ if re.match(r'\d+/\d+/\d+', ent.text):
239
+ continue
240
+
241
+ # Skip if just numbers
242
+ if re.match(r'^\d+$', ent.text):
243
+ continue
244
+
245
+ # Skip if already extracted by regex
246
+ if ent_lower in extracted_test_names:
247
+ continue
248
+
249
+ # Only add if contains valid medical keywords
250
+ if any(keyword in ent_lower for keyword in self.valid_tests):
251
+ additional_tests.append({
252
+ 'testname': ent.text,
253
+ 'value': None,
254
+ 'unit': None,
255
+ 'source': 'ner'
256
+ })
257
+
258
+ elif ent.label_ == 'DISEASE':
259
+ if ent.text.lower() not in self.stopwords:
260
+ diseases.add(ent.text)
261
+
262
+ elif ent.label_ == 'INTERPRETATION':
263
+ interpretations.add(ent.text)
264
+
265
+ return additional_tests, diseases, interpretations, ner_stats
266
+
267
+ def extract_and_format(self, text: str, report_id: str = None, patient_id: str = None) -> Dict:
268
+ """Smart extraction using hybrid approach"""
269
+ start_time = time.time()
270
+
271
+ raw_tests = []
272
+ seen_tests = set()
273
+
274
+ # Step 1: Regex extraction (most reliable for structured data)
275
+ for match in self.lab_value_pattern.finditer(text):
276
+ test_name = self._normalize_test_name(match.group(1).strip())
277
+ try:
278
+ value = float(match.group(2))
279
+ unit = match.group(3) if match.group(3) else None
280
+
281
+ test_key = (test_name.lower(), value)
282
+ if test_key not in seen_tests:
283
+ raw_tests.append({
284
+ 'testname': test_name,
285
+ 'value': value,
286
+ 'unit': unit,
287
+ 'source': 'regex'
288
+ })
289
+ seen_tests.add(test_key)
290
+ except:
291
+ continue
292
+
293
+ extracted_test_names = {t['testname'].lower() for t in raw_tests}
294
+
295
+ # Step 2: Smart NER extraction with filtering
296
+ doc = self.nlp(text)
297
+ additional_tests, diseases, interpretations, ner_stats = self._smart_ner_extraction(
298
+ doc, extracted_test_names
299
+ )
300
+
301
+ # Extract status flags from text
302
+ for match in self.status_pattern.finditer(text):
303
+ context = text[max(0, match.start()-10):match.end()+10]
304
+ if 'Range' not in context: # Avoid "Normal Range"
305
+ interpretations.add(match.group(1))
306
+
307
+ # Collect entities for output
308
+ entities_for_output = []
309
+ for ent in doc.ents:
310
+ entities_for_output.append({
311
+ "text": ent.text,
312
+ "label": ent.label_,
313
+ "start_char": ent.start_char,
314
+ "end_char": ent.end_char,
315
+ "confidence": 0.92
316
+ })
317
+
318
+ # Step 3: Build test results with reference ranges
319
+ test_results = []
320
+ abnormal_results = []
321
+
322
+ for test in raw_tests:
323
+ test_name = test['testname']
324
+ value = test['value']
325
+ unit = test['unit']
326
+
327
+ ref_range = REFERENCE_RANGES.get(test_name, {})
328
+ status_info = self._calculate_status(test_name, value)
329
+
330
+ test_result = {
331
+ "test_name": test_name,
332
+ "value": value,
333
+ "unit": unit or ref_range.get('unit', ''),
334
+ "reference_range": {
335
+ "min": ref_range.get('min'),
336
+ "max": ref_range.get('max'),
337
+ "unit": ref_range.get('unit', unit or '')
338
+ } if ref_range else None,
339
+ "status": status_info['status'],
340
+ "deviation_percentage": status_info['deviation_percentage'],
341
+ "clinical_significance": status_info['clinical_significance'],
342
+ "trend": None,
343
+ "source": test['source']
344
+ }
345
+
346
+ test_results.append(test_result)
347
+
348
+ if status_info['status'] in ['low', 'high', 'critical_low', 'critical_high']:
349
+ severity = "critical" if 'critical' in status_info['status'] else "moderate"
350
+ abnormal_results.append({
351
+ "test_name": test_name,
352
+ "severity": severity,
353
+ "requires_attention": True
354
+ })
355
+
356
+ # Step 4: Generate summaries and insights
357
+ ai_summary = self._generate_summary(test_results, abnormal_results)
358
+ test_panels = self._group_into_panels(test_results)
359
+ visualization_data = self._generate_visualization_data(test_results)
360
+
361
+ # Step 5: Generate clinical insights with ClinicalDistilBERT
362
+ clinical_insights = self._generate_clinical_insights(
363
+ text, abnormal_results, diseases, interpretations
364
+ )
365
+
366
+ processing_time = int((time.time() - start_time) * 1000)
367
+
368
+ return {
369
+ "report_id": report_id or f"rep_{int(time.time())}",
370
+ "report_type": "laboratory",
371
+ "processing_time_ms": processing_time,
372
+
373
+ "classification": {
374
+ "test_category": self._determine_category(test_results),
375
+ "sub_category": "complete_blood_count",
376
+ "urgency_level": "critical" if any(r['severity'] == 'critical' for r in abnormal_results) else "abnormal" if abnormal_results else "routine",
377
+ "confidence": 0.96
378
+ },
379
+
380
+ "extraction_stats": {
381
+ "tests_with_values": len(test_results),
382
+ "additional_tests_found": len(additional_tests),
383
+ "diseases_detected": len(diseases),
384
+ "interpretations_found": len(interpretations),
385
+ "ner_model_stats": dict(ner_stats)
386
+ },
387
+
388
+ "entities": entities_for_output[:20],
389
+ "test_results": test_results,
390
+ "abnormal_results": abnormal_results,
391
+ "ai_summary": ai_summary,
392
+ "clinical_insights": clinical_insights,
393
+ "test_panels": test_panels,
394
+ "visualization_data": visualization_data,
395
+
396
+ "metadata": {
397
+ "model_version": "radiolo_smart_ner_v2.0.0",
398
+ "processing_date": datetime.utcnow().isoformat() + "Z",
399
+ "tests_extracted": len(test_results),
400
+ "confidence_score": 0.94,
401
+ "nlp_models": {
402
+ "ner": "Custom Lab NER (Smart Filtered)",
403
+ "clinical_bert": "ClinicalDistilBERT",
404
+ "extraction_method": "Hybrid (Regex + Filtered NER)"
405
+ }
406
+ }
407
+ }
408
+
409
+ def _determine_category(self, test_results: List[Dict]) -> str:
410
+ test_names = {t['test_name'].lower() for t in test_results}
411
+
412
+ if any('blood cell' in name or name in ['hemoglobin', 'hematocrit', 'platelet'] for name in test_names):
413
+ return "hematology"
414
+ elif any(name in ['alanine aminotransferase', 'aspartate aminotransferase', 'alkaline phosphatase', 'bilirubin', 'albumin'] for name in test_names):
415
+ return "liver_function"
416
+ elif any('thyroid' in name or name in ['thyroid stimulating hormone', 'free thyroxine'] for name in test_names):
417
+ return "thyroid_function"
418
+ else:
419
+ return "general_chemistry"
420
+
421
+ def _generate_summary(self, test_results: List[Dict], abnormal_results: List[Dict]) -> Dict:
422
+ normal_tests = [t['test_name'] for t in test_results if t['status'] == 'normal']
423
+ abnormal_tests = [a['test_name'] for a in abnormal_results]
424
+
425
+ if not abnormal_tests:
426
+ overall = "All test results are within normal limits."
427
+ recommendations = ["No immediate action required", "Continue regular health monitoring"]
428
+ else:
429
+ overall = f"Detected {len(abnormal_tests)} abnormal result(s). {len(normal_tests)} parameters within normal limits."
430
+ recommendations = [
431
+ "Correlate with clinical symptoms",
432
+ "Consider follow-up testing if symptoms persist",
433
+ "Consult with healthcare provider for interpretation"
434
+ ]
435
+
436
+ key_abnormalities = []
437
+ for result in abnormal_results:
438
+ test_detail = next((t for t in test_results if t['test_name'] == result['test_name']), None)
439
+ if test_detail:
440
+ key_abnormalities.append(
441
+ f"{result['test_name']}: {test_detail['clinical_significance']}"
442
+ )
443
+
444
+ return {
445
+ "overall_assessment": overall,
446
+ "key_abnormalities": key_abnormalities,
447
+ "normal_parameters": normal_tests,
448
+ "recommendations": recommendations
449
+ }
450
+
451
+ def _group_into_panels(self, test_results: List[Dict]) -> List[Dict]:
452
+ panels = defaultdict(list)
453
+
454
+ cbc_tests = {'White Blood Cell Count', 'Red Blood Cell Count', 'Hemoglobin', 'Hematocrit', 'Platelet Count'}
455
+ liver_tests = {'Alanine Aminotransferase', 'Aspartate Aminotransferase', 'Alkaline Phosphatase', 'Bilirubin', 'Albumin'}
456
+ thyroid_tests = {'Thyroid Stimulating Hormone', 'Free Thyroxine'}
457
+
458
+ for test in test_results:
459
+ name = test['test_name']
460
+ if name in cbc_tests:
461
+ panels['Complete Blood Count'].append(test)
462
+ elif name in liver_tests:
463
+ panels['Liver Function Panel'].append(test)
464
+ elif name in thyroid_tests:
465
+ panels['Thyroid Function Panel'].append(test)
466
+ else:
467
+ panels['General Chemistry'].append(test)
468
+
469
+ panel_list = []
470
+ for panel_name, tests in panels.items():
471
+ abnormal_count = sum(1 for t in tests if t['status'] != 'normal')
472
+ panel_list.append({
473
+ "panel_name": panel_name,
474
+ "tests_included": [t['test_name'] for t in tests],
475
+ "panel_status": "abnormal" if abnormal_count > 0 else "normal",
476
+ "abnormal_count": abnormal_count,
477
+ "total_tests": len(tests)
478
+ })
479
+
480
+ return panel_list
481
+
482
+ def _generate_visualization_data(self, test_results: List[Dict]) -> Dict:
483
+ chart_data = []
484
+
485
+ for test in test_results:
486
+ if test['reference_range']:
487
+ chart_data.append({
488
+ "test": test['test_name'],
489
+ "value": test['value'],
490
+ "ref_min": test['reference_range']['min'],
491
+ "ref_max": test['reference_range']['max']
492
+ })
493
+
494
+ return {
495
+ "charts": [{
496
+ "chart_type": "bar",
497
+ "title": "Lab Results vs Reference Range",
498
+ "data": chart_data
499
+ }],
500
+ "trend_data": []
501
+ }
app/main.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Request, File, UploadFile
2
+ from fastapi.responses import JSONResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from starlette.middleware.gzip import GZipMiddleware
5
+ import time
6
+ import os
7
+ import gzip
8
+ import base64
9
+ import json
10
+
11
+ from .text_extractor import extract_text_from_pdf, extract_text_from_image
12
+ from .image_extractor import extract_images_from_pdf
13
+ from .lab_processor import RadioloLabProcessor
14
+ from .models import EncryptedRequest
15
+ from .crypto_utils import CryptoManager
16
+ from dotenv import load_dotenv
17
+
18
+ load_dotenv()
19
+
20
+ app = FastAPI(
21
+ title="Medical Lab Report Analysis API",
22
+ description="Extract structured lab test data from medical reports using NER + Regex with end-to-end encryption",
23
+ version="2.0.0",
24
+ docs_url=None,
25
+ redoc_url=None
26
+ )
27
+
28
+ app.add_middleware(
29
+ CORSMiddleware,
30
+ allow_origins=["*"],
31
+ allow_credentials=True,
32
+ allow_methods=["*"],
33
+ allow_headers=["*"],
34
+ )
35
+
36
+ app.add_middleware(GZipMiddleware, minimum_size=1000)
37
+
38
+ lab_processor = None
39
+ SECRET_KEY = os.getenv("ENCRYPTION_KEY")
40
+ crypto_manager = CryptoManager(SECRET_KEY)
41
+
42
+ @app.on_event("startup")
43
+ async def startup_event():
44
+ global lab_processor
45
+
46
+ print("\n" + "=" * 70)
47
+ print("MEDICAL LAB REPORT ANALYSIS API - STARTING UP")
48
+ print("=" * 70)
49
+
50
+ model_path = os.getenv("LAB_NER_MODEL_PATH", "./models/radiolo_clinic_ner")
51
+ print(f"\nLoading Lab NER model from: {model_path}")
52
+
53
+ if not os.path.exists(model_path):
54
+ print(f"✗ ERROR: Model not found at {model_path}")
55
+ raise RuntimeError("Lab NER model not found")
56
+
57
+ try:
58
+ lab_processor = RadioloLabProcessor(model_path)
59
+ print("✅ API READY!")
60
+ print("=" * 70 + "\n")
61
+ except Exception as e:
62
+ print(f"✗ FATAL ERROR: Failed to load model: {e}")
63
+ raise
64
+
65
+ @app.on_event("shutdown")
66
+ async def shutdown_event():
67
+ print("\nAPI SHUTTING DOWN\n")
68
+
69
+ @app.get("/")
70
+ async def root():
71
+ return {
72
+ "status": "online",
73
+ "api": "Medical Lab Report Analysis API",
74
+ "version": "2.0.0",
75
+ "model_loaded": lab_processor is not None,
76
+ "features": {
77
+ "encryption": "NaCl (XSalsa20-Poly1305)",
78
+ "compression": "gzip",
79
+ "ocr_engine": "EasyOCR",
80
+ "ner_model": "Custom Lab NER",
81
+ "supported_tests": 16
82
+ },
83
+ "endpoints": {
84
+ "health": "/health",
85
+ "analyze": "/analyze-lab-secure",
86
+ "test": "/test-analyze" # NEW
87
+ },
88
+ "supported_formats": ["pdf", "image"],
89
+ "supported_lab_tests": [
90
+ "Complete Blood Count (WBC, RBC, Hemoglobin, Hematocrit, Platelets)",
91
+ "Liver Function (ALT, AST, ALP, Bilirubin, Albumin)",
92
+ "Thyroid Function (TSH, Free T4)",
93
+ "Metabolic Panel (Glucose, Creatinine, Urea, Cholesterol)"
94
+ ]
95
+ }
96
+
97
+ @app.get("/health")
98
+ async def health_check():
99
+ return {
100
+ "status": "healthy",
101
+ "model_loaded": lab_processor is not None,
102
+ "model_type": "Lab Report NER",
103
+ "ocr_engine": "EasyOCR",
104
+ "encryption": "NaCl (XSalsa20-Poly1305)",
105
+ "compression": "gzip",
106
+ "version": "2.0.0",
107
+ "supported_tests": 16
108
+ }
109
+
110
+ # ============================================================================
111
+ # NEW: UNENCRYPTED TEST ENDPOINT (for testing only)
112
+ # ============================================================================
113
+
114
+ @app.post("/test-analyze", tags=["Testing"])
115
+ async def test_analyze(file: UploadFile = File(...)):
116
+ """
117
+ Test endpoint without encryption - upload file directly
118
+ ⚠️ WARNING: For testing only! No encryption!
119
+ """
120
+ start_time = time.time()
121
+
122
+ try:
123
+ if not lab_processor:
124
+ raise HTTPException(status_code=503, detail="Lab processor not loaded")
125
+
126
+ # Read uploaded file
127
+ file_bytes = await file.read()
128
+ filename = file.filename
129
+
130
+ print(f"\n📄 Processing test file: {filename} ({len(file_bytes)} bytes)")
131
+
132
+ # Determine file type from extension
133
+ if filename.lower().endswith('.pdf'):
134
+ file_type = "pdf"
135
+ extracted_text, ocr_used = extract_text_from_pdf(file_bytes)
136
+ images = extract_images_from_pdf(file_bytes)
137
+ elif filename.lower().endswith(('.jpg', '.jpeg', '.png', '.tiff', '.bmp')):
138
+ file_type = "image"
139
+ extracted_text = extract_text_from_image(file_bytes)
140
+ ocr_used = True
141
+ images = []
142
+ else:
143
+ raise HTTPException(status_code=400, detail="Unsupported file type. Use PDF or image files.")
144
+
145
+ if not extracted_text or len(extracted_text.strip()) < 10:
146
+ raise HTTPException(status_code=400, detail="Could not extract sufficient text from file")
147
+
148
+ print(f"✓ Extracted {len(extracted_text)} characters (OCR: {ocr_used})")
149
+
150
+ # Process with lab processor
151
+ print("🧠 Processing with NER + ClinicalDistilBERT...")
152
+ lab_analysis = lab_processor.extract_and_format(
153
+ extracted_text,
154
+ report_id=f"test_{int(time.time())}",
155
+ patient_id="TEST_PATIENT"
156
+ )
157
+
158
+ processing_time = time.time() - start_time
159
+
160
+ print(f"✅ Processing complete in {processing_time:.2f}s")
161
+ print(f" Tests extracted: {lab_analysis.get('metadata', {}).get('tests_extracted', 0)}\n")
162
+
163
+ # Return unencrypted response
164
+ response_data = {
165
+ "status": "success",
166
+ "processing_time": round(processing_time, 3),
167
+ "filename": filename,
168
+ "input_type": file_type,
169
+ "ocr_used": ocr_used,
170
+ "ocr_engine": "EasyOCR" if ocr_used else "PyMuPDF",
171
+ "raw_text_preview": extracted_text[:500] + "..." if len(extracted_text) > 500 else extracted_text,
172
+ "text_length": len(extracted_text),
173
+ "images": images,
174
+ **lab_analysis
175
+ }
176
+
177
+ return response_data
178
+
179
+ except HTTPException as he:
180
+ raise he
181
+ except Exception as e:
182
+ import traceback
183
+ print(f"❌ Error: {e}")
184
+ traceback.print_exc()
185
+ raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}")
186
+
187
+ # ============================================================================
188
+ # ENCRYPTED ENDPOINT (production)
189
+ # ============================================================================
190
+
191
+ @app.post("/analyze-lab-secure", tags=["Lab Analysis"])
192
+ async def analyze_lab_secure(request: EncryptedRequest):
193
+ start_time = time.time()
194
+
195
+ try:
196
+ if not lab_processor:
197
+ raise HTTPException(status_code=503, detail="Lab processor not loaded")
198
+
199
+ # Decrypt request
200
+ decrypted_data = crypto_manager.decrypt(request.ciphertext, request.nonce)
201
+ compressed_b64 = decrypted_data.decode('utf-8')
202
+ compressed_bytes = base64.b64decode(compressed_b64)
203
+ decompressed_data = gzip.decompress(compressed_bytes)
204
+
205
+ payload = json.loads(decompressed_data.decode('utf-8'))
206
+ filename = payload.get('filename', 'unknown')
207
+ file_data_b64 = payload['file_data']
208
+ file_type = payload['file_type']
209
+ file_bytes = base64.b64decode(file_data_b64)
210
+
211
+ # Extract text
212
+ if file_type == "pdf":
213
+ extracted_text, ocr_used = extract_text_from_pdf(file_bytes)
214
+ if not extracted_text or len(extracted_text.strip()) < 10:
215
+ raise HTTPException(status_code=400, detail="Could not extract text from PDF")
216
+ images = extract_images_from_pdf(file_bytes)
217
+ elif file_type == "image":
218
+ extracted_text = extract_text_from_image(file_bytes)
219
+ ocr_used = True
220
+ images = []
221
+ if not extracted_text or len(extracted_text.strip()) < 10:
222
+ raise HTTPException(status_code=400, detail="Could not extract text from image")
223
+ else:
224
+ raise HTTPException(status_code=400, detail="Invalid file_type. Must be 'pdf' or 'image'")
225
+
226
+ # Process with lab processor
227
+ lab_analysis = lab_processor.extract_and_format(
228
+ extracted_text,
229
+ report_id=f"lab_{int(time.time())}",
230
+ patient_id=payload.get('patient_id', 'unknown')
231
+ )
232
+
233
+ processing_time = time.time() - start_time
234
+
235
+ response_data = {
236
+ "status": "success",
237
+ "processing_time": round(processing_time, 3),
238
+ "filename": filename,
239
+ "input_type": file_type,
240
+ "ocr_used": ocr_used,
241
+ "ocr_engine": "EasyOCR" if ocr_used else "PyMuPDF",
242
+ "raw_text": extracted_text[:500] + "..." if len(extracted_text) > 500 else extracted_text,
243
+ "text_length": len(extracted_text),
244
+ "images": images,
245
+ **lab_analysis
246
+ }
247
+
248
+ # Encrypt response
249
+ encrypted_response = crypto_manager.encrypt_json(response_data)
250
+
251
+ return {
252
+ "status": "success",
253
+ "ciphertext": encrypted_response['ciphertext'],
254
+ "nonce": encrypted_response['nonce']
255
+ }
256
+
257
+ except HTTPException as he:
258
+ raise he
259
+ except Exception as e:
260
+ raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}")
261
+
262
+ @app.exception_handler(404)
263
+ async def not_found_handler(request: Request, exc):
264
+ return JSONResponse(
265
+ status_code=404,
266
+ content={
267
+ "status": "error",
268
+ "message": "Endpoint not found",
269
+ "available_endpoints": ["/", "/health", "/test-analyze", "/analyze-lab-secure"]
270
+ }
271
+ )
272
+
273
+ @app.exception_handler(500)
274
+ async def internal_error_handler(request: Request, exc):
275
+ return JSONResponse(
276
+ status_code=500,
277
+ content={
278
+ "status": "error",
279
+ "message": "Internal server error",
280
+ "error_type": type(exc).__name__
281
+ }
282
+ )
283
+
284
+ if __name__ == "__main__":
285
+ import uvicorn
286
+ host = os.getenv("HOST", "0.0.0.0")
287
+ port = int(os.getenv("PORT", 7860))
288
+ uvicorn.run("app.main:app", host=host, port=port, reload=False, log_level="info")
app/models.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pydantic models for request/response validation
3
+ """
4
+
5
+ from pydantic import BaseModel, Field
6
+ from typing import List, Dict, Optional
7
+
8
+ class TextRequest(BaseModel):
9
+ """Request model for text-only analysis"""
10
+ text: str = Field(..., min_length=10, description="Radiology report text")
11
+
12
+ class Config:
13
+ json_schema_extra = {
14
+ "example": {
15
+ "text": "FINDINGS: The cardiac silhouette is within normal limits. The lungs are clear. No pleural effusion or pneumothorax."
16
+ }
17
+ }
18
+
19
+ class Entity(BaseModel):
20
+ """Individual entity detected by NER"""
21
+ text: str
22
+ label: str
23
+ start: int
24
+ end: int
25
+ confidence: float = 0.99
26
+
27
+ class StructuredReport(BaseModel):
28
+ """Structured representation of report findings"""
29
+ anatomy: List[str]
30
+ all_observations: List[str]
31
+ positive_findings: List[str]
32
+ negative_findings: List[str]
33
+ critical_findings: List[str]
34
+
35
+ class Summary(BaseModel):
36
+ """Summary statistics of the analysis"""
37
+ total_entities: int
38
+ anatomy_count: int
39
+ observations_count: int
40
+ has_critical_findings: bool
41
+ has_abnormalities: bool
42
+
43
+ class ImageData(BaseModel):
44
+ """Extracted image from PDF"""
45
+ page: int
46
+ format: str
47
+ width: int
48
+ height: int
49
+ data: str # base64 encoded
50
+
51
+ class AnalysisResponse(BaseModel):
52
+ """Complete analysis response"""
53
+ status: str
54
+ processing_time: float
55
+ input_type: str
56
+ ocr_used: bool
57
+ ocr_engine: Optional[str] = None
58
+ raw_text: str
59
+ text_length: int
60
+ entities: List[Entity]
61
+ structured_report: StructuredReport
62
+ summary: Summary
63
+ recommendations: List[str]
64
+ images: Optional[List[ImageData]] = None
65
+
66
+ class EncryptedRequest(BaseModel):
67
+ """Encrypted and compressed file request"""
68
+ ciphertext: str
69
+ nonce: str
70
+
71
+ class Config:
72
+ json_schema_extra = {
73
+ "example": {
74
+ "ciphertext": "mJXnK8p9VGhpN...",
75
+ "nonce": "Y2FzZGFzZGFzZA=="
76
+ }
77
+ }
78
+
79
+ class EncryptedResponse(BaseModel):
80
+ """Encrypted response"""
81
+ ciphertext: str
82
+ nonce: str
83
+ status: str = "success"
app/text_extractor.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text extraction from PDFs and images using EasyOCR
3
+ Smart extraction: tries text layer first, falls back to OCR
4
+ """
5
+
6
+ import fitz # PyMuPDF
7
+ import easyocr
8
+ from PIL import Image
9
+ from pdf2image import convert_from_bytes
10
+ import io
11
+ import numpy as np
12
+ from typing import Tuple, Optional
13
+
14
+ print("Initializing EasyOCR Reader...")
15
+ try:
16
+ reader = easyocr.Reader(['en'], gpu=False, verbose=False)
17
+ print("✓ EasyOCR Reader initialized successfully")
18
+ except Exception as e:
19
+ print(f"✗ EasyOCR initialization failed: {e}")
20
+ reader = None
21
+
22
+ def extract_text_from_pdf(pdf_bytes: bytes) -> Tuple[Optional[str], bool]:
23
+ """
24
+ Extract text from PDF with smart OCR fallback
25
+
26
+ Returns:
27
+ (extracted_text, ocr_used)
28
+ """
29
+ if not pdf_bytes:
30
+ return None, False
31
+
32
+ try:
33
+ # Try extracting text layer first (fast)
34
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
35
+ full_text = ""
36
+
37
+ for page in doc:
38
+ full_text += page.get_text()
39
+
40
+ doc.close()
41
+
42
+ # Check if meaningful text was extracted
43
+ if len(full_text.strip()) > 50:
44
+ print(f"✓ Extracted {len(full_text)} chars from text layer")
45
+ return full_text.strip(), False
46
+
47
+ # No text layer - use OCR
48
+ print("⚠ No text layer detected, using EasyOCR...")
49
+ text = extract_text_from_pdf_via_ocr(pdf_bytes)
50
+ return text, True
51
+
52
+ except Exception as e:
53
+ print(f"✗ Error in PDF text extraction: {e}")
54
+ return None, False
55
+
56
+ def extract_text_from_pdf_via_ocr(pdf_bytes: bytes) -> Optional[str]:
57
+ """
58
+ Extract text using EasyOCR on PDF pages converted to images
59
+ """
60
+ if not reader:
61
+ raise RuntimeError("EasyOCR not initialized")
62
+
63
+ try:
64
+ # Convert PDF to images
65
+ images = convert_from_bytes(pdf_bytes, dpi=300)
66
+ full_text = ""
67
+
68
+ for i, image in enumerate(images):
69
+ print(f" OCR processing page {i+1}/{len(images)}...")
70
+
71
+ # Convert PIL to numpy array
72
+ img_array = np.array(image)
73
+
74
+ # Run EasyOCR
75
+ results = reader.readtext(img_array, detail=0, paragraph=True)
76
+ page_text = ' '.join(results)
77
+ full_text += page_text + "\n\n"
78
+
79
+ print(f"✓ EasyOCR extracted {len(full_text)} chars from {len(images)} pages")
80
+ return full_text.strip()
81
+
82
+ except Exception as e:
83
+ print(f"✗ OCR failed: {e}")
84
+ return None
85
+
86
+ def extract_text_from_image(image_bytes: bytes) -> Optional[str]:
87
+ """
88
+ Extract text from image file using EasyOCR
89
+ """
90
+ if not reader:
91
+ raise RuntimeError("EasyOCR not initialized")
92
+
93
+ try:
94
+ print("Processing image with EasyOCR...")
95
+
96
+ # Open and prepare image
97
+ image = Image.open(io.BytesIO(image_bytes))
98
+
99
+ if image.mode != 'RGB':
100
+ image = image.convert('RGB')
101
+
102
+ # Convert to numpy
103
+ img_array = np.array(image)
104
+
105
+ # Run EasyOCR
106
+ results = reader.readtext(img_array, detail=0, paragraph=True)
107
+ text = ' '.join(results)
108
+
109
+ print(f"✓ EasyOCR extracted {len(text)} chars from image")
110
+ return text.strip()
111
+
112
+ except Exception as e:
113
+ print(f"✗ Image OCR failed: {e}")
114
+ return None
115
+
116
+ def get_ocr_confidence(image_array: np.ndarray) -> list:
117
+ """
118
+ Get detailed OCR results with confidence scores
119
+ """
120
+ if not reader:
121
+ return []
122
+
123
+ try:
124
+ results = reader.readtext(image_array, detail=1)
125
+ return [
126
+ {
127
+ "text": text,
128
+ "confidence": round(conf, 3),
129
+ "bbox": bbox
130
+ }
131
+ for bbox, text, conf in results
132
+ ]
133
+ except:
134
+ return []
requirements.txt ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.4
2
+ uvicorn[standard]==0.32.0
3
+ python-multipart==0.0.19
4
+ starlette==0.41.3
5
+ # pip install fastapi uvicorn python-multipart starlette
6
+
7
+ PyNaCl==1.5.0
8
+ python-dotenv==1.0.1
9
+ # pip install PyNaCl python-dotenv
10
+
11
+ PyMuPDF==1.24.13
12
+ Pillow==11.0.0
13
+ easyocr==1.7.2x
14
+ opencv-python-headless==4.10.0.84
15
+ # pip install PyMuPDF Pillow easyocr opencv-python-headless
16
+
17
+ spacy==3.8.2
18
+ transformers==4.46.3
19
+ torch==2.5.1
20
+ sentencepiece==0.2.0
21
+ # pip install spacy transformers torch sentencepiece
22
+
23
+ easyocr
24
+ pdf2image
25
+ # pip install easyocr pdf2image
26
+
27
+ # Utilities
28
+ numpy<2.0
29
+ pydantic==2.9.2
30
+ pydantic-settings==2.6.1
31
+ aiofiles==24.1.0
32
+ # pip install pydantic pydantic-settings aiofiles python-json-logger
33
+
34
+ # Monitoring & Logging
35
+ python-json-logger==3.2.1
36
+
37
+ # Testing (optional, for development)
38
+ pytest==8.3.3
39
+ pytest-asyncio==0.24.0
40
+ httpx==0.28.0
41
+ # pip install pytest pytest-asyncio httpx