Spaces:

MakPr016
/

clinical-analysis-api

Sleeping

App Files Files Community

MakPr016 commited on Oct 19, 2025

Commit

e158d2f

0 Parent(s):

Inital phase

Browse files

Files changed (10) hide show

.gitignore +108 -0
Dockerfile +0 -0
app/__init__.py +6 -0
app/crypto_utils.py +88 -0
app/image_extractor.py +77 -0
app/lab_processor.py +501 -0
app/main.py +288 -0
app/models.py +83 -0
app/text_extractor.py +134 -0
requirements.txt +41 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,108 @@

+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+.pytest_cache/
+.coverage
+.coverage.*
+htmlcov/
+.tox/
+.nox/
+.hypothesis/
+pytestdebug.log
+*.log
+*.pot
+*.pyc
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+.spyderproject
+.spyproject
+.ropeproject
+instance/
+.webassets-cache
+.mypy_cache/
+.dmypy.json
+dmypy.json
+.pyre/
+.pytype/
+cython_debug/
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+models/
+*.pkl
+*.pth
+*.pt
+*.bin
+*.h5
+*.onnx
+*.pb
+*.caffemodel
+*.weights
+data/
+datasets/
+*.csv
+*.json
+*.jsonl
+*.tsv
+*.pdf
+*.jpg
+*.jpeg
+*.png
+*.gif
+*.bmp
+*.tiff
+*.svg
+*.ico
+test_files/
+uploads/
+temp/
+tmp/
+cache/
+.ipynb_checkpoints/
+*.ipynb
+node_modules/
+package-lock.json
+yarn.lock
+flagged/
+.env

Dockerfile ADDED Viewed

File without changes

app/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""
+Lab Report NER API
+Extracts structured entities from medical reports using spaCy NER + EasyOCR + ClinicalDistilBERT
+"""
+__version__ = "1.0.0"

app/crypto_utils.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""
+Encryption utilities using NaCl (libsodium)
+"""
+import base64
+import gzip
+import json
+from nacl.secret import SecretBox
+from nacl.utils import random
+class CryptoManager:
+    def __init__(self, secret_key_hex: str):
+        """
+        Initialize with hex key string from .env
+        Converts 64-character hex string to 32 bytes
+        """
+        if not secret_key_hex:
+            raise ValueError("Secret key is required")
+        # Check if it's already the right length
+        if len(secret_key_hex) == 64:
+            # 64 hex chars = 32 bytes (correct)
+            self.secret_key = bytes.fromhex(secret_key_hex)
+        elif len(secret_key_hex) == 32:
+            # If someone passes 32 chars thinking it's bytes, warn them
+            print(f"⚠️  WARNING: Key is only 32 characters (16 bytes)")
+            print(f"   Should be 64 hex characters for 32 bytes")
+            # Try to use as-is but it will fail
+            self.secret_key = secret_key_hex.encode('utf-8')
+        else:
+            raise ValueError(f"Secret key must be 64 hex characters (got {len(secret_key_hex)})")
+        if len(self.secret_key) != 32:
+            raise ValueError(f"Secret key must be 32 bytes (got {len(self.secret_key)} bytes)")
+        self.box = SecretBox(self.secret_key)
+        print(f"✓ CryptoManager initialized (key: {len(self.secret_key)} bytes)")
+    def encrypt(self, plaintext: bytes, nonce: bytes = None) -> bytes:
+        """Encrypt plaintext bytes"""
+        if nonce is None:
+            nonce = random(SecretBox.NONCE_SIZE)
+        return self.box.encrypt(plaintext, nonce)
+    def decrypt(self, ciphertext: str, nonce: str) -> bytes:
+        """Decrypt base64-encoded ciphertext with base64-encoded nonce"""
+        try:
+            ciphertext_bytes = base64.b64decode(ciphertext)
+            nonce_bytes = base64.b64decode(nonce)
+            return self.box.decrypt(ciphertext_bytes, nonce_bytes)
+        except Exception as e:
+            raise ValueError(f"Decryption failed. {e}")
+    def encrypt_json(self, data: dict) -> dict:
+        """
+        Encrypt JSON data with compression
+        Returns dict with base64-encoded ciphertext and nonce
+        """
+        # Convert to JSON and compress
+        json_data = json.dumps(data).encode('utf-8')
+        compressed = gzip.compress(json_data, compresslevel=6)
+        compressed_b64 = base64.b64encode(compressed).decode('utf-8')
+        # Encrypt
+        nonce = random(SecretBox.NONCE_SIZE)
+        ciphertext = self.box.encrypt(compressed_b64.encode('utf-8'), nonce)
+        return {
+            "ciphertext": base64.b64encode(ciphertext.ciphertext).decode('utf-8'),
+            "nonce": base64.b64encode(nonce).decode('utf-8')
+        }
+    def decrypt_json(self, ciphertext: str, nonce: str) -> dict:
+        """
+        Decrypt and decompress JSON data
+        """
+        try:
+            # Decrypt
+            decrypted = self.decrypt(ciphertext, nonce)
+            # Decompress
+            compressed_b64 = decrypted.decode('utf-8')
+            compressed_bytes = base64.b64decode(compressed_b64)
+            decompressed = gzip.decompress(compressed_bytes)
+            # Parse JSON
+            return json.loads(decompressed.decode('utf-8'))
+        except Exception as e:
+            raise ValueError(f"Decryption/decompression failed. {e}")

app/image_extractor.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+Extract embedded images from PDF files
+"""
+import fitz  # PyMuPDF
+import base64
+from PIL import Image
+import io
+from typing import List, Dict
+def extract_images_from_pdf(pdf_bytes: bytes) -> List[Dict]:
+    """
+    Extract all embedded images from PDF
+    Returns list of image dictionaries with base64 data
+    """
+    if not pdf_bytes:
+        return []
+    try:
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        images = []
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            image_list = page.get_images(full=True)
+            for img_index, img in enumerate(image_list):
+                try:
+                    xref = img[0]
+                    base_image = doc.extract_image(xref)
+                    image_bytes = base_image["image"]
+                    image_ext = base_image["ext"]
+                    # Get dimensions
+                    pil_image = Image.open(io.BytesIO(image_bytes))
+                    # Convert to base64
+                    image_b64 = base64.b64encode(image_bytes).decode('utf-8')
+                    images.append({
+                        "page": page_num + 1,
+                        "format": image_ext,
+                        "width": pil_image.width,
+                        "height": pil_image.height,
+                        "data": f"data:image/{image_ext};base64,{image_b64}"
+                    })
+                except Exception as e:
+                    print(f"⚠ Failed to extract image {img_index} from page {page_num + 1}: {e}")
+                    continue
+        doc.close()
+        print(f"✓ Extracted {len(images)} images from PDF")
+        return images
+    except Exception as e:
+        print(f"✗ Image extraction error: {e}")
+        return []
+def create_thumbnail(image_bytes: bytes, size: tuple = (200, 200)) -> str:
+    """
+    Create thumbnail version of image (base64)
+    """
+    try:
+        image = Image.open(io.BytesIO(image_bytes))
+        image.thumbnail(size, Image.Resampling.LANCZOS)
+        buffered = io.BytesIO()
+        image.save(buffered, format="JPEG", quality=85)
+        img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
+        return f"data:image/jpeg;base64,{img_str}"
+    except Exception as e:
+        print(f"✗ Thumbnail creation failed: {e}")
+        return ""

app/lab_processor.py ADDED Viewed

	@@ -0,0 +1,501 @@

+"""
+Lab Report Processing with Smart NER + Regex + ClinicalDistilBERT
+Based on your proven local implementation
+"""
+import spacy
+import re
+import time
+import torch
+from datetime import datetime
+from typing import Dict, List, Set
+from collections import defaultdict
+from transformers import AutoTokenizer, AutoModel
+REFERENCE_RANGES = {
+    "White Blood Cell Count": {"min": 4.0, "max": 11.0, "unit": "x10^9/L"},
+    "Red Blood Cell Count": {"min": 4.2, "max": 5.9, "unit": "x10^12/L"},
+    "Hemoglobin": {"min": 13.5, "max": 17.5, "unit": "g/dL"},
+    "Hematocrit": {"min": 38.3, "max": 48.6, "unit": "%"},
+    "Platelet Count": {"min": 150, "max": 450, "unit": "x10^9/L"},
+    "Glucose": {"min": 70, "max": 99, "unit": "mg/dL"},
+    "Creatinine": {"min": 0.6, "max": 1.2, "unit": "mg/dL"},
+    "Urea": {"min": 15, "max": 50, "unit": "mg/dL"},
+    "Cholesterol": {"min": 0, "max": 200, "unit": "mg/dL"},
+    "Alanine Aminotransferase": {"min": 7, "max": 56, "unit": "U/L"},
+    "Aspartate Aminotransferase": {"min": 8, "max": 48, "unit": "U/L"},
+    "Alkaline Phosphatase": {"min": 40, "max": 129, "unit": "U/L"},
+    "Bilirubin": {"min": 0.3, "max": 1.9, "unit": "mg/dL"},
+    "Albumin": {"min": 3.5, "max": 5.5, "unit": "g/dL"},
+    "Thyroid Stimulating Hormone": {"min": 0.5, "max": 4.5, "unit": "mIU/L"},
+    "Free Thyroxine": {"min": 0.9, "max": 1.7, "unit": "ng/dL"},
+}
+class RadioloLabProcessor:
+    def __init__(self, ner_model_path: str):
+        """Initialize smart lab processor with NER, stopwords, and ClinicalDistilBERT"""
+        # Load custom NER model
+        self.nlp = spacy.load(ner_model_path)
+        print(f"✓ Lab NER model loaded: {ner_model_path}")
+        # Load ClinicalDistilBERT
+        print("Loading ClinicalDistilBERT...")
+        self.clinical_tokenizer = AutoTokenizer.from_pretrained("nlpie/clinical-distilbert")
+        self.clinical_model = AutoModel.from_pretrained("nlpie/clinical-distilbert")
+        # Set device
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.clinical_model = self.clinical_model.to(self.device)
+        self.clinical_model.eval()
+        print(f"✓ ClinicalDistilBERT loaded on {self.device}")
+        # Strict stopwords to filter false positives
+        self.stopwords = {
+            # Document structure
+            'hemolab', 'central', 'medicity', 'wellbeing', 'healthland',
+            'laboratory', 'health', 'ave', 'page',
+            # Metadata fields
+            'age', 'gender', 'email', 'sample', 'results', 'verified by',
+            'processing', 'details',
+            # Table headers
+            'test', 'result', 'unit', 'normal', 'range', 'status',
+            'normal range', 'result status',
+            # Section headers
+            'hematology', 'biochemistry', 'liver function', 'thyroid function',
+            'kidney function', 'lipid profile',
+            # Names (common in reports)
+            'john', 'doe', 'johnatan', 'emily', 'johnson', 'dr',
+            # Standalone numbers
+            '30', '123', '12345',
+        }
+        # Valid lab tests for NER filtering
+        self.valid_tests = {
+            'white blood cell count', 'wbc', 'red blood cell count', 'rbc',
+            'hemoglobin', 'hgb', 'hb', 'hematocrit', 'hct',
+            'platelet count', 'platelets', 'plt',
+            'mcv', 'mch', 'mchc',
+            'glucose', 'glu', 'creatinine', 'urea', 'bun',
+            'cholesterol', 'ldl', 'hdl', 'triglycerides',
+            'alt', 'ast', 'alp', 'bilirubin', 'albumin',
+            'tsh', 'ft4', 'free thyroxine', 'hba1c', 'a1c',
+            'sodium', 'potassium', 'calcium', 'chloride',
+            'aminotransferase', 'phosphatase',
+        }
+        # Targeted regex for structured lab values
+        self.lab_value_pattern = re.compile(
+            r'(White Blood Cell Count|Red Blood Cell Count|Hemoglobin|Hematocrit|'
+            r'Platelet Count|Glucose|Creatinine|Urea|Cholesterol|'
+            r'Alanine Aminotransferase|Aspartate Aminotransferase|'
+            r'Alkaline Phosphatase|Bilirubin|Albumin|'
+            r'Thyroid Stimulating Hormone|Free Thyroxine|'
+            r'WBC|RBC|HGB|HCT|PLT|ALT|AST|ALP|TSH|FT4|HbA1c)'
+            r'\s*[:\n]\s*'
+            r'(\d+\.?\d*)'
+            r'\s*'
+            r'([a-zA-Z/%^0-9]+)?',
+            re.IGNORECASE
+        )
+        # Status pattern for interpretations
+        self.status_pattern = re.compile(r'\b(Elevated|High|Low|Normal|Critical|Abnormal)\b')
+    def _normalize_test_name(self, name: str) -> str:
+        """Normalize test abbreviations to full names"""
+        name_lower = name.lower().strip()
+        mapping = {
+            'wbc': 'White Blood Cell Count',
+            'rbc': 'Red Blood Cell Count',
+            'hgb': 'Hemoglobin',
+            'hb': 'Hemoglobin',
+            'hct': 'Hematocrit',
+            'plt': 'Platelet Count',
+            'platelets': 'Platelet Count',
+            'glu': 'Glucose',
+            'alt': 'Alanine Aminotransferase',
+            'ast': 'Aspartate Aminotransferase',
+            'alp': 'Alkaline Phosphatase',
+            'tsh': 'Thyroid Stimulating Hormone',
+            'ft4': 'Free Thyroxine',
+        }
+        return mapping.get(name_lower, name)
+    def _calculate_status(self, test_name: str, value: float) -> Dict:
+        """Calculate test status and deviation from reference range"""
+        ref_range = REFERENCE_RANGES.get(test_name)
+        if not ref_range:
+            return {
+                "status": "unknown",
+                "deviation_percentage": 0.0,
+                "clinical_significance": "Reference range not available"
+            }
+        min_val, max_val = ref_range['min'], ref_range['max']
+        if value < min_val:
+            deviation = ((min_val - value) / min_val) * 100
+            status = "critical_low" if deviation > 50 else "low"
+            significance = f"Below normal range (↓{deviation:.1f}%)"
+        elif value > max_val:
+            deviation = ((value - max_val) / max_val) * 100
+            status = "critical_high" if deviation > 50 else "high"
+            significance = f"Above normal range (↑{deviation:.1f}%)"
+        else:
+            deviation = 0.0
+            status = "normal"
+            significance = "Within normal limits"
+        return {
+            "status": status,
+            "deviation_percentage": round(deviation, 2),
+            "clinical_significance": significance
+        }
+    def _get_clinical_embeddings(self, text: str) -> torch.Tensor:
+        """Get clinical embeddings using ClinicalDistilBERT"""
+        inputs = self.clinical_tokenizer(
+            text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=512,
+            padding=True,
+            return_token_type_ids=False
+        ).to(self.device)
+        with torch.no_grad():
+            outputs = self.clinical_model(**inputs)
+            embeddings = outputs.last_hidden_state[:, 0, :]  # [CLS] token
+        return embeddings
+    def _generate_clinical_insights(self, text: str, abnormal_results: List[Dict],
+                                    diseases: Set[str], interpretations: Set[str]) -> Dict:
+        """Generate clinical insights using ClinicalDistilBERT"""
+        # Get embeddings
+        embeddings = self._get_clinical_embeddings(text[:512])
+        insights = {
+            "embedding_dimension": embeddings.shape[1],
+            "clinical_context_captured": True,
+            "embeddings_generated": True,
+            "diseases_detected": list(diseases),
+            "status_flags": list(interpretations),
+            "abnormality_patterns": [],
+            "clinical_relevance_score": 0.0
+        }
+        # Analyze patterns
+        if len(abnormal_results) > 0:
+            critical_count = sum(1 for r in abnormal_results if r.get('severity') == 'critical')
+            moderate_count = len(abnormal_results) - critical_count
+            relevance_score = min(100.0, (critical_count * 30.0) + (moderate_count * 10.0))
+            insights["clinical_relevance_score"] = round(relevance_score, 2)
+            insights["abnormality_patterns"].append(
+                f"Detected {len(abnormal_results)} abnormal parameter(s)"
+            )
+            if critical_count > 0:
+                insights["abnormality_patterns"].append(
+                    f"{critical_count} critical finding(s) require immediate attention"
+                )
+        else:
+            insights["clinical_relevance_score"] = 0.0
+            insights["abnormality_patterns"].append("All parameters within normal clinical ranges")
+        return insights
+    def _smart_ner_extraction(self, doc, extracted_test_names: Set[str]) -> tuple:
+        """Smart NER extraction with strict filtering"""
+        additional_tests = []
+        diseases = set()
+        interpretations = set()
+        ner_stats = defaultdict(int)
+        for ent in doc.ents:
+            ner_stats[ent.label_] += 1
+            if ent.label_ == 'TEST_NAME':
+                ent_lower = ent.text.lower()
+                # Skip if in stopwords
+                if ent_lower in self.stopwords:
+                    continue
+                # Skip if looks like date
+                if re.match(r'\d+/\d+/\d+', ent.text):
+                    continue
+                # Skip if just numbers
+                if re.match(r'^\d+$', ent.text):
+                    continue
+                # Skip if already extracted by regex
+                if ent_lower in extracted_test_names:
+                    continue
+                # Only add if contains valid medical keywords
+                if any(keyword in ent_lower for keyword in self.valid_tests):
+                    additional_tests.append({
+                        'testname': ent.text,
+                        'value': None,
+                        'unit': None,
+                        'source': 'ner'
+                    })
+            elif ent.label_ == 'DISEASE':
+                if ent.text.lower() not in self.stopwords:
+                    diseases.add(ent.text)
+            elif ent.label_ == 'INTERPRETATION':
+                interpretations.add(ent.text)
+        return additional_tests, diseases, interpretations, ner_stats
+    def extract_and_format(self, text: str, report_id: str = None, patient_id: str = None) -> Dict:
+        """Smart extraction using hybrid approach"""
+        start_time = time.time()
+        raw_tests = []
+        seen_tests = set()
+        # Step 1: Regex extraction (most reliable for structured data)
+        for match in self.lab_value_pattern.finditer(text):
+            test_name = self._normalize_test_name(match.group(1).strip())
+            try:
+                value = float(match.group(2))
+                unit = match.group(3) if match.group(3) else None
+                test_key = (test_name.lower(), value)
+                if test_key not in seen_tests:
+                    raw_tests.append({
+                        'testname': test_name,
+                        'value': value,
+                        'unit': unit,
+                        'source': 'regex'
+                    })
+                    seen_tests.add(test_key)
+            except:
+                continue
+        extracted_test_names = {t['testname'].lower() for t in raw_tests}
+        # Step 2: Smart NER extraction with filtering
+        doc = self.nlp(text)
+        additional_tests, diseases, interpretations, ner_stats = self._smart_ner_extraction(
+            doc, extracted_test_names
+        )
+        # Extract status flags from text
+        for match in self.status_pattern.finditer(text):
+            context = text[max(0, match.start()-10):match.end()+10]
+            if 'Range' not in context:  # Avoid "Normal Range"
+                interpretations.add(match.group(1))
+        # Collect entities for output
+        entities_for_output = []
+        for ent in doc.ents:
+            entities_for_output.append({
+                "text": ent.text,
+                "label": ent.label_,
+                "start_char": ent.start_char,
+                "end_char": ent.end_char,
+                "confidence": 0.92
+            })
+        # Step 3: Build test results with reference ranges
+        test_results = []
+        abnormal_results = []
+        for test in raw_tests:
+            test_name = test['testname']
+            value = test['value']
+            unit = test['unit']
+            ref_range = REFERENCE_RANGES.get(test_name, {})
+            status_info = self._calculate_status(test_name, value)
+            test_result = {
+                "test_name": test_name,
+                "value": value,
+                "unit": unit or ref_range.get('unit', ''),
+                "reference_range": {
+                    "min": ref_range.get('min'),
+                    "max": ref_range.get('max'),
+                    "unit": ref_range.get('unit', unit or '')
+                } if ref_range else None,
+                "status": status_info['status'],
+                "deviation_percentage": status_info['deviation_percentage'],
+                "clinical_significance": status_info['clinical_significance'],
+                "trend": None,
+                "source": test['source']
+            }
+            test_results.append(test_result)
+            if status_info['status'] in ['low', 'high', 'critical_low', 'critical_high']:
+                severity = "critical" if 'critical' in status_info['status'] else "moderate"
+                abnormal_results.append({
+                    "test_name": test_name,
+                    "severity": severity,
+                    "requires_attention": True
+                })
+        # Step 4: Generate summaries and insights
+        ai_summary = self._generate_summary(test_results, abnormal_results)
+        test_panels = self._group_into_panels(test_results)
+        visualization_data = self._generate_visualization_data(test_results)
+        # Step 5: Generate clinical insights with ClinicalDistilBERT
+        clinical_insights = self._generate_clinical_insights(
+            text, abnormal_results, diseases, interpretations
+        )
+        processing_time = int((time.time() - start_time) * 1000)
+        return {
+            "report_id": report_id or f"rep_{int(time.time())}",
+            "report_type": "laboratory",
+            "processing_time_ms": processing_time,
+            "classification": {
+                "test_category": self._determine_category(test_results),
+                "sub_category": "complete_blood_count",
+                "urgency_level": "critical" if any(r['severity'] == 'critical' for r in abnormal_results) else "abnormal" if abnormal_results else "routine",
+                "confidence": 0.96
+            },
+            "extraction_stats": {
+                "tests_with_values": len(test_results),
+                "additional_tests_found": len(additional_tests),
+                "diseases_detected": len(diseases),
+                "interpretations_found": len(interpretations),
+                "ner_model_stats": dict(ner_stats)
+            },
+            "entities": entities_for_output[:20],
+            "test_results": test_results,
+            "abnormal_results": abnormal_results,
+            "ai_summary": ai_summary,
+            "clinical_insights": clinical_insights,
+            "test_panels": test_panels,
+            "visualization_data": visualization_data,
+            "metadata": {
+                "model_version": "radiolo_smart_ner_v2.0.0",
+                "processing_date": datetime.utcnow().isoformat() + "Z",
+                "tests_extracted": len(test_results),
+                "confidence_score": 0.94,
+                "nlp_models": {
+                    "ner": "Custom Lab NER (Smart Filtered)",
+                    "clinical_bert": "ClinicalDistilBERT",
+                    "extraction_method": "Hybrid (Regex + Filtered NER)"
+                }
+            }
+        }
+    def _determine_category(self, test_results: List[Dict]) -> str:
+        test_names = {t['test_name'].lower() for t in test_results}
+        if any('blood cell' in name or name in ['hemoglobin', 'hematocrit', 'platelet'] for name in test_names):
+            return "hematology"
+        elif any(name in ['alanine aminotransferase', 'aspartate aminotransferase', 'alkaline phosphatase', 'bilirubin', 'albumin'] for name in test_names):
+            return "liver_function"
+        elif any('thyroid' in name or name in ['thyroid stimulating hormone', 'free thyroxine'] for name in test_names):
+            return "thyroid_function"
+        else:
+            return "general_chemistry"
+    def _generate_summary(self, test_results: List[Dict], abnormal_results: List[Dict]) -> Dict:
+        normal_tests = [t['test_name'] for t in test_results if t['status'] == 'normal']
+        abnormal_tests = [a['test_name'] for a in abnormal_results]
+        if not abnormal_tests:
+            overall = "All test results are within normal limits."
+            recommendations = ["No immediate action required", "Continue regular health monitoring"]
+        else:
+            overall = f"Detected {len(abnormal_tests)} abnormal result(s). {len(normal_tests)} parameters within normal limits."
+            recommendations = [
+                "Correlate with clinical symptoms",
+                "Consider follow-up testing if symptoms persist",
+                "Consult with healthcare provider for interpretation"
+            ]
+        key_abnormalities = []
+        for result in abnormal_results:
+            test_detail = next((t for t in test_results if t['test_name'] == result['test_name']), None)
+            if test_detail:
+                key_abnormalities.append(
+                    f"{result['test_name']}: {test_detail['clinical_significance']}"
+                )
+        return {
+            "overall_assessment": overall,
+            "key_abnormalities": key_abnormalities,
+            "normal_parameters": normal_tests,
+            "recommendations": recommendations
+        }
+    def _group_into_panels(self, test_results: List[Dict]) -> List[Dict]:
+        panels = defaultdict(list)
+        cbc_tests = {'White Blood Cell Count', 'Red Blood Cell Count', 'Hemoglobin', 'Hematocrit', 'Platelet Count'}
+        liver_tests = {'Alanine Aminotransferase', 'Aspartate Aminotransferase', 'Alkaline Phosphatase', 'Bilirubin', 'Albumin'}
+        thyroid_tests = {'Thyroid Stimulating Hormone', 'Free Thyroxine'}
+        for test in test_results:
+            name = test['test_name']
+            if name in cbc_tests:
+                panels['Complete Blood Count'].append(test)
+            elif name in liver_tests:
+                panels['Liver Function Panel'].append(test)
+            elif name in thyroid_tests:
+                panels['Thyroid Function Panel'].append(test)
+            else:
+                panels['General Chemistry'].append(test)
+        panel_list = []
+        for panel_name, tests in panels.items():
+            abnormal_count = sum(1 for t in tests if t['status'] != 'normal')
+            panel_list.append({
+                "panel_name": panel_name,
+                "tests_included": [t['test_name'] for t in tests],
+                "panel_status": "abnormal" if abnormal_count > 0 else "normal",
+                "abnormal_count": abnormal_count,
+                "total_tests": len(tests)
+            })
+        return panel_list
+    def _generate_visualization_data(self, test_results: List[Dict]) -> Dict:
+        chart_data = []
+        for test in test_results:
+            if test['reference_range']:
+                chart_data.append({
+                    "test": test['test_name'],
+                    "value": test['value'],
+                    "ref_min": test['reference_range']['min'],
+                    "ref_max": test['reference_range']['max']
+                })
+        return {
+            "charts": [{
+                "chart_type": "bar",
+                "title": "Lab Results vs Reference Range",
+                "data": chart_data
+            }],
+            "trend_data": []
+        }

app/main.py ADDED Viewed

	@@ -0,0 +1,288 @@

+from fastapi import FastAPI, HTTPException, Request, File, UploadFile
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from starlette.middleware.gzip import GZipMiddleware
+import time
+import os
+import gzip
+import base64
+import json
+from .text_extractor import extract_text_from_pdf, extract_text_from_image
+from .image_extractor import extract_images_from_pdf
+from .lab_processor import RadioloLabProcessor
+from .models import EncryptedRequest
+from .crypto_utils import CryptoManager
+from dotenv import load_dotenv
+load_dotenv()
+app = FastAPI(
+    title="Medical Lab Report Analysis API",
+    description="Extract structured lab test data from medical reports using NER + Regex with end-to-end encryption",
+    version="2.0.0",
+    docs_url=None,
+    redoc_url=None
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.add_middleware(GZipMiddleware, minimum_size=1000)
+lab_processor = None
+SECRET_KEY = os.getenv("ENCRYPTION_KEY")
+crypto_manager = CryptoManager(SECRET_KEY)
+@app.on_event("startup")
+async def startup_event():
+    global lab_processor
+    print("\n" + "=" * 70)
+    print("MEDICAL LAB REPORT ANALYSIS API - STARTING UP")
+    print("=" * 70)
+    model_path = os.getenv("LAB_NER_MODEL_PATH", "./models/radiolo_clinic_ner")
+    print(f"\nLoading Lab NER model from: {model_path}")
+    if not os.path.exists(model_path):
+        print(f"✗ ERROR: Model not found at {model_path}")
+        raise RuntimeError("Lab NER model not found")
+    try:
+        lab_processor = RadioloLabProcessor(model_path)
+        print("✅ API READY!")
+        print("=" * 70 + "\n")
+    except Exception as e:
+        print(f"✗ FATAL ERROR: Failed to load model: {e}")
+        raise
+@app.on_event("shutdown")
+async def shutdown_event():
+    print("\nAPI SHUTTING DOWN\n")
+@app.get("/")
+async def root():
+    return {
+        "status": "online",
+        "api": "Medical Lab Report Analysis API",
+        "version": "2.0.0",
+        "model_loaded": lab_processor is not None,
+        "features": {
+            "encryption": "NaCl (XSalsa20-Poly1305)",
+            "compression": "gzip",
+            "ocr_engine": "EasyOCR",
+            "ner_model": "Custom Lab NER",
+            "supported_tests": 16
+        },
+        "endpoints": {
+            "health": "/health",
+            "analyze": "/analyze-lab-secure",
+            "test": "/test-analyze"  # NEW
+        },
+        "supported_formats": ["pdf", "image"],
+        "supported_lab_tests": [
+            "Complete Blood Count (WBC, RBC, Hemoglobin, Hematocrit, Platelets)",
+            "Liver Function (ALT, AST, ALP, Bilirubin, Albumin)",
+            "Thyroid Function (TSH, Free T4)",
+            "Metabolic Panel (Glucose, Creatinine, Urea, Cholesterol)"
+        ]
+    }
+@app.get("/health")
+async def health_check():
+    return {
+        "status": "healthy",
+        "model_loaded": lab_processor is not None,
+        "model_type": "Lab Report NER",
+        "ocr_engine": "EasyOCR",
+        "encryption": "NaCl (XSalsa20-Poly1305)",
+        "compression": "gzip",
+        "version": "2.0.0",
+        "supported_tests": 16
+    }
+# ============================================================================
+# NEW: UNENCRYPTED TEST ENDPOINT (for testing only)
+# ============================================================================
+@app.post("/test-analyze", tags=["Testing"])
+async def test_analyze(file: UploadFile = File(...)):
+    """
+    Test endpoint without encryption - upload file directly
+    ⚠️ WARNING: For testing only! No encryption!
+    """
+    start_time = time.time()
+    try:
+        if not lab_processor:
+            raise HTTPException(status_code=503, detail="Lab processor not loaded")
+        # Read uploaded file
+        file_bytes = await file.read()
+        filename = file.filename
+        print(f"\n📄 Processing test file: {filename} ({len(file_bytes)} bytes)")
+        # Determine file type from extension
+        if filename.lower().endswith('.pdf'):
+            file_type = "pdf"
+            extracted_text, ocr_used = extract_text_from_pdf(file_bytes)
+            images = extract_images_from_pdf(file_bytes)
+        elif filename.lower().endswith(('.jpg', '.jpeg', '.png', '.tiff', '.bmp')):
+            file_type = "image"
+            extracted_text = extract_text_from_image(file_bytes)
+            ocr_used = True
+            images = []
+        else:
+            raise HTTPException(status_code=400, detail="Unsupported file type. Use PDF or image files.")
+        if not extracted_text or len(extracted_text.strip()) < 10:
+            raise HTTPException(status_code=400, detail="Could not extract sufficient text from file")
+        print(f"✓ Extracted {len(extracted_text)} characters (OCR: {ocr_used})")
+        # Process with lab processor
+        print("🧠 Processing with NER + ClinicalDistilBERT...")
+        lab_analysis = lab_processor.extract_and_format(
+            extracted_text,
+            report_id=f"test_{int(time.time())}",
+            patient_id="TEST_PATIENT"
+        )
+        processing_time = time.time() - start_time
+        print(f"✅ Processing complete in {processing_time:.2f}s")
+        print(f"   Tests extracted: {lab_analysis.get('metadata', {}).get('tests_extracted', 0)}\n")
+        # Return unencrypted response
+        response_data = {
+            "status": "success",
+            "processing_time": round(processing_time, 3),
+            "filename": filename,
+            "input_type": file_type,
+            "ocr_used": ocr_used,
+            "ocr_engine": "EasyOCR" if ocr_used else "PyMuPDF",
+            "raw_text_preview": extracted_text[:500] + "..." if len(extracted_text) > 500 else extracted_text,
+            "text_length": len(extracted_text),
+            "images": images,
+            **lab_analysis
+        }
+        return response_data
+    except HTTPException as he:
+        raise he
+    except Exception as e:
+        import traceback
+        print(f"❌ Error: {e}")
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}")
+# ============================================================================
+# ENCRYPTED ENDPOINT (production)
+# ============================================================================
+@app.post("/analyze-lab-secure", tags=["Lab Analysis"])
+async def analyze_lab_secure(request: EncryptedRequest):
+    start_time = time.time()
+    try:
+        if not lab_processor:
+            raise HTTPException(status_code=503, detail="Lab processor not loaded")
+        # Decrypt request
+        decrypted_data = crypto_manager.decrypt(request.ciphertext, request.nonce)
+        compressed_b64 = decrypted_data.decode('utf-8')
+        compressed_bytes = base64.b64decode(compressed_b64)
+        decompressed_data = gzip.decompress(compressed_bytes)
+        payload = json.loads(decompressed_data.decode('utf-8'))
+        filename = payload.get('filename', 'unknown')
+        file_data_b64 = payload['file_data']
+        file_type = payload['file_type']
+        file_bytes = base64.b64decode(file_data_b64)
+        # Extract text
+        if file_type == "pdf":
+            extracted_text, ocr_used = extract_text_from_pdf(file_bytes)
+            if not extracted_text or len(extracted_text.strip()) < 10:
+                raise HTTPException(status_code=400, detail="Could not extract text from PDF")
+            images = extract_images_from_pdf(file_bytes)
+        elif file_type == "image":
+            extracted_text = extract_text_from_image(file_bytes)
+            ocr_used = True
+            images = []
+            if not extracted_text or len(extracted_text.strip()) < 10:
+                raise HTTPException(status_code=400, detail="Could not extract text from image")
+        else:
+            raise HTTPException(status_code=400, detail="Invalid file_type. Must be 'pdf' or 'image'")
+        # Process with lab processor
+        lab_analysis = lab_processor.extract_and_format(
+            extracted_text,
+            report_id=f"lab_{int(time.time())}",
+            patient_id=payload.get('patient_id', 'unknown')
+        )
+        processing_time = time.time() - start_time
+        response_data = {
+            "status": "success",
+            "processing_time": round(processing_time, 3),
+            "filename": filename,
+            "input_type": file_type,
+            "ocr_used": ocr_used,
+            "ocr_engine": "EasyOCR" if ocr_used else "PyMuPDF",
+            "raw_text": extracted_text[:500] + "..." if len(extracted_text) > 500 else extracted_text,
+            "text_length": len(extracted_text),
+            "images": images,
+            **lab_analysis
+        }
+        # Encrypt response
+        encrypted_response = crypto_manager.encrypt_json(response_data)
+        return {
+            "status": "success",
+            "ciphertext": encrypted_response['ciphertext'],
+            "nonce": encrypted_response['nonce']
+        }
+    except HTTPException as he:
+        raise he
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Internal error: {str(e)}")
+@app.exception_handler(404)
+async def not_found_handler(request: Request, exc):
+    return JSONResponse(
+        status_code=404,
+        content={
+            "status": "error",
+            "message": "Endpoint not found",
+            "available_endpoints": ["/", "/health", "/test-analyze", "/analyze-lab-secure"]
+        }
+    )
+@app.exception_handler(500)
+async def internal_error_handler(request: Request, exc):
+    return JSONResponse(
+        status_code=500,
+        content={
+            "status": "error",
+            "message": "Internal server error",
+            "error_type": type(exc).__name__
+        }
+    )
+if __name__ == "__main__":
+    import uvicorn
+    host = os.getenv("HOST", "0.0.0.0")
+    port = int(os.getenv("PORT", 7860))
+    uvicorn.run("app.main:app", host=host, port=port, reload=False, log_level="info")

app/models.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""
+Pydantic models for request/response validation
+"""
+from pydantic import BaseModel, Field
+from typing import List, Dict, Optional
+class TextRequest(BaseModel):
+    """Request model for text-only analysis"""
+    text: str = Field(..., min_length=10, description="Radiology report text")
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "text": "FINDINGS: The cardiac silhouette is within normal limits. The lungs are clear. No pleural effusion or pneumothorax."
+            }
+        }
+class Entity(BaseModel):
+    """Individual entity detected by NER"""
+    text: str
+    label: str
+    start: int
+    end: int
+    confidence: float = 0.99
+class StructuredReport(BaseModel):
+    """Structured representation of report findings"""
+    anatomy: List[str]
+    all_observations: List[str]
+    positive_findings: List[str]
+    negative_findings: List[str]
+    critical_findings: List[str]
+class Summary(BaseModel):
+    """Summary statistics of the analysis"""
+    total_entities: int
+    anatomy_count: int
+    observations_count: int
+    has_critical_findings: bool
+    has_abnormalities: bool
+class ImageData(BaseModel):
+    """Extracted image from PDF"""
+    page: int
+    format: str
+    width: int
+    height: int
+    data: str  # base64 encoded
+class AnalysisResponse(BaseModel):
+    """Complete analysis response"""
+    status: str
+    processing_time: float
+    input_type: str
+    ocr_used: bool
+    ocr_engine: Optional[str] = None
+    raw_text: str
+    text_length: int
+    entities: List[Entity]
+    structured_report: StructuredReport
+    summary: Summary
+    recommendations: List[str]
+    images: Optional[List[ImageData]] = None
+class EncryptedRequest(BaseModel):
+    """Encrypted and compressed file request"""
+    ciphertext: str
+    nonce: str
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "ciphertext": "mJXnK8p9VGhpN...",
+                "nonce": "Y2FzZGFzZGFzZA=="
+            }
+        }
+class EncryptedResponse(BaseModel):
+    """Encrypted response"""
+    ciphertext: str
+    nonce: str
+    status: str = "success"

app/text_extractor.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""
+Text extraction from PDFs and images using EasyOCR
+Smart extraction: tries text layer first, falls back to OCR
+"""
+import fitz  # PyMuPDF
+import easyocr
+from PIL import Image
+from pdf2image import convert_from_bytes
+import io
+import numpy as np
+from typing import Tuple, Optional
+print("Initializing EasyOCR Reader...")
+try:
+    reader = easyocr.Reader(['en'], gpu=False, verbose=False)
+    print("✓ EasyOCR Reader initialized successfully")
+except Exception as e:
+    print(f"✗ EasyOCR initialization failed: {e}")
+    reader = None
+def extract_text_from_pdf(pdf_bytes: bytes) -> Tuple[Optional[str], bool]:
+    """
+    Extract text from PDF with smart OCR fallback
+    Returns:
+        (extracted_text, ocr_used)
+    """
+    if not pdf_bytes:
+        return None, False
+    try:
+        # Try extracting text layer first (fast)
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        full_text = ""
+        for page in doc:
+            full_text += page.get_text()
+        doc.close()
+        # Check if meaningful text was extracted
+        if len(full_text.strip()) > 50:
+            print(f"✓ Extracted {len(full_text)} chars from text layer")
+            return full_text.strip(), False
+        # No text layer - use OCR
+        print("⚠ No text layer detected, using EasyOCR...")
+        text = extract_text_from_pdf_via_ocr(pdf_bytes)
+        return text, True
+    except Exception as e:
+        print(f"✗ Error in PDF text extraction: {e}")
+        return None, False
+def extract_text_from_pdf_via_ocr(pdf_bytes: bytes) -> Optional[str]:
+    """
+    Extract text using EasyOCR on PDF pages converted to images
+    """
+    if not reader:
+        raise RuntimeError("EasyOCR not initialized")
+    try:
+        # Convert PDF to images
+        images = convert_from_bytes(pdf_bytes, dpi=300)
+        full_text = ""
+        for i, image in enumerate(images):
+            print(f"   OCR processing page {i+1}/{len(images)}...")
+            # Convert PIL to numpy array
+            img_array = np.array(image)
+            # Run EasyOCR
+            results = reader.readtext(img_array, detail=0, paragraph=True)
+            page_text = ' '.join(results)
+            full_text += page_text + "\n\n"
+        print(f"✓ EasyOCR extracted {len(full_text)} chars from {len(images)} pages")
+        return full_text.strip()
+    except Exception as e:
+        print(f"✗ OCR failed: {e}")
+        return None
+def extract_text_from_image(image_bytes: bytes) -> Optional[str]:
+    """
+    Extract text from image file using EasyOCR
+    """
+    if not reader:
+        raise RuntimeError("EasyOCR not initialized")
+    try:
+        print("Processing image with EasyOCR...")
+        # Open and prepare image
+        image = Image.open(io.BytesIO(image_bytes))
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
+        # Convert to numpy
+        img_array = np.array(image)
+        # Run EasyOCR
+        results = reader.readtext(img_array, detail=0, paragraph=True)
+        text = ' '.join(results)
+        print(f"✓ EasyOCR extracted {len(text)} chars from image")
+        return text.strip()
+    except Exception as e:
+        print(f"✗ Image OCR failed: {e}")
+        return None
+def get_ocr_confidence(image_array: np.ndarray) -> list:
+    """
+    Get detailed OCR results with confidence scores
+    """
+    if not reader:
+        return []
+    try:
+        results = reader.readtext(image_array, detail=1)
+        return [
+            {
+                "text": text,
+                "confidence": round(conf, 3),
+                "bbox": bbox
+            }
+            for bbox, text, conf in results
+        ]
+    except:
+        return []

requirements.txt ADDED Viewed

	@@ -0,0 +1,41 @@

+fastapi==0.115.4
+uvicorn[standard]==0.32.0
+python-multipart==0.0.19
+starlette==0.41.3
+# pip install fastapi uvicorn python-multipart starlette
+PyNaCl==1.5.0
+python-dotenv==1.0.1
+# pip install PyNaCl python-dotenv
+PyMuPDF==1.24.13
+Pillow==11.0.0
+easyocr==1.7.2x
+opencv-python-headless==4.10.0.84
+# pip install PyMuPDF Pillow easyocr opencv-python-headless
+spacy==3.8.2
+transformers==4.46.3
+torch==2.5.1
+sentencepiece==0.2.0
+# pip install spacy transformers torch sentencepiece
+easyocr
+pdf2image
+# pip install easyocr pdf2image
+# Utilities
+numpy<2.0
+pydantic==2.9.2
+pydantic-settings==2.6.1
+aiofiles==24.1.0
+# pip install pydantic pydantic-settings aiofiles python-json-logger
+# Monitoring & Logging
+python-json-logger==3.2.1
+# Testing (optional, for development)
+pytest==8.3.3
+pytest-asyncio==0.24.0
+httpx==0.28.0
+# pip install pytest pytest-asyncio httpx