Spaces:

snikhilesh
/

medical-report-analyzer

Running

File size: 12,895 Bytes

13d5ab4

"""
Document Classifier - Layer 1: Medical Document Classification with Real AI Models
Routes documents to appropriate specialized models using Bio_ClinicalBERT
"""

import logging
from typing import Dict, List, Any, Optional
import re
from model_loader import get_model_loader

logger = logging.getLogger(__name__)


class DocumentClassifier:
    """
    Classifies medical documents into types for intelligent routing
    
    Supported document types:
    - Radiology Report
    - Pathology Report
    - Laboratory Results
    - Clinical Notes
    - Discharge Summary
    - ECG/Cardiology Report
    - Operative Note
    - Medication List
    - Consultation Note
    """
    
    def __init__(self):
        self.model_loader = get_model_loader()
        self.document_types = [
            "radiology",
            "pathology",
            "laboratory",
            "clinical_notes",
            "discharge_summary",
            "cardiology",
            "operative_note",
            "medication_list",
            "consultation",
            "unknown"
        ]
        
        # Keywords for document type detection (fallback method)
        self.classification_keywords = {
            "radiology": [
                "ct scan", "mri", "x-ray", "radiograph", "ultrasound",
                "imaging", "radiology", "chest xray", "chest x-ray",
                "ct", "pet scan", "mammogram", "fluoroscopy"
            ],
            "pathology": [
                "pathology", "biopsy", "histopathology", "cytology",
                "tissue", "slide", "specimen", "microscopic",
                "immunohistochemistry", "tumor grade", "malignant"
            ],
            "laboratory": [
                "lab results", "laboratory", "complete blood count", "cbc",
                "chemistry panel", "metabolic panel", "lipid panel",
                "glucose", "hemoglobin", "platelet", "wbc", "rbc",
                "test results", "reference range"
            ],
            "cardiology": [
                "ecg", "ekg", "electrocardiogram", "echo", "echocardiogram",
                "stress test", "cardiac", "heart", "arrhythmia",
                "ejection fraction", "coronary", "myocardial"
            ],
            "discharge_summary": [
                "discharge summary", "discharge diagnosis", "hospital course",
                "admission date", "discharge date", "discharge medications",
                "discharge instructions", "follow-up"
            ],
            "operative_note": [
                "operative note", "operation", "surgery", "surgical procedure",
                "procedure performed", "anesthesia", "incision", "operative findings",
                "post-operative", "surgeon"
            ],
            "medication_list": [
                "medication list", "current medications", "prescriptions",
                "drug list", "rx", "dosage", "frequency"
            ],
            "consultation": [
                "consultation", "consulted", "specialist", "referred",
                "opinion", "evaluation", "assessment and plan"
            ]
        }
        
        logger.info("Document Classifier initialized")
    
    async def classify(self, pdf_content: Dict[str, Any]) -> Dict[str, Any]:
        """
        Classify medical document using AI model + keyword fallback
        
        Returns:
            Classification result with:
            - document_type: primary classification
            - confidence: confidence score
            - secondary_types: other possible classifications
            - routing_hints: suggestions for model routing
        """
        try:
            text = pdf_content.get("text", "")
            metadata = pdf_content.get("metadata", {})
            sections = pdf_content.get("sections", {})
            
            # Try AI-based classification first
            ai_result = await self._ai_classification(text[:1000])  # Use first 1000 chars
            
            # Also run keyword-based classification as backup
            keyword_result = self._keyword_classification(text.lower())
            
            # Combine results with AI taking precedence if confidence is high
            if ai_result.get("confidence", 0) > 0.6:
                primary_type = ai_result["document_type"]
                confidence = ai_result["confidence"]
                method = "ai_model"
            else:
                primary_type = keyword_result["document_type"]
                confidence = keyword_result["confidence"]
                method = "keyword_based"
            
            # Get secondary types from both methods
            secondary_types = list(set(
                ai_result.get("secondary_types", []) + 
                keyword_result.get("secondary_types", [])
            ))[:3]
            
            # Generate routing hints based on classification
            routing_hints = self._generate_routing_hints(
                primary_type,
                secondary_types,
                pdf_content
            )
            
            result = {
                "document_type": primary_type,
                "confidence": confidence,
                "secondary_types": secondary_types,
                "routing_hints": routing_hints,
                "classification_method": method,
                "ai_confidence": ai_result.get("confidence", 0),
                "keyword_confidence": keyword_result.get("confidence", 0)
            }
            
            logger.info(f"Document classified as: {primary_type} (confidence: {confidence:.2f}, method: {method})")
            
            return result
            
        except Exception as e:
            logger.error(f"Classification failed: {str(e)}")
            return {
                "document_type": "unknown",
                "confidence": 0.0,
                "secondary_types": [],
                "routing_hints": {"models": ["general"]},
                "error": str(e)
            }
    
    async def _ai_classification(self, text: str) -> Dict[str, Any]:
        """Use Bio_ClinicalBERT for document classification"""
        try:
            # Use model loader for classification
            import asyncio
            loop = asyncio.get_event_loop()
            
            result = await loop.run_in_executor(
                None,
                lambda: self.model_loader.run_inference(
                    "document_classifier",
                    text,
                    {}
                )
            )
            
            if result.get("success") and result.get("result"):
                model_output = result["result"]
                
                # Handle different output formats
                if isinstance(model_output, list) and len(model_output) > 0:
                    top_prediction = model_output[0]
                    
                    # Map model labels to our document types
                    label = top_prediction.get("label", "").lower()
                    score = top_prediction.get("score", 0.5)
                    
                    # Map common labels to document types
                    label_mapping = {
                        "radiology": "radiology",
                        "pathology": "pathology",
                        "laboratory": "laboratory",
                        "lab": "laboratory",
                        "cardiology": "cardiology",
                        "clinical": "clinical_notes",
                        "discharge": "discharge_summary",
                        "operative": "operative_note",
                        "surgery": "operative_note",
                        "medication": "medication_list",
                        "consultation": "consultation"
                    }
                    
                    doc_type = "unknown"
                    for key, value in label_mapping.items():
                        if key in label:
                            doc_type = value
                            break
                    
                    # Get secondary types from other predictions
                    secondary_types = []
                    for pred in model_output[1:4]:
                        sec_label = pred.get("label", "").lower()
                        for key, value in label_mapping.items():
                            if key in sec_label and value != doc_type:
                                secondary_types.append(value)
                                break
                    
                    return {
                        "document_type": doc_type,
                        "confidence": score,
                        "secondary_types": secondary_types
                    }
            
            # Fallback if model doesn't return expected format
            return {"document_type": "unknown", "confidence": 0.0, "secondary_types": []}
            
        except Exception as e:
            logger.warning(f"AI classification failed: {str(e)}, falling back to keywords")
            return {"document_type": "unknown", "confidence": 0.0, "secondary_types": []}
    
    def _keyword_classification(self, text: str) -> Dict[str, Any]:
        """Keyword-based classification as fallback"""
        # Score each document type
        scores = {}
        for doc_type, keywords in self.classification_keywords.items():
            score = self._calculate_type_score(text, keywords)
            scores[doc_type] = score
        
        # Get top classifications
        sorted_types = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        
        primary_type = sorted_types[0][0] if sorted_types else "unknown"
        primary_score = sorted_types[0][1] if sorted_types else 0.0
        
        # Confidence calculation
        confidence = min(primary_score / 10.0, 1.0)  # Normalize to 0-1
        
        # Secondary types (score > 3)
        secondary_types = [
            doc_type for doc_type, score in sorted_types[1:4]
            if score > 3
        ]
        
        return {
            "document_type": primary_type,
            "confidence": confidence,
            "secondary_types": secondary_types
        }
    
    def _calculate_type_score(self, text: str, keywords: List[str]) -> float:
        """Calculate relevance score for a document type"""
        score = 0.0
        
        for keyword in keywords:
            # Count occurrences (weighted by keyword importance)
            count = text.count(keyword.lower())
            
            # Keyword at beginning of document = higher weight
            if keyword.lower() in text[:500]:
                score += count * 2
            else:
                score += count
        
        return score
    
    def _generate_routing_hints(
        self,
        primary_type: str,
        secondary_types: List[str],
        pdf_content: Dict[str, Any]
    ) -> Dict[str, Any]:
        """
        Generate hints for intelligent model routing
        """
        hints = {
            "primary_models": [],
            "secondary_models": [],
            "extract_images": False,
            "extract_tables": False,
            "priority": "standard"
        }
        
        # Map document types to model domains
        type_to_models = {
            "radiology": ["radiology_vqa", "report_generation", "segmentation"],
            "pathology": ["pathology_classification", "slide_analysis"],
            "laboratory": ["lab_normalization", "result_interpretation"],
            "cardiology": ["ecg_analysis", "cardiac_imaging"],
            "discharge_summary": ["clinical_summarization", "coding_extraction"],
            "operative_note": ["procedure_extraction", "coding"],
            "clinical_notes": ["clinical_ner", "summarization"],
            "consultation": ["clinical_ner", "diagnosis_extraction"],
            "medication_list": ["medication_extraction", "drug_interaction"]
        }
        
        # Set primary models
        hints["primary_models"] = type_to_models.get(primary_type, ["general"])
        
        # Set secondary models
        for sec_type in secondary_types:
            if sec_type in type_to_models:
                hints["secondary_models"].extend(type_to_models[sec_type])
        
        # Special processing hints
        if primary_type == "radiology":
            hints["extract_images"] = True
            hints["priority"] = "high"
        
        if primary_type == "laboratory":
            hints["extract_tables"] = True
        
        if primary_type == "pathology":
            hints["extract_images"] = True
        
        # Check if document has images
        if pdf_content.get("images"):
            hints["has_images"] = True
        
        # Check if document has tables
        if pdf_content.get("tables"):
            hints["has_tables"] = True
        
        return hints