Spaces:

moazx
/

Lung-Cancer-AI-Advisor

Sleeping

File size: 19,313 Bytes

ddc9c77

"""
Medical Terminology Module with Dynamic Learning

This module provides intelligent handling of medical linguistic variability including:
- Synonyms and alternate terms
- Abbreviations and acronyms (with context awareness)
- Regional spelling variations (US/UK/International)
- Specialty-specific terminology
- Dynamic learning from corpus
"""

import re
import json
from typing import List, Dict, Set, Tuple, Optional
from collections import defaultdict
from pathlib import Path
from .config import logger

# ============================================================================
# CORE MEDICAL TERMINOLOGY MAPPINGS
# ============================================================================

# Common medical abbreviations with context-aware expansions
MEDICAL_ABBREVIATIONS = {
    # Cancer Types
    "nsclc": ["non-small cell lung cancer", "non small cell lung cancer"],
    "sclc": ["small cell lung cancer"],
    "nscl": ["non-small cell lung"],
    "alk": ["anaplastic lymphoma kinase"],
    "egfr": ["epidermal growth factor receptor"],
    "ros1": ["ros proto-oncogene 1", "c-ros oncogene 1"],
    "braf": ["b-raf proto-oncogene"],
    "kras": ["kirsten rat sarcoma viral oncogene"],
    "met": ["mesenchymal epithelial transition", "met proto-oncogene"],
    "her2": ["human epidermal growth factor receptor 2"],
    "ret": ["ret proto-oncogene", "rearranged during transfection"],
    "ntrk": ["neurotrophic tyrosine receptor kinase", "neurotrophic tropomyosin receptor kinase"],
    
    # Treatment & Procedures
    "chemo": ["chemotherapy"],
    "rt": ["radiation therapy", "radiotherapy"],
    "sbrt": ["stereotactic body radiation therapy", "stereotactic body radiotherapy"],
    "imrt": ["intensity-modulated radiation therapy"],
    "ct": ["computed tomography", "ct scan"],
    "pet": ["positron emission tomography"],
    "mri": ["magnetic resonance imaging"],
    "io": ["immunotherapy", "immune-oncology"],
    "ici": ["immune checkpoint inhibitor", "immune checkpoint inhibitors"],
    "tki": ["tyrosine kinase inhibitor", "tyrosine kinase inhibitors"],
    "pd-1": ["programmed death-1", "programmed cell death protein 1"],
    "pd-l1": ["programmed death-ligand 1"],
    "ctla-4": ["cytotoxic t-lymphocyte-associated protein 4"],
    
    # Clinical Terms
    "os": ["overall survival"],
    "pfs": ["progression-free survival"],
    "dfs": ["disease-free survival"],
    "orr": ["overall response rate", "objective response rate"],
    "cr": ["complete response"],
    "pr": ["partial response"],
    "sd": ["stable disease"],
    "pd": ["progressive disease"],
    "ecog": ["eastern cooperative oncology group"],
    "ps": ["performance status"],
    "aes": ["adverse events"],
    "sae": ["serious adverse event", "serious adverse events"],
    "qol": ["quality of life"],
    
    # Staging
    "tnm": ["tumor node metastasis", "tnm staging"],
    "ajcc": ["american joint committee on cancer"],
    
    # Drugs (common abbreviations)
    "cddp": ["cisplatin"],
    "cbdca": ["carboplatin"],
    "pem": ["pemetrexed"],
    "gem": ["gemcitabine"],
    "doc": ["docetaxel"],
    "pac": ["paclitaxel"],
    "vin": ["vinorelbine"],
    "eto": ["etoposide"],
}

# Synonym mappings for medical terms
MEDICAL_SYNONYMS = {
    # Cancer terminology
    "lung cancer": ["pulmonary cancer", "lung carcinoma", "pulmonary carcinoma", "bronchogenic carcinoma"],
    "non-small cell lung cancer": ["nsclc", "non small cell lung cancer", "non-small-cell lung cancer"],
    "small cell lung cancer": ["sclc", "small-cell lung cancer", "oat cell carcinoma"],
    "adenocarcinoma": ["adeno", "glandular cancer"],
    "squamous cell carcinoma": ["squamous carcinoma", "scc", "epidermoid carcinoma"],
    "metastatic": ["advanced", "stage iv", "stage 4", "metastases", "mets"],
    "locally advanced": ["stage iii", "stage 3", "regional spread"],
    "early stage": ["stage i", "stage ii", "stage 1", "stage 2", "localized"],
    
    # Treatment terms
    "chemotherapy": ["chemo", "cytotoxic therapy", "systemic therapy"],
    "radiation therapy": ["radiotherapy", "rt", "radiation treatment", "irradiation"],
    "immunotherapy": ["immune therapy", "io", "immune-oncology", "checkpoint inhibitor"],
    "targeted therapy": ["molecular therapy", "precision medicine", "targeted treatment"],
    "surgery": ["surgical resection", "resection", "operative treatment", "surgical intervention"],
    "lobectomy": ["lobe resection", "pulmonary lobectomy"],
    "pneumonectomy": ["lung removal", "complete lung resection"],
    "wedge resection": ["segmentectomy", "limited resection"],
    
    # Molecular markers
    "mutation": ["alteration", "variant", "genetic change", "molecular alteration"],
    "biomarker": ["molecular marker", "tumor marker", "genetic marker"],
    "driver mutation": ["oncogenic driver", "actionable mutation", "targetable mutation"],
    
    # Clinical outcomes
    "survival": ["survival rate", "survival outcome"],
    "response": ["treatment response", "tumor response", "clinical response"],
    "progression": ["disease progression", "tumor progression", "cancer progression"],
    "recurrence": ["relapse", "disease recurrence", "tumor recurrence"],
    "remission": ["response", "disease control"],
    
    # Side effects
    "adverse event": ["side effect", "adverse reaction", "toxicity", "adverse drug reaction"],
    "neutropenia": ["low white blood cell count", "low neutrophil count"],
    "anemia": ["low red blood cell count", "low hemoglobin"],
    "thrombocytopenia": ["low platelet count"],
    "nausea": ["feeling sick", "queasiness"],
    "fatigue": ["tiredness", "exhaustion", "weakness"],
    
    # Diagnostic terms
    "biopsy": ["tissue sample", "tissue sampling"],
    "imaging": ["radiology", "diagnostic imaging", "medical imaging"],
    "screening": ["early detection", "cancer screening"],
}

# Regional spelling variations (US/UK/International)
SPELLING_VARIATIONS = {
    # US -> UK/International variants
    "tumor": ["tumour"],
    "tumors": ["tumours"],
    "metastasis": ["metastases"],
    "anemia": ["anaemia"],
    "edema": ["oedema"],
    "esophageal": ["oesophageal"],
    "pediatric": ["paediatric"],
    "hematology": ["haematology"],
    "hemoglobin": ["haemoglobin"],
    "leukemia": ["leukaemia"],
    "lymphoma": ["lymphoma"],  # Same in both
    "optimize": ["optimise"],
    "randomized": ["randomised"],
    "analyze": ["analyse"],
    "center": ["centre"],
    "fiber": ["fibre"],
}

# Context-dependent abbreviations (require disambiguation)
CONTEXT_DEPENDENT_ABBREVS = {
    "ca": {
        "cancer": ["cancer", "carcinoma"],
        "calcium": ["calcium"],
    },
    "cr": {
        "complete_response": ["complete response", "complete remission"],
        "creatinine": ["creatinine"],
    },
    "pt": {
        "patient": ["patient"],
        "prothrombin_time": ["prothrombin time"],
    },
    "rt": {
        "radiation_therapy": ["radiation therapy", "radiotherapy"],
        "reverse_transcriptase": ["reverse transcriptase"],
    },
}

# ============================================================================
# DYNAMIC LEARNING COMPONENTS
# ============================================================================

class MedicalTerminologyLearner:
    """
    Dynamically learns medical term variations from the corpus.
    Builds co-occurrence patterns and semantic relationships.
    """
    
    def __init__(self, cache_path: Optional[str] = None):
        self.cache_path = cache_path or "data/medical_terms_cache.json"
        self.term_cooccurrence = defaultdict(lambda: defaultdict(int))
        self.learned_synonyms = defaultdict(set)
        self.learned_abbreviations = defaultdict(set)
        self.context_patterns = defaultdict(list)
        self._load_cache()
    
    def _load_cache(self):
        """Load previously learned terms from cache"""
        try:
            cache_file = Path(self.cache_path)
            if cache_file.exists():
                with open(cache_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    self.learned_synonyms = defaultdict(set, {k: set(v) for k, v in data.get('synonyms', {}).items()})
                    self.learned_abbreviations = defaultdict(set, {k: set(v) for k, v in data.get('abbreviations', {}).items()})
                logger.info(f"Loaded {len(self.learned_synonyms)} learned synonyms from cache")
        except Exception as e:
            logger.warning(f"Could not load term cache: {e}")
    
    def _save_cache(self):
        """Save learned terms to cache"""
        try:
            cache_file = Path(self.cache_path)
            cache_file.parent.mkdir(parents=True, exist_ok=True)
            data = {
                'synonyms': {k: list(v) for k, v in self.learned_synonyms.items()},
                'abbreviations': {k: list(v) for k, v in self.learned_abbreviations.items()}
            }
            with open(cache_file, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2)
            logger.info(f"Saved learned terms to cache")
        except Exception as e:
            logger.warning(f"Could not save term cache: {e}")
    
    def learn_from_documents(self, documents: List[Dict[str, str]]):
        """
        Learn term variations from a corpus of documents.
        Identifies patterns like:
        - "X (Y)" -> Y is abbreviation of X
        - "X, also known as Y" -> X and Y are synonyms
        - "X or Y" in similar contexts -> potential synonyms
        """
        for doc in documents:
            content = doc.get('content', '')
            self._extract_abbreviation_patterns(content)
            self._extract_synonym_patterns(content)
            self._build_cooccurrence(content)
        
        self._save_cache()
    
    def _extract_abbreviation_patterns(self, text: str):
        """Extract abbreviations from patterns like 'Full Term (ABBR)'"""
        # Pattern: "Full Term (ABBR)" or "Full Term [ABBR]"
        pattern = r'([A-Z][a-z]+(?:\s+[A-Z]?[a-z]+)*)\s*[\(\[]([A-Z]{2,}|[A-Z][a-z]*(?:-[A-Z][a-z]*)*)[\)\]]'
        matches = re.finditer(pattern, text)
        
        for match in matches:
            full_term = match.group(1).strip().lower()
            abbrev = match.group(2).strip().lower()
            
            # Validate: abbreviation should be shorter and contain initials
            if len(abbrev) < len(full_term) and len(abbrev) >= 2:
                self.learned_abbreviations[abbrev].add(full_term)
                logger.debug(f"Learned: {abbrev} -> {full_term}")
    
    def _extract_synonym_patterns(self, text: str):
        """Extract synonyms from patterns like 'X, also known as Y' or 'X (Y)'"""
        # Pattern: "X, also known as Y" or "X, also called Y"
        patterns = [
            r'([a-z\s\-]+),?\s+also\s+known\s+as\s+([a-z\s\-]+)',
            r'([a-z\s\-]+),?\s+also\s+called\s+([a-z\s\-]+)',
            r'([a-z\s\-]+)\s+\(([a-z\s\-]+)\)',
        ]
        
        for pattern in patterns:
            matches = re.finditer(pattern, text.lower())
            for match in matches:
                term1 = match.group(1).strip()
                term2 = match.group(2).strip()
                
                # Validate: both should be reasonable length
                if 3 <= len(term1) <= 50 and 3 <= len(term2) <= 50:
                    self.learned_synonyms[term1].add(term2)
                    self.learned_synonyms[term2].add(term1)
    
    def _build_cooccurrence(self, text: str):
        """Build co-occurrence matrix for terms"""
        # Extract medical terms (simplified)
        terms = re.findall(r'\b[a-z]{3,}(?:\s+[a-z]{3,}){0,3}\b', text.lower())
        
        # Build co-occurrence within a window
        window_size = 10
        for i, term in enumerate(terms):
            for j in range(max(0, i - window_size), min(len(terms), i + window_size + 1)):
                if i != j:
                    self.term_cooccurrence[term][terms[j]] += 1
    
    def get_related_terms(self, term: str, threshold: int = 3) -> Set[str]:
        """Get terms that frequently co-occur with the given term"""
        term_lower = term.lower()
        related = set()
        
        if term_lower in self.term_cooccurrence:
            for related_term, count in self.term_cooccurrence[term_lower].items():
                if count >= threshold:
                    related.add(related_term)
        
        return related

# Global learner instance
_terminology_learner = MedicalTerminologyLearner()

# ============================================================================
# QUERY NORMALIZATION AND EXPANSION FUNCTIONS
# ============================================================================

def normalize_query(query: str) -> str:
    """
    Normalize a query by:
    - Converting to lowercase
    - Removing extra whitespace
    - Standardizing punctuation
    """
    # Convert to lowercase
    normalized = query.lower()
    
    # Standardize hyphens and dashes
    normalized = re.sub(r'[–—]', '-', normalized)
    
    # Remove extra whitespace
    normalized = re.sub(r'\s+', ' ', normalized).strip()
    
    return normalized


def expand_abbreviations(text: str, context: Optional[str] = None) -> List[str]:
    """
    Expand abbreviations in text to their full forms.
    Uses context when available for disambiguation.
    """
    expansions = [text]
    text_lower = text.lower()
    
    # Check learned abbreviations first
    for abbrev, full_forms in _terminology_learner.learned_abbreviations.items():
        if abbrev in text_lower:
            for full_form in full_forms:
                expanded = text_lower.replace(abbrev, full_form)
                if expanded != text_lower:
                    expansions.append(expanded)
    
    # Check predefined abbreviations
    for abbrev, full_forms in MEDICAL_ABBREVIATIONS.items():
        if re.search(rf'\b{re.escape(abbrev)}\b', text_lower):
            for full_form in full_forms:
                expanded = re.sub(rf'\b{re.escape(abbrev)}\b', full_form, text_lower)
                if expanded != text_lower:
                    expansions.append(expanded)
    
    # Remove duplicates while preserving order
    seen = set()
    unique_expansions = []
    for exp in expansions:
        if exp not in seen:
            seen.add(exp)
            unique_expansions.append(exp)
    
    return unique_expansions


def get_synonyms(term: str) -> Set[str]:
    """Get all known synonyms for a medical term"""
    term_lower = term.lower()
    synonyms = set()
    
    # Check predefined synonyms
    if term_lower in MEDICAL_SYNONYMS:
        synonyms.update(MEDICAL_SYNONYMS[term_lower])
    
    # Check if term is a synonym of something else
    for key, syn_list in MEDICAL_SYNONYMS.items():
        if term_lower in syn_list:
            synonyms.add(key)
            synonyms.update(syn_list)
    
    # Check learned synonyms
    if term_lower in _terminology_learner.learned_synonyms:
        synonyms.update(_terminology_learner.learned_synonyms[term_lower])
    
    # Remove the original term
    synonyms.discard(term_lower)
    
    return synonyms


def get_spelling_variations(term: str) -> Set[str]:
    """Get regional spelling variations for a term"""
    term_lower = term.lower()
    variations = set()
    
    # Check direct mapping
    if term_lower in SPELLING_VARIATIONS:
        variations.update(SPELLING_VARIATIONS[term_lower])
    
    # Check reverse mapping
    for key, var_list in SPELLING_VARIATIONS.items():
        if term_lower in var_list:
            variations.add(key)
            variations.update(var_list)
    
    variations.discard(term_lower)
    return variations


def extract_medical_entities(text: str) -> List[Tuple[str, str]]:
    """
    Extract medical entities from text.
    Returns list of (entity, type) tuples.
    """
    entities = []
    text_lower = text.lower()
    
    # Extract abbreviations
    for abbrev in MEDICAL_ABBREVIATIONS.keys():
        if re.search(rf'\b{re.escape(abbrev)}\b', text_lower):
            entities.append((abbrev, 'abbreviation'))
    
    # Extract known medical terms
    for term in MEDICAL_SYNONYMS.keys():
        if term in text_lower:
            entities.append((term, 'medical_term'))
    
    return entities


def is_medical_abbreviation(text: str) -> bool:
    """Check if text is a known medical abbreviation"""
    text_lower = text.lower().strip()
    return text_lower in MEDICAL_ABBREVIATIONS or text_lower in _terminology_learner.learned_abbreviations


def get_abbreviation_expansion(abbrev: str) -> List[str]:
    """Get all possible expansions for an abbreviation"""
    abbrev_lower = abbrev.lower().strip()
    expansions = []
    
    # Check predefined
    if abbrev_lower in MEDICAL_ABBREVIATIONS:
        expansions.extend(MEDICAL_ABBREVIATIONS[abbrev_lower])
    
    # Check learned
    if abbrev_lower in _terminology_learner.learned_abbreviations:
        expansions.extend(_terminology_learner.learned_abbreviations[abbrev_lower])
    
    return expansions


def expand_query_with_variations(query: str, max_variations: int = 5) -> List[str]:
    """
    Generate query variations by expanding abbreviations, adding synonyms,
    and including spelling variations.
    
    Args:
        query: Original query string
        max_variations: Maximum number of variations to generate
    
    Returns:
        List of query variations including the original
    """
    variations = [query]
    query_lower = normalize_query(query)
    
    # 1. Expand abbreviations
    abbrev_expansions = expand_abbreviations(query_lower)
    variations.extend(abbrev_expansions)
    
    # 2. Add synonym variations
    words = query_lower.split()
    for i, word in enumerate(words):
        synonyms = get_synonyms(word)
        for syn in list(synonyms)[:2]:  # Limit to 2 synonyms per word
            new_query = ' '.join(words[:i] + [syn] + words[i+1:])
            variations.append(new_query)
    
    # 3. Add spelling variations
    for word in words:
        spelling_vars = get_spelling_variations(word)
        for var in spelling_vars:
            new_query = query_lower.replace(word, var)
            variations.append(new_query)
    
    # 4. Add multi-word phrase variations
    for term, synonyms in MEDICAL_SYNONYMS.items():
        if term in query_lower:
            for syn in list(synonyms)[:2]:
                new_query = query_lower.replace(term, syn)
                variations.append(new_query)
    
    # Remove duplicates and limit
    seen = set()
    unique_variations = []
    for var in variations:
        if var not in seen:
            seen.add(var)
            unique_variations.append(var)
            if len(unique_variations) >= max_variations:
                break
    
    return unique_variations


def learn_from_corpus(documents: List[Dict[str, str]]):
    """
    Learn medical term variations from a corpus of documents.
    Should be called during system initialization.
    """
    _terminology_learner.learn_from_documents(documents)


def get_terminology_learner() -> MedicalTerminologyLearner:
    """Get the global terminology learner instance"""
    return _terminology_learner