""" Medical Terminology Module with Dynamic Learning This module provides intelligent handling of medical linguistic variability including: - Synonyms and alternate terms - Abbreviations and acronyms (with context awareness) - Regional spelling variations (US/UK/International) - Specialty-specific terminology - Dynamic learning from corpus """ import re import json from typing import List, Dict, Set, Tuple, Optional from collections import defaultdict from pathlib import Path from .config import logger # ============================================================================ # CORE MEDICAL TERMINOLOGY MAPPINGS # ============================================================================ # Common medical abbreviations with context-aware expansions MEDICAL_ABBREVIATIONS = { # Cancer Types "nsclc": ["non-small cell lung cancer", "non small cell lung cancer"], "sclc": ["small cell lung cancer"], "nscl": ["non-small cell lung"], "alk": ["anaplastic lymphoma kinase"], "egfr": ["epidermal growth factor receptor"], "ros1": ["ros proto-oncogene 1", "c-ros oncogene 1"], "braf": ["b-raf proto-oncogene"], "kras": ["kirsten rat sarcoma viral oncogene"], "met": ["mesenchymal epithelial transition", "met proto-oncogene"], "her2": ["human epidermal growth factor receptor 2"], "ret": ["ret proto-oncogene", "rearranged during transfection"], "ntrk": ["neurotrophic tyrosine receptor kinase", "neurotrophic tropomyosin receptor kinase"], # Treatment & Procedures "chemo": ["chemotherapy"], "rt": ["radiation therapy", "radiotherapy"], "sbrt": ["stereotactic body radiation therapy", "stereotactic body radiotherapy"], "imrt": ["intensity-modulated radiation therapy"], "ct": ["computed tomography", "ct scan"], "pet": ["positron emission tomography"], "mri": ["magnetic resonance imaging"], "io": ["immunotherapy", "immune-oncology"], "ici": ["immune checkpoint inhibitor", "immune checkpoint inhibitors"], "tki": ["tyrosine kinase inhibitor", "tyrosine kinase inhibitors"], "pd-1": ["programmed death-1", "programmed cell death protein 1"], "pd-l1": ["programmed death-ligand 1"], "ctla-4": ["cytotoxic t-lymphocyte-associated protein 4"], # Clinical Terms "os": ["overall survival"], "pfs": ["progression-free survival"], "dfs": ["disease-free survival"], "orr": ["overall response rate", "objective response rate"], "cr": ["complete response"], "pr": ["partial response"], "sd": ["stable disease"], "pd": ["progressive disease"], "ecog": ["eastern cooperative oncology group"], "ps": ["performance status"], "aes": ["adverse events"], "sae": ["serious adverse event", "serious adverse events"], "qol": ["quality of life"], # Staging "tnm": ["tumor node metastasis", "tnm staging"], "ajcc": ["american joint committee on cancer"], # Drugs (common abbreviations) "cddp": ["cisplatin"], "cbdca": ["carboplatin"], "pem": ["pemetrexed"], "gem": ["gemcitabine"], "doc": ["docetaxel"], "pac": ["paclitaxel"], "vin": ["vinorelbine"], "eto": ["etoposide"], } # Synonym mappings for medical terms MEDICAL_SYNONYMS = { # Cancer terminology "lung cancer": ["pulmonary cancer", "lung carcinoma", "pulmonary carcinoma", "bronchogenic carcinoma"], "non-small cell lung cancer": ["nsclc", "non small cell lung cancer", "non-small-cell lung cancer"], "small cell lung cancer": ["sclc", "small-cell lung cancer", "oat cell carcinoma"], "adenocarcinoma": ["adeno", "glandular cancer"], "squamous cell carcinoma": ["squamous carcinoma", "scc", "epidermoid carcinoma"], "metastatic": ["advanced", "stage iv", "stage 4", "metastases", "mets"], "locally advanced": ["stage iii", "stage 3", "regional spread"], "early stage": ["stage i", "stage ii", "stage 1", "stage 2", "localized"], # Treatment terms "chemotherapy": ["chemo", "cytotoxic therapy", "systemic therapy"], "radiation therapy": ["radiotherapy", "rt", "radiation treatment", "irradiation"], "immunotherapy": ["immune therapy", "io", "immune-oncology", "checkpoint inhibitor"], "targeted therapy": ["molecular therapy", "precision medicine", "targeted treatment"], "surgery": ["surgical resection", "resection", "operative treatment", "surgical intervention"], "lobectomy": ["lobe resection", "pulmonary lobectomy"], "pneumonectomy": ["lung removal", "complete lung resection"], "wedge resection": ["segmentectomy", "limited resection"], # Molecular markers "mutation": ["alteration", "variant", "genetic change", "molecular alteration"], "biomarker": ["molecular marker", "tumor marker", "genetic marker"], "driver mutation": ["oncogenic driver", "actionable mutation", "targetable mutation"], # Clinical outcomes "survival": ["survival rate", "survival outcome"], "response": ["treatment response", "tumor response", "clinical response"], "progression": ["disease progression", "tumor progression", "cancer progression"], "recurrence": ["relapse", "disease recurrence", "tumor recurrence"], "remission": ["response", "disease control"], # Side effects "adverse event": ["side effect", "adverse reaction", "toxicity", "adverse drug reaction"], "neutropenia": ["low white blood cell count", "low neutrophil count"], "anemia": ["low red blood cell count", "low hemoglobin"], "thrombocytopenia": ["low platelet count"], "nausea": ["feeling sick", "queasiness"], "fatigue": ["tiredness", "exhaustion", "weakness"], # Diagnostic terms "biopsy": ["tissue sample", "tissue sampling"], "imaging": ["radiology", "diagnostic imaging", "medical imaging"], "screening": ["early detection", "cancer screening"], } # Regional spelling variations (US/UK/International) SPELLING_VARIATIONS = { # US -> UK/International variants "tumor": ["tumour"], "tumors": ["tumours"], "metastasis": ["metastases"], "anemia": ["anaemia"], "edema": ["oedema"], "esophageal": ["oesophageal"], "pediatric": ["paediatric"], "hematology": ["haematology"], "hemoglobin": ["haemoglobin"], "leukemia": ["leukaemia"], "lymphoma": ["lymphoma"], # Same in both "optimize": ["optimise"], "randomized": ["randomised"], "analyze": ["analyse"], "center": ["centre"], "fiber": ["fibre"], } # Context-dependent abbreviations (require disambiguation) CONTEXT_DEPENDENT_ABBREVS = { "ca": { "cancer": ["cancer", "carcinoma"], "calcium": ["calcium"], }, "cr": { "complete_response": ["complete response", "complete remission"], "creatinine": ["creatinine"], }, "pt": { "patient": ["patient"], "prothrombin_time": ["prothrombin time"], }, "rt": { "radiation_therapy": ["radiation therapy", "radiotherapy"], "reverse_transcriptase": ["reverse transcriptase"], }, } # ============================================================================ # DYNAMIC LEARNING COMPONENTS # ============================================================================ class MedicalTerminologyLearner: """ Dynamically learns medical term variations from the corpus. Builds co-occurrence patterns and semantic relationships. """ def __init__(self, cache_path: Optional[str] = None): self.cache_path = cache_path or "data/medical_terms_cache.json" self.term_cooccurrence = defaultdict(lambda: defaultdict(int)) self.learned_synonyms = defaultdict(set) self.learned_abbreviations = defaultdict(set) self.context_patterns = defaultdict(list) self._load_cache() def _load_cache(self): """Load previously learned terms from cache""" try: cache_file = Path(self.cache_path) if cache_file.exists(): with open(cache_file, 'r', encoding='utf-8') as f: data = json.load(f) self.learned_synonyms = defaultdict(set, {k: set(v) for k, v in data.get('synonyms', {}).items()}) self.learned_abbreviations = defaultdict(set, {k: set(v) for k, v in data.get('abbreviations', {}).items()}) logger.info(f"Loaded {len(self.learned_synonyms)} learned synonyms from cache") except Exception as e: logger.warning(f"Could not load term cache: {e}") def _save_cache(self): """Save learned terms to cache""" try: cache_file = Path(self.cache_path) cache_file.parent.mkdir(parents=True, exist_ok=True) data = { 'synonyms': {k: list(v) for k, v in self.learned_synonyms.items()}, 'abbreviations': {k: list(v) for k, v in self.learned_abbreviations.items()} } with open(cache_file, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2) logger.info(f"Saved learned terms to cache") except Exception as e: logger.warning(f"Could not save term cache: {e}") def learn_from_documents(self, documents: List[Dict[str, str]]): """ Learn term variations from a corpus of documents. Identifies patterns like: - "X (Y)" -> Y is abbreviation of X - "X, also known as Y" -> X and Y are synonyms - "X or Y" in similar contexts -> potential synonyms """ for doc in documents: content = doc.get('content', '') self._extract_abbreviation_patterns(content) self._extract_synonym_patterns(content) self._build_cooccurrence(content) self._save_cache() def _extract_abbreviation_patterns(self, text: str): """Extract abbreviations from patterns like 'Full Term (ABBR)'""" # Pattern: "Full Term (ABBR)" or "Full Term [ABBR]" pattern = r'([A-Z][a-z]+(?:\s+[A-Z]?[a-z]+)*)\s*[\(\[]([A-Z]{2,}|[A-Z][a-z]*(?:-[A-Z][a-z]*)*)[\)\]]' matches = re.finditer(pattern, text) for match in matches: full_term = match.group(1).strip().lower() abbrev = match.group(2).strip().lower() # Validate: abbreviation should be shorter and contain initials if len(abbrev) < len(full_term) and len(abbrev) >= 2: self.learned_abbreviations[abbrev].add(full_term) logger.debug(f"Learned: {abbrev} -> {full_term}") def _extract_synonym_patterns(self, text: str): """Extract synonyms from patterns like 'X, also known as Y' or 'X (Y)'""" # Pattern: "X, also known as Y" or "X, also called Y" patterns = [ r'([a-z\s\-]+),?\s+also\s+known\s+as\s+([a-z\s\-]+)', r'([a-z\s\-]+),?\s+also\s+called\s+([a-z\s\-]+)', r'([a-z\s\-]+)\s+\(([a-z\s\-]+)\)', ] for pattern in patterns: matches = re.finditer(pattern, text.lower()) for match in matches: term1 = match.group(1).strip() term2 = match.group(2).strip() # Validate: both should be reasonable length if 3 <= len(term1) <= 50 and 3 <= len(term2) <= 50: self.learned_synonyms[term1].add(term2) self.learned_synonyms[term2].add(term1) def _build_cooccurrence(self, text: str): """Build co-occurrence matrix for terms""" # Extract medical terms (simplified) terms = re.findall(r'\b[a-z]{3,}(?:\s+[a-z]{3,}){0,3}\b', text.lower()) # Build co-occurrence within a window window_size = 10 for i, term in enumerate(terms): for j in range(max(0, i - window_size), min(len(terms), i + window_size + 1)): if i != j: self.term_cooccurrence[term][terms[j]] += 1 def get_related_terms(self, term: str, threshold: int = 3) -> Set[str]: """Get terms that frequently co-occur with the given term""" term_lower = term.lower() related = set() if term_lower in self.term_cooccurrence: for related_term, count in self.term_cooccurrence[term_lower].items(): if count >= threshold: related.add(related_term) return related # Global learner instance _terminology_learner = MedicalTerminologyLearner() # ============================================================================ # QUERY NORMALIZATION AND EXPANSION FUNCTIONS # ============================================================================ def normalize_query(query: str) -> str: """ Normalize a query by: - Converting to lowercase - Removing extra whitespace - Standardizing punctuation """ # Convert to lowercase normalized = query.lower() # Standardize hyphens and dashes normalized = re.sub(r'[–—]', '-', normalized) # Remove extra whitespace normalized = re.sub(r'\s+', ' ', normalized).strip() return normalized def expand_abbreviations(text: str, context: Optional[str] = None) -> List[str]: """ Expand abbreviations in text to their full forms. Uses context when available for disambiguation. """ expansions = [text] text_lower = text.lower() # Check learned abbreviations first for abbrev, full_forms in _terminology_learner.learned_abbreviations.items(): if abbrev in text_lower: for full_form in full_forms: expanded = text_lower.replace(abbrev, full_form) if expanded != text_lower: expansions.append(expanded) # Check predefined abbreviations for abbrev, full_forms in MEDICAL_ABBREVIATIONS.items(): if re.search(rf'\b{re.escape(abbrev)}\b', text_lower): for full_form in full_forms: expanded = re.sub(rf'\b{re.escape(abbrev)}\b', full_form, text_lower) if expanded != text_lower: expansions.append(expanded) # Remove duplicates while preserving order seen = set() unique_expansions = [] for exp in expansions: if exp not in seen: seen.add(exp) unique_expansions.append(exp) return unique_expansions def get_synonyms(term: str) -> Set[str]: """Get all known synonyms for a medical term""" term_lower = term.lower() synonyms = set() # Check predefined synonyms if term_lower in MEDICAL_SYNONYMS: synonyms.update(MEDICAL_SYNONYMS[term_lower]) # Check if term is a synonym of something else for key, syn_list in MEDICAL_SYNONYMS.items(): if term_lower in syn_list: synonyms.add(key) synonyms.update(syn_list) # Check learned synonyms if term_lower in _terminology_learner.learned_synonyms: synonyms.update(_terminology_learner.learned_synonyms[term_lower]) # Remove the original term synonyms.discard(term_lower) return synonyms def get_spelling_variations(term: str) -> Set[str]: """Get regional spelling variations for a term""" term_lower = term.lower() variations = set() # Check direct mapping if term_lower in SPELLING_VARIATIONS: variations.update(SPELLING_VARIATIONS[term_lower]) # Check reverse mapping for key, var_list in SPELLING_VARIATIONS.items(): if term_lower in var_list: variations.add(key) variations.update(var_list) variations.discard(term_lower) return variations def extract_medical_entities(text: str) -> List[Tuple[str, str]]: """ Extract medical entities from text. Returns list of (entity, type) tuples. """ entities = [] text_lower = text.lower() # Extract abbreviations for abbrev in MEDICAL_ABBREVIATIONS.keys(): if re.search(rf'\b{re.escape(abbrev)}\b', text_lower): entities.append((abbrev, 'abbreviation')) # Extract known medical terms for term in MEDICAL_SYNONYMS.keys(): if term in text_lower: entities.append((term, 'medical_term')) return entities def is_medical_abbreviation(text: str) -> bool: """Check if text is a known medical abbreviation""" text_lower = text.lower().strip() return text_lower in MEDICAL_ABBREVIATIONS or text_lower in _terminology_learner.learned_abbreviations def get_abbreviation_expansion(abbrev: str) -> List[str]: """Get all possible expansions for an abbreviation""" abbrev_lower = abbrev.lower().strip() expansions = [] # Check predefined if abbrev_lower in MEDICAL_ABBREVIATIONS: expansions.extend(MEDICAL_ABBREVIATIONS[abbrev_lower]) # Check learned if abbrev_lower in _terminology_learner.learned_abbreviations: expansions.extend(_terminology_learner.learned_abbreviations[abbrev_lower]) return expansions def expand_query_with_variations(query: str, max_variations: int = 5) -> List[str]: """ Generate query variations by expanding abbreviations, adding synonyms, and including spelling variations. Args: query: Original query string max_variations: Maximum number of variations to generate Returns: List of query variations including the original """ variations = [query] query_lower = normalize_query(query) # 1. Expand abbreviations abbrev_expansions = expand_abbreviations(query_lower) variations.extend(abbrev_expansions) # 2. Add synonym variations words = query_lower.split() for i, word in enumerate(words): synonyms = get_synonyms(word) for syn in list(synonyms)[:2]: # Limit to 2 synonyms per word new_query = ' '.join(words[:i] + [syn] + words[i+1:]) variations.append(new_query) # 3. Add spelling variations for word in words: spelling_vars = get_spelling_variations(word) for var in spelling_vars: new_query = query_lower.replace(word, var) variations.append(new_query) # 4. Add multi-word phrase variations for term, synonyms in MEDICAL_SYNONYMS.items(): if term in query_lower: for syn in list(synonyms)[:2]: new_query = query_lower.replace(term, syn) variations.append(new_query) # Remove duplicates and limit seen = set() unique_variations = [] for var in variations: if var not in seen: seen.add(var) unique_variations.append(var) if len(unique_variations) >= max_variations: break return unique_variations def learn_from_corpus(documents: List[Dict[str, str]]): """ Learn medical term variations from a corpus of documents. Should be called during system initialization. """ _terminology_learner.learn_from_documents(documents) def get_terminology_learner() -> MedicalTerminologyLearner: """Get the global terminology learner instance""" return _terminology_learner