Spaces:
Running
Running
Enhance API security and functionality by adding authentication middleware and session management. Updated app.py to include the new auth router and integrated authentication checks for protected endpoints. Modified requirements.txt to include necessary libraries for session handling. Updated .env.example to include authentication credentials. Improved retrieval functions with query expansion for better medical term matching and enriched context in responses.
ddc9c77
| """ | |
| Medical Terminology Module with Dynamic Learning | |
| This module provides intelligent handling of medical linguistic variability including: | |
| - Synonyms and alternate terms | |
| - Abbreviations and acronyms (with context awareness) | |
| - Regional spelling variations (US/UK/International) | |
| - Specialty-specific terminology | |
| - Dynamic learning from corpus | |
| """ | |
| import re | |
| import json | |
| from typing import List, Dict, Set, Tuple, Optional | |
| from collections import defaultdict | |
| from pathlib import Path | |
| from .config import logger | |
| # ============================================================================ | |
| # CORE MEDICAL TERMINOLOGY MAPPINGS | |
| # ============================================================================ | |
| # Common medical abbreviations with context-aware expansions | |
| MEDICAL_ABBREVIATIONS = { | |
| # Cancer Types | |
| "nsclc": ["non-small cell lung cancer", "non small cell lung cancer"], | |
| "sclc": ["small cell lung cancer"], | |
| "nscl": ["non-small cell lung"], | |
| "alk": ["anaplastic lymphoma kinase"], | |
| "egfr": ["epidermal growth factor receptor"], | |
| "ros1": ["ros proto-oncogene 1", "c-ros oncogene 1"], | |
| "braf": ["b-raf proto-oncogene"], | |
| "kras": ["kirsten rat sarcoma viral oncogene"], | |
| "met": ["mesenchymal epithelial transition", "met proto-oncogene"], | |
| "her2": ["human epidermal growth factor receptor 2"], | |
| "ret": ["ret proto-oncogene", "rearranged during transfection"], | |
| "ntrk": ["neurotrophic tyrosine receptor kinase", "neurotrophic tropomyosin receptor kinase"], | |
| # Treatment & Procedures | |
| "chemo": ["chemotherapy"], | |
| "rt": ["radiation therapy", "radiotherapy"], | |
| "sbrt": ["stereotactic body radiation therapy", "stereotactic body radiotherapy"], | |
| "imrt": ["intensity-modulated radiation therapy"], | |
| "ct": ["computed tomography", "ct scan"], | |
| "pet": ["positron emission tomography"], | |
| "mri": ["magnetic resonance imaging"], | |
| "io": ["immunotherapy", "immune-oncology"], | |
| "ici": ["immune checkpoint inhibitor", "immune checkpoint inhibitors"], | |
| "tki": ["tyrosine kinase inhibitor", "tyrosine kinase inhibitors"], | |
| "pd-1": ["programmed death-1", "programmed cell death protein 1"], | |
| "pd-l1": ["programmed death-ligand 1"], | |
| "ctla-4": ["cytotoxic t-lymphocyte-associated protein 4"], | |
| # Clinical Terms | |
| "os": ["overall survival"], | |
| "pfs": ["progression-free survival"], | |
| "dfs": ["disease-free survival"], | |
| "orr": ["overall response rate", "objective response rate"], | |
| "cr": ["complete response"], | |
| "pr": ["partial response"], | |
| "sd": ["stable disease"], | |
| "pd": ["progressive disease"], | |
| "ecog": ["eastern cooperative oncology group"], | |
| "ps": ["performance status"], | |
| "aes": ["adverse events"], | |
| "sae": ["serious adverse event", "serious adverse events"], | |
| "qol": ["quality of life"], | |
| # Staging | |
| "tnm": ["tumor node metastasis", "tnm staging"], | |
| "ajcc": ["american joint committee on cancer"], | |
| # Drugs (common abbreviations) | |
| "cddp": ["cisplatin"], | |
| "cbdca": ["carboplatin"], | |
| "pem": ["pemetrexed"], | |
| "gem": ["gemcitabine"], | |
| "doc": ["docetaxel"], | |
| "pac": ["paclitaxel"], | |
| "vin": ["vinorelbine"], | |
| "eto": ["etoposide"], | |
| } | |
| # Synonym mappings for medical terms | |
| MEDICAL_SYNONYMS = { | |
| # Cancer terminology | |
| "lung cancer": ["pulmonary cancer", "lung carcinoma", "pulmonary carcinoma", "bronchogenic carcinoma"], | |
| "non-small cell lung cancer": ["nsclc", "non small cell lung cancer", "non-small-cell lung cancer"], | |
| "small cell lung cancer": ["sclc", "small-cell lung cancer", "oat cell carcinoma"], | |
| "adenocarcinoma": ["adeno", "glandular cancer"], | |
| "squamous cell carcinoma": ["squamous carcinoma", "scc", "epidermoid carcinoma"], | |
| "metastatic": ["advanced", "stage iv", "stage 4", "metastases", "mets"], | |
| "locally advanced": ["stage iii", "stage 3", "regional spread"], | |
| "early stage": ["stage i", "stage ii", "stage 1", "stage 2", "localized"], | |
| # Treatment terms | |
| "chemotherapy": ["chemo", "cytotoxic therapy", "systemic therapy"], | |
| "radiation therapy": ["radiotherapy", "rt", "radiation treatment", "irradiation"], | |
| "immunotherapy": ["immune therapy", "io", "immune-oncology", "checkpoint inhibitor"], | |
| "targeted therapy": ["molecular therapy", "precision medicine", "targeted treatment"], | |
| "surgery": ["surgical resection", "resection", "operative treatment", "surgical intervention"], | |
| "lobectomy": ["lobe resection", "pulmonary lobectomy"], | |
| "pneumonectomy": ["lung removal", "complete lung resection"], | |
| "wedge resection": ["segmentectomy", "limited resection"], | |
| # Molecular markers | |
| "mutation": ["alteration", "variant", "genetic change", "molecular alteration"], | |
| "biomarker": ["molecular marker", "tumor marker", "genetic marker"], | |
| "driver mutation": ["oncogenic driver", "actionable mutation", "targetable mutation"], | |
| # Clinical outcomes | |
| "survival": ["survival rate", "survival outcome"], | |
| "response": ["treatment response", "tumor response", "clinical response"], | |
| "progression": ["disease progression", "tumor progression", "cancer progression"], | |
| "recurrence": ["relapse", "disease recurrence", "tumor recurrence"], | |
| "remission": ["response", "disease control"], | |
| # Side effects | |
| "adverse event": ["side effect", "adverse reaction", "toxicity", "adverse drug reaction"], | |
| "neutropenia": ["low white blood cell count", "low neutrophil count"], | |
| "anemia": ["low red blood cell count", "low hemoglobin"], | |
| "thrombocytopenia": ["low platelet count"], | |
| "nausea": ["feeling sick", "queasiness"], | |
| "fatigue": ["tiredness", "exhaustion", "weakness"], | |
| # Diagnostic terms | |
| "biopsy": ["tissue sample", "tissue sampling"], | |
| "imaging": ["radiology", "diagnostic imaging", "medical imaging"], | |
| "screening": ["early detection", "cancer screening"], | |
| } | |
| # Regional spelling variations (US/UK/International) | |
| SPELLING_VARIATIONS = { | |
| # US -> UK/International variants | |
| "tumor": ["tumour"], | |
| "tumors": ["tumours"], | |
| "metastasis": ["metastases"], | |
| "anemia": ["anaemia"], | |
| "edema": ["oedema"], | |
| "esophageal": ["oesophageal"], | |
| "pediatric": ["paediatric"], | |
| "hematology": ["haematology"], | |
| "hemoglobin": ["haemoglobin"], | |
| "leukemia": ["leukaemia"], | |
| "lymphoma": ["lymphoma"], # Same in both | |
| "optimize": ["optimise"], | |
| "randomized": ["randomised"], | |
| "analyze": ["analyse"], | |
| "center": ["centre"], | |
| "fiber": ["fibre"], | |
| } | |
| # Context-dependent abbreviations (require disambiguation) | |
| CONTEXT_DEPENDENT_ABBREVS = { | |
| "ca": { | |
| "cancer": ["cancer", "carcinoma"], | |
| "calcium": ["calcium"], | |
| }, | |
| "cr": { | |
| "complete_response": ["complete response", "complete remission"], | |
| "creatinine": ["creatinine"], | |
| }, | |
| "pt": { | |
| "patient": ["patient"], | |
| "prothrombin_time": ["prothrombin time"], | |
| }, | |
| "rt": { | |
| "radiation_therapy": ["radiation therapy", "radiotherapy"], | |
| "reverse_transcriptase": ["reverse transcriptase"], | |
| }, | |
| } | |
| # ============================================================================ | |
| # DYNAMIC LEARNING COMPONENTS | |
| # ============================================================================ | |
| class MedicalTerminologyLearner: | |
| """ | |
| Dynamically learns medical term variations from the corpus. | |
| Builds co-occurrence patterns and semantic relationships. | |
| """ | |
| def __init__(self, cache_path: Optional[str] = None): | |
| self.cache_path = cache_path or "data/medical_terms_cache.json" | |
| self.term_cooccurrence = defaultdict(lambda: defaultdict(int)) | |
| self.learned_synonyms = defaultdict(set) | |
| self.learned_abbreviations = defaultdict(set) | |
| self.context_patterns = defaultdict(list) | |
| self._load_cache() | |
| def _load_cache(self): | |
| """Load previously learned terms from cache""" | |
| try: | |
| cache_file = Path(self.cache_path) | |
| if cache_file.exists(): | |
| with open(cache_file, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| self.learned_synonyms = defaultdict(set, {k: set(v) for k, v in data.get('synonyms', {}).items()}) | |
| self.learned_abbreviations = defaultdict(set, {k: set(v) for k, v in data.get('abbreviations', {}).items()}) | |
| logger.info(f"Loaded {len(self.learned_synonyms)} learned synonyms from cache") | |
| except Exception as e: | |
| logger.warning(f"Could not load term cache: {e}") | |
| def _save_cache(self): | |
| """Save learned terms to cache""" | |
| try: | |
| cache_file = Path(self.cache_path) | |
| cache_file.parent.mkdir(parents=True, exist_ok=True) | |
| data = { | |
| 'synonyms': {k: list(v) for k, v in self.learned_synonyms.items()}, | |
| 'abbreviations': {k: list(v) for k, v in self.learned_abbreviations.items()} | |
| } | |
| with open(cache_file, 'w', encoding='utf-8') as f: | |
| json.dump(data, f, indent=2) | |
| logger.info(f"Saved learned terms to cache") | |
| except Exception as e: | |
| logger.warning(f"Could not save term cache: {e}") | |
| def learn_from_documents(self, documents: List[Dict[str, str]]): | |
| """ | |
| Learn term variations from a corpus of documents. | |
| Identifies patterns like: | |
| - "X (Y)" -> Y is abbreviation of X | |
| - "X, also known as Y" -> X and Y are synonyms | |
| - "X or Y" in similar contexts -> potential synonyms | |
| """ | |
| for doc in documents: | |
| content = doc.get('content', '') | |
| self._extract_abbreviation_patterns(content) | |
| self._extract_synonym_patterns(content) | |
| self._build_cooccurrence(content) | |
| self._save_cache() | |
| def _extract_abbreviation_patterns(self, text: str): | |
| """Extract abbreviations from patterns like 'Full Term (ABBR)'""" | |
| # Pattern: "Full Term (ABBR)" or "Full Term [ABBR]" | |
| pattern = r'([A-Z][a-z]+(?:\s+[A-Z]?[a-z]+)*)\s*[\(\[]([A-Z]{2,}|[A-Z][a-z]*(?:-[A-Z][a-z]*)*)[\)\]]' | |
| matches = re.finditer(pattern, text) | |
| for match in matches: | |
| full_term = match.group(1).strip().lower() | |
| abbrev = match.group(2).strip().lower() | |
| # Validate: abbreviation should be shorter and contain initials | |
| if len(abbrev) < len(full_term) and len(abbrev) >= 2: | |
| self.learned_abbreviations[abbrev].add(full_term) | |
| logger.debug(f"Learned: {abbrev} -> {full_term}") | |
| def _extract_synonym_patterns(self, text: str): | |
| """Extract synonyms from patterns like 'X, also known as Y' or 'X (Y)'""" | |
| # Pattern: "X, also known as Y" or "X, also called Y" | |
| patterns = [ | |
| r'([a-z\s\-]+),?\s+also\s+known\s+as\s+([a-z\s\-]+)', | |
| r'([a-z\s\-]+),?\s+also\s+called\s+([a-z\s\-]+)', | |
| r'([a-z\s\-]+)\s+\(([a-z\s\-]+)\)', | |
| ] | |
| for pattern in patterns: | |
| matches = re.finditer(pattern, text.lower()) | |
| for match in matches: | |
| term1 = match.group(1).strip() | |
| term2 = match.group(2).strip() | |
| # Validate: both should be reasonable length | |
| if 3 <= len(term1) <= 50 and 3 <= len(term2) <= 50: | |
| self.learned_synonyms[term1].add(term2) | |
| self.learned_synonyms[term2].add(term1) | |
| def _build_cooccurrence(self, text: str): | |
| """Build co-occurrence matrix for terms""" | |
| # Extract medical terms (simplified) | |
| terms = re.findall(r'\b[a-z]{3,}(?:\s+[a-z]{3,}){0,3}\b', text.lower()) | |
| # Build co-occurrence within a window | |
| window_size = 10 | |
| for i, term in enumerate(terms): | |
| for j in range(max(0, i - window_size), min(len(terms), i + window_size + 1)): | |
| if i != j: | |
| self.term_cooccurrence[term][terms[j]] += 1 | |
| def get_related_terms(self, term: str, threshold: int = 3) -> Set[str]: | |
| """Get terms that frequently co-occur with the given term""" | |
| term_lower = term.lower() | |
| related = set() | |
| if term_lower in self.term_cooccurrence: | |
| for related_term, count in self.term_cooccurrence[term_lower].items(): | |
| if count >= threshold: | |
| related.add(related_term) | |
| return related | |
| # Global learner instance | |
| _terminology_learner = MedicalTerminologyLearner() | |
| # ============================================================================ | |
| # QUERY NORMALIZATION AND EXPANSION FUNCTIONS | |
| # ============================================================================ | |
| def normalize_query(query: str) -> str: | |
| """ | |
| Normalize a query by: | |
| - Converting to lowercase | |
| - Removing extra whitespace | |
| - Standardizing punctuation | |
| """ | |
| # Convert to lowercase | |
| normalized = query.lower() | |
| # Standardize hyphens and dashes | |
| normalized = re.sub(r'[–—]', '-', normalized) | |
| # Remove extra whitespace | |
| normalized = re.sub(r'\s+', ' ', normalized).strip() | |
| return normalized | |
| def expand_abbreviations(text: str, context: Optional[str] = None) -> List[str]: | |
| """ | |
| Expand abbreviations in text to their full forms. | |
| Uses context when available for disambiguation. | |
| """ | |
| expansions = [text] | |
| text_lower = text.lower() | |
| # Check learned abbreviations first | |
| for abbrev, full_forms in _terminology_learner.learned_abbreviations.items(): | |
| if abbrev in text_lower: | |
| for full_form in full_forms: | |
| expanded = text_lower.replace(abbrev, full_form) | |
| if expanded != text_lower: | |
| expansions.append(expanded) | |
| # Check predefined abbreviations | |
| for abbrev, full_forms in MEDICAL_ABBREVIATIONS.items(): | |
| if re.search(rf'\b{re.escape(abbrev)}\b', text_lower): | |
| for full_form in full_forms: | |
| expanded = re.sub(rf'\b{re.escape(abbrev)}\b', full_form, text_lower) | |
| if expanded != text_lower: | |
| expansions.append(expanded) | |
| # Remove duplicates while preserving order | |
| seen = set() | |
| unique_expansions = [] | |
| for exp in expansions: | |
| if exp not in seen: | |
| seen.add(exp) | |
| unique_expansions.append(exp) | |
| return unique_expansions | |
| def get_synonyms(term: str) -> Set[str]: | |
| """Get all known synonyms for a medical term""" | |
| term_lower = term.lower() | |
| synonyms = set() | |
| # Check predefined synonyms | |
| if term_lower in MEDICAL_SYNONYMS: | |
| synonyms.update(MEDICAL_SYNONYMS[term_lower]) | |
| # Check if term is a synonym of something else | |
| for key, syn_list in MEDICAL_SYNONYMS.items(): | |
| if term_lower in syn_list: | |
| synonyms.add(key) | |
| synonyms.update(syn_list) | |
| # Check learned synonyms | |
| if term_lower in _terminology_learner.learned_synonyms: | |
| synonyms.update(_terminology_learner.learned_synonyms[term_lower]) | |
| # Remove the original term | |
| synonyms.discard(term_lower) | |
| return synonyms | |
| def get_spelling_variations(term: str) -> Set[str]: | |
| """Get regional spelling variations for a term""" | |
| term_lower = term.lower() | |
| variations = set() | |
| # Check direct mapping | |
| if term_lower in SPELLING_VARIATIONS: | |
| variations.update(SPELLING_VARIATIONS[term_lower]) | |
| # Check reverse mapping | |
| for key, var_list in SPELLING_VARIATIONS.items(): | |
| if term_lower in var_list: | |
| variations.add(key) | |
| variations.update(var_list) | |
| variations.discard(term_lower) | |
| return variations | |
| def extract_medical_entities(text: str) -> List[Tuple[str, str]]: | |
| """ | |
| Extract medical entities from text. | |
| Returns list of (entity, type) tuples. | |
| """ | |
| entities = [] | |
| text_lower = text.lower() | |
| # Extract abbreviations | |
| for abbrev in MEDICAL_ABBREVIATIONS.keys(): | |
| if re.search(rf'\b{re.escape(abbrev)}\b', text_lower): | |
| entities.append((abbrev, 'abbreviation')) | |
| # Extract known medical terms | |
| for term in MEDICAL_SYNONYMS.keys(): | |
| if term in text_lower: | |
| entities.append((term, 'medical_term')) | |
| return entities | |
| def is_medical_abbreviation(text: str) -> bool: | |
| """Check if text is a known medical abbreviation""" | |
| text_lower = text.lower().strip() | |
| return text_lower in MEDICAL_ABBREVIATIONS or text_lower in _terminology_learner.learned_abbreviations | |
| def get_abbreviation_expansion(abbrev: str) -> List[str]: | |
| """Get all possible expansions for an abbreviation""" | |
| abbrev_lower = abbrev.lower().strip() | |
| expansions = [] | |
| # Check predefined | |
| if abbrev_lower in MEDICAL_ABBREVIATIONS: | |
| expansions.extend(MEDICAL_ABBREVIATIONS[abbrev_lower]) | |
| # Check learned | |
| if abbrev_lower in _terminology_learner.learned_abbreviations: | |
| expansions.extend(_terminology_learner.learned_abbreviations[abbrev_lower]) | |
| return expansions | |
| def expand_query_with_variations(query: str, max_variations: int = 5) -> List[str]: | |
| """ | |
| Generate query variations by expanding abbreviations, adding synonyms, | |
| and including spelling variations. | |
| Args: | |
| query: Original query string | |
| max_variations: Maximum number of variations to generate | |
| Returns: | |
| List of query variations including the original | |
| """ | |
| variations = [query] | |
| query_lower = normalize_query(query) | |
| # 1. Expand abbreviations | |
| abbrev_expansions = expand_abbreviations(query_lower) | |
| variations.extend(abbrev_expansions) | |
| # 2. Add synonym variations | |
| words = query_lower.split() | |
| for i, word in enumerate(words): | |
| synonyms = get_synonyms(word) | |
| for syn in list(synonyms)[:2]: # Limit to 2 synonyms per word | |
| new_query = ' '.join(words[:i] + [syn] + words[i+1:]) | |
| variations.append(new_query) | |
| # 3. Add spelling variations | |
| for word in words: | |
| spelling_vars = get_spelling_variations(word) | |
| for var in spelling_vars: | |
| new_query = query_lower.replace(word, var) | |
| variations.append(new_query) | |
| # 4. Add multi-word phrase variations | |
| for term, synonyms in MEDICAL_SYNONYMS.items(): | |
| if term in query_lower: | |
| for syn in list(synonyms)[:2]: | |
| new_query = query_lower.replace(term, syn) | |
| variations.append(new_query) | |
| # Remove duplicates and limit | |
| seen = set() | |
| unique_variations = [] | |
| for var in variations: | |
| if var not in seen: | |
| seen.add(var) | |
| unique_variations.append(var) | |
| if len(unique_variations) >= max_variations: | |
| break | |
| return unique_variations | |
| def learn_from_corpus(documents: List[Dict[str, str]]): | |
| """ | |
| Learn medical term variations from a corpus of documents. | |
| Should be called during system initialization. | |
| """ | |
| _terminology_learner.learn_from_documents(documents) | |
| def get_terminology_learner() -> MedicalTerminologyLearner: | |
| """Get the global terminology learner instance""" | |
| return _terminology_learner | |