Lung-Cancer-AI-Advisor / core /medical_terminology.py
moazx's picture
Enhance API security and functionality by adding authentication middleware and session management. Updated app.py to include the new auth router and integrated authentication checks for protected endpoints. Modified requirements.txt to include necessary libraries for session handling. Updated .env.example to include authentication credentials. Improved retrieval functions with query expansion for better medical term matching and enriched context in responses.
ddc9c77
"""
Medical Terminology Module with Dynamic Learning
This module provides intelligent handling of medical linguistic variability including:
- Synonyms and alternate terms
- Abbreviations and acronyms (with context awareness)
- Regional spelling variations (US/UK/International)
- Specialty-specific terminology
- Dynamic learning from corpus
"""
import re
import json
from typing import List, Dict, Set, Tuple, Optional
from collections import defaultdict
from pathlib import Path
from .config import logger
# ============================================================================
# CORE MEDICAL TERMINOLOGY MAPPINGS
# ============================================================================
# Common medical abbreviations with context-aware expansions
MEDICAL_ABBREVIATIONS = {
# Cancer Types
"nsclc": ["non-small cell lung cancer", "non small cell lung cancer"],
"sclc": ["small cell lung cancer"],
"nscl": ["non-small cell lung"],
"alk": ["anaplastic lymphoma kinase"],
"egfr": ["epidermal growth factor receptor"],
"ros1": ["ros proto-oncogene 1", "c-ros oncogene 1"],
"braf": ["b-raf proto-oncogene"],
"kras": ["kirsten rat sarcoma viral oncogene"],
"met": ["mesenchymal epithelial transition", "met proto-oncogene"],
"her2": ["human epidermal growth factor receptor 2"],
"ret": ["ret proto-oncogene", "rearranged during transfection"],
"ntrk": ["neurotrophic tyrosine receptor kinase", "neurotrophic tropomyosin receptor kinase"],
# Treatment & Procedures
"chemo": ["chemotherapy"],
"rt": ["radiation therapy", "radiotherapy"],
"sbrt": ["stereotactic body radiation therapy", "stereotactic body radiotherapy"],
"imrt": ["intensity-modulated radiation therapy"],
"ct": ["computed tomography", "ct scan"],
"pet": ["positron emission tomography"],
"mri": ["magnetic resonance imaging"],
"io": ["immunotherapy", "immune-oncology"],
"ici": ["immune checkpoint inhibitor", "immune checkpoint inhibitors"],
"tki": ["tyrosine kinase inhibitor", "tyrosine kinase inhibitors"],
"pd-1": ["programmed death-1", "programmed cell death protein 1"],
"pd-l1": ["programmed death-ligand 1"],
"ctla-4": ["cytotoxic t-lymphocyte-associated protein 4"],
# Clinical Terms
"os": ["overall survival"],
"pfs": ["progression-free survival"],
"dfs": ["disease-free survival"],
"orr": ["overall response rate", "objective response rate"],
"cr": ["complete response"],
"pr": ["partial response"],
"sd": ["stable disease"],
"pd": ["progressive disease"],
"ecog": ["eastern cooperative oncology group"],
"ps": ["performance status"],
"aes": ["adverse events"],
"sae": ["serious adverse event", "serious adverse events"],
"qol": ["quality of life"],
# Staging
"tnm": ["tumor node metastasis", "tnm staging"],
"ajcc": ["american joint committee on cancer"],
# Drugs (common abbreviations)
"cddp": ["cisplatin"],
"cbdca": ["carboplatin"],
"pem": ["pemetrexed"],
"gem": ["gemcitabine"],
"doc": ["docetaxel"],
"pac": ["paclitaxel"],
"vin": ["vinorelbine"],
"eto": ["etoposide"],
}
# Synonym mappings for medical terms
MEDICAL_SYNONYMS = {
# Cancer terminology
"lung cancer": ["pulmonary cancer", "lung carcinoma", "pulmonary carcinoma", "bronchogenic carcinoma"],
"non-small cell lung cancer": ["nsclc", "non small cell lung cancer", "non-small-cell lung cancer"],
"small cell lung cancer": ["sclc", "small-cell lung cancer", "oat cell carcinoma"],
"adenocarcinoma": ["adeno", "glandular cancer"],
"squamous cell carcinoma": ["squamous carcinoma", "scc", "epidermoid carcinoma"],
"metastatic": ["advanced", "stage iv", "stage 4", "metastases", "mets"],
"locally advanced": ["stage iii", "stage 3", "regional spread"],
"early stage": ["stage i", "stage ii", "stage 1", "stage 2", "localized"],
# Treatment terms
"chemotherapy": ["chemo", "cytotoxic therapy", "systemic therapy"],
"radiation therapy": ["radiotherapy", "rt", "radiation treatment", "irradiation"],
"immunotherapy": ["immune therapy", "io", "immune-oncology", "checkpoint inhibitor"],
"targeted therapy": ["molecular therapy", "precision medicine", "targeted treatment"],
"surgery": ["surgical resection", "resection", "operative treatment", "surgical intervention"],
"lobectomy": ["lobe resection", "pulmonary lobectomy"],
"pneumonectomy": ["lung removal", "complete lung resection"],
"wedge resection": ["segmentectomy", "limited resection"],
# Molecular markers
"mutation": ["alteration", "variant", "genetic change", "molecular alteration"],
"biomarker": ["molecular marker", "tumor marker", "genetic marker"],
"driver mutation": ["oncogenic driver", "actionable mutation", "targetable mutation"],
# Clinical outcomes
"survival": ["survival rate", "survival outcome"],
"response": ["treatment response", "tumor response", "clinical response"],
"progression": ["disease progression", "tumor progression", "cancer progression"],
"recurrence": ["relapse", "disease recurrence", "tumor recurrence"],
"remission": ["response", "disease control"],
# Side effects
"adverse event": ["side effect", "adverse reaction", "toxicity", "adverse drug reaction"],
"neutropenia": ["low white blood cell count", "low neutrophil count"],
"anemia": ["low red blood cell count", "low hemoglobin"],
"thrombocytopenia": ["low platelet count"],
"nausea": ["feeling sick", "queasiness"],
"fatigue": ["tiredness", "exhaustion", "weakness"],
# Diagnostic terms
"biopsy": ["tissue sample", "tissue sampling"],
"imaging": ["radiology", "diagnostic imaging", "medical imaging"],
"screening": ["early detection", "cancer screening"],
}
# Regional spelling variations (US/UK/International)
SPELLING_VARIATIONS = {
# US -> UK/International variants
"tumor": ["tumour"],
"tumors": ["tumours"],
"metastasis": ["metastases"],
"anemia": ["anaemia"],
"edema": ["oedema"],
"esophageal": ["oesophageal"],
"pediatric": ["paediatric"],
"hematology": ["haematology"],
"hemoglobin": ["haemoglobin"],
"leukemia": ["leukaemia"],
"lymphoma": ["lymphoma"], # Same in both
"optimize": ["optimise"],
"randomized": ["randomised"],
"analyze": ["analyse"],
"center": ["centre"],
"fiber": ["fibre"],
}
# Context-dependent abbreviations (require disambiguation)
CONTEXT_DEPENDENT_ABBREVS = {
"ca": {
"cancer": ["cancer", "carcinoma"],
"calcium": ["calcium"],
},
"cr": {
"complete_response": ["complete response", "complete remission"],
"creatinine": ["creatinine"],
},
"pt": {
"patient": ["patient"],
"prothrombin_time": ["prothrombin time"],
},
"rt": {
"radiation_therapy": ["radiation therapy", "radiotherapy"],
"reverse_transcriptase": ["reverse transcriptase"],
},
}
# ============================================================================
# DYNAMIC LEARNING COMPONENTS
# ============================================================================
class MedicalTerminologyLearner:
"""
Dynamically learns medical term variations from the corpus.
Builds co-occurrence patterns and semantic relationships.
"""
def __init__(self, cache_path: Optional[str] = None):
self.cache_path = cache_path or "data/medical_terms_cache.json"
self.term_cooccurrence = defaultdict(lambda: defaultdict(int))
self.learned_synonyms = defaultdict(set)
self.learned_abbreviations = defaultdict(set)
self.context_patterns = defaultdict(list)
self._load_cache()
def _load_cache(self):
"""Load previously learned terms from cache"""
try:
cache_file = Path(self.cache_path)
if cache_file.exists():
with open(cache_file, 'r', encoding='utf-8') as f:
data = json.load(f)
self.learned_synonyms = defaultdict(set, {k: set(v) for k, v in data.get('synonyms', {}).items()})
self.learned_abbreviations = defaultdict(set, {k: set(v) for k, v in data.get('abbreviations', {}).items()})
logger.info(f"Loaded {len(self.learned_synonyms)} learned synonyms from cache")
except Exception as e:
logger.warning(f"Could not load term cache: {e}")
def _save_cache(self):
"""Save learned terms to cache"""
try:
cache_file = Path(self.cache_path)
cache_file.parent.mkdir(parents=True, exist_ok=True)
data = {
'synonyms': {k: list(v) for k, v in self.learned_synonyms.items()},
'abbreviations': {k: list(v) for k, v in self.learned_abbreviations.items()}
}
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2)
logger.info(f"Saved learned terms to cache")
except Exception as e:
logger.warning(f"Could not save term cache: {e}")
def learn_from_documents(self, documents: List[Dict[str, str]]):
"""
Learn term variations from a corpus of documents.
Identifies patterns like:
- "X (Y)" -> Y is abbreviation of X
- "X, also known as Y" -> X and Y are synonyms
- "X or Y" in similar contexts -> potential synonyms
"""
for doc in documents:
content = doc.get('content', '')
self._extract_abbreviation_patterns(content)
self._extract_synonym_patterns(content)
self._build_cooccurrence(content)
self._save_cache()
def _extract_abbreviation_patterns(self, text: str):
"""Extract abbreviations from patterns like 'Full Term (ABBR)'"""
# Pattern: "Full Term (ABBR)" or "Full Term [ABBR]"
pattern = r'([A-Z][a-z]+(?:\s+[A-Z]?[a-z]+)*)\s*[\(\[]([A-Z]{2,}|[A-Z][a-z]*(?:-[A-Z][a-z]*)*)[\)\]]'
matches = re.finditer(pattern, text)
for match in matches:
full_term = match.group(1).strip().lower()
abbrev = match.group(2).strip().lower()
# Validate: abbreviation should be shorter and contain initials
if len(abbrev) < len(full_term) and len(abbrev) >= 2:
self.learned_abbreviations[abbrev].add(full_term)
logger.debug(f"Learned: {abbrev} -> {full_term}")
def _extract_synonym_patterns(self, text: str):
"""Extract synonyms from patterns like 'X, also known as Y' or 'X (Y)'"""
# Pattern: "X, also known as Y" or "X, also called Y"
patterns = [
r'([a-z\s\-]+),?\s+also\s+known\s+as\s+([a-z\s\-]+)',
r'([a-z\s\-]+),?\s+also\s+called\s+([a-z\s\-]+)',
r'([a-z\s\-]+)\s+\(([a-z\s\-]+)\)',
]
for pattern in patterns:
matches = re.finditer(pattern, text.lower())
for match in matches:
term1 = match.group(1).strip()
term2 = match.group(2).strip()
# Validate: both should be reasonable length
if 3 <= len(term1) <= 50 and 3 <= len(term2) <= 50:
self.learned_synonyms[term1].add(term2)
self.learned_synonyms[term2].add(term1)
def _build_cooccurrence(self, text: str):
"""Build co-occurrence matrix for terms"""
# Extract medical terms (simplified)
terms = re.findall(r'\b[a-z]{3,}(?:\s+[a-z]{3,}){0,3}\b', text.lower())
# Build co-occurrence within a window
window_size = 10
for i, term in enumerate(terms):
for j in range(max(0, i - window_size), min(len(terms), i + window_size + 1)):
if i != j:
self.term_cooccurrence[term][terms[j]] += 1
def get_related_terms(self, term: str, threshold: int = 3) -> Set[str]:
"""Get terms that frequently co-occur with the given term"""
term_lower = term.lower()
related = set()
if term_lower in self.term_cooccurrence:
for related_term, count in self.term_cooccurrence[term_lower].items():
if count >= threshold:
related.add(related_term)
return related
# Global learner instance
_terminology_learner = MedicalTerminologyLearner()
# ============================================================================
# QUERY NORMALIZATION AND EXPANSION FUNCTIONS
# ============================================================================
def normalize_query(query: str) -> str:
"""
Normalize a query by:
- Converting to lowercase
- Removing extra whitespace
- Standardizing punctuation
"""
# Convert to lowercase
normalized = query.lower()
# Standardize hyphens and dashes
normalized = re.sub(r'[–—]', '-', normalized)
# Remove extra whitespace
normalized = re.sub(r'\s+', ' ', normalized).strip()
return normalized
def expand_abbreviations(text: str, context: Optional[str] = None) -> List[str]:
"""
Expand abbreviations in text to their full forms.
Uses context when available for disambiguation.
"""
expansions = [text]
text_lower = text.lower()
# Check learned abbreviations first
for abbrev, full_forms in _terminology_learner.learned_abbreviations.items():
if abbrev in text_lower:
for full_form in full_forms:
expanded = text_lower.replace(abbrev, full_form)
if expanded != text_lower:
expansions.append(expanded)
# Check predefined abbreviations
for abbrev, full_forms in MEDICAL_ABBREVIATIONS.items():
if re.search(rf'\b{re.escape(abbrev)}\b', text_lower):
for full_form in full_forms:
expanded = re.sub(rf'\b{re.escape(abbrev)}\b', full_form, text_lower)
if expanded != text_lower:
expansions.append(expanded)
# Remove duplicates while preserving order
seen = set()
unique_expansions = []
for exp in expansions:
if exp not in seen:
seen.add(exp)
unique_expansions.append(exp)
return unique_expansions
def get_synonyms(term: str) -> Set[str]:
"""Get all known synonyms for a medical term"""
term_lower = term.lower()
synonyms = set()
# Check predefined synonyms
if term_lower in MEDICAL_SYNONYMS:
synonyms.update(MEDICAL_SYNONYMS[term_lower])
# Check if term is a synonym of something else
for key, syn_list in MEDICAL_SYNONYMS.items():
if term_lower in syn_list:
synonyms.add(key)
synonyms.update(syn_list)
# Check learned synonyms
if term_lower in _terminology_learner.learned_synonyms:
synonyms.update(_terminology_learner.learned_synonyms[term_lower])
# Remove the original term
synonyms.discard(term_lower)
return synonyms
def get_spelling_variations(term: str) -> Set[str]:
"""Get regional spelling variations for a term"""
term_lower = term.lower()
variations = set()
# Check direct mapping
if term_lower in SPELLING_VARIATIONS:
variations.update(SPELLING_VARIATIONS[term_lower])
# Check reverse mapping
for key, var_list in SPELLING_VARIATIONS.items():
if term_lower in var_list:
variations.add(key)
variations.update(var_list)
variations.discard(term_lower)
return variations
def extract_medical_entities(text: str) -> List[Tuple[str, str]]:
"""
Extract medical entities from text.
Returns list of (entity, type) tuples.
"""
entities = []
text_lower = text.lower()
# Extract abbreviations
for abbrev in MEDICAL_ABBREVIATIONS.keys():
if re.search(rf'\b{re.escape(abbrev)}\b', text_lower):
entities.append((abbrev, 'abbreviation'))
# Extract known medical terms
for term in MEDICAL_SYNONYMS.keys():
if term in text_lower:
entities.append((term, 'medical_term'))
return entities
def is_medical_abbreviation(text: str) -> bool:
"""Check if text is a known medical abbreviation"""
text_lower = text.lower().strip()
return text_lower in MEDICAL_ABBREVIATIONS or text_lower in _terminology_learner.learned_abbreviations
def get_abbreviation_expansion(abbrev: str) -> List[str]:
"""Get all possible expansions for an abbreviation"""
abbrev_lower = abbrev.lower().strip()
expansions = []
# Check predefined
if abbrev_lower in MEDICAL_ABBREVIATIONS:
expansions.extend(MEDICAL_ABBREVIATIONS[abbrev_lower])
# Check learned
if abbrev_lower in _terminology_learner.learned_abbreviations:
expansions.extend(_terminology_learner.learned_abbreviations[abbrev_lower])
return expansions
def expand_query_with_variations(query: str, max_variations: int = 5) -> List[str]:
"""
Generate query variations by expanding abbreviations, adding synonyms,
and including spelling variations.
Args:
query: Original query string
max_variations: Maximum number of variations to generate
Returns:
List of query variations including the original
"""
variations = [query]
query_lower = normalize_query(query)
# 1. Expand abbreviations
abbrev_expansions = expand_abbreviations(query_lower)
variations.extend(abbrev_expansions)
# 2. Add synonym variations
words = query_lower.split()
for i, word in enumerate(words):
synonyms = get_synonyms(word)
for syn in list(synonyms)[:2]: # Limit to 2 synonyms per word
new_query = ' '.join(words[:i] + [syn] + words[i+1:])
variations.append(new_query)
# 3. Add spelling variations
for word in words:
spelling_vars = get_spelling_variations(word)
for var in spelling_vars:
new_query = query_lower.replace(word, var)
variations.append(new_query)
# 4. Add multi-word phrase variations
for term, synonyms in MEDICAL_SYNONYMS.items():
if term in query_lower:
for syn in list(synonyms)[:2]:
new_query = query_lower.replace(term, syn)
variations.append(new_query)
# Remove duplicates and limit
seen = set()
unique_variations = []
for var in variations:
if var not in seen:
seen.add(var)
unique_variations.append(var)
if len(unique_variations) >= max_variations:
break
return unique_variations
def learn_from_corpus(documents: List[Dict[str, str]]):
"""
Learn medical term variations from a corpus of documents.
Should be called during system initialization.
"""
_terminology_learner.learn_from_documents(documents)
def get_terminology_learner() -> MedicalTerminologyLearner:
"""Get the global terminology learner instance"""
return _terminology_learner