Spaces:

moazx
/

Lung-Cancer-AI-Advisor

Running

Lung-Cancer-AI-Advisor / core /medical_terminology.py

Enhance API security and functionality by adding authentication middleware and session management. Updated app.py to include the new auth router and integrated authentication checks for protected endpoints. Modified requirements.txt to include necessary libraries for session handling. Updated .env.example to include authentication credentials. Improved retrieval functions with query expansion for better medical term matching and enriched context in responses.

ddc9c77 about 2 months ago

raw

history blame contribute delete

19.3 kB

	"""
	Medical Terminology Module with Dynamic Learning

	This module provides intelligent handling of medical linguistic variability including:
	- Synonyms and alternate terms
	- Abbreviations and acronyms (with context awareness)
	- Regional spelling variations (US/UK/International)
	- Specialty-specific terminology
	- Dynamic learning from corpus
	"""

	import re
	import json
	from typing import List, Dict, Set, Tuple, Optional
	from collections import defaultdict
	from pathlib import Path
	from .config import logger

	# ============================================================================
	# CORE MEDICAL TERMINOLOGY MAPPINGS
	# ============================================================================

	# Common medical abbreviations with context-aware expansions
	MEDICAL_ABBREVIATIONS = {
	# Cancer Types
	"nsclc": ["non-small cell lung cancer", "non small cell lung cancer"],
	"sclc": ["small cell lung cancer"],
	"nscl": ["non-small cell lung"],
	"alk": ["anaplastic lymphoma kinase"],
	"egfr": ["epidermal growth factor receptor"],
	"ros1": ["ros proto-oncogene 1", "c-ros oncogene 1"],
	"braf": ["b-raf proto-oncogene"],
	"kras": ["kirsten rat sarcoma viral oncogene"],
	"met": ["mesenchymal epithelial transition", "met proto-oncogene"],
	"her2": ["human epidermal growth factor receptor 2"],
	"ret": ["ret proto-oncogene", "rearranged during transfection"],
	"ntrk": ["neurotrophic tyrosine receptor kinase", "neurotrophic tropomyosin receptor kinase"],

	# Treatment & Procedures
	"chemo": ["chemotherapy"],
	"rt": ["radiation therapy", "radiotherapy"],
	"sbrt": ["stereotactic body radiation therapy", "stereotactic body radiotherapy"],
	"imrt": ["intensity-modulated radiation therapy"],
	"ct": ["computed tomography", "ct scan"],
	"pet": ["positron emission tomography"],
	"mri": ["magnetic resonance imaging"],
	"io": ["immunotherapy", "immune-oncology"],
	"ici": ["immune checkpoint inhibitor", "immune checkpoint inhibitors"],
	"tki": ["tyrosine kinase inhibitor", "tyrosine kinase inhibitors"],
	"pd-1": ["programmed death-1", "programmed cell death protein 1"],
	"pd-l1": ["programmed death-ligand 1"],
	"ctla-4": ["cytotoxic t-lymphocyte-associated protein 4"],

	# Clinical Terms
	"os": ["overall survival"],
	"pfs": ["progression-free survival"],
	"dfs": ["disease-free survival"],
	"orr": ["overall response rate", "objective response rate"],
	"cr": ["complete response"],
	"pr": ["partial response"],
	"sd": ["stable disease"],
	"pd": ["progressive disease"],
	"ecog": ["eastern cooperative oncology group"],
	"ps": ["performance status"],
	"aes": ["adverse events"],
	"sae": ["serious adverse event", "serious adverse events"],
	"qol": ["quality of life"],

	# Staging
	"tnm": ["tumor node metastasis", "tnm staging"],
	"ajcc": ["american joint committee on cancer"],

	# Drugs (common abbreviations)
	"cddp": ["cisplatin"],
	"cbdca": ["carboplatin"],
	"pem": ["pemetrexed"],
	"gem": ["gemcitabine"],
	"doc": ["docetaxel"],
	"pac": ["paclitaxel"],
	"vin": ["vinorelbine"],
	"eto": ["etoposide"],
	}

	# Synonym mappings for medical terms
	MEDICAL_SYNONYMS = {
	# Cancer terminology
	"lung cancer": ["pulmonary cancer", "lung carcinoma", "pulmonary carcinoma", "bronchogenic carcinoma"],
	"non-small cell lung cancer": ["nsclc", "non small cell lung cancer", "non-small-cell lung cancer"],
	"small cell lung cancer": ["sclc", "small-cell lung cancer", "oat cell carcinoma"],
	"adenocarcinoma": ["adeno", "glandular cancer"],
	"squamous cell carcinoma": ["squamous carcinoma", "scc", "epidermoid carcinoma"],
	"metastatic": ["advanced", "stage iv", "stage 4", "metastases", "mets"],
	"locally advanced": ["stage iii", "stage 3", "regional spread"],
	"early stage": ["stage i", "stage ii", "stage 1", "stage 2", "localized"],

	# Treatment terms
	"chemotherapy": ["chemo", "cytotoxic therapy", "systemic therapy"],
	"radiation therapy": ["radiotherapy", "rt", "radiation treatment", "irradiation"],
	"immunotherapy": ["immune therapy", "io", "immune-oncology", "checkpoint inhibitor"],
	"targeted therapy": ["molecular therapy", "precision medicine", "targeted treatment"],
	"surgery": ["surgical resection", "resection", "operative treatment", "surgical intervention"],
	"lobectomy": ["lobe resection", "pulmonary lobectomy"],
	"pneumonectomy": ["lung removal", "complete lung resection"],
	"wedge resection": ["segmentectomy", "limited resection"],

	# Molecular markers
	"mutation": ["alteration", "variant", "genetic change", "molecular alteration"],
	"biomarker": ["molecular marker", "tumor marker", "genetic marker"],
	"driver mutation": ["oncogenic driver", "actionable mutation", "targetable mutation"],

	# Clinical outcomes
	"survival": ["survival rate", "survival outcome"],
	"response": ["treatment response", "tumor response", "clinical response"],
	"progression": ["disease progression", "tumor progression", "cancer progression"],
	"recurrence": ["relapse", "disease recurrence", "tumor recurrence"],
	"remission": ["response", "disease control"],

	# Side effects
	"adverse event": ["side effect", "adverse reaction", "toxicity", "adverse drug reaction"],
	"neutropenia": ["low white blood cell count", "low neutrophil count"],
	"anemia": ["low red blood cell count", "low hemoglobin"],
	"thrombocytopenia": ["low platelet count"],
	"nausea": ["feeling sick", "queasiness"],
	"fatigue": ["tiredness", "exhaustion", "weakness"],

	# Diagnostic terms
	"biopsy": ["tissue sample", "tissue sampling"],
	"imaging": ["radiology", "diagnostic imaging", "medical imaging"],
	"screening": ["early detection", "cancer screening"],
	}

	# Regional spelling variations (US/UK/International)
	SPELLING_VARIATIONS = {
	# US -> UK/International variants
	"tumor": ["tumour"],
	"tumors": ["tumours"],
	"metastasis": ["metastases"],
	"anemia": ["anaemia"],
	"edema": ["oedema"],
	"esophageal": ["oesophageal"],
	"pediatric": ["paediatric"],
	"hematology": ["haematology"],
	"hemoglobin": ["haemoglobin"],
	"leukemia": ["leukaemia"],
	"lymphoma": ["lymphoma"], # Same in both
	"optimize": ["optimise"],
	"randomized": ["randomised"],
	"analyze": ["analyse"],
	"center": ["centre"],
	"fiber": ["fibre"],
	}

	# Context-dependent abbreviations (require disambiguation)
	CONTEXT_DEPENDENT_ABBREVS = {
	"ca": {
	"cancer": ["cancer", "carcinoma"],
	"calcium": ["calcium"],
	},
	"cr": {
	"complete_response": ["complete response", "complete remission"],
	"creatinine": ["creatinine"],
	},
	"pt": {
	"patient": ["patient"],
	"prothrombin_time": ["prothrombin time"],
	},
	"rt": {
	"radiation_therapy": ["radiation therapy", "radiotherapy"],
	"reverse_transcriptase": ["reverse transcriptase"],
	},
	}

	# ============================================================================
	# DYNAMIC LEARNING COMPONENTS
	# ============================================================================

	class MedicalTerminologyLearner:
	"""
	Dynamically learns medical term variations from the corpus.
	Builds co-occurrence patterns and semantic relationships.
	"""

	def __init__(self, cache_path: Optional[str] = None):
	self.cache_path = cache_path or "data/medical_terms_cache.json"
	self.term_cooccurrence = defaultdict(lambda: defaultdict(int))
	self.learned_synonyms = defaultdict(set)
	self.learned_abbreviations = defaultdict(set)
	self.context_patterns = defaultdict(list)
	self._load_cache()

	def _load_cache(self):
	"""Load previously learned terms from cache"""
	try:
	cache_file = Path(self.cache_path)
	if cache_file.exists():
	with open(cache_file, 'r', encoding='utf-8') as f:
	data = json.load(f)
	self.learned_synonyms = defaultdict(set, {k: set(v) for k, v in data.get('synonyms', {}).items()})
	self.learned_abbreviations = defaultdict(set, {k: set(v) for k, v in data.get('abbreviations', {}).items()})
	logger.info(f"Loaded {len(self.learned_synonyms)} learned synonyms from cache")
	except Exception as e:
	logger.warning(f"Could not load term cache: {e}")

	def _save_cache(self):
	"""Save learned terms to cache"""
	try:
	cache_file = Path(self.cache_path)
	cache_file.parent.mkdir(parents=True, exist_ok=True)
	data = {
	'synonyms': {k: list(v) for k, v in self.learned_synonyms.items()},
	'abbreviations': {k: list(v) for k, v in self.learned_abbreviations.items()}
	}
	with open(cache_file, 'w', encoding='utf-8') as f:
	json.dump(data, f, indent=2)
	logger.info(f"Saved learned terms to cache")
	except Exception as e:
	logger.warning(f"Could not save term cache: {e}")

	def learn_from_documents(self, documents: List[Dict[str, str]]):
	"""
	Learn term variations from a corpus of documents.
	Identifies patterns like:
	- "X (Y)" -> Y is abbreviation of X
	- "X, also known as Y" -> X and Y are synonyms
	- "X or Y" in similar contexts -> potential synonyms
	"""
	for doc in documents:
	content = doc.get('content', '')
	self._extract_abbreviation_patterns(content)
	self._extract_synonym_patterns(content)
	self._build_cooccurrence(content)

	self._save_cache()

	def _extract_abbreviation_patterns(self, text: str):
	"""Extract abbreviations from patterns like 'Full Term (ABBR)'"""
	# Pattern: "Full Term (ABBR)" or "Full Term [ABBR]"
	pattern = r'([A-Z][a-z]+(?:\s+[A-Z]?[a-z]+))\s[\(\[]([A-Z]{2,}\|[A-Z][a-z](?:-[A-Z][a-z])*)[\)\]]'
	matches = re.finditer(pattern, text)

	for match in matches:
	full_term = match.group(1).strip().lower()
	abbrev = match.group(2).strip().lower()

	# Validate: abbreviation should be shorter and contain initials
	if len(abbrev) < len(full_term) and len(abbrev) >= 2:
	self.learned_abbreviations[abbrev].add(full_term)
	logger.debug(f"Learned: {abbrev} -> {full_term}")

	def _extract_synonym_patterns(self, text: str):
	"""Extract synonyms from patterns like 'X, also known as Y' or 'X (Y)'"""
	# Pattern: "X, also known as Y" or "X, also called Y"
	patterns = [
	r'([a-z\s\-]+),?\s+also\s+known\s+as\s+([a-z\s\-]+)',
	r'([a-z\s\-]+),?\s+also\s+called\s+([a-z\s\-]+)',
	r'([a-z\s\-]+)\s+\(([a-z\s\-]+)\)',
	]

	for pattern in patterns:
	matches = re.finditer(pattern, text.lower())
	for match in matches:
	term1 = match.group(1).strip()
	term2 = match.group(2).strip()

	# Validate: both should be reasonable length
	if 3 <= len(term1) <= 50 and 3 <= len(term2) <= 50:
	self.learned_synonyms[term1].add(term2)
	self.learned_synonyms[term2].add(term1)

	def _build_cooccurrence(self, text: str):
	"""Build co-occurrence matrix for terms"""
	# Extract medical terms (simplified)
	terms = re.findall(r'\b[a-z]{3,}(?:\s+[a-z]{3,}){0,3}\b', text.lower())

	# Build co-occurrence within a window
	window_size = 10
	for i, term in enumerate(terms):
	for j in range(max(0, i - window_size), min(len(terms), i + window_size + 1)):
	if i != j:
	self.term_cooccurrence[term][terms[j]] += 1

	def get_related_terms(self, term: str, threshold: int = 3) -> Set[str]:
	"""Get terms that frequently co-occur with the given term"""
	term_lower = term.lower()
	related = set()

	if term_lower in self.term_cooccurrence:
	for related_term, count in self.term_cooccurrence[term_lower].items():
	if count >= threshold:
	related.add(related_term)

	return related

	# Global learner instance
	_terminology_learner = MedicalTerminologyLearner()

	# ============================================================================
	# QUERY NORMALIZATION AND EXPANSION FUNCTIONS
	# ============================================================================

	def normalize_query(query: str) -> str:
	"""
	Normalize a query by:
	- Converting to lowercase
	- Removing extra whitespace
	- Standardizing punctuation
	"""
	# Convert to lowercase
	normalized = query.lower()

	# Standardize hyphens and dashes
	normalized = re.sub(r'[–—]', '-', normalized)

	# Remove extra whitespace
	normalized = re.sub(r'\s+', ' ', normalized).strip()

	return normalized


	def expand_abbreviations(text: str, context: Optional[str] = None) -> List[str]:
	"""
	Expand abbreviations in text to their full forms.
	Uses context when available for disambiguation.
	"""
	expansions = [text]
	text_lower = text.lower()

	# Check learned abbreviations first
	for abbrev, full_forms in _terminology_learner.learned_abbreviations.items():
	if abbrev in text_lower:
	for full_form in full_forms:
	expanded = text_lower.replace(abbrev, full_form)
	if expanded != text_lower:
	expansions.append(expanded)

	# Check predefined abbreviations
	for abbrev, full_forms in MEDICAL_ABBREVIATIONS.items():
	if re.search(rf'\b{re.escape(abbrev)}\b', text_lower):
	for full_form in full_forms:
	expanded = re.sub(rf'\b{re.escape(abbrev)}\b', full_form, text_lower)
	if expanded != text_lower:
	expansions.append(expanded)

	# Remove duplicates while preserving order
	seen = set()
	unique_expansions = []
	for exp in expansions:
	if exp not in seen:
	seen.add(exp)
	unique_expansions.append(exp)

	return unique_expansions


	def get_synonyms(term: str) -> Set[str]:
	"""Get all known synonyms for a medical term"""
	term_lower = term.lower()
	synonyms = set()

	# Check predefined synonyms
	if term_lower in MEDICAL_SYNONYMS:
	synonyms.update(MEDICAL_SYNONYMS[term_lower])

	# Check if term is a synonym of something else
	for key, syn_list in MEDICAL_SYNONYMS.items():
	if term_lower in syn_list:
	synonyms.add(key)
	synonyms.update(syn_list)

	# Check learned synonyms
	if term_lower in _terminology_learner.learned_synonyms:
	synonyms.update(_terminology_learner.learned_synonyms[term_lower])

	# Remove the original term
	synonyms.discard(term_lower)

	return synonyms


	def get_spelling_variations(term: str) -> Set[str]:
	"""Get regional spelling variations for a term"""
	term_lower = term.lower()
	variations = set()

	# Check direct mapping
	if term_lower in SPELLING_VARIATIONS:
	variations.update(SPELLING_VARIATIONS[term_lower])

	# Check reverse mapping
	for key, var_list in SPELLING_VARIATIONS.items():
	if term_lower in var_list:
	variations.add(key)
	variations.update(var_list)

	variations.discard(term_lower)
	return variations


	def extract_medical_entities(text: str) -> List[Tuple[str, str]]:
	"""
	Extract medical entities from text.
	Returns list of (entity, type) tuples.
	"""
	entities = []
	text_lower = text.lower()

	# Extract abbreviations
	for abbrev in MEDICAL_ABBREVIATIONS.keys():
	if re.search(rf'\b{re.escape(abbrev)}\b', text_lower):
	entities.append((abbrev, 'abbreviation'))

	# Extract known medical terms
	for term in MEDICAL_SYNONYMS.keys():
	if term in text_lower:
	entities.append((term, 'medical_term'))

	return entities


	def is_medical_abbreviation(text: str) -> bool:
	"""Check if text is a known medical abbreviation"""
	text_lower = text.lower().strip()
	return text_lower in MEDICAL_ABBREVIATIONS or text_lower in _terminology_learner.learned_abbreviations


	def get_abbreviation_expansion(abbrev: str) -> List[str]:
	"""Get all possible expansions for an abbreviation"""
	abbrev_lower = abbrev.lower().strip()
	expansions = []

	# Check predefined
	if abbrev_lower in MEDICAL_ABBREVIATIONS:
	expansions.extend(MEDICAL_ABBREVIATIONS[abbrev_lower])

	# Check learned
	if abbrev_lower in _terminology_learner.learned_abbreviations:
	expansions.extend(_terminology_learner.learned_abbreviations[abbrev_lower])

	return expansions


	def expand_query_with_variations(query: str, max_variations: int = 5) -> List[str]:
	"""
	Generate query variations by expanding abbreviations, adding synonyms,
	and including spelling variations.

	Args:
	query: Original query string
	max_variations: Maximum number of variations to generate

	Returns:
	List of query variations including the original
	"""
	variations = [query]
	query_lower = normalize_query(query)

	# 1. Expand abbreviations
	abbrev_expansions = expand_abbreviations(query_lower)
	variations.extend(abbrev_expansions)

	# 2. Add synonym variations
	words = query_lower.split()
	for i, word in enumerate(words):
	synonyms = get_synonyms(word)
	for syn in list(synonyms)[:2]: # Limit to 2 synonyms per word
	new_query = ' '.join(words[:i] + [syn] + words[i+1:])
	variations.append(new_query)

	# 3. Add spelling variations
	for word in words:
	spelling_vars = get_spelling_variations(word)
	for var in spelling_vars:
	new_query = query_lower.replace(word, var)
	variations.append(new_query)

	# 4. Add multi-word phrase variations
	for term, synonyms in MEDICAL_SYNONYMS.items():
	if term in query_lower:
	for syn in list(synonyms)[:2]:
	new_query = query_lower.replace(term, syn)
	variations.append(new_query)

	# Remove duplicates and limit
	seen = set()
	unique_variations = []
	for var in variations:
	if var not in seen:
	seen.add(var)
	unique_variations.append(var)
	if len(unique_variations) >= max_variations:
	break

	return unique_variations


	def learn_from_corpus(documents: List[Dict[str, str]]):
	"""
	Learn medical term variations from a corpus of documents.
	Should be called during system initialization.
	"""
	_terminology_learner.learn_from_documents(documents)


	def get_terminology_learner() -> MedicalTerminologyLearner:
	"""Get the global terminology learner instance"""
	return _terminology_learner