File size: 19,313 Bytes
ddc9c77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
"""
Medical Terminology Module with Dynamic Learning

This module provides intelligent handling of medical linguistic variability including:
- Synonyms and alternate terms
- Abbreviations and acronyms (with context awareness)
- Regional spelling variations (US/UK/International)
- Specialty-specific terminology
- Dynamic learning from corpus
"""

import re
import json
from typing import List, Dict, Set, Tuple, Optional
from collections import defaultdict
from pathlib import Path
from .config import logger

# ============================================================================
# CORE MEDICAL TERMINOLOGY MAPPINGS
# ============================================================================

# Common medical abbreviations with context-aware expansions
MEDICAL_ABBREVIATIONS = {
    # Cancer Types
    "nsclc": ["non-small cell lung cancer", "non small cell lung cancer"],
    "sclc": ["small cell lung cancer"],
    "nscl": ["non-small cell lung"],
    "alk": ["anaplastic lymphoma kinase"],
    "egfr": ["epidermal growth factor receptor"],
    "ros1": ["ros proto-oncogene 1", "c-ros oncogene 1"],
    "braf": ["b-raf proto-oncogene"],
    "kras": ["kirsten rat sarcoma viral oncogene"],
    "met": ["mesenchymal epithelial transition", "met proto-oncogene"],
    "her2": ["human epidermal growth factor receptor 2"],
    "ret": ["ret proto-oncogene", "rearranged during transfection"],
    "ntrk": ["neurotrophic tyrosine receptor kinase", "neurotrophic tropomyosin receptor kinase"],
    
    # Treatment & Procedures
    "chemo": ["chemotherapy"],
    "rt": ["radiation therapy", "radiotherapy"],
    "sbrt": ["stereotactic body radiation therapy", "stereotactic body radiotherapy"],
    "imrt": ["intensity-modulated radiation therapy"],
    "ct": ["computed tomography", "ct scan"],
    "pet": ["positron emission tomography"],
    "mri": ["magnetic resonance imaging"],
    "io": ["immunotherapy", "immune-oncology"],
    "ici": ["immune checkpoint inhibitor", "immune checkpoint inhibitors"],
    "tki": ["tyrosine kinase inhibitor", "tyrosine kinase inhibitors"],
    "pd-1": ["programmed death-1", "programmed cell death protein 1"],
    "pd-l1": ["programmed death-ligand 1"],
    "ctla-4": ["cytotoxic t-lymphocyte-associated protein 4"],
    
    # Clinical Terms
    "os": ["overall survival"],
    "pfs": ["progression-free survival"],
    "dfs": ["disease-free survival"],
    "orr": ["overall response rate", "objective response rate"],
    "cr": ["complete response"],
    "pr": ["partial response"],
    "sd": ["stable disease"],
    "pd": ["progressive disease"],
    "ecog": ["eastern cooperative oncology group"],
    "ps": ["performance status"],
    "aes": ["adverse events"],
    "sae": ["serious adverse event", "serious adverse events"],
    "qol": ["quality of life"],
    
    # Staging
    "tnm": ["tumor node metastasis", "tnm staging"],
    "ajcc": ["american joint committee on cancer"],
    
    # Drugs (common abbreviations)
    "cddp": ["cisplatin"],
    "cbdca": ["carboplatin"],
    "pem": ["pemetrexed"],
    "gem": ["gemcitabine"],
    "doc": ["docetaxel"],
    "pac": ["paclitaxel"],
    "vin": ["vinorelbine"],
    "eto": ["etoposide"],
}

# Synonym mappings for medical terms
MEDICAL_SYNONYMS = {
    # Cancer terminology
    "lung cancer": ["pulmonary cancer", "lung carcinoma", "pulmonary carcinoma", "bronchogenic carcinoma"],
    "non-small cell lung cancer": ["nsclc", "non small cell lung cancer", "non-small-cell lung cancer"],
    "small cell lung cancer": ["sclc", "small-cell lung cancer", "oat cell carcinoma"],
    "adenocarcinoma": ["adeno", "glandular cancer"],
    "squamous cell carcinoma": ["squamous carcinoma", "scc", "epidermoid carcinoma"],
    "metastatic": ["advanced", "stage iv", "stage 4", "metastases", "mets"],
    "locally advanced": ["stage iii", "stage 3", "regional spread"],
    "early stage": ["stage i", "stage ii", "stage 1", "stage 2", "localized"],
    
    # Treatment terms
    "chemotherapy": ["chemo", "cytotoxic therapy", "systemic therapy"],
    "radiation therapy": ["radiotherapy", "rt", "radiation treatment", "irradiation"],
    "immunotherapy": ["immune therapy", "io", "immune-oncology", "checkpoint inhibitor"],
    "targeted therapy": ["molecular therapy", "precision medicine", "targeted treatment"],
    "surgery": ["surgical resection", "resection", "operative treatment", "surgical intervention"],
    "lobectomy": ["lobe resection", "pulmonary lobectomy"],
    "pneumonectomy": ["lung removal", "complete lung resection"],
    "wedge resection": ["segmentectomy", "limited resection"],
    
    # Molecular markers
    "mutation": ["alteration", "variant", "genetic change", "molecular alteration"],
    "biomarker": ["molecular marker", "tumor marker", "genetic marker"],
    "driver mutation": ["oncogenic driver", "actionable mutation", "targetable mutation"],
    
    # Clinical outcomes
    "survival": ["survival rate", "survival outcome"],
    "response": ["treatment response", "tumor response", "clinical response"],
    "progression": ["disease progression", "tumor progression", "cancer progression"],
    "recurrence": ["relapse", "disease recurrence", "tumor recurrence"],
    "remission": ["response", "disease control"],
    
    # Side effects
    "adverse event": ["side effect", "adverse reaction", "toxicity", "adverse drug reaction"],
    "neutropenia": ["low white blood cell count", "low neutrophil count"],
    "anemia": ["low red blood cell count", "low hemoglobin"],
    "thrombocytopenia": ["low platelet count"],
    "nausea": ["feeling sick", "queasiness"],
    "fatigue": ["tiredness", "exhaustion", "weakness"],
    
    # Diagnostic terms
    "biopsy": ["tissue sample", "tissue sampling"],
    "imaging": ["radiology", "diagnostic imaging", "medical imaging"],
    "screening": ["early detection", "cancer screening"],
}

# Regional spelling variations (US/UK/International)
SPELLING_VARIATIONS = {
    # US -> UK/International variants
    "tumor": ["tumour"],
    "tumors": ["tumours"],
    "metastasis": ["metastases"],
    "anemia": ["anaemia"],
    "edema": ["oedema"],
    "esophageal": ["oesophageal"],
    "pediatric": ["paediatric"],
    "hematology": ["haematology"],
    "hemoglobin": ["haemoglobin"],
    "leukemia": ["leukaemia"],
    "lymphoma": ["lymphoma"],  # Same in both
    "optimize": ["optimise"],
    "randomized": ["randomised"],
    "analyze": ["analyse"],
    "center": ["centre"],
    "fiber": ["fibre"],
}

# Context-dependent abbreviations (require disambiguation)
CONTEXT_DEPENDENT_ABBREVS = {
    "ca": {
        "cancer": ["cancer", "carcinoma"],
        "calcium": ["calcium"],
    },
    "cr": {
        "complete_response": ["complete response", "complete remission"],
        "creatinine": ["creatinine"],
    },
    "pt": {
        "patient": ["patient"],
        "prothrombin_time": ["prothrombin time"],
    },
    "rt": {
        "radiation_therapy": ["radiation therapy", "radiotherapy"],
        "reverse_transcriptase": ["reverse transcriptase"],
    },
}

# ============================================================================
# DYNAMIC LEARNING COMPONENTS
# ============================================================================

class MedicalTerminologyLearner:
    """
    Dynamically learns medical term variations from the corpus.
    Builds co-occurrence patterns and semantic relationships.
    """
    
    def __init__(self, cache_path: Optional[str] = None):
        self.cache_path = cache_path or "data/medical_terms_cache.json"
        self.term_cooccurrence = defaultdict(lambda: defaultdict(int))
        self.learned_synonyms = defaultdict(set)
        self.learned_abbreviations = defaultdict(set)
        self.context_patterns = defaultdict(list)
        self._load_cache()
    
    def _load_cache(self):
        """Load previously learned terms from cache"""
        try:
            cache_file = Path(self.cache_path)
            if cache_file.exists():
                with open(cache_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    self.learned_synonyms = defaultdict(set, {k: set(v) for k, v in data.get('synonyms', {}).items()})
                    self.learned_abbreviations = defaultdict(set, {k: set(v) for k, v in data.get('abbreviations', {}).items()})
                logger.info(f"Loaded {len(self.learned_synonyms)} learned synonyms from cache")
        except Exception as e:
            logger.warning(f"Could not load term cache: {e}")
    
    def _save_cache(self):
        """Save learned terms to cache"""
        try:
            cache_file = Path(self.cache_path)
            cache_file.parent.mkdir(parents=True, exist_ok=True)
            data = {
                'synonyms': {k: list(v) for k, v in self.learned_synonyms.items()},
                'abbreviations': {k: list(v) for k, v in self.learned_abbreviations.items()}
            }
            with open(cache_file, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2)
            logger.info(f"Saved learned terms to cache")
        except Exception as e:
            logger.warning(f"Could not save term cache: {e}")
    
    def learn_from_documents(self, documents: List[Dict[str, str]]):
        """
        Learn term variations from a corpus of documents.
        Identifies patterns like:
        - "X (Y)" -> Y is abbreviation of X
        - "X, also known as Y" -> X and Y are synonyms
        - "X or Y" in similar contexts -> potential synonyms
        """
        for doc in documents:
            content = doc.get('content', '')
            self._extract_abbreviation_patterns(content)
            self._extract_synonym_patterns(content)
            self._build_cooccurrence(content)
        
        self._save_cache()
    
    def _extract_abbreviation_patterns(self, text: str):
        """Extract abbreviations from patterns like 'Full Term (ABBR)'"""
        # Pattern: "Full Term (ABBR)" or "Full Term [ABBR]"
        pattern = r'([A-Z][a-z]+(?:\s+[A-Z]?[a-z]+)*)\s*[\(\[]([A-Z]{2,}|[A-Z][a-z]*(?:-[A-Z][a-z]*)*)[\)\]]'
        matches = re.finditer(pattern, text)
        
        for match in matches:
            full_term = match.group(1).strip().lower()
            abbrev = match.group(2).strip().lower()
            
            # Validate: abbreviation should be shorter and contain initials
            if len(abbrev) < len(full_term) and len(abbrev) >= 2:
                self.learned_abbreviations[abbrev].add(full_term)
                logger.debug(f"Learned: {abbrev} -> {full_term}")
    
    def _extract_synonym_patterns(self, text: str):
        """Extract synonyms from patterns like 'X, also known as Y' or 'X (Y)'"""
        # Pattern: "X, also known as Y" or "X, also called Y"
        patterns = [
            r'([a-z\s\-]+),?\s+also\s+known\s+as\s+([a-z\s\-]+)',
            r'([a-z\s\-]+),?\s+also\s+called\s+([a-z\s\-]+)',
            r'([a-z\s\-]+)\s+\(([a-z\s\-]+)\)',
        ]
        
        for pattern in patterns:
            matches = re.finditer(pattern, text.lower())
            for match in matches:
                term1 = match.group(1).strip()
                term2 = match.group(2).strip()
                
                # Validate: both should be reasonable length
                if 3 <= len(term1) <= 50 and 3 <= len(term2) <= 50:
                    self.learned_synonyms[term1].add(term2)
                    self.learned_synonyms[term2].add(term1)
    
    def _build_cooccurrence(self, text: str):
        """Build co-occurrence matrix for terms"""
        # Extract medical terms (simplified)
        terms = re.findall(r'\b[a-z]{3,}(?:\s+[a-z]{3,}){0,3}\b', text.lower())
        
        # Build co-occurrence within a window
        window_size = 10
        for i, term in enumerate(terms):
            for j in range(max(0, i - window_size), min(len(terms), i + window_size + 1)):
                if i != j:
                    self.term_cooccurrence[term][terms[j]] += 1
    
    def get_related_terms(self, term: str, threshold: int = 3) -> Set[str]:
        """Get terms that frequently co-occur with the given term"""
        term_lower = term.lower()
        related = set()
        
        if term_lower in self.term_cooccurrence:
            for related_term, count in self.term_cooccurrence[term_lower].items():
                if count >= threshold:
                    related.add(related_term)
        
        return related

# Global learner instance
_terminology_learner = MedicalTerminologyLearner()

# ============================================================================
# QUERY NORMALIZATION AND EXPANSION FUNCTIONS
# ============================================================================

def normalize_query(query: str) -> str:
    """
    Normalize a query by:
    - Converting to lowercase
    - Removing extra whitespace
    - Standardizing punctuation
    """
    # Convert to lowercase
    normalized = query.lower()
    
    # Standardize hyphens and dashes
    normalized = re.sub(r'[–—]', '-', normalized)
    
    # Remove extra whitespace
    normalized = re.sub(r'\s+', ' ', normalized).strip()
    
    return normalized


def expand_abbreviations(text: str, context: Optional[str] = None) -> List[str]:
    """
    Expand abbreviations in text to their full forms.
    Uses context when available for disambiguation.
    """
    expansions = [text]
    text_lower = text.lower()
    
    # Check learned abbreviations first
    for abbrev, full_forms in _terminology_learner.learned_abbreviations.items():
        if abbrev in text_lower:
            for full_form in full_forms:
                expanded = text_lower.replace(abbrev, full_form)
                if expanded != text_lower:
                    expansions.append(expanded)
    
    # Check predefined abbreviations
    for abbrev, full_forms in MEDICAL_ABBREVIATIONS.items():
        if re.search(rf'\b{re.escape(abbrev)}\b', text_lower):
            for full_form in full_forms:
                expanded = re.sub(rf'\b{re.escape(abbrev)}\b', full_form, text_lower)
                if expanded != text_lower:
                    expansions.append(expanded)
    
    # Remove duplicates while preserving order
    seen = set()
    unique_expansions = []
    for exp in expansions:
        if exp not in seen:
            seen.add(exp)
            unique_expansions.append(exp)
    
    return unique_expansions


def get_synonyms(term: str) -> Set[str]:
    """Get all known synonyms for a medical term"""
    term_lower = term.lower()
    synonyms = set()
    
    # Check predefined synonyms
    if term_lower in MEDICAL_SYNONYMS:
        synonyms.update(MEDICAL_SYNONYMS[term_lower])
    
    # Check if term is a synonym of something else
    for key, syn_list in MEDICAL_SYNONYMS.items():
        if term_lower in syn_list:
            synonyms.add(key)
            synonyms.update(syn_list)
    
    # Check learned synonyms
    if term_lower in _terminology_learner.learned_synonyms:
        synonyms.update(_terminology_learner.learned_synonyms[term_lower])
    
    # Remove the original term
    synonyms.discard(term_lower)
    
    return synonyms


def get_spelling_variations(term: str) -> Set[str]:
    """Get regional spelling variations for a term"""
    term_lower = term.lower()
    variations = set()
    
    # Check direct mapping
    if term_lower in SPELLING_VARIATIONS:
        variations.update(SPELLING_VARIATIONS[term_lower])
    
    # Check reverse mapping
    for key, var_list in SPELLING_VARIATIONS.items():
        if term_lower in var_list:
            variations.add(key)
            variations.update(var_list)
    
    variations.discard(term_lower)
    return variations


def extract_medical_entities(text: str) -> List[Tuple[str, str]]:
    """
    Extract medical entities from text.
    Returns list of (entity, type) tuples.
    """
    entities = []
    text_lower = text.lower()
    
    # Extract abbreviations
    for abbrev in MEDICAL_ABBREVIATIONS.keys():
        if re.search(rf'\b{re.escape(abbrev)}\b', text_lower):
            entities.append((abbrev, 'abbreviation'))
    
    # Extract known medical terms
    for term in MEDICAL_SYNONYMS.keys():
        if term in text_lower:
            entities.append((term, 'medical_term'))
    
    return entities


def is_medical_abbreviation(text: str) -> bool:
    """Check if text is a known medical abbreviation"""
    text_lower = text.lower().strip()
    return text_lower in MEDICAL_ABBREVIATIONS or text_lower in _terminology_learner.learned_abbreviations


def get_abbreviation_expansion(abbrev: str) -> List[str]:
    """Get all possible expansions for an abbreviation"""
    abbrev_lower = abbrev.lower().strip()
    expansions = []
    
    # Check predefined
    if abbrev_lower in MEDICAL_ABBREVIATIONS:
        expansions.extend(MEDICAL_ABBREVIATIONS[abbrev_lower])
    
    # Check learned
    if abbrev_lower in _terminology_learner.learned_abbreviations:
        expansions.extend(_terminology_learner.learned_abbreviations[abbrev_lower])
    
    return expansions


def expand_query_with_variations(query: str, max_variations: int = 5) -> List[str]:
    """
    Generate query variations by expanding abbreviations, adding synonyms,
    and including spelling variations.
    
    Args:
        query: Original query string
        max_variations: Maximum number of variations to generate
    
    Returns:
        List of query variations including the original
    """
    variations = [query]
    query_lower = normalize_query(query)
    
    # 1. Expand abbreviations
    abbrev_expansions = expand_abbreviations(query_lower)
    variations.extend(abbrev_expansions)
    
    # 2. Add synonym variations
    words = query_lower.split()
    for i, word in enumerate(words):
        synonyms = get_synonyms(word)
        for syn in list(synonyms)[:2]:  # Limit to 2 synonyms per word
            new_query = ' '.join(words[:i] + [syn] + words[i+1:])
            variations.append(new_query)
    
    # 3. Add spelling variations
    for word in words:
        spelling_vars = get_spelling_variations(word)
        for var in spelling_vars:
            new_query = query_lower.replace(word, var)
            variations.append(new_query)
    
    # 4. Add multi-word phrase variations
    for term, synonyms in MEDICAL_SYNONYMS.items():
        if term in query_lower:
            for syn in list(synonyms)[:2]:
                new_query = query_lower.replace(term, syn)
                variations.append(new_query)
    
    # Remove duplicates and limit
    seen = set()
    unique_variations = []
    for var in variations:
        if var not in seen:
            seen.add(var)
            unique_variations.append(var)
            if len(unique_variations) >= max_variations:
                break
    
    return unique_variations


def learn_from_corpus(documents: List[Dict[str, str]]):
    """
    Learn medical term variations from a corpus of documents.
    Should be called during system initialization.
    """
    _terminology_learner.learn_from_documents(documents)


def get_terminology_learner() -> MedicalTerminologyLearner:
    """Get the global terminology learner instance"""
    return _terminology_learner