| """Query preprocessing utilities for biomedical search.""" |
|
|
| import re |
|
|
| |
| QUESTION_WORDS: set[str] = { |
| |
| "what", |
| "which", |
| "how", |
| "why", |
| "when", |
| "where", |
| "who", |
| "whom", |
| |
| "is", |
| "are", |
| "was", |
| "were", |
| "do", |
| "does", |
| "did", |
| "can", |
| "could", |
| "would", |
| "should", |
| "will", |
| "shall", |
| "may", |
| "might", |
| |
| "show", |
| "promise", |
| "help", |
| "believe", |
| "think", |
| "suggest", |
| "possible", |
| "potential", |
| "effective", |
| "useful", |
| "good", |
| |
| "the", |
| "a", |
| "an", |
| } |
|
|
| |
| SYNONYMS: dict[str, list[str]] = { |
| "long covid": [ |
| "long COVID", |
| "PASC", |
| "post-acute sequelae of SARS-CoV-2", |
| "post-COVID syndrome", |
| "post-COVID-19 condition", |
| ], |
| "alzheimer": [ |
| "Alzheimer's disease", |
| "Alzheimer disease", |
| "AD", |
| "Alzheimer dementia", |
| ], |
| "parkinson": [ |
| "Parkinson's disease", |
| "Parkinson disease", |
| "PD", |
| ], |
| "diabetes": [ |
| "diabetes mellitus", |
| "type 2 diabetes", |
| "T2DM", |
| "diabetic", |
| ], |
| "cancer": [ |
| "cancer", |
| "neoplasm", |
| "tumor", |
| "malignancy", |
| "carcinoma", |
| ], |
| "heart disease": [ |
| "cardiovascular disease", |
| "CVD", |
| "coronary artery disease", |
| "heart failure", |
| ], |
| } |
|
|
|
|
| def strip_question_words(query: str) -> str: |
| """ |
| Remove question words and filler terms from query. |
| |
| Args: |
| query: Raw query string |
| |
| Returns: |
| Query with question words removed |
| """ |
| words = query.lower().split() |
| filtered = [w for w in words if w not in QUESTION_WORDS] |
| return " ".join(filtered) |
|
|
|
|
| def expand_synonyms(query: str) -> str: |
| """ |
| Expand medical terms to include synonyms. |
| |
| Args: |
| query: Query string |
| |
| Returns: |
| Query with synonym expansions in OR groups |
| """ |
| result = query.lower() |
|
|
| for term, expansions in SYNONYMS.items(): |
| if term in result: |
| |
| or_group = " OR ".join([f'"{exp}"' for exp in expansions]) |
| |
| |
| |
| |
| |
| result = result.replace(term, f"({or_group})") |
|
|
| return result |
|
|
|
|
| def preprocess_query(raw_query: str) -> str: |
| """ |
| Full preprocessing pipeline for PubMed queries. |
| |
| Pipeline: |
| 1. Strip whitespace and punctuation |
| 2. Remove question words |
| 3. Expand medical synonyms |
| |
| Args: |
| raw_query: Natural language query from user |
| |
| Returns: |
| Optimized query for PubMed |
| """ |
| if not raw_query or not raw_query.strip(): |
| return "" |
|
|
| |
| query = raw_query.replace("?", "").strip() |
| query = re.sub(r"\s+", " ", query) |
|
|
| |
| query = strip_question_words(query) |
|
|
| |
| query = expand_synonyms(query) |
|
|
| return query.strip() |
|
|