|
|
"""Query preprocessing utilities for biomedical search.""" |
|
|
|
|
|
import re |
|
|
|
|
|
|
|
|
QUESTION_WORDS: set[str] = { |
|
|
|
|
|
"what", |
|
|
"which", |
|
|
"how", |
|
|
"why", |
|
|
"when", |
|
|
"where", |
|
|
"who", |
|
|
"whom", |
|
|
|
|
|
"is", |
|
|
"are", |
|
|
"was", |
|
|
"were", |
|
|
"do", |
|
|
"does", |
|
|
"did", |
|
|
"can", |
|
|
"could", |
|
|
"would", |
|
|
"should", |
|
|
"will", |
|
|
"shall", |
|
|
"may", |
|
|
"might", |
|
|
|
|
|
"show", |
|
|
"promise", |
|
|
"help", |
|
|
"believe", |
|
|
"think", |
|
|
"suggest", |
|
|
"possible", |
|
|
"potential", |
|
|
"effective", |
|
|
"useful", |
|
|
"good", |
|
|
|
|
|
"the", |
|
|
"a", |
|
|
"an", |
|
|
} |
|
|
|
|
|
|
|
|
SYNONYMS: dict[str, list[str]] = { |
|
|
"long covid": [ |
|
|
"long COVID", |
|
|
"PASC", |
|
|
"post-acute sequelae of SARS-CoV-2", |
|
|
"post-COVID syndrome", |
|
|
"post-COVID-19 condition", |
|
|
], |
|
|
"alzheimer": [ |
|
|
"Alzheimer's disease", |
|
|
"Alzheimer disease", |
|
|
"AD", |
|
|
"Alzheimer dementia", |
|
|
], |
|
|
"parkinson": [ |
|
|
"Parkinson's disease", |
|
|
"Parkinson disease", |
|
|
"PD", |
|
|
], |
|
|
"diabetes": [ |
|
|
"diabetes mellitus", |
|
|
"type 2 diabetes", |
|
|
"T2DM", |
|
|
"diabetic", |
|
|
], |
|
|
"cancer": [ |
|
|
"cancer", |
|
|
"neoplasm", |
|
|
"tumor", |
|
|
"malignancy", |
|
|
"carcinoma", |
|
|
], |
|
|
"heart disease": [ |
|
|
"cardiovascular disease", |
|
|
"CVD", |
|
|
"coronary artery disease", |
|
|
"heart failure", |
|
|
], |
|
|
} |
|
|
|
|
|
|
|
|
def strip_question_words(query: str) -> str: |
|
|
""" |
|
|
Remove question words and filler terms from query. |
|
|
|
|
|
Args: |
|
|
query: Raw query string |
|
|
|
|
|
Returns: |
|
|
Query with question words removed |
|
|
""" |
|
|
words = query.lower().split() |
|
|
filtered = [w for w in words if w not in QUESTION_WORDS] |
|
|
return " ".join(filtered) |
|
|
|
|
|
|
|
|
def expand_synonyms(query: str) -> str: |
|
|
""" |
|
|
Expand medical terms to include synonyms. |
|
|
|
|
|
Args: |
|
|
query: Query string |
|
|
|
|
|
Returns: |
|
|
Query with synonym expansions in OR groups |
|
|
""" |
|
|
result = query.lower() |
|
|
|
|
|
for term, expansions in SYNONYMS.items(): |
|
|
if term in result: |
|
|
|
|
|
or_group = " OR ".join([f'"{exp}"' for exp in expansions]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
result = result.replace(term, f"({or_group})") |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def preprocess_query(raw_query: str) -> str: |
|
|
""" |
|
|
Full preprocessing pipeline for PubMed queries. |
|
|
|
|
|
Pipeline: |
|
|
1. Strip whitespace and punctuation |
|
|
2. Remove question words |
|
|
3. Expand medical synonyms |
|
|
|
|
|
Args: |
|
|
raw_query: Natural language query from user |
|
|
|
|
|
Returns: |
|
|
Optimized query for PubMed |
|
|
""" |
|
|
if not raw_query or not raw_query.strip(): |
|
|
return "" |
|
|
|
|
|
|
|
|
query = raw_query.replace("?", "").strip() |
|
|
query = re.sub(r"\s+", " ", query) |
|
|
|
|
|
|
|
|
query = strip_question_words(query) |
|
|
|
|
|
|
|
|
query = expand_synonyms(query) |
|
|
|
|
|
return query.strip() |
|
|
|