|
|
"""Query preprocessing utilities for biomedical search.""" |
|
|
|
|
|
import re |
|
|
|
|
|
|
|
|
QUESTION_WORDS: set[str] = { |
|
|
|
|
|
"what", |
|
|
"which", |
|
|
"how", |
|
|
"why", |
|
|
"when", |
|
|
"where", |
|
|
"who", |
|
|
"whom", |
|
|
|
|
|
"is", |
|
|
"are", |
|
|
"was", |
|
|
"were", |
|
|
"do", |
|
|
"does", |
|
|
"did", |
|
|
"can", |
|
|
"could", |
|
|
"would", |
|
|
"should", |
|
|
"will", |
|
|
"shall", |
|
|
"may", |
|
|
"might", |
|
|
|
|
|
"show", |
|
|
"promise", |
|
|
"help", |
|
|
"believe", |
|
|
"think", |
|
|
"suggest", |
|
|
"possible", |
|
|
"potential", |
|
|
"effective", |
|
|
"useful", |
|
|
"good", |
|
|
|
|
|
"the", |
|
|
"a", |
|
|
"an", |
|
|
} |
|
|
|
|
|
|
|
|
SYNONYMS: dict[str, list[str]] = { |
|
|
"erectile dysfunction": [ |
|
|
"ED", |
|
|
"impotence", |
|
|
"sexual dysfunction", |
|
|
], |
|
|
"low libido": [ |
|
|
"hypoactive sexual desire disorder", |
|
|
"HSDD", |
|
|
"low sexual desire", |
|
|
"loss of libido", |
|
|
], |
|
|
"menopause": [ |
|
|
"postmenopausal", |
|
|
"climacteric", |
|
|
"perimenopause", |
|
|
], |
|
|
"testosterone": [ |
|
|
"androgen", |
|
|
"testosterone therapy", |
|
|
"TRT", |
|
|
], |
|
|
"premature ejaculation": [ |
|
|
"PE", |
|
|
"rapid ejaculation", |
|
|
"early ejaculation", |
|
|
], |
|
|
"pcos": [ |
|
|
"polycystic ovary syndrome", |
|
|
"Stein-Leventhal syndrome", |
|
|
], |
|
|
} |
|
|
|
|
|
|
|
|
def strip_question_words(query: str) -> str: |
|
|
""" |
|
|
Remove question words and filler terms from query. |
|
|
|
|
|
Args: |
|
|
query: Raw query string |
|
|
|
|
|
Returns: |
|
|
Query with question words removed |
|
|
""" |
|
|
words = query.lower().split() |
|
|
filtered = [w for w in words if w not in QUESTION_WORDS] |
|
|
return " ".join(filtered) |
|
|
|
|
|
|
|
|
def expand_synonyms(query: str) -> str: |
|
|
""" |
|
|
Expand medical terms to include synonyms. |
|
|
|
|
|
Args: |
|
|
query: Search query (e.g., "testosterone libido") |
|
|
|
|
|
Returns: |
|
|
Query with synonym expansions in OR groups |
|
|
""" |
|
|
result = query.lower() |
|
|
|
|
|
for term, expansions in SYNONYMS.items(): |
|
|
if term in result: |
|
|
|
|
|
or_group = " OR ".join([f'"{exp}"' for exp in expansions]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
result = result.replace(term, f"({or_group})") |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def preprocess_query(raw_query: str) -> str: |
|
|
""" |
|
|
Full preprocessing pipeline for PubMed queries. |
|
|
|
|
|
Pipeline: |
|
|
1. Strip whitespace and punctuation |
|
|
2. Remove question words |
|
|
3. Expand medical synonyms |
|
|
|
|
|
Args: |
|
|
raw_query: Natural language query from user |
|
|
|
|
|
Returns: |
|
|
Optimized query for PubMed |
|
|
""" |
|
|
if not raw_query or not raw_query.strip(): |
|
|
return "" |
|
|
|
|
|
|
|
|
query = raw_query.replace("?", "").strip() |
|
|
query = re.sub(r"\s+", " ", query) |
|
|
|
|
|
|
|
|
query = strip_question_words(query) |
|
|
|
|
|
|
|
|
query = expand_synonyms(query) |
|
|
|
|
|
return query.strip() |
|
|
|