NiranjanSathish's picture
Upload 12 files
5e9bfb5 verified
"""
Query Processing Pipeline for Retrieval-based QA Chatbot
========================================================
This module handles:
1. Query preprocessing
2. Intent and sub-intent classification
3. Named Entity Recognition (NER) using SciSpaCy
"""
import spacy
import re
from typing import List, Tuple
# Load pre-trained SciSpaCy model for biomedical NER
ner_model = spacy.load("en_core_sci_md")
# -------------------------------
# Rule-Based Intent Classification
# -------------------------------
def classify_intent(question: str) -> str:
"""
Classify the user's query into a high-level intent based on keywords.
Replace this rule-based system with ML-based intent detection for scalability.
Parameters:
question (str): The user's question.
Returns:
str: One of ['description', 'before_using', 'proper_use', 'precautions', 'side_effects']
"""
q = question.lower()
if re.search(r"\bwhat is\b|\bused for\b|\bdefine\b", q):
return "description"
elif re.search(r"\bbefore using\b|\bshould I tell\b|\bdoctor know\b", q):
return "before_using"
elif re.search(r"\bhow to\b|\bdosage\b|\btake\b|\binstructions\b", q):
return "proper_use"
elif re.search(r"\bprecaution\b|\bpregnan\b|\bbreastfeed\b|\brisk\b", q):
return "precautions"
elif re.search(r"\bside effect\b|\badverse\b|\bnausea\b|\bdizziness\b", q):
return "side_effects"
else:
return "description" # default fallback
# -------------------------------
# Subsection Classification
# -------------------------------
def classify_subsection(question: str) -> str:
"""
Identify more granular subtopics within each main intent.
Parameters:
question (str): The user's question.
Returns:
str: Sub-intent such as 'more common', 'incidence not known', etc.
"""
q = question.lower()
if re.search(r"\bcommon side effects\b|\busual symptoms\b", q):
return "more common"
elif re.search(r"\bunknown\b|\brare\b|\bincidence\b", q):
return "incidence not known"
elif re.search(r"\bchildren\b|\bpediatric\b|\bkids\b", q):
return "pediatric"
elif re.search(r"\bbreastfeed\b|\bnursing\b|\blactation\b", q):
return "breastfeeding"
elif re.search(r"\belderly\b|\bgeriatric\b", q):
return "geriatric"
elif re.search(r"\binteract\b|\bcombination\b|\bcontraindications\b", q):
return "drug interactions"
else:
return ""
# -------------------------------
# Named Entity Extraction
# -------------------------------
def extract_entities_spacy(question: str) -> List[str]:
"""
Use SciSpaCy NER model to extract biomedical entities.
Parameters:
question (str): User query.
Returns:
List[str]: Unique list of extracted entities.
"""
doc = ner_model(question)
return list(set(ent.text for ent in doc.ents))
# -------------------------------
# Query Preprocessing Wrapper
# -------------------------------
def preprocess_query(raw_query: str) -> Tuple[Tuple[str, str], List[str]]:
"""
Main preprocessing function that extracts:
- Intent
- Subsection
- Named Entities
Parameters:
raw_query (str): The raw user question.
Returns:
Tuple[Tuple[str, str], List[str]]: ((intent, sub_intent), list of entities)
"""
try:
intent = classify_intent(raw_query)
sub_intent = classify_subsection(raw_query)
entities = extract_entities_spacy(raw_query)
if not entities:
print("[NER fallback] No entities found. Using raw query.")
return (intent or "", sub_intent or ""), []
print(f"[Query Processed] Intent = {intent} | Subsection = {sub_intent} | Entities = {entities}")
return (intent or "", sub_intent or ""), entities
except Exception as e:
print(f"[Preprocessing failed] {e}")
return ("", ""), []