""" Query Processing Pipeline for Retrieval-based QA Chatbot ======================================================== This module handles: 1. Query preprocessing 2. Intent and sub-intent classification 3. Named Entity Recognition (NER) using SciSpaCy """ import spacy import re from typing import List, Tuple # Load pre-trained SciSpaCy model for biomedical NER ner_model = spacy.load("en_core_sci_md") # ------------------------------- # Rule-Based Intent Classification # ------------------------------- def classify_intent(question: str) -> str: """ Classify the user's query into a high-level intent based on keywords. Replace this rule-based system with ML-based intent detection for scalability. Parameters: question (str): The user's question. Returns: str: One of ['description', 'before_using', 'proper_use', 'precautions', 'side_effects'] """ q = question.lower() if re.search(r"\bwhat is\b|\bused for\b|\bdefine\b", q): return "description" elif re.search(r"\bbefore using\b|\bshould I tell\b|\bdoctor know\b", q): return "before_using" elif re.search(r"\bhow to\b|\bdosage\b|\btake\b|\binstructions\b", q): return "proper_use" elif re.search(r"\bprecaution\b|\bpregnan\b|\bbreastfeed\b|\brisk\b", q): return "precautions" elif re.search(r"\bside effect\b|\badverse\b|\bnausea\b|\bdizziness\b", q): return "side_effects" else: return "description" # default fallback # ------------------------------- # Subsection Classification # ------------------------------- def classify_subsection(question: str) -> str: """ Identify more granular subtopics within each main intent. Parameters: question (str): The user's question. Returns: str: Sub-intent such as 'more common', 'incidence not known', etc. """ q = question.lower() if re.search(r"\bcommon side effects\b|\busual symptoms\b", q): return "more common" elif re.search(r"\bunknown\b|\brare\b|\bincidence\b", q): return "incidence not known" elif re.search(r"\bchildren\b|\bpediatric\b|\bkids\b", q): return "pediatric" elif re.search(r"\bbreastfeed\b|\bnursing\b|\blactation\b", q): return "breastfeeding" elif re.search(r"\belderly\b|\bgeriatric\b", q): return "geriatric" elif re.search(r"\binteract\b|\bcombination\b|\bcontraindications\b", q): return "drug interactions" else: return "" # ------------------------------- # Named Entity Extraction # ------------------------------- def extract_entities_spacy(question: str) -> List[str]: """ Use SciSpaCy NER model to extract biomedical entities. Parameters: question (str): User query. Returns: List[str]: Unique list of extracted entities. """ doc = ner_model(question) return list(set(ent.text for ent in doc.ents)) # ------------------------------- # Query Preprocessing Wrapper # ------------------------------- def preprocess_query(raw_query: str) -> Tuple[Tuple[str, str], List[str]]: """ Main preprocessing function that extracts: - Intent - Subsection - Named Entities Parameters: raw_query (str): The raw user question. Returns: Tuple[Tuple[str, str], List[str]]: ((intent, sub_intent), list of entities) """ try: intent = classify_intent(raw_query) sub_intent = classify_subsection(raw_query) entities = extract_entities_spacy(raw_query) if not entities: print("[NER fallback] No entities found. Using raw query.") return (intent or "", sub_intent or ""), [] print(f"[Query Processed] Intent = {intent} | Subsection = {sub_intent} | Entities = {entities}") return (intent or "", sub_intent or ""), entities except Exception as e: print(f"[Preprocessing failed] {e}") return ("", ""), []