Spaces:
Runtime error
Runtime error
File size: 4,101 Bytes
5e9bfb5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
"""
Query Processing Pipeline for Retrieval-based QA Chatbot
========================================================
This module handles:
1. Query preprocessing
2. Intent and sub-intent classification
3. Named Entity Recognition (NER) using SciSpaCy
"""
import spacy
import re
from typing import List, Tuple
# Load pre-trained SciSpaCy model for biomedical NER
ner_model = spacy.load("en_core_sci_md")
# -------------------------------
# Rule-Based Intent Classification
# -------------------------------
def classify_intent(question: str) -> str:
"""
Classify the user's query into a high-level intent based on keywords.
Replace this rule-based system with ML-based intent detection for scalability.
Parameters:
question (str): The user's question.
Returns:
str: One of ['description', 'before_using', 'proper_use', 'precautions', 'side_effects']
"""
q = question.lower()
if re.search(r"\bwhat is\b|\bused for\b|\bdefine\b", q):
return "description"
elif re.search(r"\bbefore using\b|\bshould I tell\b|\bdoctor know\b", q):
return "before_using"
elif re.search(r"\bhow to\b|\bdosage\b|\btake\b|\binstructions\b", q):
return "proper_use"
elif re.search(r"\bprecaution\b|\bpregnan\b|\bbreastfeed\b|\brisk\b", q):
return "precautions"
elif re.search(r"\bside effect\b|\badverse\b|\bnausea\b|\bdizziness\b", q):
return "side_effects"
else:
return "description" # default fallback
# -------------------------------
# Subsection Classification
# -------------------------------
def classify_subsection(question: str) -> str:
"""
Identify more granular subtopics within each main intent.
Parameters:
question (str): The user's question.
Returns:
str: Sub-intent such as 'more common', 'incidence not known', etc.
"""
q = question.lower()
if re.search(r"\bcommon side effects\b|\busual symptoms\b", q):
return "more common"
elif re.search(r"\bunknown\b|\brare\b|\bincidence\b", q):
return "incidence not known"
elif re.search(r"\bchildren\b|\bpediatric\b|\bkids\b", q):
return "pediatric"
elif re.search(r"\bbreastfeed\b|\bnursing\b|\blactation\b", q):
return "breastfeeding"
elif re.search(r"\belderly\b|\bgeriatric\b", q):
return "geriatric"
elif re.search(r"\binteract\b|\bcombination\b|\bcontraindications\b", q):
return "drug interactions"
else:
return ""
# -------------------------------
# Named Entity Extraction
# -------------------------------
def extract_entities_spacy(question: str) -> List[str]:
"""
Use SciSpaCy NER model to extract biomedical entities.
Parameters:
question (str): User query.
Returns:
List[str]: Unique list of extracted entities.
"""
doc = ner_model(question)
return list(set(ent.text for ent in doc.ents))
# -------------------------------
# Query Preprocessing Wrapper
# -------------------------------
def preprocess_query(raw_query: str) -> Tuple[Tuple[str, str], List[str]]:
"""
Main preprocessing function that extracts:
- Intent
- Subsection
- Named Entities
Parameters:
raw_query (str): The raw user question.
Returns:
Tuple[Tuple[str, str], List[str]]: ((intent, sub_intent), list of entities)
"""
try:
intent = classify_intent(raw_query)
sub_intent = classify_subsection(raw_query)
entities = extract_entities_spacy(raw_query)
if not entities:
print("[NER fallback] No entities found. Using raw query.")
return (intent or "", sub_intent or ""), []
print(f"[Query Processed] Intent = {intent} | Subsection = {sub_intent} | Entities = {entities}")
return (intent or "", sub_intent or ""), entities
except Exception as e:
print(f"[Preprocessing failed] {e}")
return ("", ""), [] |