| from __future__ import annotations | |
| import re | |
| from pathlib import Path | |
| from threading import Lock | |
| from sentence_transformers import SentenceTransformer | |
| INTENT_PHRASES = { | |
| "greeting": ["hi", "hello", "hey", "good morning", "good evening", "how are you"], | |
| "emergency": [ | |
| "emergency", | |
| "call ambulance", | |
| "heart attack", | |
| "can't breathe", | |
| "dying", | |
| "severe chest pain", | |
| "unconscious", | |
| "not breathing", | |
| ], | |
| "educational": [ | |
| "what is", | |
| "what causes", | |
| "how does", | |
| "explain", | |
| "tell me about", | |
| "symptoms of", | |
| "treatment for", | |
| "prevention of", | |
| ], | |
| "assessment": [ | |
| "i have", | |
| "i feel", | |
| "i am experiencing", | |
| "my chest", | |
| "i've been", | |
| "i noticed", | |
| "i'm suffering", | |
| "i got diagnosed", | |
| ], | |
| } | |
| DISEASE_PHRASES = { | |
| "heart": [ | |
| "chest pain", | |
| "heart attack", | |
| "cardiac", | |
| "palpitations", | |
| "angina", | |
| "heart disease", | |
| "coronary", | |
| "arrhythmia", | |
| ], | |
| "diabetes": ["blood sugar", "diabetes", "insulin", "glucose", "diabetic"], | |
| "asthma": ["asthma", "can't breathe", "inhaler", "wheezing", "shortness of breath"], | |
| "liver": ["liver pain", "jaundice", "hepatitis", "liver disease"], | |
| "kidney": ["kidney pain", "kidney stone", "renal", "dialysis"], | |
| "mental_health": ["depression", "anxiety", "mental health", "stress", "panic attack"], | |
| "cancer": ["cancer", "tumor", "chemotherapy", "malignant"], | |
| } | |
| EDUCATIONAL_QUERY_MARKERS = [ | |
| "what", | |
| "why", | |
| "how", | |
| "explain", | |
| "tell me", | |
| "symptoms", | |
| "causes", | |
| "treatment", | |
| "prevention", | |
| ] | |
| EMERGENCY_STRONG_PHRASES = [ | |
| "i think i'm having a heart attack", | |
| "i think i am having a heart attack", | |
| "i am having a heart attack", | |
| "not breathing", | |
| "unconscious", | |
| "passed out", | |
| "severe chest pain", | |
| ] | |
| EMERGENCY_SEVERE_SIGNS = [ | |
| "can't breathe", | |
| "cannot breathe", | |
| "severe chest pain", | |
| "unconscious", | |
| "not breathing", | |
| "passing out", | |
| "passed out", | |
| "heart attack", | |
| ] | |
| EMERGENCY_URGENCY_CUES = [ | |
| "right now", | |
| "now", | |
| "urgent", | |
| "help me", | |
| "help", | |
| "immediately", | |
| "emergency", | |
| ] | |
| FIRST_PERSON_MARKERS = [ | |
| "i have", | |
| "i feel", | |
| "i am", | |
| "i'm", | |
| "my", | |
| "i've", | |
| "i noticed", | |
| ] | |
| SYMPTOM_MARKERS = [ | |
| "pain", | |
| "hurt", | |
| "hurts", | |
| "breathing", | |
| "wheezing", | |
| "dizzy", | |
| "fatigue", | |
| "vomit", | |
| "nausea", | |
| "fever", | |
| "cough", | |
| ] | |
| class NLPService: | |
| _instance: "NLPService | None" = None | |
| _instance_lock = Lock() | |
| _model_lock = Lock() | |
| _model_name = "sentence-transformers/all-MiniLM-L6-v2" | |
| _local_model_dir = ( | |
| Path(__file__).resolve().parents[2] / "models" / "sentence-transformers__all-MiniLM-L6-v2" | |
| ) | |
| def __new__(cls) -> "NLPService": | |
| if cls._instance is None: | |
| with cls._instance_lock: | |
| if cls._instance is None: | |
| cls._instance = super().__new__(cls) | |
| cls._instance._initialized = False | |
| return cls._instance | |
| def __init__(self) -> None: | |
| if self._initialized: | |
| return | |
| self._model: SentenceTransformer | None = None | |
| self._intent_embeddings: dict[str, list[list[float]]] = {} | |
| self._disease_embeddings: dict[str, list[list[float]]] = {} | |
| self._bootstrap() | |
| self._initialized = True | |
| def _bootstrap(self) -> None: | |
| self._intent_embeddings = self._encode_phrase_groups(INTENT_PHRASES) | |
| self._disease_embeddings = self._encode_phrase_groups(DISEASE_PHRASES) | |
| def _load_model(self) -> SentenceTransformer: | |
| if self._model is None: | |
| with self._model_lock: | |
| if self._model is None: | |
| model_source = ( | |
| str(self._local_model_dir) | |
| if self._local_model_dir.exists() | |
| else self._model_name | |
| ) | |
| self._model = SentenceTransformer(model_source) | |
| print(f"NLP routing model loaded: {self._model_name}") | |
| return self._model | |
| def _encode(self, texts: list[str]) -> list[list[float]]: | |
| model = self._load_model() | |
| embeddings = model.encode( | |
| texts, | |
| normalize_embeddings=True, | |
| show_progress_bar=False, | |
| ) | |
| return embeddings.tolist() | |
| def _encode_phrase_groups(self, phrase_groups: dict[str, list[str]]) -> dict[str, list[list[float]]]: | |
| labels: list[str] = [] | |
| phrases: list[str] = [] | |
| for label, canonical_phrases in phrase_groups.items(): | |
| for phrase in canonical_phrases: | |
| labels.append(label) | |
| phrases.append(phrase) | |
| encoded_phrases = self._encode(phrases) | |
| grouped_embeddings: dict[str, list[list[float]]] = {label: [] for label in phrase_groups} | |
| for label, embedding in zip(labels, encoded_phrases): | |
| grouped_embeddings[label].append(embedding) | |
| return grouped_embeddings | |
| def _cosine_similarity(vector_a: list[float], vector_b: list[float]) -> float: | |
| return float(sum(value_a * value_b for value_a, value_b in zip(vector_a, vector_b))) | |
| def _best_match( | |
| self, | |
| text: str, | |
| reference_embeddings: dict[str, list[list[float]]], | |
| ) -> tuple[str, float]: | |
| label_scores = self._label_scores(text, reference_embeddings) | |
| best_label, best_score = max(label_scores.items(), key=lambda item: item[1]) | |
| return best_label, best_score | |
| def _label_scores( | |
| self, | |
| text: str, | |
| reference_embeddings: dict[str, list[list[float]]], | |
| ) -> dict[str, float]: | |
| query_embedding = self._encode([text or ""])[0] | |
| label_scores: dict[str, float] = {} | |
| for label, embeddings in reference_embeddings.items(): | |
| label_scores[label] = max( | |
| self._cosine_similarity(query_embedding, embedding) for embedding in embeddings | |
| ) | |
| return label_scores | |
| def _has_phrase(text: str, phrases: list[str]) -> bool: | |
| normalized_text = (text or "").lower() | |
| return any(NLPService._phrase_in_text(normalized_text, phrase) for phrase in phrases) | |
| def _phrase_in_text(text: str, phrase: str) -> bool: | |
| normalized_text = " ".join((text or "").lower().split()) | |
| normalized_phrase = " ".join((phrase or "").lower().split()) | |
| if not normalized_phrase: | |
| return False | |
| pattern = r"\b" + re.escape(normalized_phrase).replace(r"\ ", r"\s+") + r"\b" | |
| return re.search(pattern, normalized_text) is not None | |
| def _contains_any(text: str, phrases: list[str]) -> bool: | |
| return any(NLPService._phrase_in_text(text, phrase) for phrase in phrases) | |
| def _looks_educational(cls, normalized_text: str) -> bool: | |
| if cls._contains_any(normalized_text, INTENT_PHRASES["educational"]): | |
| return True | |
| if cls._contains_any(normalized_text, EDUCATIONAL_QUERY_MARKERS) and not cls._contains_any( | |
| normalized_text, FIRST_PERSON_MARKERS | |
| ): | |
| return True | |
| if "?" in normalized_text and cls._contains_any(normalized_text, EDUCATIONAL_QUERY_MARKERS): | |
| return True | |
| return False | |
| def _looks_assessment(cls, normalized_text: str) -> bool: | |
| has_first_person = cls._contains_any(normalized_text, FIRST_PERSON_MARKERS) | |
| has_assessment_phrase = cls._contains_any(normalized_text, INTENT_PHRASES["assessment"]) | |
| has_symptom = cls._contains_any(normalized_text, SYMPTOM_MARKERS) | |
| return has_assessment_phrase or (has_first_person and has_symptom) | |
| def _looks_emergency(cls, normalized_text: str, label_scores: dict[str, float]) -> bool: | |
| if cls._contains_any(normalized_text, EMERGENCY_STRONG_PHRASES): | |
| return True | |
| if cls._looks_educational(normalized_text): | |
| return False | |
| has_severe_sign = cls._contains_any(normalized_text, EMERGENCY_SEVERE_SIGNS) | |
| has_urgency_cue = cls._contains_any(normalized_text, EMERGENCY_URGENCY_CUES) | |
| if has_severe_sign and has_urgency_cue: | |
| return True | |
| return has_severe_sign and label_scores["emergency"] >= 0.78 | |
| def classify_intent(self, text: str) -> dict: | |
| label_scores = self._label_scores(text, self._intent_embeddings) | |
| normalized_text = (text or "").lower().strip() | |
| sorted_scores = sorted(label_scores.values(), reverse=True) | |
| best_margin = sorted_scores[0] - sorted_scores[1] if len(sorted_scores) > 1 else sorted_scores[0] | |
| if self._has_phrase(normalized_text, INTENT_PHRASES["greeting"]): | |
| confidence = max(label_scores["greeting"], 0.90) | |
| return { | |
| "intent": "greeting", | |
| "confidence": confidence, | |
| "needs_llm_tiebreaker": False, | |
| } | |
| if self._looks_emergency(normalized_text, label_scores): | |
| confidence = max(label_scores["emergency"], 0.86) | |
| return { | |
| "intent": "emergency", | |
| "confidence": confidence, | |
| "needs_llm_tiebreaker": False, | |
| } | |
| educational_hint = self._looks_educational(normalized_text) | |
| assessment_hint = self._looks_assessment(normalized_text) | |
| if educational_hint and not assessment_hint: | |
| confidence = max(label_scores["educational"], 0.85) | |
| return { | |
| "intent": "educational", | |
| "confidence": confidence, | |
| "needs_llm_tiebreaker": False, | |
| } | |
| if assessment_hint and not educational_hint: | |
| confidence = max(label_scores["assessment"], 0.85) | |
| return { | |
| "intent": "assessment", | |
| "confidence": confidence, | |
| "needs_llm_tiebreaker": False, | |
| } | |
| if educational_hint and assessment_hint: | |
| if label_scores["assessment"] >= label_scores["educational"] + 0.04: | |
| intent = "assessment" | |
| confidence = max(label_scores["assessment"], 0.80) | |
| else: | |
| intent = "educational" | |
| confidence = max(label_scores["educational"], 0.80) | |
| return { | |
| "intent": intent, | |
| "confidence": confidence, | |
| "needs_llm_tiebreaker": best_margin < 0.06, | |
| } | |
| intent, confidence = max(label_scores.items(), key=lambda item: item[1]) | |
| return { | |
| "intent": intent, | |
| "confidence": confidence, | |
| "needs_llm_tiebreaker": (0.30 <= confidence <= 0.65) or best_margin < 0.05, | |
| } | |
| def detect_disease(self, text: str) -> dict: | |
| disease_id, confidence = self._best_match(text, self._disease_embeddings) | |
| if confidence < 0.30: | |
| return {"disease_id": "general", "confidence": 0.0} | |
| return {"disease_id": disease_id, "confidence": confidence} | |
| def process(self, text: str) -> dict: | |
| intent_result = self.classify_intent(text) | |
| disease_result = self.detect_disease(text) | |
| return { | |
| "intent": intent_result["intent"], | |
| "intent_confidence": intent_result["confidence"], | |
| "disease_id": disease_result["disease_id"], | |
| "disease_confidence": disease_result["confidence"], | |
| "needs_llm_tiebreaker": intent_result["needs_llm_tiebreaker"], | |
| } | |
Xet Storage Details
- Size:
- 11.7 kB
- Xet hash:
- 30a48b62b57a1cd5017197e40431ea2661ec0a5e6166b58e6b651ce00acfd299
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.