meet4150/ALIV_AI / app /nlp /nlp_service.py
download
raw
11.7 kB
from __future__ import annotations
import re
from pathlib import Path
from threading import Lock
from sentence_transformers import SentenceTransformer
INTENT_PHRASES = {
"greeting": ["hi", "hello", "hey", "good morning", "good evening", "how are you"],
"emergency": [
"emergency",
"call ambulance",
"heart attack",
"can't breathe",
"dying",
"severe chest pain",
"unconscious",
"not breathing",
],
"educational": [
"what is",
"what causes",
"how does",
"explain",
"tell me about",
"symptoms of",
"treatment for",
"prevention of",
],
"assessment": [
"i have",
"i feel",
"i am experiencing",
"my chest",
"i've been",
"i noticed",
"i'm suffering",
"i got diagnosed",
],
}
DISEASE_PHRASES = {
"heart": [
"chest pain",
"heart attack",
"cardiac",
"palpitations",
"angina",
"heart disease",
"coronary",
"arrhythmia",
],
"diabetes": ["blood sugar", "diabetes", "insulin", "glucose", "diabetic"],
"asthma": ["asthma", "can't breathe", "inhaler", "wheezing", "shortness of breath"],
"liver": ["liver pain", "jaundice", "hepatitis", "liver disease"],
"kidney": ["kidney pain", "kidney stone", "renal", "dialysis"],
"mental_health": ["depression", "anxiety", "mental health", "stress", "panic attack"],
"cancer": ["cancer", "tumor", "chemotherapy", "malignant"],
}
EDUCATIONAL_QUERY_MARKERS = [
"what",
"why",
"how",
"explain",
"tell me",
"symptoms",
"causes",
"treatment",
"prevention",
]
EMERGENCY_STRONG_PHRASES = [
"i think i'm having a heart attack",
"i think i am having a heart attack",
"i am having a heart attack",
"not breathing",
"unconscious",
"passed out",
"severe chest pain",
]
EMERGENCY_SEVERE_SIGNS = [
"can't breathe",
"cannot breathe",
"severe chest pain",
"unconscious",
"not breathing",
"passing out",
"passed out",
"heart attack",
]
EMERGENCY_URGENCY_CUES = [
"right now",
"now",
"urgent",
"help me",
"help",
"immediately",
"emergency",
]
FIRST_PERSON_MARKERS = [
"i have",
"i feel",
"i am",
"i'm",
"my",
"i've",
"i noticed",
]
SYMPTOM_MARKERS = [
"pain",
"hurt",
"hurts",
"breathing",
"wheezing",
"dizzy",
"fatigue",
"vomit",
"nausea",
"fever",
"cough",
]
class NLPService:
_instance: "NLPService | None" = None
_instance_lock = Lock()
_model_lock = Lock()
_model_name = "sentence-transformers/all-MiniLM-L6-v2"
_local_model_dir = (
Path(__file__).resolve().parents[2] / "models" / "sentence-transformers__all-MiniLM-L6-v2"
)
def __new__(cls) -> "NLPService":
if cls._instance is None:
with cls._instance_lock:
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._initialized = False
return cls._instance
def __init__(self) -> None:
if self._initialized:
return
self._model: SentenceTransformer | None = None
self._intent_embeddings: dict[str, list[list[float]]] = {}
self._disease_embeddings: dict[str, list[list[float]]] = {}
self._bootstrap()
self._initialized = True
def _bootstrap(self) -> None:
self._intent_embeddings = self._encode_phrase_groups(INTENT_PHRASES)
self._disease_embeddings = self._encode_phrase_groups(DISEASE_PHRASES)
def _load_model(self) -> SentenceTransformer:
if self._model is None:
with self._model_lock:
if self._model is None:
model_source = (
str(self._local_model_dir)
if self._local_model_dir.exists()
else self._model_name
)
self._model = SentenceTransformer(model_source)
print(f"NLP routing model loaded: {self._model_name}")
return self._model
def _encode(self, texts: list[str]) -> list[list[float]]:
model = self._load_model()
embeddings = model.encode(
texts,
normalize_embeddings=True,
show_progress_bar=False,
)
return embeddings.tolist()
def _encode_phrase_groups(self, phrase_groups: dict[str, list[str]]) -> dict[str, list[list[float]]]:
labels: list[str] = []
phrases: list[str] = []
for label, canonical_phrases in phrase_groups.items():
for phrase in canonical_phrases:
labels.append(label)
phrases.append(phrase)
encoded_phrases = self._encode(phrases)
grouped_embeddings: dict[str, list[list[float]]] = {label: [] for label in phrase_groups}
for label, embedding in zip(labels, encoded_phrases):
grouped_embeddings[label].append(embedding)
return grouped_embeddings
@staticmethod
def _cosine_similarity(vector_a: list[float], vector_b: list[float]) -> float:
return float(sum(value_a * value_b for value_a, value_b in zip(vector_a, vector_b)))
def _best_match(
self,
text: str,
reference_embeddings: dict[str, list[list[float]]],
) -> tuple[str, float]:
label_scores = self._label_scores(text, reference_embeddings)
best_label, best_score = max(label_scores.items(), key=lambda item: item[1])
return best_label, best_score
def _label_scores(
self,
text: str,
reference_embeddings: dict[str, list[list[float]]],
) -> dict[str, float]:
query_embedding = self._encode([text or ""])[0]
label_scores: dict[str, float] = {}
for label, embeddings in reference_embeddings.items():
label_scores[label] = max(
self._cosine_similarity(query_embedding, embedding) for embedding in embeddings
)
return label_scores
@staticmethod
def _has_phrase(text: str, phrases: list[str]) -> bool:
normalized_text = (text or "").lower()
return any(NLPService._phrase_in_text(normalized_text, phrase) for phrase in phrases)
@staticmethod
def _phrase_in_text(text: str, phrase: str) -> bool:
normalized_text = " ".join((text or "").lower().split())
normalized_phrase = " ".join((phrase or "").lower().split())
if not normalized_phrase:
return False
pattern = r"\b" + re.escape(normalized_phrase).replace(r"\ ", r"\s+") + r"\b"
return re.search(pattern, normalized_text) is not None
@staticmethod
def _contains_any(text: str, phrases: list[str]) -> bool:
return any(NLPService._phrase_in_text(text, phrase) for phrase in phrases)
@classmethod
def _looks_educational(cls, normalized_text: str) -> bool:
if cls._contains_any(normalized_text, INTENT_PHRASES["educational"]):
return True
if cls._contains_any(normalized_text, EDUCATIONAL_QUERY_MARKERS) and not cls._contains_any(
normalized_text, FIRST_PERSON_MARKERS
):
return True
if "?" in normalized_text and cls._contains_any(normalized_text, EDUCATIONAL_QUERY_MARKERS):
return True
return False
@classmethod
def _looks_assessment(cls, normalized_text: str) -> bool:
has_first_person = cls._contains_any(normalized_text, FIRST_PERSON_MARKERS)
has_assessment_phrase = cls._contains_any(normalized_text, INTENT_PHRASES["assessment"])
has_symptom = cls._contains_any(normalized_text, SYMPTOM_MARKERS)
return has_assessment_phrase or (has_first_person and has_symptom)
@classmethod
def _looks_emergency(cls, normalized_text: str, label_scores: dict[str, float]) -> bool:
if cls._contains_any(normalized_text, EMERGENCY_STRONG_PHRASES):
return True
if cls._looks_educational(normalized_text):
return False
has_severe_sign = cls._contains_any(normalized_text, EMERGENCY_SEVERE_SIGNS)
has_urgency_cue = cls._contains_any(normalized_text, EMERGENCY_URGENCY_CUES)
if has_severe_sign and has_urgency_cue:
return True
return has_severe_sign and label_scores["emergency"] >= 0.78
def classify_intent(self, text: str) -> dict:
label_scores = self._label_scores(text, self._intent_embeddings)
normalized_text = (text or "").lower().strip()
sorted_scores = sorted(label_scores.values(), reverse=True)
best_margin = sorted_scores[0] - sorted_scores[1] if len(sorted_scores) > 1 else sorted_scores[0]
if self._has_phrase(normalized_text, INTENT_PHRASES["greeting"]):
confidence = max(label_scores["greeting"], 0.90)
return {
"intent": "greeting",
"confidence": confidence,
"needs_llm_tiebreaker": False,
}
if self._looks_emergency(normalized_text, label_scores):
confidence = max(label_scores["emergency"], 0.86)
return {
"intent": "emergency",
"confidence": confidence,
"needs_llm_tiebreaker": False,
}
educational_hint = self._looks_educational(normalized_text)
assessment_hint = self._looks_assessment(normalized_text)
if educational_hint and not assessment_hint:
confidence = max(label_scores["educational"], 0.85)
return {
"intent": "educational",
"confidence": confidence,
"needs_llm_tiebreaker": False,
}
if assessment_hint and not educational_hint:
confidence = max(label_scores["assessment"], 0.85)
return {
"intent": "assessment",
"confidence": confidence,
"needs_llm_tiebreaker": False,
}
if educational_hint and assessment_hint:
if label_scores["assessment"] >= label_scores["educational"] + 0.04:
intent = "assessment"
confidence = max(label_scores["assessment"], 0.80)
else:
intent = "educational"
confidence = max(label_scores["educational"], 0.80)
return {
"intent": intent,
"confidence": confidence,
"needs_llm_tiebreaker": best_margin < 0.06,
}
intent, confidence = max(label_scores.items(), key=lambda item: item[1])
return {
"intent": intent,
"confidence": confidence,
"needs_llm_tiebreaker": (0.30 <= confidence <= 0.65) or best_margin < 0.05,
}
def detect_disease(self, text: str) -> dict:
disease_id, confidence = self._best_match(text, self._disease_embeddings)
if confidence < 0.30:
return {"disease_id": "general", "confidence": 0.0}
return {"disease_id": disease_id, "confidence": confidence}
def process(self, text: str) -> dict:
intent_result = self.classify_intent(text)
disease_result = self.detect_disease(text)
return {
"intent": intent_result["intent"],
"intent_confidence": intent_result["confidence"],
"disease_id": disease_result["disease_id"],
"disease_confidence": disease_result["confidence"],
"needs_llm_tiebreaker": intent_result["needs_llm_tiebreaker"],
}

Xet Storage Details

Size:
11.7 kB
·
Xet hash:
30a48b62b57a1cd5017197e40431ea2661ec0a5e6166b58e6b651ce00acfd299

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.