Buckets:

meet4150
/

ALIV_AI

meet4150/ALIV_AI / app /nlp /nlp_service.py

11.7 kB

	from __future__ import annotations

	import re
	from pathlib import Path
	from threading import Lock

	from sentence_transformers import SentenceTransformer


	INTENT_PHRASES = {
	"greeting": ["hi", "hello", "hey", "good morning", "good evening", "how are you"],
	"emergency": [
	"emergency",
	"call ambulance",
	"heart attack",
	"can't breathe",
	"dying",
	"severe chest pain",
	"unconscious",
	"not breathing",
	],
	"educational": [
	"what is",
	"what causes",
	"how does",
	"explain",
	"tell me about",
	"symptoms of",
	"treatment for",
	"prevention of",
	],
	"assessment": [
	"i have",
	"i feel",
	"i am experiencing",
	"my chest",
	"i've been",
	"i noticed",
	"i'm suffering",
	"i got diagnosed",
	],
	}

	DISEASE_PHRASES = {
	"heart": [
	"chest pain",
	"heart attack",
	"cardiac",
	"palpitations",
	"angina",
	"heart disease",
	"coronary",
	"arrhythmia",
	],
	"diabetes": ["blood sugar", "diabetes", "insulin", "glucose", "diabetic"],
	"asthma": ["asthma", "can't breathe", "inhaler", "wheezing", "shortness of breath"],
	"liver": ["liver pain", "jaundice", "hepatitis", "liver disease"],
	"kidney": ["kidney pain", "kidney stone", "renal", "dialysis"],
	"mental_health": ["depression", "anxiety", "mental health", "stress", "panic attack"],
	"cancer": ["cancer", "tumor", "chemotherapy", "malignant"],
	}

	EDUCATIONAL_QUERY_MARKERS = [
	"what",
	"why",
	"how",
	"explain",
	"tell me",
	"symptoms",
	"causes",
	"treatment",
	"prevention",
	]

	EMERGENCY_STRONG_PHRASES = [
	"i think i'm having a heart attack",
	"i think i am having a heart attack",
	"i am having a heart attack",
	"not breathing",
	"unconscious",
	"passed out",
	"severe chest pain",
	]

	EMERGENCY_SEVERE_SIGNS = [
	"can't breathe",
	"cannot breathe",
	"severe chest pain",
	"unconscious",
	"not breathing",
	"passing out",
	"passed out",
	"heart attack",
	]

	EMERGENCY_URGENCY_CUES = [
	"right now",
	"now",
	"urgent",
	"help me",
	"help",
	"immediately",
	"emergency",
	]

	FIRST_PERSON_MARKERS = [
	"i have",
	"i feel",
	"i am",
	"i'm",
	"my",
	"i've",
	"i noticed",
	]

	SYMPTOM_MARKERS = [
	"pain",
	"hurt",
	"hurts",
	"breathing",
	"wheezing",
	"dizzy",
	"fatigue",
	"vomit",
	"nausea",
	"fever",
	"cough",
	]


	class NLPService:
	_instance: "NLPService \| None" = None
	_instance_lock = Lock()
	_model_lock = Lock()
	_model_name = "sentence-transformers/all-MiniLM-L6-v2"
	_local_model_dir = (
	Path(__file__).resolve().parents[2] / "models" / "sentence-transformers__all-MiniLM-L6-v2"
	)

	def __new__(cls) -> "NLPService":
	if cls._instance is None:
	with cls._instance_lock:
	if cls._instance is None:
	cls._instance = super().__new__(cls)
	cls._instance._initialized = False
	return cls._instance

	def __init__(self) -> None:
	if self._initialized:
	return

	self._model: SentenceTransformer \| None = None
	self._intent_embeddings: dict[str, list[list[float]]] = {}
	self._disease_embeddings: dict[str, list[list[float]]] = {}
	self._bootstrap()
	self._initialized = True

	def _bootstrap(self) -> None:
	self._intent_embeddings = self._encode_phrase_groups(INTENT_PHRASES)
	self._disease_embeddings = self._encode_phrase_groups(DISEASE_PHRASES)

	def _load_model(self) -> SentenceTransformer:
	if self._model is None:
	with self._model_lock:
	if self._model is None:
	model_source = (
	str(self._local_model_dir)
	if self._local_model_dir.exists()
	else self._model_name
	)
	self._model = SentenceTransformer(model_source)
	print(f"NLP routing model loaded: {self._model_name}")
	return self._model

	def _encode(self, texts: list[str]) -> list[list[float]]:
	model = self._load_model()
	embeddings = model.encode(
	texts,
	normalize_embeddings=True,
	show_progress_bar=False,
	)
	return embeddings.tolist()

	def _encode_phrase_groups(self, phrase_groups: dict[str, list[str]]) -> dict[str, list[list[float]]]:
	labels: list[str] = []
	phrases: list[str] = []

	for label, canonical_phrases in phrase_groups.items():
	for phrase in canonical_phrases:
	labels.append(label)
	phrases.append(phrase)

	encoded_phrases = self._encode(phrases)
	grouped_embeddings: dict[str, list[list[float]]] = {label: [] for label in phrase_groups}

	for label, embedding in zip(labels, encoded_phrases):
	grouped_embeddings[label].append(embedding)

	return grouped_embeddings

	@staticmethod
	def _cosine_similarity(vector_a: list[float], vector_b: list[float]) -> float:
	return float(sum(value_a * value_b for value_a, value_b in zip(vector_a, vector_b)))

	def _best_match(
	self,
	text: str,
	reference_embeddings: dict[str, list[list[float]]],
	) -> tuple[str, float]:
	label_scores = self._label_scores(text, reference_embeddings)
	best_label, best_score = max(label_scores.items(), key=lambda item: item[1])
	return best_label, best_score

	def _label_scores(
	self,
	text: str,
	reference_embeddings: dict[str, list[list[float]]],
	) -> dict[str, float]:
	query_embedding = self._encode([text or ""])[0]
	label_scores: dict[str, float] = {}

	for label, embeddings in reference_embeddings.items():
	label_scores[label] = max(
	self._cosine_similarity(query_embedding, embedding) for embedding in embeddings
	)

	return label_scores

	@staticmethod
	def _has_phrase(text: str, phrases: list[str]) -> bool:
	normalized_text = (text or "").lower()
	return any(NLPService._phrase_in_text(normalized_text, phrase) for phrase in phrases)

	@staticmethod
	def _phrase_in_text(text: str, phrase: str) -> bool:
	normalized_text = " ".join((text or "").lower().split())
	normalized_phrase = " ".join((phrase or "").lower().split())
	if not normalized_phrase:
	return False

	pattern = r"\b" + re.escape(normalized_phrase).replace(r"\ ", r"\s+") + r"\b"
	return re.search(pattern, normalized_text) is not None

	@staticmethod
	def _contains_any(text: str, phrases: list[str]) -> bool:
	return any(NLPService._phrase_in_text(text, phrase) for phrase in phrases)

	@classmethod
	def _looks_educational(cls, normalized_text: str) -> bool:
	if cls._contains_any(normalized_text, INTENT_PHRASES["educational"]):
	return True
	if cls._contains_any(normalized_text, EDUCATIONAL_QUERY_MARKERS) and not cls._contains_any(
	normalized_text, FIRST_PERSON_MARKERS
	):
	return True
	if "?" in normalized_text and cls._contains_any(normalized_text, EDUCATIONAL_QUERY_MARKERS):
	return True
	return False

	@classmethod
	def _looks_assessment(cls, normalized_text: str) -> bool:
	has_first_person = cls._contains_any(normalized_text, FIRST_PERSON_MARKERS)
	has_assessment_phrase = cls._contains_any(normalized_text, INTENT_PHRASES["assessment"])
	has_symptom = cls._contains_any(normalized_text, SYMPTOM_MARKERS)
	return has_assessment_phrase or (has_first_person and has_symptom)

	@classmethod
	def _looks_emergency(cls, normalized_text: str, label_scores: dict[str, float]) -> bool:
	if cls._contains_any(normalized_text, EMERGENCY_STRONG_PHRASES):
	return True

	if cls._looks_educational(normalized_text):
	return False

	has_severe_sign = cls._contains_any(normalized_text, EMERGENCY_SEVERE_SIGNS)
	has_urgency_cue = cls._contains_any(normalized_text, EMERGENCY_URGENCY_CUES)
	if has_severe_sign and has_urgency_cue:
	return True

	return has_severe_sign and label_scores["emergency"] >= 0.78

	def classify_intent(self, text: str) -> dict:
	label_scores = self._label_scores(text, self._intent_embeddings)
	normalized_text = (text or "").lower().strip()
	sorted_scores = sorted(label_scores.values(), reverse=True)
	best_margin = sorted_scores[0] - sorted_scores[1] if len(sorted_scores) > 1 else sorted_scores[0]

	if self._has_phrase(normalized_text, INTENT_PHRASES["greeting"]):
	confidence = max(label_scores["greeting"], 0.90)
	return {
	"intent": "greeting",
	"confidence": confidence,
	"needs_llm_tiebreaker": False,
	}

	if self._looks_emergency(normalized_text, label_scores):
	confidence = max(label_scores["emergency"], 0.86)
	return {
	"intent": "emergency",
	"confidence": confidence,
	"needs_llm_tiebreaker": False,
	}

	educational_hint = self._looks_educational(normalized_text)
	assessment_hint = self._looks_assessment(normalized_text)

	if educational_hint and not assessment_hint:
	confidence = max(label_scores["educational"], 0.85)
	return {
	"intent": "educational",
	"confidence": confidence,
	"needs_llm_tiebreaker": False,
	}

	if assessment_hint and not educational_hint:
	confidence = max(label_scores["assessment"], 0.85)
	return {
	"intent": "assessment",
	"confidence": confidence,
	"needs_llm_tiebreaker": False,
	}

	if educational_hint and assessment_hint:
	if label_scores["assessment"] >= label_scores["educational"] + 0.04:
	intent = "assessment"
	confidence = max(label_scores["assessment"], 0.80)
	else:
	intent = "educational"
	confidence = max(label_scores["educational"], 0.80)
	return {
	"intent": intent,
	"confidence": confidence,
	"needs_llm_tiebreaker": best_margin < 0.06,
	}

	intent, confidence = max(label_scores.items(), key=lambda item: item[1])
	return {
	"intent": intent,
	"confidence": confidence,
	"needs_llm_tiebreaker": (0.30 <= confidence <= 0.65) or best_margin < 0.05,
	}

	def detect_disease(self, text: str) -> dict:
	disease_id, confidence = self._best_match(text, self._disease_embeddings)
	if confidence < 0.30:
	return {"disease_id": "general", "confidence": 0.0}
	return {"disease_id": disease_id, "confidence": confidence}

	def process(self, text: str) -> dict:
	intent_result = self.classify_intent(text)
	disease_result = self.detect_disease(text)
	return {
	"intent": intent_result["intent"],
	"intent_confidence": intent_result["confidence"],
	"disease_id": disease_result["disease_id"],
	"disease_confidence": disease_result["confidence"],
	"needs_llm_tiebreaker": intent_result["needs_llm_tiebreaker"],
	}

Xet Storage Details

Size:: 11.7 kB
Xet hash:: 30a48b62b57a1cd5017197e40431ea2661ec0a5e6166b58e6b651ce00acfd299

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.