Spaces:

SmartHeal
/

SmartHeal-Agentic-AI

Sleeping

App Files Files Community

SmartHeal commited on Aug 16, 2025

Commit

bc7b1d8

verified ·

1 Parent(s): a4ec7ff

Update src/ai_processor.py

Browse files

Files changed (1) hide show

src/ai_processor.py +119 -149

src/ai_processor.py CHANGED Viewed

@@ -7,8 +7,9 @@ import logging
 from datetime import datetime
 from typing import Optional, Dict, List, Tuple
-# ---- Environment defaults (do NOT globally hint CUDA here) ----
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 LOGLEVEL = os.getenv("LOGLEVEL", "INFO").upper()
 SMARTHEAL_DEBUG = os.getenv("SMARTHEAL_DEBUG", "0") == "1"
@@ -26,14 +27,9 @@ logging.basicConfig(
 def _log_kv(prefix: str, kv: Dict):
     logging.debug(prefix + " | " + " | ".join(f"{k}={v}" for k, v in kv.items()))
-# --- Spaces GPU decorator (REQUIRED) ---
-from spaces import GPU as _SPACES_GPU
-@_SPACES_GPU(enable_queue=True)  # enable_queue ignored by ZeroGPU but explicit is fine
-def smartheal_gpu_stub(ping: int = 0) -> str:
-    return "ready"
-# ---- Paths / constants ----
 UPLOADS_DIR = "uploads"
 os.makedirs(UPLOADS_DIR, exist_ok=True)
@@ -53,37 +49,15 @@ SEG_THRESH = float(os.getenv("SEG_THRESH", "0.5"))
 models_cache: Dict[str, object] = {}
 knowledge_base_cache: Dict[str, object] = {}
-# ---------- Utilities to prevent CUDA in main process ----------
-from contextlib import contextmanager
-@contextmanager
-def _no_cuda_env():
-    """
-    Mask GPUs so any library imported/constructed in the main process
-    cannot see CUDA (required for Spaces Stateless GPU).
-    """
-    prev = os.environ.get("CUDA_VISIBLE_DEVICES")
-    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
-    try:
-        yield
-    finally:
-        if prev is None:
-            os.environ.pop("CUDA_VISIBLE_DEVICES", None)
-        else:
-            os.environ["CUDA_VISIBLE_DEVICES"] = prev
-# ---------- Lazy imports (wrapped where needed) ----------
 def _import_ultralytics():
-    # Prevent Ultralytics from probing CUDA on import
-    with _no_cuda_env():
-        from ultralytics import YOLO
     return YOLO
 def _import_tf_loader():
     import tensorflow as tf
     try:
-        # Keep TF on CPU only
-        tf.config.set_visible_devices([], "GPU")
     except Exception:
         pass
     from tensorflow.keras.models import load_model
@@ -94,11 +68,8 @@ def _import_hf_cls():
     return pipeline
 def _import_embeddings():
-    # Prefer the new package if available, fallback to community to avoid deprecation warnings
-    try:
-        from langchain_huggingface import HuggingFaceEmbeddings  # type: ignore
-    except Exception:
-        from langchain_community.embeddings import HuggingFaceEmbeddings  # type: ignore
     return HuggingFaceEmbeddings
 def _import_langchain_pdf():
@@ -113,85 +84,107 @@ def _import_hf_hub():
     from huggingface_hub import HfApi, HfFolder
     return HfApi, HfFolder
-# ---------- SmartHeal prompts (system + user prefix) ----------
-SMARTHEAL_SYSTEM_PROMPT = """\
-You are SmartHeal Clinical Assistant, a wound-care decision-support system.
-You analyze wound photographs and brief patient context to produce careful,
-specific, guideline-informed recommendations WITHOUT diagnosing. You always:
-- Use the measurements calculated by the vision pipeline as ground truth.
-- Prefer concise, actionable steps tailored to exudate level, infection risk, and pain.
-- Flag uncertainties and red flags that need escalation to a clinician.
-- Avoid contraindicated advice; do not infer unseen comorbidities.
-- Keep under 300 words and use the requested headings exactly.
-- Tone: professional, clear, and conservative; no definitive medical claims.
-- Safety: remind the user to seek clinician review for changes or red flags.
-"""
-SMARTHEAL_USER_PREFIX = """\
-Patient: {patient_info}
-Visual findings: type={wound_type}, size={length_cm}x{breadth_cm} cm, area={area_cm2} cm^2,
-detection_conf={det_conf:.2f}, calibration={px_per_cm} px/cm.
-Guideline context (snippets you can draw principles from; do not quote at length):
-{guideline_context}
-Write a structured answer with these headings exactly:
-1. Clinical Summary (max 4 bullet points)
-2. Likely Stage/Type (if uncertain, say 'uncertain')
-3. Treatment Plan (specific dressing choices and frequency based on exudate/infection risk)
-4. Red Flags (what to escalate and when)
-5. Follow-up Cadence (days)
-6. Notes (assumptions/uncertainties)
-Keep to 220–300 words. Do NOT provide diagnosis. Avoid contraindicated advice.
-"""
-# ---------- VLM (MedGemma replaced with Qwen2-VL) ----------
-@_SPACES_GPU(enable_queue=True)
-def _vlm_infer_gpu(messages, model_id: str, max_new_tokens: int, token: Optional[str]):
-    """
-    Runs entirely inside a Spaces GPU worker. It's the ONLY place we allow CUDA init.
-    """
-    import torch
-    if not torch.cuda.is_available():
-        raise RuntimeError("CUDA not available in worker (check ZeroGPU torch version).")
-    from transformers import pipeline
-    pipe = pipeline(
-        task="image-text-to-text",
-        model=model_id,
-        device_map={"": 0},           # be explicit: put everything on cuda:0
-        token=token,
         trust_remote_code=True,
-        model_kwargs={"low_cpu_mem_usage": True},
     )
-    out = pipe(text=messages, max_new_tokens=max_new_tokens, do_sample=False, temperature=0.2)
-    try:
-        txt = out[0]["generated_text"][-1].get("content", "")
-    except Exception:
-        txt = out[0].get("generated_text", "")
-    return (txt or "").strip() or "⚠️ Empty response"
-def _vlm_infer_cpu(messages, model_id: str, max_new_tokens: int, token: Optional[str]) -> str:
-    """
-    CPU fallback path when ZeroGPU grant fails or CUDA wheel is unavailable.
-    """
     from transformers import pipeline
     pipe = pipeline(
         task="image-text-to-text",
         model=model_id,
-        device_map="cpu",
-        token=token,
         trust_remote_code=True,
-        model_kwargs={"low_cpu_mem_usage": True},
     )
-    out = pipe(text=messages, max_new_tokens=max_new_tokens, do_sample=False, temperature=0.2)
     try:
-        txt = out[0]["generated_text"][-1].get("content", "")
     except Exception:
-        txt = out[0].get("generated_text", "")
-    return (txt or "").strip() or "⚠️ Empty response"
-def generate_medgemma_report(  # kept name so callers don't change
     patient_info: str,
     visual_results: Dict,
     guideline_context: str,
@@ -199,53 +192,34 @@ def generate_medgemma_report(  # kept name so callers don't change
     max_new_tokens: Optional[int] = None,
 ) -> str:
     """
-    MedGemma replacement using Qwen/Qwen2-VL-2B-Instruct via image-text-to-text.
-    Loads & runs ONLY inside a GPU worker to satisfy Stateless GPU constraints.
-    Falls back to CPU pipeline if a GPU grant/initialization fails.
     """
-    if os.getenv("SMARTHEAL_ENABLE_VLM", "1") != "1":
-        return "⚠️ VLM disabled"
-    model_id = os.getenv("SMARTHEAL_VLM_MODEL", "Qwen/Qwen2-VL-2B-Instruct")
-    max_new_tokens = max_new_tokens or int(os.getenv("SMARTHEAL_VLM_MAX_TOKENS", "600"))
-    uprompt = SMARTHEAL_USER_PREFIX.format(
-        patient_info=patient_info,
-        wound_type=visual_results.get("wound_type", "Unknown"),
-        length_cm=visual_results.get("length_cm", 0),
-        breadth_cm=visual_results.get("breadth_cm", 0),
-        area_cm2=visual_results.get("surface_area_cm2", 0),
-        det_conf=float(visual_results.get("detection_confidence", 0.0)),
-        px_per_cm=visual_results.get("px_per_cm", "?"),
-        guideline_context=(guideline_context or "")[:900],
-    )
-    messages = [
-        {"role": "system", "content": [{"type": "text", "text": SMARTHEAL_SYSTEM_PROMPT}]},
-        {"role": "user", "content": [
-            {"type": "image", "image": image_pil},
-            {"type": "text",  "text": uprompt},
-        ]},
-    ]
-    # Try GPU worker first, then CPU fallback
     try:
-        return _vlm_infer_gpu(messages, model_id, max_new_tokens, HF_TOKEN)
     except Exception as e:
-        logging.warning(f"GPU VLM failed; falling back to CPU: {e}")
-        try:
-            return _vlm_infer_cpu(messages, model_id, max_new_tokens, HF_TOKEN)
-        except Exception as e2:
-            logging.error(f"CPU VLM also failed: {e2}")
-            return "⚠️ VLM error"
 # ---------- Initialize CPU models ----------
 def load_yolo_model():
     YOLO = _import_ultralytics()
-    # Construct model with CUDA masked to avoid auto-selecting cuda:0
-    with _no_cuda_env():
-        model = YOLO(YOLO_MODEL_PATH)
-    return model
 def load_segmentation_model():
     load_model = _import_tf_loader()
@@ -287,7 +261,6 @@ def initialize_cpu_models() -> None:
                 models_cache["seg"] = None
                 logging.warning("Segmentation model file missing; skipping.")
         except Exception as e:
-            # Typical with Keras/TF version mismatch; pin TF/Keras 2.15 in requirements.
             models_cache["seg"] = None
             logging.warning(f"Segmentation unavailable: {e}")
@@ -452,7 +425,6 @@ def _grabcut_refine(bgr: np.ndarray, seed01: np.ndarray, iters: int = 3) -> np.n
     seed_dil = cv2.dilate(seed01, k, iterations=1)
     gc[seed01.astype(bool)] = cv2.GC_PR_FGD
     gc[seed_dil.astype(bool)] = cv2.GC_FGD
-    # force borders to background
     gc[0, :], gc[-1, :], gc[:, 0], gc[:, -1] = cv2.GC_BGD, cv2.GC_BGD, cv2.GC_BGD, cv2.GC_BGD
     bgdModel = np.zeros((1, 65), np.float64)
     fgdModel = np.zeros((1, 65), np.float64)
@@ -485,9 +457,7 @@ def _clean_mask(mask01: np.ndarray) -> np.ndarray:
             mask01 = (labels == largest_idx).astype(np.uint8)
     return (mask01 > 0).astype(np.uint8)
-# Global last debug dict (per-process)
-_last_seg_debug: Dict[str, object] = {}
 def segment_wound(image_bgr: np.ndarray, ts: str, out_dir: str) -> Tuple[np.ndarray, Dict[str, object]]:
     """
     TF model → adaptive threshold on prob → GrabCut grow → cleanup.
@@ -710,7 +680,6 @@ class AIProcessor:
             det_model = self.models_cache.get("det")
             if det_model is None:
                 raise RuntimeError("YOLO model not loaded")
-            # Force CPU inference and avoid CUDA touch
             results = det_model.predict(image_cv, verbose=False, device="cpu")
             if (not results) or (not getattr(results[0], "boxes", None)) or (len(results[0].boxes) == 0):
                 try:
@@ -856,7 +825,7 @@ class AIProcessor:
             if not vs:
                 return "Knowledge base is not available."
             retriever = vs.as_retriever(search_kwargs={"k": 5})
-            # Modern API (avoid get_relevant_documents deprecation)
             docs = retriever.invoke(query)
             lines: List[str] = []
             for d in docs:
@@ -914,6 +883,7 @@ Automated analysis provides quantitative measurements; verify via clinical exami
         max_new_tokens: Optional[int] = None,
     ) -> str:
         try:
             report = generate_medgemma_report(
                 patient_info, visual_results, guideline_context, image_pil, max_new_tokens
             )

 from datetime import datetime
 from typing import Optional, Dict, List, Tuple
+# ---- Environment defaults (mask CUDA in main process) ----
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")  # ensure main never touches CUDA
 LOGLEVEL = os.getenv("LOGLEVEL", "INFO").upper()
 SMARTHEAL_DEBUG = os.getenv("SMARTHEAL_DEBUG", "0") == "1"
 def _log_kv(prefix: str, kv: Dict):
     logging.debug(prefix + " | " + " | ".join(f"{k}={v}" for k, v in kv.items()))
+# --- Spaces GPU (non-optional) ---
+import spaces  # required; do not stub/optionalize
 UPLOADS_DIR = "uploads"
 os.makedirs(UPLOADS_DIR, exist_ok=True)
 models_cache: Dict[str, object] = {}
 knowledge_base_cache: Dict[str, object] = {}
+# ---------- Lazy imports ----------
 def _import_ultralytics():
+    from ultralytics import YOLO
     return YOLO
 def _import_tf_loader():
     import tensorflow as tf
     try:
+        tf.config.set_visible_devices([], "GPU")  # keep TF on CPU
     except Exception:
         pass
     from tensorflow.keras.models import load_model
     return pipeline
 def _import_embeddings():
+    # updated per LangChain deprecations
+    from langchain_huggingface import HuggingFaceEmbeddings
     return HuggingFaceEmbeddings
 def _import_langchain_pdf():
     from huggingface_hub import HfApi, HfFolder
     return HfApi, HfFolder
+# ---------- VLM (MedGemma replacement under the same public function name) ----------
+SMARTHEAL_VLM_ID = os.getenv("SMARTHEAL_VLM_ID", "Qwen/Qwen2-VL-2B-Instruct")
+SMARTHEAL_VLM_MAX_NEW_TOKENS = int(os.getenv("SMARTHEAL_VLM_MAX_NEW_TOKENS", "600"))
+SMARTHEAL_VLM_TEMPERATURE = float(os.getenv("SMARTHEAL_VLM_TEMPERATURE", "0.2"))
+SMARTHEAL_SYSTEM_PROMPT = """You are SmartHeal, a medical decision-support assistant specialized in wound assessment.
+You are given: (1) a wound photograph, (2) basic patient context, and (3) visual measurements (length, width, area)
+estimated from computer vision. You must:
+- Summarize clinically-relevant visual cues (tissue type, exudate amount, slough/necrosis, peri-wound condition).
+- Interpret in context of diabetes/infection/moisture/bleeding risks.
+- Provide clear next-step care: cleansing, debridement criteria, dressing selection, offloading, escalation triggers.
+- Include risk flags (ischemia, cellulitis, osteomyelitis suspicion) and monitoring frequency.
+- Be concise, structured, and avoid speculation beyond the image and given data.
+- Always add a short disclaimer: “Decision-support only; verify clinically.”"""
+def _build_vlm_messages(patient_info: str, visual_results: Dict, guideline_context: str) -> list:
+    wt = visual_results.get("wound_type", "Unknown")
+    L = visual_results.get("length_cm", 0)
+    W = visual_results.get("breadth_cm", 0)
+    A = visual_results.get("surface_area_cm2", 0)
+    ppcm = visual_results.get("px_per_cm", "?")
+    ctx = (guideline_context or "")
+    if ctx:
+        ctx = f"\n\nRelevant guideline snippets:\n{ctx[:1200]}{'...' if len(ctx)>1200 else ''}"
+    text = (
+        f"{SMARTHEAL_SYSTEM_PROMPT}\n\n"
+        f"Patient: {patient_info}\n"
+        f"Wound visual summary (from CV): type={wt}, length={L} cm, width={W} cm, "
+        f"area={A} cm² (calibration {ppcm} px/cm)."
+        f"{ctx}\n\n"
+        "Analyze the image and provide:\n"
+        "1) Clinical Summary\n2) Dressing & Treatment Plan\n"
+        "3) Risk/Red Flags\n4) Monitoring Plan\n"
+        "Format with short headings and bullets.\n"
+    )
+    return [{"role": "user", "content": [{"type": "text", "text": text}]}]
+@spaces.GPU  # non-optional: ensure CUDA work happens only inside the ZeroGPU worker
+def _vlm_infer_gpu(
+    image_pil: Image.Image,
+    messages: list,
+    max_new_tokens: int,
+    temperature: float,
+    model_id: str,
+    token: Optional[str],
+) -> str:
+    import torch
+    from transformers import AutoProcessor, AutoModelForCausalLM
+    torch.backends.cuda.matmul.allow_tf32 = True
+    device = "cuda"
+    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, use_fast=True, token=token)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch.float16,
         trust_remote_code=True,
+        token=token,
+    ).to(device)
+    inputs = processor(messages=messages, images=[image_pil], return_tensors="pt").to(device)
+    gen_ids = model.generate(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+        do_sample=False,
+        temperature=temperature,
     )
+    out = processor.batch_decode(gen_ids, skip_special_tokens=True)[0]
+    return out.strip()
+def _vlm_infer_cpu(
+    image_pil: Image.Image,
+    messages: list,
+    max_new_tokens: int,
+    temperature: float,
+    model_id: str,
+    token: Optional[str],
+) -> str:
     from transformers import pipeline
     pipe = pipeline(
         task="image-text-to-text",
         model=model_id,
+        device="cpu",
         trust_remote_code=True,
+        token=token,
+    )
+    out = pipe(
+        text=[{"role": "user", "content": [{"type": "image", "image": image_pil}, *messages[0]["content"]]}],
+        max_new_tokens=max_new_tokens,
+        do_sample=False,
+        temperature=temperature,
     )
     try:
+        return (out[0]["generated_text"][-1].get("content", "") or "").strip()
     except Exception:
+        return (out[0].get("generated_text", "") or "").strip()
+def generate_medgemma_report(  # <-- keep the original PUBLIC NAME
     patient_info: str,
     visual_results: Dict,
     guideline_context: str,
     max_new_tokens: Optional[int] = None,
 ) -> str:
     """
+    Re-implemented to use Qwen/Qwen2-VL-* via ZeroGPU (@spaces.GPU) with CPU fallback.
+    Name preserved for compatibility with existing callers.
     """
+    msgs = _build_vlm_messages(patient_info, visual_results, guideline_context)
     try:
+        return _vlm_infer_gpu(
+            image_pil=image_pil,
+            messages=msgs,
+            max_new_tokens=max_new_tokens or SMARTHEAL_VLM_MAX_NEW_TOKENS,
+            temperature=SMARTHEAL_VLM_TEMPERATURE,
+            model_id=SMARTHEAL_VLM_ID,
+            token=HF_TOKEN,
+        )
     except Exception as e:
+        logging.warning(f"GPU VLM failed; falling back to CPU: {e!r}")
+        return _vlm_infer_cpu(
+            image_pil=image_pil,
+            messages=msgs,
+            max_new_tokens=max_new_tokens or SMARTHEAL_VLM_MAX_NEW_TOKENS,
+            temperature=SMARTHEAL_VLM_TEMPERATURE,
+            model_id=SMARTHEAL_VLM_ID,
+            token=HF_TOKEN,
+        ) or "⚠️ VLM returned empty output"
 # ---------- Initialize CPU models ----------
 def load_yolo_model():
     YOLO = _import_ultralytics()
+    return YOLO(YOLO_MODEL_PATH)
 def load_segmentation_model():
     load_model = _import_tf_loader()
                 models_cache["seg"] = None
                 logging.warning("Segmentation model file missing; skipping.")
         except Exception as e:
             models_cache["seg"] = None
             logging.warning(f"Segmentation unavailable: {e}")
     seed_dil = cv2.dilate(seed01, k, iterations=1)
     gc[seed01.astype(bool)] = cv2.GC_PR_FGD
     gc[seed_dil.astype(bool)] = cv2.GC_FGD
     gc[0, :], gc[-1, :], gc[:, 0], gc[:, -1] = cv2.GC_BGD, cv2.GC_BGD, cv2.GC_BGD, cv2.GC_BGD
     bgdModel = np.zeros((1, 65), np.float64)
     fgdModel = np.zeros((1, 65), np.float64)
             mask01 = (labels == largest_idx).astype(np.uint8)
     return (mask01 > 0).astype(np.uint8)
+# ---------- Segmentation pipeline ----------
 def segment_wound(image_bgr: np.ndarray, ts: str, out_dir: str) -> Tuple[np.ndarray, Dict[str, object]]:
     """
     TF model → adaptive threshold on prob → GrabCut grow → cleanup.
             det_model = self.models_cache.get("det")
             if det_model is None:
                 raise RuntimeError("YOLO model not loaded")
             results = det_model.predict(image_cv, verbose=False, device="cpu")
             if (not results) or (not getattr(results[0], "boxes", None)) or (len(results[0].boxes) == 0):
                 try:
             if not vs:
                 return "Knowledge base is not available."
             retriever = vs.as_retriever(search_kwargs={"k": 5})
+            # LangChain deprecation fix: use invoke()
             docs = retriever.invoke(query)
             lines: List[str] = []
             for d in docs:
         max_new_tokens: Optional[int] = None,
     ) -> str:
         try:
+            # call the preserved function name (now backed by Qwen2-VL)
             report = generate_medgemma_report(
                 patient_info, visual_results, guideline_context, image_pil, max_new_tokens
             )