Spaces:

SmartHeal
/

SmartHeal-Agentic-AI

Sleeping

App Files Files Community

SmartHeal commited on Aug 16, 2025

Commit

a4ec7ff

verified ·

1 Parent(s): b8b442a

Update src/ai_processor.py

Browse files

Files changed (1) hide show

src/ai_processor.py +42 -8

src/ai_processor.py CHANGED Viewed

@@ -29,7 +29,7 @@ def _log_kv(prefix: str, kv: Dict):
 # --- Spaces GPU decorator (REQUIRED) ---
 from spaces import GPU as _SPACES_GPU
-@_SPACES_GPU(enable_queue=True)
 def smartheal_gpu_stub(ping: int = 0) -> str:
     return "ready"
@@ -94,7 +94,11 @@ def _import_hf_cls():
     return pipeline
 def _import_embeddings():
-    from langchain_community.embeddings import HuggingFaceEmbeddings
     return HuggingFaceEmbeddings
 def _import_langchain_pdf():
@@ -148,11 +152,34 @@ def _vlm_infer_gpu(messages, model_id: str, max_new_tokens: int, token: Optional
     """
     Runs entirely inside a Spaces GPU worker. It's the ONLY place we allow CUDA init.
     """
     from transformers import pipeline
     pipe = pipeline(
         task="image-text-to-text",
         model=model_id,
-        device_map="cpu",            # CUDA init happens here, safely in GPU worker
         token=token,
         trust_remote_code=True,
         model_kwargs={"low_cpu_mem_usage": True},
@@ -174,6 +201,7 @@ def generate_medgemma_report(  # kept name so callers don't change
     """
     MedGemma replacement using Qwen/Qwen2-VL-2B-Instruct via image-text-to-text.
     Loads & runs ONLY inside a GPU worker to satisfy Stateless GPU constraints.
     """
     if os.getenv("SMARTHEAL_ENABLE_VLM", "1") != "1":
         return "⚠️ VLM disabled"
@@ -200,12 +228,16 @@ def generate_medgemma_report(  # kept name so callers don't change
         ]},
     ]
     try:
-        # IMPORTANT: do not import transformers or touch CUDA here. Only call the GPU worker.
         return _vlm_infer_gpu(messages, model_id, max_new_tokens, HF_TOKEN)
     except Exception as e:
-        logging.error(f"VLM call failed: {e}")
-        return "⚠️ VLM error"
 # ---------- Initialize CPU models ----------
 def load_yolo_model():
@@ -217,7 +249,7 @@ def load_yolo_model():
 def load_segmentation_model():
     load_model = _import_tf_loader()
-    return load_model(SEG_MODEL_PATH, compile=False, custom_objects={'InputLayer': tf.keras.layers.InputLayer})
 def load_classification_pipeline():
     pipe = _import_hf_cls()
@@ -255,6 +287,7 @@ def initialize_cpu_models() -> None:
                 models_cache["seg"] = None
                 logging.warning("Segmentation model file missing; skipping.")
         except Exception as e:
             models_cache["seg"] = None
             logging.warning(f"Segmentation unavailable: {e}")
@@ -419,7 +452,8 @@ def _grabcut_refine(bgr: np.ndarray, seed01: np.ndarray, iters: int = 3) -> np.n
     seed_dil = cv2.dilate(seed01, k, iterations=1)
     gc[seed01.astype(bool)] = cv2.GC_PR_FGD
     gc[seed_dil.astype(bool)] = cv2.GC_FGD
-    gc[0, :], gc[-1, :], gc[:, 0], gc[:, 1] = cv2.GC_BGD, cv2.GC_BGD, cv2.GC_BGD, cv2.GC_BGD
     bgdModel = np.zeros((1, 65), np.float64)
     fgdModel = np.zeros((1, 65), np.float64)
     cv2.grabCut(bgr, gc, None, bgdModel, fgdModel, iters, cv2.GC_INIT_WITH_MASK)

 # --- Spaces GPU decorator (REQUIRED) ---
 from spaces import GPU as _SPACES_GPU
+@_SPACES_GPU(enable_queue=True)  # enable_queue ignored by ZeroGPU but explicit is fine
 def smartheal_gpu_stub(ping: int = 0) -> str:
     return "ready"
     return pipeline
 def _import_embeddings():
+    # Prefer the new package if available, fallback to community to avoid deprecation warnings
+    try:
+        from langchain_huggingface import HuggingFaceEmbeddings  # type: ignore
+    except Exception:
+        from langchain_community.embeddings import HuggingFaceEmbeddings  # type: ignore
     return HuggingFaceEmbeddings
 def _import_langchain_pdf():
     """
     Runs entirely inside a Spaces GPU worker. It's the ONLY place we allow CUDA init.
     """
+    import torch
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA not available in worker (check ZeroGPU torch version).")
     from transformers import pipeline
     pipe = pipeline(
         task="image-text-to-text",
         model=model_id,
+        device_map={"": 0},           # be explicit: put everything on cuda:0
+        token=token,
+        trust_remote_code=True,
+        model_kwargs={"low_cpu_mem_usage": True},
+    )
+    out = pipe(text=messages, max_new_tokens=max_new_tokens, do_sample=False, temperature=0.2)
+    try:
+        txt = out[0]["generated_text"][-1].get("content", "")
+    except Exception:
+        txt = out[0].get("generated_text", "")
+    return (txt or "").strip() or "⚠️ Empty response"
+def _vlm_infer_cpu(messages, model_id: str, max_new_tokens: int, token: Optional[str]) -> str:
+    """
+    CPU fallback path when ZeroGPU grant fails or CUDA wheel is unavailable.
+    """
+    from transformers import pipeline
+    pipe = pipeline(
+        task="image-text-to-text",
+        model=model_id,
+        device_map="cpu",
         token=token,
         trust_remote_code=True,
         model_kwargs={"low_cpu_mem_usage": True},
     """
     MedGemma replacement using Qwen/Qwen2-VL-2B-Instruct via image-text-to-text.
     Loads & runs ONLY inside a GPU worker to satisfy Stateless GPU constraints.
+    Falls back to CPU pipeline if a GPU grant/initialization fails.
     """
     if os.getenv("SMARTHEAL_ENABLE_VLM", "1") != "1":
         return "⚠️ VLM disabled"
         ]},
     ]
+    # Try GPU worker first, then CPU fallback
     try:
         return _vlm_infer_gpu(messages, model_id, max_new_tokens, HF_TOKEN)
     except Exception as e:
+        logging.warning(f"GPU VLM failed; falling back to CPU: {e}")
+        try:
+            return _vlm_infer_cpu(messages, model_id, max_new_tokens, HF_TOKEN)
+        except Exception as e2:
+            logging.error(f"CPU VLM also failed: {e2}")
+            return "⚠️ VLM error"
 # ---------- Initialize CPU models ----------
 def load_yolo_model():
 def load_segmentation_model():
     load_model = _import_tf_loader()
+    return load_model(SEG_MODEL_PATH, compile=False)
 def load_classification_pipeline():
     pipe = _import_hf_cls()
                 models_cache["seg"] = None
                 logging.warning("Segmentation model file missing; skipping.")
         except Exception as e:
+            # Typical with Keras/TF version mismatch; pin TF/Keras 2.15 in requirements.
             models_cache["seg"] = None
             logging.warning(f"Segmentation unavailable: {e}")
     seed_dil = cv2.dilate(seed01, k, iterations=1)
     gc[seed01.astype(bool)] = cv2.GC_PR_FGD
     gc[seed_dil.astype(bool)] = cv2.GC_FGD
+    # force borders to background
+    gc[0, :], gc[-1, :], gc[:, 0], gc[:, -1] = cv2.GC_BGD, cv2.GC_BGD, cv2.GC_BGD, cv2.GC_BGD
     bgdModel = np.zeros((1, 65), np.float64)
     fgdModel = np.zeros((1, 65), np.float64)
     cv2.grabCut(bgr, gc, None, bgdModel, fgdModel, iters, cv2.GC_INIT_WITH_MASK)