Spaces:

SmartHeal
/

SmartHeal-Agentic-AI

Sleeping

App Files Files Community

SmartHeal commited on Aug 25, 2025

Commit

83e490e

verified ·

1 Parent(s): f85c4fc

Update src/ai_processor.py

Browse files

Files changed (1) hide show

src/ai_processor.py +98 -33

src/ai_processor.py CHANGED Viewed

@@ -140,50 +140,101 @@ Keep to 220–300 words. Do NOT provide diagnosis. Avoid contraindicated advice.
 # ---------- MedGemma-only text generator ----------
 @_SPACES_GPU(enable_queue=True)
-def _medgemma_generate_gpu(prompt: str, model_id: str, max_new_tokens: int, token: Optional[str]):
     """
-    Runs entirely inside a Spaces GPU worker. Uses Med-Gemma (text-only) to draft the report.
     """
-    import torch
     from transformers import pipeline
-    pipe = pipeline(
-        "image-text-to-text",
-        model="unsloth/medgemma-4b-it-unsloth-bnb-4bit",
-        torch_dtype=torch.bfloat16,
-        device="cuda",
     )
-    out = pipe(
-        prompt,
-        max_new_tokens=max_new_tokens,
         do_sample=False,
         temperature=0.2,
-        return_full_text=True,
     )
-    text = (out[0].get("generated_text") if isinstance(out, list) else out).strip()
-    # Remove the prompt echo if present
-    if text.startswith(prompt):
-        text = text[len(prompt):].lstrip()
-    return text or "⚠️ Empty response"
-def generate_medgemma_report(  # kept name so callers don't change
     patient_info: str,
     visual_results: Dict,
     guideline_context: str,
-    image_pil: Image.Image,  # kept for signature compatibility; not used by MedGemma
-    max_new_tokens: Optional[int] = None,
 ) -> str:
-    """
-    MedGemma (text-only) report generation.
-    The image is analyzed by the vision pipeline; MedGemma formats clinical guidance text.
-    """
     if os.getenv("SMARTHEAL_ENABLE_VLM", "1") != "1":
         return "⚠️ VLM disabled"
-    # Default to a public Med-Gemma instruction-tuned model (update via env if you have access to another).
-    model_id = os.getenv("SMARTHEAL_MEDGEMMA_MODEL", "google/med-gemma-2-2b-it")
-    max_new_tokens = max_new_tokens or int(os.getenv("SMARTHEAL_VLM_MAX_TOKENS", "600"))
     uprompt = SMARTHEAL_USER_PREFIX.format(
         patient_info=patient_info,
         wound_type=visual_results.get("wound_type", "Unknown"),
@@ -194,16 +245,30 @@ def generate_medgemma_report(  # kept name so callers don't change
         px_per_cm=visual_results.get("px_per_cm", "?"),
         guideline_context=(guideline_context or "")[:900],
     )
-    # Compose a single text prompt
     prompt = f"{SMARTHEAL_SYSTEM_PROMPT}\n\n{uprompt}\n\nAnswer:"
     try:
-        return _medgemma_generate_gpu(prompt, model_id, max_new_tokens, HF_TOKEN)
     except Exception as e:
-        logging.error(f"MedGemma call failed: {e}")
         return "⚠️ VLM error"
 # ---------- Input-shape helpers (avoid `.as_list()` on strings) ----------
 def _shape_to_hw(shape) -> Tuple[Optional[int], Optional[int]]:
     try:

 # ---------- MedGemma-only text generator ----------
 @_SPACES_GPU(enable_queue=True)
+def _medgemma_generate_gpu_with_pipeline(
+    prompt: str,
+    image_pil,                              # PIL.Image (the wound image)
+    model_id: str | None = None,            # e.g. "unsloth/medgemma-4b-it-bnb-4bit"
+    max_new_tokens: int = 256,
+    token: str | None = None,
+) -> str:
     """
+    Vision LLM via Transformers pipeline using the "messages" format:
+      [{"role":"user","content":[{"type":"image","image": PIL}, {"type":"text","text": "..."}]}]
+    Returns a generated string.
     """
+    import os, torch
     from transformers import pipeline
+    try:
+        from transformers import BitsAndBytesConfig  # only needed for 4-bit
+    except Exception:
+        BitsAndBytesConfig = None
+    hf_token = token or os.getenv("HF_TOKEN")
+    mid = model_id or "unsloth/medgemma-4b-it-bnb-4bit"
+    # device / dtype
+    use_cuda = torch.cuda.is_available()
+    device   = 0 if use_cuda else -1
+    dtype    = torch.bfloat16 if use_cuda else torch.float32
+    # Build messages in the doc format
+    messages = [{
+        "role": "user",
+        "content": [
+            {"type": "image", "image": image_pil},   # local PIL image
+            {"type": "text",  "text": prompt},
+        ],
+    }]
+    pipe_kwargs = dict(
+        task="image-text-to-text",
+        model=mid,
+        torch_dtype=dtype,
+        device=device,                 # GPU=0 or CPU=-1
+        trust_remote_code=True,
     )
+    # Pass HF token (newer Transformers uses `token`; older uses `use_auth_token`)
+    if hf_token:
+        try:
+            pipe_kwargs["token"] = hf_token
+        except TypeError:
+            pipe_kwargs["use_auth_token"] = hf_token
+    # If this is the 4-bit Unsloth build, attach quantization (requires CUDA + bitsandbytes)
+    if "bnb-4bit" in mid.lower():
+        if not use_cuda or BitsAndBytesConfig is None:
+            raise RuntimeError("Unsloth 4-bit requires CUDA + bitsandbytes; no GPU available.")
+        bnb = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+        )
+        pipe_kwargs["model_kwargs"] = {"quantization_config": bnb}
+    # Create pipeline and run with messages
+    p = pipeline(**pipe_kwargs)
+    out = p(
+        text=messages,
+        max_new_tokens=int(max_new_tokens or 256),
         do_sample=False,
         temperature=0.2,
+        return_full_text=False,   # we just want the answer
     )
+    # Normalize output to a string
+    if isinstance(out, list):
+        # pipelines often return a list of strings or dicts; handle both
+        first = out[0]
+        text = first.get("generated_text") if isinstance(first, dict) else str(first)
+    else:
+        text = str(out)
+    return (text or "").strip() or "⚠️ Empty response"
+def generate_medgemma_report(
     patient_info: str,
     visual_results: Dict,
     guideline_context: str,
+    image_pil,                      # keep passing the PIL image
+    max_new_tokens: int | None = None,
 ) -> str:
     if os.getenv("SMARTHEAL_ENABLE_VLM", "1") != "1":
         return "⚠️ VLM disabled"
+    # Build your prompt as before
     uprompt = SMARTHEAL_USER_PREFIX.format(
         patient_info=patient_info,
         wound_type=visual_results.get("wound_type", "Unknown"),
         px_per_cm=visual_results.get("px_per_cm", "?"),
         guideline_context=(guideline_context or "")[:900],
     )
     prompt = f"{SMARTHEAL_SYSTEM_PROMPT}\n\n{uprompt}\n\nAnswer:"
+    model_id = os.getenv("SMARTHEAL_MEDGEMMA_MODEL", "unsloth/medgemma-4b-it-bnb-4bit")
+    max_new_tokens = max_new_tokens or int(os.getenv("SMARTHEAL_VLM_MAX_TOKENS", "600"))
     try:
+        return _medgemma_generate_gpu_with_pipeline(prompt, image_pil, model_id, max_new_tokens, HF_TOKEN)
     except Exception as e:
+        # Optional: automatic tiny fallback if CUDA/bnb/space issues show up
+        err = str(e)
+        if any(s in err for s in ("No space left", "bitsandbytes", "CUDA", "requires CUDA")):
+            try:
+                return _medgemma_generate_gpu_with_pipeline(
+                    prompt, image_pil,
+                    model_id="bczhou/tiny-llava-v1-hf",   # ~1GB; CPU OK
+                    max_new_tokens=max_new_tokens,
+                    token=HF_TOKEN,
+                )
+            except Exception:
+                pass
+        logging.error(f"MedGemma pipeline failed: {e}", exc_info=True)
         return "⚠️ VLM error"
 # ---------- Input-shape helpers (avoid `.as_list()` on strings) ----------
 def _shape_to_hw(shape) -> Tuple[Optional[int], Optional[int]]:
     try: