Spaces:

ColdSlim
/

Dermatology-AI-Assistant

Sleeping

App Files Files Community

ColdSlim commited on Oct 13, 2025

Commit

a2e0d44

verified ·

1 Parent(s): 8ac03b2

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -108

app.py CHANGED Viewed

@@ -1,10 +1,9 @@
 # app.py
-# Dermatology-AI-Assistant — HF Spaces (ZeroGPU, Qwen2.5-VL multimodal)
-# - GUARANTEES multimodal: loads processor from base with trust_remote_code + use_fast=False
-# - Asserts processor supports images at startup (clear error if deps are wrong)
-# - Tries FT model first; falls back to base model on load/generation issues
-# - Uses qwen-vl-utils for vision inputs
-# - ZeroGPU only during inference; no runtime pip installs
 import os
 import logging
@@ -14,6 +13,7 @@ import gradio as gr
 import spaces
 import torch
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForVision2Seq
 from qwen_vl_utils import process_vision_info
@@ -23,65 +23,65 @@ logger = logging.getLogger(__name__)
 # ---------------------------
 # Config
 # ---------------------------
-FT_MODEL_ID = os.environ.get("MODEL_ID", "ColdSlim/Dermatology-Qwen2.5-VL-3B")
 BASE_MODEL_ID = os.environ.get("FALLBACK_BASE_MODEL_ID", "Qwen/Qwen2.5-VL-3B-Instruct")
 GEN_KW = dict(
-    max_new_tokens=512,
-    do_sample=True,
-    temperature=0.7,
-    top_p=0.9,
 )
 ZGPU_DURATION = int(os.environ.get("ZGPU_DURATION", "180"))
 # ---------------------------
-# Load MULTIMODAL processor from BASE (NOT FT) and validate it
 # ---------------------------
-logger.info(f"Loading processor from base model (multimodal expected): {BASE_MODEL_ID}")
-processor = AutoProcessor.from_pretrained(
-    BASE_MODEL_ID,
-    trust_remote_code=True,
-    use_fast=False,   # critical: ensure multimodal __call__ supports images/videos
-)
-logger.info(f"Processor class: {processor.__class__.__name__}")
-# Validate that processor can handle images
-proc_sig = getattr(processor.__call__, "__signature__", None)
-accepts_images = ("images" in str(proc_sig)) if proc_sig else hasattr(processor, "image_processor")
-if not accepts_images or not hasattr(processor, "image_processor"):
-    raise RuntimeError(
-        "Loaded processor is not multimodal. Ensure requirements include: "
-        "transformers>=4.56.1, qwen-vl-utils>=0.0.10, torch>=2.2.0, and do a Factory reboot."
-    )
-# Optional: stabilize tiling/token placeholders
-if hasattr(processor, "image_processor"):
-    try:
-        processor.image_processor.max_pixels = int(os.environ.get("QWEN_MAX_PIXELS", "1500000"))  # ~1.5MP
-        processor.image_processor.min_pixels = int(os.environ.get("QWEN_MIN_PIXELS", "262144"))   # 512x512
-    except Exception:
-        pass
 # ---------------------------
 # Helpers
 # ---------------------------
 def _messages(image: Image.Image, question: str):
     if image.mode != "RGB":
         image = image.convert("RGB")
-    return [{
-        "role": "user",
-        "content": [
-            {"type": "image", "image": image},
-            {"type": "text", "text": question},
-        ],
-    }]
 def build_inputs(image: Image.Image, question: str):
-    """
-    Build Qwen2.5-VL multimodal inputs using processor + qwen-vl-utils.
-    Single-sample, no padding (reduces placeholder mask edge cases).
-    """
     messages = _messages(image, question)
     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     image_inputs, video_inputs = process_vision_info(messages)
@@ -89,98 +89,99 @@ def build_inputs(image: Image.Image, question: str):
 def _pad_token_id(model):
     tid = getattr(getattr(processor, "tokenizer", None), "eos_token_id", None)
-    if tid is not None:
-        return tid
-    return getattr(getattr(model, "config", None), "eos_token_id", 0) or 0
 def _generate_text(model, inputs: dict) -> str:
-    # move tensors to CUDA
     inputs = {k: v.to("cuda") if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
     with torch.no_grad():
-        out_ids = model.generate(
-            **inputs,
-            **GEN_KW,
-            pad_token_id=_pad_token_id(model),
-        )
     trimmed = [o[len(i):] for i, o in zip(inputs["input_ids"], out_ids)]
     text = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
     return text
 def format_derm_disclaimer(ans: str) -> str:
-    tail = (
-        "\n\n---\n"
-        "_Disclaimer: This AI is not a medical device. The output is informational and may be inaccurate. "
-        "Consult a qualified dermatologist for diagnosis and treatment._"
     )
-    return ans + tail
-def try_load_model(model_id: str, *, allow_mismatch: bool):
     """
-    Load Qwen2.5-VL via AutoModelForVision2Seq with trust_remote_code (multimodal weights).
     """
     try:
-        logger.info(f"Loading model on GPU: {model_id}")
         model = AutoModelForVision2Seq.from_pretrained(
-            model_id,
             torch_dtype=torch.float16,
             device_map="cuda",
             trust_remote_code=True,
             low_cpu_mem_usage=True,
-            ignore_mismatched_sizes=False,
-            offload_state_dict=False,
         )
-        logger.info(f"Model loaded: {model_id} ({model.__class__.__name__})")
         return model, None
     except Exception as e:
-        logger.warning(f"Model load failed for {model_id}: {e}")
-        return None, str(e)
 # ---------------------------
 # Inference (ZeroGPU)
 # ---------------------------
 @spaces.GPU(duration=ZGPU_DURATION)
 def analyze_skin_condition(image: Optional[Image.Image], question: str) -> str:
-    """
-    STRICT multimodal: requires processor with images support (asserted at startup).
-    Try FT model first; on ANY load/generation error, fall back to base model.
-    """
     if image is None:
         return "❌ Please upload an image first."
     model = None
     try:
         inputs = build_inputs(image, question)
-        # Attempt 1: fine-tuned model
-        model, ft_err = try_load_model(FT_MODEL_ID, allow_mismatch=True)
-        if model is not None:
-            try:
-                text = _generate_text(model, inputs)
-                return format_derm_disclaimer(text)
-            except ValueError as ve:
-                if "Image features and image tokens do not match" in str(ve):
-                    logger.warning("Token/feature mismatch on FT model — falling back to base.")
-                else:
-                    logger.warning(f"FT generation error: {ve}. Falling back to base.")
-            except Exception as gen_e:
-                logger.warning(f"FT generation failed: {gen_e}. Falling back to base.")
-        else:
-            logger.warning(f"FT model unavailable, error: {ft_err}. Falling back to base.")
-        # Free FT before base
-        if model is not None:
-            del model
-            model = None
-            torch.cuda.empty_cache()
-        # Attempt 2: base model
-        model, base_err = try_load_model(BASE_MODEL_ID, allow_mismatch=False)
         if model is None:
-            return f"❌ Error loading models.\n- FT: {ft_err}\n- BASE: {base_err}"
         text = _generate_text(model, inputs)
         return format_derm_disclaimer(text)
     except Exception as e:
         logger.exception("Error during inference")
         return f"❌ Error analyzing image: {e}"
@@ -198,7 +199,6 @@ def create_interface() -> gr.Blocks:
             "# Dermatology AI Assistant\n"
             "Upload a skin photo and ask a question. The model will provide an informational response."
         )
         with gr.Row():
             image_input = gr.Image(type="pil", label="Upload Image (JPG/PNG)")
             question_input = gr.Textbox(
@@ -206,17 +206,15 @@ def create_interface() -> gr.Blocks:
                 value="Describe this skin condition in detail and suggest possible next steps.",
                 lines=3,
             )
         with gr.Row():
             submit_btn = gr.Button("Analyze", variant="primary")
             clear_btn = gr.Button("Clear")
         output_box = gr.Textbox(label="Response", lines=16)
         submit_btn.click(fn=analyze_skin_condition, inputs=[image_input, question_input], outputs=output_box, queue=True)
         clear_btn.click(fn=lambda: (None, ""), inputs=None, outputs=[image_input, question_input])
-        demo.queue()  # Gradio 4.44.1: no kwargs
         gr.Markdown("Tips: Ensure good lighting and focus. Avoid uploading personally identifying information.")
     return demo
@@ -229,7 +227,7 @@ def main():
         show_error=True,
         inbrowser=False,
         quiet=False,
-        ssr_mode=False,  # avoid Node requirement in container
     )
 if __name__ == "__main__":

 # app.py
+# Dermatology-AI-Assistant — HF Spaces (ZeroGPU, Qwen2.5-VL + LoRA adapters)
+# - Loads base model, then applies LoRA/PEFT adapters from MODEL_ID, merges, and runs multimodal inference
+# - Uses qwen-vl-utils + AutoProcessor (multimodal) with trust_remote_code, use_fast=False
+# - Deterministic decoding for stable eval
+# - ZeroGPU only during inference
 import os
 import logging
 import spaces
 import torch
 from PIL import Image
+from peft import PeftModel  # <-- LoRA/PEFT
 from transformers import AutoProcessor, AutoModelForVision2Seq
 from qwen_vl_utils import process_vision_info
 # ---------------------------
 # Config
 # ---------------------------
+FT_MODEL_ID = os.environ.get("MODEL_ID", "ColdSlim/Dermatology-Qwen2.5-VL-3B")  # LoRA adapters repo
 BASE_MODEL_ID = os.environ.get("FALLBACK_BASE_MODEL_ID", "Qwen/Qwen2.5-VL-3B-Instruct")
 GEN_KW = dict(
+    max_new_tokens=256,
+    do_sample=False,   # deterministic for evaluation
+    temperature=0.0,
+    top_p=1.0,
 )
 ZGPU_DURATION = int(os.environ.get("ZGPU_DURATION", "180"))
 # ---------------------------
+# Processor (try FT first; fall back to base). Must be multimodal.
 # ---------------------------
+def _load_multimodal_processor() -> AutoProcessor:
+    tried = []
+    for mid in (FT_MODEL_ID, BASE_MODEL_ID):
+        try:
+            proc = AutoProcessor.from_pretrained(mid, trust_remote_code=True, use_fast=False)
+            sig = getattr(proc.__call__, "__signature__", None)
+            accepts_images = ("images" in str(sig)) if sig else hasattr(proc, "image_processor")
+            if accepts_images and hasattr(proc, "image_processor"):
+                logger.info(f"Loaded multimodal processor from: {mid} ({proc.__class__.__name__})")
+                # optional: stabilize tiling
+                try:
+                    proc.image_processor.max_pixels = int(os.environ.get("QWEN_MAX_PIXELS", "1500000"))
+                    proc.image_processor.min_pixels = int(os.environ.get("QWEN_MIN_PIXELS", "262144"))
+                except Exception:
+                    pass
+                return proc
+            tried.append(f"{mid} => {proc.__class__.__name__} (no images support)")
+        except Exception as e:
+            tried.append(f"{mid} => ERROR: {e}")
+    raise RuntimeError("Failed to load a multimodal processor. Tried:\n" + "\n".join(tried))
+processor = _load_multimodal_processor()
 # ---------------------------
 # Helpers
 # ---------------------------
+SYSTEM_PROMPT = (
+    "You are a dermatology assistant. First, look carefully at the IMAGE.\n"
+    "If the image is NOT a close-up of human skin or a dermatologic lesion, "
+    "respond EXACTLY with: 'The image does not appear to show a skin condition; I cannot analyze it.' "
+    "Do not invent findings.\n"
+    "If it IS a skin/lesion photo, provide a concise description, 3–5 likely differentials, "
+    "and prudent next steps (including red flags). Avoid definitive diagnoses."
+)
 def _messages(image: Image.Image, question: str):
     if image.mode != "RGB":
         image = image.convert("RGB")
+    return [
+        {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
+        {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": question}]},
+    ]
 def build_inputs(image: Image.Image, question: str):
     messages = _messages(image, question)
     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     image_inputs, video_inputs = process_vision_info(messages)
 def _pad_token_id(model):
     tid = getattr(getattr(processor, "tokenizer", None), "eos_token_id", None)
+    return tid if tid is not None else (getattr(getattr(model, "config", None), "eos_token_id", 0) or 0)
 def _generate_text(model, inputs: dict) -> str:
     inputs = {k: v.to("cuda") if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
     with torch.no_grad():
+        out_ids = model.generate(**inputs, **GEN_KW, pad_token_id=_pad_token_id(model))
     trimmed = [o[len(i):] for i, o in zip(inputs["input_ids"], out_ids)]
     text = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
     return text
 def format_derm_disclaimer(ans: str) -> str:
+    return (
+        ans
+        + "\n\n---\n"
+          "_Disclaimer: This AI is not a medical device. The output is informational and may be inaccurate. "
+          "Consult a qualified dermatologist for diagnosis and treatment._"
     )
+# ---------------------------
+# Model loading (LoRA first, then full weights fallback, then base)
+# ---------------------------
+def try_load_model() -> Tuple[Optional[AutoModelForVision2Seq], Optional[str]]:
     """
+    Preferred path: load BASE, then apply LoRA adapters from FT repo, merge, unload.
+    Fallbacks: full FT weights -> pure base.
     """
+    # 1) BASE + LoRA adapters (PEFT)
     try:
+        logger.info(f"Loading BASE model: {BASE_MODEL_ID}")
+        base = AutoModelForVision2Seq.from_pretrained(
+            BASE_MODEL_ID,
+            torch_dtype=torch.float16,
+            device_map="cuda",
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+        )
+        logger.info(f"Attaching LoRA adapters from: {FT_MODEL_ID}")
+        model = PeftModel.from_pretrained(base, FT_MODEL_ID, is_trainable=False)
+        try:
+            model = model.merge_and_unload()
+            logger.info("Merged LoRA adapters into base (inference-optimized).")
+        except Exception as e:
+            logger.info(f"Adapters active without merge (PEFT runtime). Reason: {e}")
+        return model, None
+    except Exception as peft_e:
+        logger.warning(f"PEFT adapters load failed: {peft_e}")
+    # 2) Try full FT weights (in case you exported merged weights)
+    try:
+        logger.info(f"Loading full FT weights from: {FT_MODEL_ID}")
         model = AutoModelForVision2Seq.from_pretrained(
+            FT_MODEL_ID,
             torch_dtype=torch.float16,
             device_map="cuda",
             trust_remote_code=True,
             low_cpu_mem_usage=True,
+            ignore_mismatched_sizes=False,  # strict: do not silently re-init layers
         )
         return model, None
     except Exception as e:
+        logger.warning(f"Full FT load failed: {e}")
+    # 3) Final fallback: base only (so app still works)
+    try:
+        logger.info("Falling back to BASE model only.")
+        model = AutoModelForVision2Seq.from_pretrained(
+            BASE_MODEL_ID,
+            torch_dtype=torch.float16,
+            device_map="cuda",
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+        )
+        return model, "Using base model only (FT not applied)."
+    except Exception as e:
+        return None, f"Base load failed too: {e}"
 # ---------------------------
 # Inference (ZeroGPU)
 # ---------------------------
 @spaces.GPU(duration=ZGPU_DURATION)
 def analyze_skin_condition(image: Optional[Image.Image], question: str) -> str:
     if image is None:
         return "❌ Please upload an image first."
     model = None
     try:
         inputs = build_inputs(image, question)
+        model, warn = try_load_model()
         if model is None:
+            return "❌ Could not load any model (see logs)."
+        if warn:
+            logger.warning(warn)
         text = _generate_text(model, inputs)
         return format_derm_disclaimer(text)
     except Exception as e:
         logger.exception("Error during inference")
         return f"❌ Error analyzing image: {e}"
             "# Dermatology AI Assistant\n"
             "Upload a skin photo and ask a question. The model will provide an informational response."
         )
         with gr.Row():
             image_input = gr.Image(type="pil", label="Upload Image (JPG/PNG)")
             question_input = gr.Textbox(
                 value="Describe this skin condition in detail and suggest possible next steps.",
                 lines=3,
             )
         with gr.Row():
             submit_btn = gr.Button("Analyze", variant="primary")
             clear_btn = gr.Button("Clear")
         output_box = gr.Textbox(label="Response", lines=16)
         submit_btn.click(fn=analyze_skin_condition, inputs=[image_input, question_input], outputs=output_box, queue=True)
         clear_btn.click(fn=lambda: (None, ""), inputs=None, outputs=[image_input, question_input])
+        demo.queue()
         gr.Markdown("Tips: Ensure good lighting and focus. Avoid uploading personally identifying information.")
     return demo
         show_error=True,
         inbrowser=False,
         quiet=False,
+        ssr_mode=False,
     )
 if __name__ == "__main__":