Spaces:

aadarsh99
/

ConvSeg

Running on Zero

App Files Files Community

aadarsh99 commited on 27 days ago

Commit

c84ea63

1 Parent(s): 96c10ec

update app

Browse files

Files changed (1) hide show

app.py +117 -131

app.py CHANGED Viewed

@@ -20,7 +20,10 @@ from sam2.sam2_image_predictor import SAM2ImagePredictor
 from plm_adapter_lora_with_image_input_only_text_positions import PLMLanguageAdapter
 # ----------------- Configuration -----------------
-HF_REPO_ID = "aadarsh99/ConvSeg-Stage1"
 SAM2_CONFIG = "sam2_hiera_l.yaml"
 BASE_CKPT_NAME = "sam2_hiera_large.pt"
@@ -31,24 +34,25 @@ LORA_CKPT_NAME = None
 SQUARE_DIM = 1024
 logging.basicConfig(level=logging.INFO)
-MODEL_SAM_CPU = None
-PLM_CPU = None
-# ----------------- Helper Functions -----------------
-def download_if_needed(filename):
-    if os.path.exists(filename):
-        return filename
     try:
-        return hf_hub_download(repo_id=HF_REPO_ID, filename=filename)
     except Exception as e:
-        raise FileNotFoundError(f"Could not find {filename} in HF repo {HF_REPO_ID}. Error: {e}")
 def _hex_to_rgb(h: str):
     h = h.lstrip("#")
     return tuple(int(h[i : i + 2], 16) for i in (0, 2, 4))
-EDGE_COLORS_HEX = ["#3A86FF", "#FF006E", "#43AA8B", "#F3722C", "#8338EC", "#90BE6D"]
 EDGE_COLORS = [_hex_to_rgb(h) for h in EDGE_COLORS_HEX]
 def stable_color(key: str):
@@ -58,79 +62,42 @@ def stable_color(key: str):
 def tint(rgb, amt: float = 0.1):
     return tuple(int(255 - (255 - c) * (1 - amt)) for c in rgb)
-def edge_map(mask_bool: np.ndarray, width_px: int = 2) -> Image.Image:
-    m = Image.fromarray((mask_bool.astype(np.uint8) * 255), "L")
-    edges = ImageChops.difference(m.filter(ImageFilter.MaxFilter(3)), m.filter(ImageFilter.MinFilter(3)))
-    for _ in range(max(0, width_px - 1)):
-        edges = edges.filter(ImageFilter.MaxFilter(3))
-    return edges.point(lambda p: 255 if p > 0 else 0)
-def _apply_rounded_corners(img_rgb: Image.Image, radius: int) -> Image.Image:
-    w, h = img_rgb.size
-    mask = Image.new("L", (w, h), 0)
-    ImageDraw.Draw(mask).rounded_rectangle([0, 0, w - 1, h - 1], radius=radius, fill=255)
-    bg = Image.new("RGB", (w, h), "white")
-    img_rgba = img_rgb.convert("RGBA")
-    img_rgba.putalpha(mask)
-    bg.paste(img_rgba.convert("RGB"), (0, 0), mask)
-    return bg
 def make_overlay(rgb: np.ndarray, mask: np.ndarray, key: str = "mask") -> Image.Image:
     base = Image.fromarray(rgb.astype(np.uint8)).convert("RGB")
-    H, W = mask.shape[:2]
-    if base.size != (W, H):
-        base = base.resize((W, H), Image.BICUBIC)
     base_rgba = base.convert("RGBA")
     mask_bool = mask > 0
     color = stable_color(key)
     fill_rgb = tint(color, 0.1)
-    fill_layer = Image.new("RGBA", base_rgba.size, fill_rgb + (0,))
     fill_alpha = Image.fromarray((mask_bool.astype(np.uint8) * 178), "L")
     fill_layer.putalpha(fill_alpha)
-    edgesL = edge_map(mask_bool, width_px=2)
-    stroke = Image.new("RGBA", base_rgba.size, color + (0,))
-    stroke.putalpha(edgesL)
-    out = Image.alpha_composite(base_rgba, fill_layer)
-    out = Image.alpha_composite(out, stroke)
-    return _apply_rounded_corners(out.convert("RGB"), max(12, int(0.06 * min(out.size))))
-# ----------------- Image Processing -----------------
-def _resize_pad_square(arr: np.ndarray, max_dim: int, *, is_mask: bool) -> np.ndarray:
-    h, w = arr.shape[:2]
-    scale = float(max_dim) / float(max(h, w))
-    new_w, new_h = max(1, int(round(w * scale))), max(1, int(round(h * scale)))
-    interp = cv2.INTER_NEAREST if is_mask else (cv2.INTER_AREA if scale < 1.0 else cv2.INTER_LINEAR)
-    arr = cv2.resize(arr, (new_w, new_h), interpolation=interp)
-    pad_w, pad_h = max_dim - new_w, max_dim - new_h
-    left, top = pad_w // 2, pad_h // 2
-    return np.ascontiguousarray(cv2.copyMakeBorder(arr, top, pad_h - top, left, pad_w - left, cv2.BORDER_CONSTANT, value=0))
-def _resize_pad_square_meta(h: int, w: int, max_dim: int):
-    scale = float(max_dim) / float(max(h, w))
-    new_w, new_h = max(1, int(round(w * scale))), max(1, int(round(h * scale)))
-    return {"scale": scale, "new_w": new_w, "new_h": new_h, "left": (max_dim - new_w) // 2, "top": (max_dim - new_h) // 2}
-def _unpad_and_resize_pred_to_gt(logit_sq: torch.Tensor, meta: dict, out_hw: tuple[int, int]) -> torch.Tensor:
-    top, left = meta["top"], meta["left"]
-    nh, nw = meta["new_h"], meta["new_w"]
-    crop = logit_sq[top : top + nh, left : left + nw].unsqueeze(0).unsqueeze(0)
-    return F.interpolate(crop, size=out_hw, mode="bilinear", align_corners=False)[0, 0]
-# ----------------- Prediction Logic -----------------
-def ensure_models_loaded_on_cpu():
-    global MODEL_SAM_CPU, PLM_CPU
-    if MODEL_SAM_CPU is not None and PLM_CPU is not None:
         return
-    logging.info("Loading models into CPU RAM...")
-    base_path = download_if_needed(BASE_CKPT_NAME)
     model = build_sam2(SAM2_CONFIG, base_path, device="cpu")
-    final_path = download_if_needed(FINAL_CKPT_NAME)
     sd = torch.load(final_path, map_location="cpu")
     model.load_state_dict(sd.get("model", sd), strict=True)
-    MODEL_SAM_CPU = model
     plm = PLMLanguageAdapter(
         model_name="Qwen/Qwen2.5-VL-3B-Instruct",
         transformer_dim=model.sam_mask_decoder.transformer_dim,
@@ -138,99 +105,118 @@ def ensure_models_loaded_on_cpu():
         lora_r=16, lora_alpha=32, lora_dropout=0.05,
         dtype=torch.bfloat16, device="cpu",
     )
-    plm_path = download_if_needed(PLM_CKPT_NAME)
     plm_sd = torch.load(plm_path, map_location="cpu")
     plm.load_state_dict(plm_sd["plm"], strict=True)
     plm.eval()
-    PLM_CPU = plm
 @spaces.GPU(duration=120)
-def run_prediction(image_pil, text_prompt, threshold):
     if image_pil is None or not text_prompt:
         return None, None
-    ensure_models_loaded_on_cpu()
-    MODEL_SAM_CPU.to("cuda")
-    PLM_CPU.to("cuda")
-    predictor = None
     try:
-        predictor = SAM2ImagePredictor(MODEL_SAM_CPU)
-        rgb_orig = np.array(image_pil.convert("RGB"))
-        Hgt, Wgt = rgb_orig.shape[:2]
-        meta = _resize_pad_square_meta(Hgt, Wgt, SQUARE_DIM)
-        rgb_sq = _resize_pad_square(rgb_orig, SQUARE_DIM, is_mask=False)
-        predictor.set_image(rgb_sq)
-        image_emb = predictor._features["image_embed"][-1].unsqueeze(0)
-        hi = [lvl[-1].unsqueeze(0) for lvl in predictor._features["high_res_feats"]]
-        temp_path = "temp_input.jpg"
-        image_pil.save(temp_path)
-        sp, dp = PLM_CPU([text_prompt], image_emb.shape[2], image_emb.shape[3], [temp_path])
-        dec = predictor.model.sam_mask_decoder
-        dev, dtype = next(dec.parameters()).device, next(dec.parameters()).dtype
-        low, scores, _, _ = dec(
-            image_embeddings=image_emb.to(dev, dtype),
-            image_pe=predictor.model.sam_prompt_encoder.get_dense_pe().to(dev, dtype),
-            sparse_prompt_embeddings=sp.to(dev, dtype),
-            dense_prompt_embeddings=dp.to(dev, dtype),
-            multimask_output=True, repeat_image=False,
-            high_res_features=[h.to(dev, dtype) for h in hi],
-        )
-        logits_sq = predictor._transforms.postprocess_masks(low, (SQUARE_DIM, SQUARE_DIM))
-        logit_gt = _unpad_and_resize_pred_to_gt(logits_sq[0, scores.argmax(dim=1).item()], meta, (Hgt, Wgt))
-        # 1. Calculate Probabilities (Heatmap)
-        prob = torch.sigmoid(logit_gt).cpu().numpy()
-        # 2. Apply dynamic threshold for overlay
         mask = (prob > threshold).astype(np.uint8) * 255
         overlay_img = make_overlay(rgb_orig, mask, key=text_prompt)
-        # 3. Create Heatmap Visualization
-        # Scale 0.0-1.0 to 0-255
-        prob_uint8 = (prob * 255).astype(np.uint8)
-        heatmap_color = cv2.applyColorMap(prob_uint8, cv2.COLORMAP_JET)
-        heatmap_color = cv2.cvtColor(heatmap_color, cv2.COLOR_BGR2RGB)
-        heatmap_pil = Image.fromarray(heatmap_color)
-        return overlay_img, heatmap_pil
-    except Exception as e:
         traceback.print_exc()
-        raise e
     finally:
-        MODEL_SAM_CPU.to("cpu")
-        PLM_CPU.to("cpu")
-        if predictor: del predictor
         torch.cuda.empty_cache()
 # ----------------- Gradio UI -----------------
-with gr.Blocks(title="SAM2 + PLM Segmentation") as demo:
     gr.Markdown("# SAM2 + PLM Interactive Segmentation")
     with gr.Row():
         with gr.Column():
             input_image = gr.Image(type="pil", label="Input Image")
-            text_prompt = gr.Textbox(label="Text Prompt", placeholder="e.g., 'the red car'")
-            threshold_slider = gr.Slider(
-                minimum=0.0, maximum=1.0, value=0.5, step=0.01,
-                label="Confidence Threshold", info="Adjust to include more/less of the object"
-            )
-            run_btn = gr.Button("Segment", variant="primary")
         with gr.Column():
             out_overlay = gr.Image(label="Segmentation Overlay", type="pil")
-            out_heatmap = gr.Image(label="Probability Heatmap", type="pil")
     run_btn.click(
         fn=run_prediction,
-        inputs=[input_image, text_prompt, threshold_slider],
         outputs=[out_overlay, out_heatmap]
     )

 from plm_adapter_lora_with_image_input_only_text_positions import PLMLanguageAdapter
 # ----------------- Configuration -----------------
+REPO_MAP = {
+    "Stage 1": "aadarsh99/ConvSeg-Stage1",
+    "Stage 2": "aadarsh99/ConvSeg-Stage2"
+}
 SAM2_CONFIG = "sam2_hiera_l.yaml"
 BASE_CKPT_NAME = "sam2_hiera_large.pt"
 SQUARE_DIM = 1024
 logging.basicConfig(level=logging.INFO)
+# ----------------- Globals (Ram Cache) -----------------
+MODEL_CACHE = {
+    "Stage 1": {"sam": None, "plm": None},
+    "Stage 2": {"sam": None, "plm": None}
+}
+# ----------------- Helper: Download Logic -----------------
+def download_if_needed(repo_id, filename):
     try:
+        logging.info(f"Downloading {filename} from {repo_id}...")
+        return hf_hub_download(repo_id=repo_id, filename=filename)
     except Exception as e:
+        raise FileNotFoundError(f"Could not find {filename} in {repo_id}. Error: {e}")
+# ----------------- Overlay & Heatmap Helpers -----------------
+EDGE_COLORS_HEX = ["#3A86FF", "#FF006E", "#43AA8B", "#F3722C", "#8338EC", "#90BE6D"]
 def _hex_to_rgb(h: str):
     h = h.lstrip("#")
     return tuple(int(h[i : i + 2], 16) for i in (0, 2, 4))
 EDGE_COLORS = [_hex_to_rgb(h) for h in EDGE_COLORS_HEX]
 def stable_color(key: str):
 def tint(rgb, amt: float = 0.1):
     return tuple(int(255 - (255 - c) * (1 - amt)) for c in rgb)
 def make_overlay(rgb: np.ndarray, mask: np.ndarray, key: str = "mask") -> Image.Image:
     base = Image.fromarray(rgb.astype(np.uint8)).convert("RGB")
     base_rgba = base.convert("RGBA")
     mask_bool = mask > 0
     color = stable_color(key)
     fill_rgb = tint(color, 0.1)
+    fill_layer = Image.new("RGBA", base.size, fill_rgb + (0,))
     fill_alpha = Image.fromarray((mask_bool.astype(np.uint8) * 178), "L")
     fill_layer.putalpha(fill_alpha)
+    m = Image.fromarray((mask_bool.astype(np.uint8) * 255), "L")
+    edges = ImageChops.difference(m.filter(ImageFilter.MaxFilter(3)), m.filter(ImageFilter.MinFilter(3)))
+    stroke = Image.new("RGBA", base.size, color + (0,))
+    stroke.putalpha(edges)
+    out = Image.alpha_composite(base_rgba, fill_layer)
+    out = Image.alpha_composite(out, stroke)
+    return out.convert("RGB")
+# ----------------- Model Loading (CPU Caching) -----------------
+def ensure_models_loaded(stage):
+    global MODEL_CACHE
+    if MODEL_CACHE[stage]["sam"] is not None:
         return
+    repo_id = REPO_MAP[stage]
+    logging.info(f"Loading {stage} models from {repo_id}...")
+    base_path = download_if_needed(repo_id, BASE_CKPT_NAME)
     model = build_sam2(SAM2_CONFIG, base_path, device="cpu")
+    final_path = download_if_needed(repo_id, FINAL_CKPT_NAME)
     sd = torch.load(final_path, map_location="cpu")
     model.load_state_dict(sd.get("model", sd), strict=True)
     plm = PLMLanguageAdapter(
         model_name="Qwen/Qwen2.5-VL-3B-Instruct",
         transformer_dim=model.sam_mask_decoder.transformer_dim,
         lora_r=16, lora_alpha=32, lora_dropout=0.05,
         dtype=torch.bfloat16, device="cpu",
     )
+    plm_path = download_if_needed(repo_id, PLM_CKPT_NAME)
     plm_sd = torch.load(plm_path, map_location="cpu")
     plm.load_state_dict(plm_sd["plm"], strict=True)
     plm.eval()
+    MODEL_CACHE[stage]["sam"] = model
+    MODEL_CACHE[stage]["plm"] = plm
+def _resize_pad_square(arr, max_dim):
+    h, w = arr.shape[:2]
+    scale = float(max_dim) / float(max(h, w))
+    nw, nh = max(1, int(round(w * scale))), max(1, int(round(h * scale)))
+    arr = cv2.resize(arr, (nw, nh), interpolation=cv2.INTER_LINEAR)
+    pad_w, pad_h = max_dim - nw, max_dim - nh
+    return cv2.copyMakeBorder(arr, pad_h//2, pad_h-pad_h//2, pad_w//2, pad_w-pad_w//2, cv2.BORDER_CONSTANT, value=0)
+# ----------------- Main Prediction -----------------
 @spaces.GPU(duration=120)
+def run_prediction(image_pil, text_prompt, threshold, stage_choice):
     if image_pil is None or not text_prompt:
         return None, None
+    ensure_models_loaded(stage_choice)
+    sam_model = MODEL_CACHE[stage_choice]["sam"]
+    plm_model = MODEL_CACHE[stage_choice]["plm"]
+    sam_model.to("cuda")
+    plm_model.to("cuda")
     try:
+        # 1. Use Inference Mode to avoid grad errors and save memory
+        with torch.inference_mode():
+            predictor = SAM2ImagePredictor(sam_model)
+            rgb_orig = np.array(image_pil.convert("RGB"))
+            Hgt, Wgt = rgb_orig.shape[:2]
+            # Setup crop/padding metadata
+            scale = SQUARE_DIM / max(Hgt, Wgt)
+            nw, nh = int(Wgt * scale), int(Hgt * scale)
+            top, left = (SQUARE_DIM - nh) // 2, (SQUARE_DIM - nw) // 2
+            rgb_sq = _resize_pad_square(rgb_orig, SQUARE_DIM)
+            predictor.set_image(rgb_sq)
+            image_emb = predictor._features["image_embed"][-1].unsqueeze(0)
+            hi = [lvl[-1].unsqueeze(0) for lvl in predictor._features["high_res_feats"]]
+            # PLM Inference
+            temp_path = "temp_input.jpg"
+            image_pil.save(temp_path)
+            sp, dp = plm_model([text_prompt], image_emb.shape[2], image_emb.shape[3], [temp_path])
+            # SAM2 Decoding
+            dec = predictor.model.sam_mask_decoder
+            dev, dtype = next(dec.parameters()).device, next(dec.parameters()).dtype
+            low, scores, _, _ = dec(
+                image_embeddings=image_emb.to(dev, dtype),
+                image_pe=predictor.model.sam_prompt_encoder.get_dense_pe().to(dev, dtype),
+                sparse_prompt_embeddings=sp.to(dev, dtype),
+                dense_prompt_embeddings=dp.to(dev, dtype),
+                multimask_output=True, repeat_image=False,
+                high_res_features=[h.to(dev, dtype) for h in hi],
+            )
+            # Postprocess to full image size
+            logits_sq = predictor._transforms.postprocess_masks(low, (SQUARE_DIM, SQUARE_DIM))
+            best_idx = scores.argmax(dim=1).item()
+            logit_crop = logits_sq[0, best_idx, top:top+nh, left:left+nw].unsqueeze(0).unsqueeze(0)
+            logit_full = F.interpolate(logit_crop, size=(Hgt, Wgt), mode="bilinear", align_corners=False)[0, 0]
+            # FIX: Detach and convert to float before moving to cpu/numpy
+            prob = torch.sigmoid(logit_full).float().detach().cpu().numpy()
+        # 2. Visualizations
+        heatmap_cv = cv2.applyColorMap((prob * 255).astype(np.uint8), cv2.COLORMAP_JET)
+        heatmap_rgb = cv2.cvtColor(heatmap_cv, cv2.COLOR_BGR2RGB)
         mask = (prob > threshold).astype(np.uint8) * 255
         overlay_img = make_overlay(rgb_orig, mask, key=text_prompt)
+        return overlay_img, Image.fromarray(heatmap_rgb)
+    except Exception:
         traceback.print_exc()
+        return None, None
     finally:
+        sam_model.to("cpu")
+        plm_model.to("cpu")
         torch.cuda.empty_cache()
 # ----------------- Gradio UI -----------------
+with gr.Blocks(title="SAM2 + PLM Multi-Stage") as demo:
     gr.Markdown("# SAM2 + PLM Interactive Segmentation")
     with gr.Row():
         with gr.Column():
             input_image = gr.Image(type="pil", label="Input Image")
+            text_prompt = gr.Textbox(label="Text Prompt", placeholder="e.g., 'the surgical tool'")
+            with gr.Row():
+                stage_select = gr.Radio(choices=["Stage 1", "Stage 2"], value="Stage 1", label="Model Stage")
+                threshold_slider = gr.Slider(0.0, 1.0, value=0.5, step=0.01, label="Confidence Threshold")
+            run_btn = gr.Button("Run Inference", variant="primary")
         with gr.Column():
             out_overlay = gr.Image(label="Segmentation Overlay", type="pil")
+            out_heatmap = gr.Image(label="Pixel-wise Probability Heatmap", type="pil")
     run_btn.click(
         fn=run_prediction,
+        inputs=[input_image, text_prompt, threshold_slider, stage_select],
         outputs=[out_overlay, out_heatmap]
     )