Spaces:

aadarsh99
/

ConverSeg

Running on Zero

App Files Files Community

aadarsh99 commited on Jan 7

Commit

96c10ec

1 Parent(s): 9d1694a

update app

Browse files

Files changed (1) hide show

app.py +64 -171

app.py CHANGED Viewed

@@ -23,7 +23,6 @@ from plm_adapter_lora_with_image_input_only_text_positions import PLMLanguageAda
 HF_REPO_ID = "aadarsh99/ConvSeg-Stage1"
 SAM2_CONFIG = "sam2_hiera_l.yaml"
-# Filenames
 BASE_CKPT_NAME = "sam2_hiera_large.pt"
 FINAL_CKPT_NAME = "fine_tuned_sam2_batched_100000.torch"
 PLM_CKPT_NAME = "fine_tuned_sam2_batched_plm_100000.torch"
@@ -32,38 +31,24 @@ LORA_CKPT_NAME = None
 SQUARE_DIM = 1024
 logging.basicConfig(level=logging.INFO)
-# ----------------- Globals (Ram Cache) -----------------
-# We keep these on CPU globally so they persist between runs
-# without taking up GPU memory (which gets reset).
 MODEL_SAM_CPU = None
 PLM_CPU = None
-# ----------------- Helper: Download Logic -----------------
 def download_if_needed(filename):
-    """
-    Checks if file exists locally. If not, downloads from HF Repo.
-    Returns the valid path to the file.
-    """
     if os.path.exists(filename):
-        logging.info(f"Found local file: {filename}")
         return filename
-    # hf_hub_download checks the cache automatically.
-    # It won't re-download if the file is already in the HF cache.
-    logging.info(f"Checking HF Cache for {filename}...")
     try:
-        path = hf_hub_download(repo_id=HF_REPO_ID, filename=filename)
-        return path
     except Exception as e:
-        raise FileNotFoundError(f"Could not find {filename} locally or in HF repo {HF_REPO_ID}. Error: {e}")
-# ----------------- Overlay Style Helpers -----------------
-EDGE_COLORS_HEX = ["#3A86FF", "#FF006E", "#43AA8B", "#F3722C", "#8338EC", "#90BE6D"]
 def _hex_to_rgb(h: str):
     h = h.lstrip("#")
     return tuple(int(h[i : i + 2], 16) for i in (0, 2, 4))
 EDGE_COLORS = [_hex_to_rgb(h) for h in EDGE_COLORS_HEX]
 def stable_color(key: str):
@@ -75,9 +60,7 @@ def tint(rgb, amt: float = 0.1):
 def edge_map(mask_bool: np.ndarray, width_px: int = 2) -> Image.Image:
     m = Image.fromarray((mask_bool.astype(np.uint8) * 255), "L")
-    edges = ImageChops.difference(
-        m.filter(ImageFilter.MaxFilter(3)), m.filter(ImageFilter.MinFilter(3))
-    )
     for _ in range(max(0, width_px - 1)):
         edges = edges.filter(ImageFilter.MaxFilter(3))
     return edges.point(lambda p: 255 if p > 0 else 0)
@@ -97,248 +80,158 @@ def make_overlay(rgb: np.ndarray, mask: np.ndarray, key: str = "mask") -> Image.
     H, W = mask.shape[:2]
     if base.size != (W, H):
         base = base.resize((W, H), Image.BICUBIC)
     base_rgba = base.convert("RGBA")
     mask_bool = mask > 0
     color = stable_color(key)
     fill_rgb = tint(color, 0.1)
-    alpha_fill = 0.7
-    edge_width = 2
-    a = int(round(alpha_fill * 255))
-    tgt_w, tgt_h = base_rgba.size
-    fill_layer = Image.new("RGBA", (tgt_w, tgt_h), fill_rgb + (0,))
-    fill_alpha = Image.fromarray((mask_bool.astype(np.uint8) * a), "L")
     fill_layer.putalpha(fill_alpha)
-    edgesL = edge_map(mask_bool, width_px=edge_width)
-    stroke = Image.new("RGBA", (tgt_w, tgt_h), color + (0,))
     stroke.putalpha(edgesL)
     out = Image.alpha_composite(base_rgba, fill_layer)
     out = Image.alpha_composite(out, stroke)
-    out = out.convert("RGB")
-    return _apply_rounded_corners(out, max(12, int(0.06 * min(out.size))))
-# ----------------- Image Processing Helpers -----------------
 def _resize_pad_square(arr: np.ndarray, max_dim: int, *, is_mask: bool) -> np.ndarray:
     h, w = arr.shape[:2]
     scale = float(max_dim) / float(max(h, w))
-    new_w = max(1, int(round(w * scale)))
-    new_h = max(1, int(round(h * scale)))
-    if is_mask:
-        interp = cv2.INTER_NEAREST
-    else:
-        interp = cv2.INTER_AREA if scale < 1.0 else cv2.INTER_LINEAR
     arr = cv2.resize(arr, (new_w, new_h), interpolation=interp)
-    pad_w = max_dim - new_w
-    pad_h = max_dim - new_h
-    left = pad_w // 2
-    right = pad_w - left
-    top = pad_h // 2
-    bottom = pad_h - top
-    border_val = 0 if is_mask else (0, 0, 0)
-    arr = cv2.copyMakeBorder(
-        arr, top, bottom, left, right, borderType=cv2.BORDER_CONSTANT, value=border_val
-    )
-    return np.ascontiguousarray(arr)
 def _resize_pad_square_meta(h: int, w: int, max_dim: int):
     scale = float(max_dim) / float(max(h, w))
-    new_w = max(1, int(round(w * scale)))
-    new_h = max(1, int(round(h * scale)))
-    pad_w = max_dim - new_w
-    pad_h = max_dim - new_h
-    left = pad_w // 2
-    right = pad_w - left
-    top = pad_h // 2
-    bottom = pad_h - top
-    return {
-        "scale": scale, "new_w": new_w, "new_h": new_h,
-        "left": left, "right": right, "top": top, "bottom": bottom,
-    }
 def _unpad_and_resize_pred_to_gt(logit_sq: torch.Tensor, meta: dict, out_hw: tuple[int, int]) -> torch.Tensor:
     top, left = meta["top"], meta["left"]
     nh, nw = meta["new_h"], meta["new_w"]
-    crop = logit_sq[top : top + nh, left : left + nw]
-    crop = crop.unsqueeze(0).unsqueeze(0)
-    up = F.interpolate(crop, size=out_hw, mode="bilinear", align_corners=False)
-    return up[0, 0]
-# ----------------- Model Loading (CPU Caching) -----------------
 def ensure_models_loaded_on_cpu():
-    """
-    Ensures models are loaded in Global CPU RAM.
-    This avoids re-reading from disk/cache on every run.
-    """
     global MODEL_SAM_CPU, PLM_CPU
     if MODEL_SAM_CPU is not None and PLM_CPU is not None:
-        return # Already loaded in RAM
-    logging.info("Loading models into CPU RAM (this happens once)...")
-    # 1. Base SAM2 Model
     base_path = download_if_needed(BASE_CKPT_NAME)
-    # Build on CPU
     model = build_sam2(SAM2_CONFIG, base_path, device="cpu")
-    # 2. Fine-tuned Weights
     final_path = download_if_needed(FINAL_CKPT_NAME)
     sd = torch.load(final_path, map_location="cpu")
     model.load_state_dict(sd.get("model", sd), strict=True)
-    # Save to Global (CPU)
     MODEL_SAM_CPU = model
-    # 3. PLM Adapter
-    C = model.sam_mask_decoder.transformer_dim
     plm = PLMLanguageAdapter(
         model_name="Qwen/Qwen2.5-VL-3B-Instruct",
-        transformer_dim=C,
-        n_sparse_tokens=0,
-        use_dense_bias=True,
-        use_lora=True,
-        lora_r=16,
-        lora_alpha=32,
-        lora_dropout=0.05,
-        dtype=torch.bfloat16,
-        device="cpu",
     )
     plm_path = download_if_needed(PLM_CKPT_NAME)
     plm_sd = torch.load(plm_path, map_location="cpu")
     plm.load_state_dict(plm_sd["plm"], strict=True)
-    if LORA_CKPT_NAME:
-        lora_path = download_if_needed(LORA_CKPT_NAME)
-        plm.load_lora(lora_path)
     plm.eval()
     PLM_CPU = plm
-    logging.info("Models successfully loaded into CPU RAM.")
 @spaces.GPU(duration=120)
-def run_prediction(image_pil, text_prompt):
     if image_pil is None or not text_prompt:
         return None, None
-    # 1. Ensure models are in RAM (Fast check)
     ensure_models_loaded_on_cpu()
-    # 2. Move to GPU (The only 'loading' cost per run)
-    # We rely on the global variables
-    logging.info("Moving models to GPU...")
     MODEL_SAM_CPU.to("cuda")
     PLM_CPU.to("cuda")
     predictor = None
     try:
-        # Instantiate Predictor on GPU
         predictor = SAM2ImagePredictor(MODEL_SAM_CPU)
-        # 3. Preprocess Image
         rgb_orig = np.array(image_pil.convert("RGB"))
         Hgt, Wgt = rgb_orig.shape[:2]
         meta = _resize_pad_square_meta(Hgt, Wgt, SQUARE_DIM)
         rgb_sq = _resize_pad_square(rgb_orig, SQUARE_DIM, is_mask=False)
-        # 4. SAM2 Image Encoding
         predictor.set_image(rgb_sq)
         image_emb = predictor._features["image_embed"][-1].unsqueeze(0)
         hi = [lvl[-1].unsqueeze(0) for lvl in predictor._features["high_res_feats"]]
-        _, _, H_feat, W_feat = image_emb.shape
-        # 5. PLM Inference
         temp_path = "temp_input.jpg"
         image_pil.save(temp_path)
-        sp, dp = PLM_CPU([text_prompt], H_feat, W_feat, [temp_path])
-        # 6. Prepare SAM2 Decoder inputs
         dec = predictor.model.sam_mask_decoder
-        dev = next(dec.parameters()).device
-        dtype = next(dec.parameters()).dtype
-        image_pe = predictor.model.sam_prompt_encoder.get_dense_pe().to(dev, dtype)
-        image_emb = image_emb.to(dev, dtype)
-        hi = [h.to(dev, dtype) for h in hi]
-        sp, dp = sp.to(dev, dtype), dp.to(dev, dtype)
-        # 7. SAM2 Decoding
         low, scores, _, _ = dec(
-            image_embeddings=image_emb,
-            image_pe=image_pe,
-            sparse_prompt_embeddings=sp,
-            dense_prompt_embeddings=dp,
-            multimask_output=True,
-            repeat_image=False,
-            high_res_features=hi,
         )
         logits_sq = predictor._transforms.postprocess_masks(low, (SQUARE_DIM, SQUARE_DIM))
-        best = scores.argmax(dim=1).item()
-        logit_sq = logits_sq[0, best]
-        logit_gt = _unpad_and_resize_pred_to_gt(logit_sq, meta, (Hgt, Wgt))
-        prob = torch.sigmoid(logit_gt)
-        mask = (prob > 0.5).cpu().numpy().astype(np.uint8) * 255
-        # 8. Visualization
         overlay_img = make_overlay(rgb_orig, mask, key=text_prompt)
-        mask_img = Image.fromarray(mask, mode="L")
-        return overlay_img, mask_img
     except Exception as e:
-        print("An error occurred during inference:")
         traceback.print_exc()
         raise e
     finally:
-        # CRITICAL: Move models back to CPU
-        # This preserves the Global Variable on CPU RAM for the next run.
-        # If we leave them on CUDA, they might be lost when ZeroGPU releases the device.
-        logging.info("Moving models back to CPU...")
         MODEL_SAM_CPU.to("cpu")
         PLM_CPU.to("cpu")
-        if predictor:
-            del predictor
         torch.cuda.empty_cache()
 # ----------------- Gradio UI -----------------
-with gr.Blocks(title="SAM2 + PLM Interactive Segmentation") as demo:
     gr.Markdown("# SAM2 + PLM Interactive Segmentation")
-    gr.Markdown("Enter a text prompt to segment objects in the image.")
     with gr.Row():
         with gr.Column():
             input_image = gr.Image(type="pil", label="Input Image")
             text_prompt = gr.Textbox(label="Text Prompt", placeholder="e.g., 'the red car'")
             run_btn = gr.Button("Segment", variant="primary")
         with gr.Column():
             out_overlay = gr.Image(label="Segmentation Overlay", type="pil")
-            out_mask = gr.Image(label="Binary Mask", type="pil")
     run_btn.click(
         fn=run_prediction,
-        inputs=[input_image, text_prompt],
-        outputs=[out_overlay, out_mask]
     )
 if __name__ == "__main__":

 HF_REPO_ID = "aadarsh99/ConvSeg-Stage1"
 SAM2_CONFIG = "sam2_hiera_l.yaml"
 BASE_CKPT_NAME = "sam2_hiera_large.pt"
 FINAL_CKPT_NAME = "fine_tuned_sam2_batched_100000.torch"
 PLM_CKPT_NAME = "fine_tuned_sam2_batched_plm_100000.torch"
 SQUARE_DIM = 1024
 logging.basicConfig(level=logging.INFO)
 MODEL_SAM_CPU = None
 PLM_CPU = None
+# ----------------- Helper Functions -----------------
 def download_if_needed(filename):
     if os.path.exists(filename):
         return filename
     try:
+        return hf_hub_download(repo_id=HF_REPO_ID, filename=filename)
     except Exception as e:
+        raise FileNotFoundError(f"Could not find {filename} in HF repo {HF_REPO_ID}. Error: {e}")
 def _hex_to_rgb(h: str):
     h = h.lstrip("#")
     return tuple(int(h[i : i + 2], 16) for i in (0, 2, 4))
+EDGE_COLORS_HEX = ["#3A86FF", "#FF006E", "#43AA8B", "#F3722C", "#8338EC", "#90BE6D"]
 EDGE_COLORS = [_hex_to_rgb(h) for h in EDGE_COLORS_HEX]
 def stable_color(key: str):
 def edge_map(mask_bool: np.ndarray, width_px: int = 2) -> Image.Image:
     m = Image.fromarray((mask_bool.astype(np.uint8) * 255), "L")
+    edges = ImageChops.difference(m.filter(ImageFilter.MaxFilter(3)), m.filter(ImageFilter.MinFilter(3)))
     for _ in range(max(0, width_px - 1)):
         edges = edges.filter(ImageFilter.MaxFilter(3))
     return edges.point(lambda p: 255 if p > 0 else 0)
     H, W = mask.shape[:2]
     if base.size != (W, H):
         base = base.resize((W, H), Image.BICUBIC)
     base_rgba = base.convert("RGBA")
     mask_bool = mask > 0
     color = stable_color(key)
     fill_rgb = tint(color, 0.1)
+    fill_layer = Image.new("RGBA", base_rgba.size, fill_rgb + (0,))
+    fill_alpha = Image.fromarray((mask_bool.astype(np.uint8) * 178), "L")
     fill_layer.putalpha(fill_alpha)
+    edgesL = edge_map(mask_bool, width_px=2)
+    stroke = Image.new("RGBA", base_rgba.size, color + (0,))
     stroke.putalpha(edgesL)
     out = Image.alpha_composite(base_rgba, fill_layer)
     out = Image.alpha_composite(out, stroke)
+    return _apply_rounded_corners(out.convert("RGB"), max(12, int(0.06 * min(out.size))))
+# ----------------- Image Processing -----------------
 def _resize_pad_square(arr: np.ndarray, max_dim: int, *, is_mask: bool) -> np.ndarray:
     h, w = arr.shape[:2]
     scale = float(max_dim) / float(max(h, w))
+    new_w, new_h = max(1, int(round(w * scale))), max(1, int(round(h * scale)))
+    interp = cv2.INTER_NEAREST if is_mask else (cv2.INTER_AREA if scale < 1.0 else cv2.INTER_LINEAR)
     arr = cv2.resize(arr, (new_w, new_h), interpolation=interp)
+    pad_w, pad_h = max_dim - new_w, max_dim - new_h
+    left, top = pad_w // 2, pad_h // 2
+    return np.ascontiguousarray(cv2.copyMakeBorder(arr, top, pad_h - top, left, pad_w - left, cv2.BORDER_CONSTANT, value=0))
 def _resize_pad_square_meta(h: int, w: int, max_dim: int):
     scale = float(max_dim) / float(max(h, w))
+    new_w, new_h = max(1, int(round(w * scale))), max(1, int(round(h * scale)))
+    return {"scale": scale, "new_w": new_w, "new_h": new_h, "left": (max_dim - new_w) // 2, "top": (max_dim - new_h) // 2}
 def _unpad_and_resize_pred_to_gt(logit_sq: torch.Tensor, meta: dict, out_hw: tuple[int, int]) -> torch.Tensor:
     top, left = meta["top"], meta["left"]
     nh, nw = meta["new_h"], meta["new_w"]
+    crop = logit_sq[top : top + nh, left : left + nw].unsqueeze(0).unsqueeze(0)
+    return F.interpolate(crop, size=out_hw, mode="bilinear", align_corners=False)[0, 0]
+# ----------------- Prediction Logic -----------------
 def ensure_models_loaded_on_cpu():
     global MODEL_SAM_CPU, PLM_CPU
     if MODEL_SAM_CPU is not None and PLM_CPU is not None:
+        return
+    logging.info("Loading models into CPU RAM...")
     base_path = download_if_needed(BASE_CKPT_NAME)
     model = build_sam2(SAM2_CONFIG, base_path, device="cpu")
     final_path = download_if_needed(FINAL_CKPT_NAME)
     sd = torch.load(final_path, map_location="cpu")
     model.load_state_dict(sd.get("model", sd), strict=True)
     MODEL_SAM_CPU = model
     plm = PLMLanguageAdapter(
         model_name="Qwen/Qwen2.5-VL-3B-Instruct",
+        transformer_dim=model.sam_mask_decoder.transformer_dim,
+        n_sparse_tokens=0, use_dense_bias=True, use_lora=True,
+        lora_r=16, lora_alpha=32, lora_dropout=0.05,
+        dtype=torch.bfloat16, device="cpu",
     )
     plm_path = download_if_needed(PLM_CKPT_NAME)
     plm_sd = torch.load(plm_path, map_location="cpu")
     plm.load_state_dict(plm_sd["plm"], strict=True)
     plm.eval()
     PLM_CPU = plm
 @spaces.GPU(duration=120)
+def run_prediction(image_pil, text_prompt, threshold):
     if image_pil is None or not text_prompt:
         return None, None
     ensure_models_loaded_on_cpu()
     MODEL_SAM_CPU.to("cuda")
     PLM_CPU.to("cuda")
     predictor = None
     try:
         predictor = SAM2ImagePredictor(MODEL_SAM_CPU)
         rgb_orig = np.array(image_pil.convert("RGB"))
         Hgt, Wgt = rgb_orig.shape[:2]
         meta = _resize_pad_square_meta(Hgt, Wgt, SQUARE_DIM)
         rgb_sq = _resize_pad_square(rgb_orig, SQUARE_DIM, is_mask=False)
         predictor.set_image(rgb_sq)
         image_emb = predictor._features["image_embed"][-1].unsqueeze(0)
         hi = [lvl[-1].unsqueeze(0) for lvl in predictor._features["high_res_feats"]]
         temp_path = "temp_input.jpg"
         image_pil.save(temp_path)
+        sp, dp = PLM_CPU([text_prompt], image_emb.shape[2], image_emb.shape[3], [temp_path])
         dec = predictor.model.sam_mask_decoder
+        dev, dtype = next(dec.parameters()).device, next(dec.parameters()).dtype
         low, scores, _, _ = dec(
+            image_embeddings=image_emb.to(dev, dtype),
+            image_pe=predictor.model.sam_prompt_encoder.get_dense_pe().to(dev, dtype),
+            sparse_prompt_embeddings=sp.to(dev, dtype),
+            dense_prompt_embeddings=dp.to(dev, dtype),
+            multimask_output=True, repeat_image=False,
+            high_res_features=[h.to(dev, dtype) for h in hi],
         )
         logits_sq = predictor._transforms.postprocess_masks(low, (SQUARE_DIM, SQUARE_DIM))
+        logit_gt = _unpad_and_resize_pred_to_gt(logits_sq[0, scores.argmax(dim=1).item()], meta, (Hgt, Wgt))
+        # 1. Calculate Probabilities (Heatmap)
+        prob = torch.sigmoid(logit_gt).cpu().numpy()
+        # 2. Apply dynamic threshold for overlay
+        mask = (prob > threshold).astype(np.uint8) * 255
         overlay_img = make_overlay(rgb_orig, mask, key=text_prompt)
+        # 3. Create Heatmap Visualization
+        # Scale 0.0-1.0 to 0-255
+        prob_uint8 = (prob * 255).astype(np.uint8)
+        heatmap_color = cv2.applyColorMap(prob_uint8, cv2.COLORMAP_JET)
+        heatmap_color = cv2.cvtColor(heatmap_color, cv2.COLOR_BGR2RGB)
+        heatmap_pil = Image.fromarray(heatmap_color)
+        return overlay_img, heatmap_pil
     except Exception as e:
         traceback.print_exc()
         raise e
     finally:
         MODEL_SAM_CPU.to("cpu")
         PLM_CPU.to("cpu")
+        if predictor: del predictor
         torch.cuda.empty_cache()
 # ----------------- Gradio UI -----------------
+with gr.Blocks(title="SAM2 + PLM Segmentation") as demo:
     gr.Markdown("# SAM2 + PLM Interactive Segmentation")
     with gr.Row():
         with gr.Column():
             input_image = gr.Image(type="pil", label="Input Image")
             text_prompt = gr.Textbox(label="Text Prompt", placeholder="e.g., 'the red car'")
+            threshold_slider = gr.Slider(
+                minimum=0.0, maximum=1.0, value=0.5, step=0.01,
+                label="Confidence Threshold", info="Adjust to include more/less of the object"
+            )
             run_btn = gr.Button("Segment", variant="primary")
         with gr.Column():
             out_overlay = gr.Image(label="Segmentation Overlay", type="pil")
+            out_heatmap = gr.Image(label="Probability Heatmap", type="pil")
     run_btn.click(
         fn=run_prediction,
+        inputs=[input_image, text_prompt, threshold_slider],
+        outputs=[out_overlay, out_heatmap]
     )
 if __name__ == "__main__":