Spaces:

aadarsh99
/

ConverSeg

Running on Zero

App Files Files Community

aadarsh99 commited on Jan 7

Commit

7e46975

1 Parent(s): c84ea63

update app

Browse files

Files changed (1) hide show

app.py +62 -108

app.py CHANGED Viewed

@@ -29,47 +29,34 @@ SAM2_CONFIG = "sam2_hiera_l.yaml"
 BASE_CKPT_NAME = "sam2_hiera_large.pt"
 FINAL_CKPT_NAME = "fine_tuned_sam2_batched_100000.torch"
 PLM_CKPT_NAME = "fine_tuned_sam2_batched_plm_100000.torch"
-LORA_CKPT_NAME = None
 SQUARE_DIM = 1024
 logging.basicConfig(level=logging.INFO)
-# ----------------- Globals (Ram Cache) -----------------
 MODEL_CACHE = {
     "Stage 1": {"sam": None, "plm": None},
     "Stage 2": {"sam": None, "plm": None}
 }
-# ----------------- Helper: Download Logic -----------------
 def download_if_needed(repo_id, filename):
     try:
-        logging.info(f"Downloading {filename} from {repo_id}...")
         return hf_hub_download(repo_id=repo_id, filename=filename)
     except Exception as e:
         raise FileNotFoundError(f"Could not find {filename} in {repo_id}. Error: {e}")
-# ----------------- Overlay & Heatmap Helpers -----------------
-EDGE_COLORS_HEX = ["#3A86FF", "#FF006E", "#43AA8B", "#F3722C", "#8338EC", "#90BE6D"]
-def _hex_to_rgb(h: str):
-    h = h.lstrip("#")
-    return tuple(int(h[i : i + 2], 16) for i in (0, 2, 4))
-EDGE_COLORS = [_hex_to_rgb(h) for h in EDGE_COLORS_HEX]
 def stable_color(key: str):
     h = int(hashlib.sha256(str(key).encode("utf-8")).hexdigest(), 16)
-    return EDGE_COLORS[h % len(EDGE_COLORS)]
-def tint(rgb, amt: float = 0.1):
-    return tuple(int(255 - (255 - c) * (1 - amt)) for c in rgb)
 def make_overlay(rgb: np.ndarray, mask: np.ndarray, key: str = "mask") -> Image.Image:
-    base = Image.fromarray(rgb.astype(np.uint8)).convert("RGB")
-    base_rgba = base.convert("RGBA")
     mask_bool = mask > 0
     color = stable_color(key)
-    fill_rgb = tint(color, 0.1)
-    fill_layer = Image.new("RGBA", base.size, fill_rgb + (0,))
     fill_alpha = Image.fromarray((mask_bool.astype(np.uint8) * 178), "L")
     fill_layer.putalpha(fill_alpha)
@@ -78,146 +65,113 @@ def make_overlay(rgb: np.ndarray, mask: np.ndarray, key: str = "mask") -> Image.
     stroke = Image.new("RGBA", base.size, color + (0,))
     stroke.putalpha(edges)
-    out = Image.alpha_composite(base_rgba, fill_layer)
-    out = Image.alpha_composite(out, stroke)
-    return out.convert("RGB")
-# ----------------- Model Loading (CPU Caching) -----------------
 def ensure_models_loaded(stage):
     global MODEL_CACHE
-    if MODEL_CACHE[stage]["sam"] is not None:
-        return
     repo_id = REPO_MAP[stage]
-    logging.info(f"Loading {stage} models from {repo_id}...")
     base_path = download_if_needed(repo_id, BASE_CKPT_NAME)
     model = build_sam2(SAM2_CONFIG, base_path, device="cpu")
-    final_path = download_if_needed(repo_id, FINAL_CKPT_NAME)
-    sd = torch.load(final_path, map_location="cpu")
     model.load_state_dict(sd.get("model", sd), strict=True)
-    plm = PLMLanguageAdapter(
-        model_name="Qwen/Qwen2.5-VL-3B-Instruct",
-        transformer_dim=model.sam_mask_decoder.transformer_dim,
-        n_sparse_tokens=0, use_dense_bias=True, use_lora=True,
-        lora_r=16, lora_alpha=32, lora_dropout=0.05,
-        dtype=torch.bfloat16, device="cpu",
-    )
-    plm_path = download_if_needed(repo_id, PLM_CKPT_NAME)
-    plm_sd = torch.load(plm_path, map_location="cpu")
-    plm.load_state_dict(plm_sd["plm"], strict=True)
     plm.eval()
-    MODEL_CACHE[stage]["sam"] = model
-    MODEL_CACHE[stage]["plm"] = plm
-def _resize_pad_square(arr, max_dim):
-    h, w = arr.shape[:2]
-    scale = float(max_dim) / float(max(h, w))
-    nw, nh = max(1, int(round(w * scale))), max(1, int(round(h * scale)))
-    arr = cv2.resize(arr, (nw, nh), interpolation=cv2.INTER_LINEAR)
-    pad_w, pad_h = max_dim - nw, max_dim - nh
-    return cv2.copyMakeBorder(arr, pad_h//2, pad_h-pad_h//2, pad_w//2, pad_w-pad_w//2, cv2.BORDER_CONSTANT, value=0)
-# ----------------- Main Prediction -----------------
 @spaces.GPU(duration=120)
 def run_prediction(image_pil, text_prompt, threshold, stage_choice):
     if image_pil is None or not text_prompt:
-        return None, None
     ensure_models_loaded(stage_choice)
-    sam_model = MODEL_CACHE[stage_choice]["sam"]
-    plm_model = MODEL_CACHE[stage_choice]["plm"]
-    sam_model.to("cuda")
-    plm_model.to("cuda")
     try:
-        # 1. Use Inference Mode to avoid grad errors and save memory
         with torch.inference_mode():
             predictor = SAM2ImagePredictor(sam_model)
             rgb_orig = np.array(image_pil.convert("RGB"))
-            Hgt, Wgt = rgb_orig.shape[:2]
-            # Setup crop/padding metadata
-            scale = SQUARE_DIM / max(Hgt, Wgt)
-            nw, nh = int(Wgt * scale), int(Hgt * scale)
             top, left = (SQUARE_DIM - nh) // 2, (SQUARE_DIM - nw) // 2
-            rgb_sq = _resize_pad_square(rgb_orig, SQUARE_DIM)
             predictor.set_image(rgb_sq)
             image_emb = predictor._features["image_embed"][-1].unsqueeze(0)
             hi = [lvl[-1].unsqueeze(0) for lvl in predictor._features["high_res_feats"]]
-            # PLM Inference
-            temp_path = "temp_input.jpg"
             image_pil.save(temp_path)
             sp, dp = plm_model([text_prompt], image_emb.shape[2], image_emb.shape[3], [temp_path])
-            # SAM2 Decoding
-            dec = predictor.model.sam_mask_decoder
-            dev, dtype = next(dec.parameters()).device, next(dec.parameters()).dtype
-            low, scores, _, _ = dec(
-                image_embeddings=image_emb.to(dev, dtype),
-                image_pe=predictor.model.sam_prompt_encoder.get_dense_pe().to(dev, dtype),
-                sparse_prompt_embeddings=sp.to(dev, dtype),
-                dense_prompt_embeddings=dp.to(dev, dtype),
-                multimask_output=True, repeat_image=False,
-                high_res_features=[h.to(dev, dtype) for h in hi],
             )
-            # Postprocess to full image size
-            logits_sq = predictor._transforms.postprocess_masks(low, (SQUARE_DIM, SQUARE_DIM))
-            best_idx = scores.argmax(dim=1).item()
-            logit_crop = logits_sq[0, best_idx, top:top+nh, left:left+nw].unsqueeze(0).unsqueeze(0)
-            logit_full = F.interpolate(logit_crop, size=(Hgt, Wgt), mode="bilinear", align_corners=False)[0, 0]
-            # FIX: Detach and convert to float before moving to cpu/numpy
             prob = torch.sigmoid(logit_full).float().detach().cpu().numpy()
-        # 2. Visualizations
-        heatmap_cv = cv2.applyColorMap((prob * 255).astype(np.uint8), cv2.COLORMAP_JET)
-        heatmap_rgb = cv2.cvtColor(heatmap_cv, cv2.COLOR_BGR2RGB)
-        mask = (prob > threshold).astype(np.uint8) * 255
-        overlay_img = make_overlay(rgb_orig, mask, key=text_prompt)
-        return overlay_img, Image.fromarray(heatmap_rgb)
-    except Exception:
-        traceback.print_exc()
-        return None, None
     finally:
-        sam_model.to("cpu")
-        plm_model.to("cpu")
         torch.cuda.empty_cache()
 # ----------------- Gradio UI -----------------
-with gr.Blocks(title="SAM2 + PLM Multi-Stage") as demo:
-    gr.Markdown("# SAM2 + PLM Interactive Segmentation")
     with gr.Row():
         with gr.Column():
             input_image = gr.Image(type="pil", label="Input Image")
-            text_prompt = gr.Textbox(label="Text Prompt", placeholder="e.g., 'the surgical tool'")
             with gr.Row():
-                stage_select = gr.Radio(choices=["Stage 1", "Stage 2"], value="Stage 1", label="Model Stage")
-                threshold_slider = gr.Slider(0.0, 1.0, value=0.5, step=0.01, label="Confidence Threshold")
             run_btn = gr.Button("Run Inference", variant="primary")
         with gr.Column():
             out_overlay = gr.Image(label="Segmentation Overlay", type="pil")
-            out_heatmap = gr.Image(label="Pixel-wise Probability Heatmap", type="pil")
     run_btn.click(
         fn=run_prediction,
         inputs=[input_image, text_prompt, threshold_slider, stage_select],
-        outputs=[out_overlay, out_heatmap]
     )
 if __name__ == "__main__":

 BASE_CKPT_NAME = "sam2_hiera_large.pt"
 FINAL_CKPT_NAME = "fine_tuned_sam2_batched_100000.torch"
 PLM_CKPT_NAME = "fine_tuned_sam2_batched_plm_100000.torch"
 SQUARE_DIM = 1024
 logging.basicConfig(level=logging.INFO)
 MODEL_CACHE = {
     "Stage 1": {"sam": None, "plm": None},
     "Stage 2": {"sam": None, "plm": None}
 }
+# ----------------- Helper Functions -----------------
 def download_if_needed(repo_id, filename):
     try:
         return hf_hub_download(repo_id=repo_id, filename=filename)
     except Exception as e:
         raise FileNotFoundError(f"Could not find {filename} in {repo_id}. Error: {e}")
 def stable_color(key: str):
     h = int(hashlib.sha256(str(key).encode("utf-8")).hexdigest(), 16)
+    EDGE_COLORS_HEX = ["#3A86FF", "#FF006E", "#43AA8B", "#F3722C", "#8338EC", "#90BE6D"]
+    colors = [tuple(int(h.lstrip("#")[i:i+2], 16) for i in (0, 2, 4)) for h in EDGE_COLORS_HEX]
+    return colors[h % len(colors)]
 def make_overlay(rgb: np.ndarray, mask: np.ndarray, key: str = "mask") -> Image.Image:
+    base = Image.fromarray(rgb.astype(np.uint8)).convert("RGB").convert("RGBA")
     mask_bool = mask > 0
     color = stable_color(key)
+    fill_layer = Image.new("RGBA", base.size, color + (0,))
     fill_alpha = Image.fromarray((mask_bool.astype(np.uint8) * 178), "L")
     fill_layer.putalpha(fill_alpha)
     stroke = Image.new("RGBA", base.size, color + (0,))
     stroke.putalpha(edges)
+    return Image.alpha_composite(base, fill_layer).alpha_composite(stroke).convert("RGB")
 def ensure_models_loaded(stage):
     global MODEL_CACHE
+    if MODEL_CACHE[stage]["sam"] is not None: return
     repo_id = REPO_MAP[stage]
     base_path = download_if_needed(repo_id, BASE_CKPT_NAME)
     model = build_sam2(SAM2_CONFIG, base_path, device="cpu")
+    sd = torch.load(download_if_needed(repo_id, FINAL_CKPT_NAME), map_location="cpu")
     model.load_state_dict(sd.get("model", sd), strict=True)
+    plm = PLMLanguageAdapter(model_name="Qwen/Qwen2.5-VL-3B-Instruct", transformer_dim=model.sam_mask_decoder.transformer_dim, n_sparse_tokens=0, use_dense_bias=True, use_lora=True, lora_r=16, lora_alpha=32, lora_dropout=0.05, dtype=torch.bfloat16, device="cpu")
+    plm.load_state_dict(torch.load(download_if_needed(repo_id, PLM_CKPT_NAME), map_location="cpu")["plm"], strict=True)
     plm.eval()
+    MODEL_CACHE[stage]["sam"], MODEL_CACHE[stage]["plm"] = model, plm
+# ----------------- Core Logic -----------------
 @spaces.GPU(duration=120)
 def run_prediction(image_pil, text_prompt, threshold, stage_choice):
     if image_pil is None or not text_prompt:
+        return None, None, None
     ensure_models_loaded(stage_choice)
+    sam_model, plm_model = MODEL_CACHE[stage_choice]["sam"], MODEL_CACHE[stage_choice]["plm"]
+    sam_model.to("cuda"), plm_model.to("cuda")
     try:
         with torch.inference_mode():
             predictor = SAM2ImagePredictor(sam_model)
             rgb_orig = np.array(image_pil.convert("RGB"))
+            H, W = rgb_orig.shape[:2]
+            scale = SQUARE_DIM / max(H, W)
+            nw, nh = int(W * scale), int(H * scale)
             top, left = (SQUARE_DIM - nh) // 2, (SQUARE_DIM - nw) // 2
+            # Preprocess & Encode
+            rgb_sq = cv2.copyMakeBorder(cv2.resize(rgb_orig, (nw, nh)), top, SQUARE_DIM-nh-top, left, SQUARE_DIM-nw-left, cv2.BORDER_CONSTANT, value=0)
             predictor.set_image(rgb_sq)
             image_emb = predictor._features["image_embed"][-1].unsqueeze(0)
             hi = [lvl[-1].unsqueeze(0) for lvl in predictor._features["high_res_feats"]]
+            # PLM & SAM2 Decoder
+            temp_path = "temp.jpg"
             image_pil.save(temp_path)
             sp, dp = plm_model([text_prompt], image_emb.shape[2], image_emb.shape[3], [temp_path])
+            low, scores, _, _ = sam_model.sam_mask_decoder(
+                image_embeddings=image_emb.to("cuda"), image_pe=sam_model.sam_prompt_encoder.get_dense_pe().to("cuda"),
+                sparse_prompt_embeddings=sp.to("cuda"), dense_prompt_embeddings=dp.to("cuda"),
+                multimask_output=True, repeat_image=False, high_res_features=[h.to("cuda") for h in hi]
             )
+            # Postprocess to full size
+            logits = predictor._transforms.postprocess_masks(low, (SQUARE_DIM, SQUARE_DIM))
+            logit_crop = logits[0, scores.argmax().item(), top:top+nh, left:left+nw].unsqueeze(0).unsqueeze(0)
+            logit_full = F.interpolate(logit_crop, size=(H, W), mode="bilinear", align_corners=False)[0, 0]
             prob = torch.sigmoid(logit_full).float().detach().cpu().numpy()
+        # Initial visualization
+        heatmap = cv2.applyColorMap((prob * 255).astype(np.uint8), cv2.COLORMAP_JET)
+        overlay = make_overlay(rgb_orig, (prob > threshold).astype(np.uint8) * 255, key=text_prompt)
+        return overlay, Image.fromarray(cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)), prob
     finally:
+        sam_model.to("cpu"), plm_model.to("cpu")
         torch.cuda.empty_cache()
+def update_threshold_ui(image_pil, text_prompt, threshold, cached_prob):
+    """Updates the overlay instantly without rerunning the GPU model."""
+    if image_pil is None or cached_prob is None:
+        return None
+    rgb_orig = np.array(image_pil.convert("RGB"))
+    mask = (cached_prob > threshold).astype(np.uint8) * 255
+    return make_overlay(rgb_orig, mask, key=text_prompt)
 # ----------------- Gradio UI -----------------
+with gr.Blocks(title="SAM2 + PLM Interactive") as demo:
+    prob_state = gr.State() # Caches the probability map
+    gr.Markdown("# SAM2 + PLM Segmentation\n*Change the model/prompt and click **Run Inference**. Then, adjust the **Threshold** slider for instant mask updates.*")
     with gr.Row():
         with gr.Column():
             input_image = gr.Image(type="pil", label="Input Image")
+            text_prompt = gr.Textbox(label="Text Prompt", placeholder="e.g., 'the blue scissors'")
             with gr.Row():
+                stage_select = gr.Radio(choices=["Stage 1", "Stage 2"], value="Stage 1", label="Model")
+                threshold_slider = gr.Slider(0.0, 1.0, value=0.5, step=0.01, label="Threshold")
             run_btn = gr.Button("Run Inference", variant="primary")
         with gr.Column():
             out_overlay = gr.Image(label="Segmentation Overlay", type="pil")
+            out_heatmap = gr.Image(label="Probability Heatmap", type="pil")
+    # 1. Clicking the button runs the heavy inference
     run_btn.click(
         fn=run_prediction,
         inputs=[input_image, text_prompt, threshold_slider, stage_select],
+        outputs=[out_overlay, out_heatmap, prob_state]
+    )
+    # 2. Moving the slider triggers only the lightweight update
+    threshold_slider.change(
+        fn=update_threshold_ui,
+        inputs=[input_image, text_prompt, threshold_slider, prob_state],
+        outputs=[out_overlay]
     )
 if __name__ == "__main__":