Spaces:

aadarsh99
/

ConverSeg

Running on Zero

App Files Files Community

aadarsh99 commited on Jan 6

Commit

aeaa431

1 Parent(s): edf7653

update app

Browse files

Files changed (1) hide show

app.py +107 -107

app.py CHANGED Viewed

@@ -45,7 +45,6 @@ def _hex_to_rgb(h: str):
 EDGE_COLORS = [_hex_to_rgb(h) for h in EDGE_COLORS_HEX]
 def stable_color(key: str):
-    # Use a fixed key if simple color is desired
     h = int(hashlib.sha256(str(key).encode("utf-8")).hexdigest(), 16)
     return EDGE_COLORS[h % len(EDGE_COLORS)]
@@ -197,34 +196,23 @@ def get_text_to_image_attention(decoder: MaskDecoder):
     text_attn = attn[..., n_output_tokens:, :]
     return text_attn
-def download_model_if_needed(filename):
-    """Checks local disk, else downloads from HF Hub."""
-    if os.path.exists(filename):
-        return filename
-    try:
-        print(f"Downloading {filename} from {HF_REPO_ID}...")
-        path = hf_hub_download(repo_id=HF_REPO_ID, filename=filename)
-        return path
-    except Exception as e:
-        print(f"Could not download {filename}. Ensure it exists locally or in the HF repo.")
-        # Fallback for Space: if files are uploaded directly to the Files tab,
-        # they are in the current working directory.
-        if os.path.exists(filename):
-            return filename
-        raise e
 def load_models():
-    print("Loading models...")
     # 1. Base SAM2 Model
-    base_ckpt_path = download_model_if_needed(BASE_CKPT_NAME)
-    model = build_sam2(SAM2_CONFIG, base_ckpt_path, device=DEVICE)
     predictor = SAM2ImagePredictor(model)
     predictor.model.eval()
     # 2. Fine-tuned Weights
-    final_ckpt_path = download_model_if_needed(FINAL_CKPT_NAME)
-    sd = torch.load(final_ckpt_path, map_location=DEVICE)
     predictor.model.load_state_dict(sd.get("model", sd), strict=True)
     # 3. PLM Adapter
@@ -239,30 +227,30 @@ def load_models():
         lora_alpha=32,
         lora_dropout=0.05,
         dtype=torch.bfloat16,
-        device=DEVICE,
-    ).to(DEVICE)
     plm.eval()
-    plm_ckpt_path = download_model_if_needed(PLM_CKPT_NAME)
-    plm_sd = torch.load(plm_ckpt_path, map_location=DEVICE)
     plm.load_state_dict(plm_sd["plm"], strict=True)
-    if LORA_CKPT_NAME:
-        lora_path = download_model_if_needed(LORA_CKPT_NAME)
-        plm.load_lora(lora_path)
-    print("Models loaded successfully.")
     return predictor, plm
-# Initialize global models
 try:
     PREDICTOR, PLM = load_models()
 except Exception as e:
     print(f"Error loading models: {e}")
-    print("Please check your checkpoint filenames and HF_REPO_ID in the script.")
     PREDICTOR, PLM = None, None
-@torch.no_grad()
 def run_prediction(image_pil, text_prompt):
     if PREDICTOR is None or PLM is None:
         return None, None, None
@@ -270,83 +258,95 @@ def run_prediction(image_pil, text_prompt):
     if image_pil is None or not text_prompt:
         return None, None, None
-    # Preprocess
-    rgb_orig = np.array(image_pil.convert("RGB"))
-    Hgt, Wgt = rgb_orig.shape[:2]
-    meta = _resize_pad_square_meta(Hgt, Wgt, SQUARE_DIM)
-    rgb_sq = _resize_pad_square(rgb_orig, SQUARE_DIM, is_mask=False)
-    PREDICTOR.set_image(rgb_sq)
-    image_emb = PREDICTOR._features["image_embed"][-1].unsqueeze(0)
-    hi = [lvl[-1].unsqueeze(0) for lvl in PREDICTOR._features["high_res_feats"]]
-    _, _, H_feat, W_feat = image_emb.shape
-    # PLM Inference
-    # Note: PLM expects a path list for 'images', but the Qwen adapter likely handles
-    # the internal logic. If your PLM adapter strictly requires disk paths,
-    # save 'image_pil' to a temp file here.
-    # Assuming PLM adapter needs a placeholder path or we save temp:
-    temp_path = "temp_input.jpg"
-    image_pil.save(temp_path)
-    sp, dp = PLM([text_prompt], H_feat, W_feat, [temp_path])
-    dec = PREDICTOR.model.sam_mask_decoder
-    dev, dtype = next(dec.parameters()).device, next(dec.parameters()).dtype
-    image_pe = PREDICTOR.model.sam_prompt_encoder.get_dense_pe().to(dev, dtype)
-    image_emb = image_emb.to(dev, dtype)
-    hi = [h.to(dev, dtype) for h in hi]
-    sp, dp = sp.to(dev, dtype), dp.to(dev, dtype)
-    # SAM2 Decoding
-    low, scores, _, _ = dec(
-        image_embeddings=image_emb,
-        image_pe=image_pe,
-        sparse_prompt_embeddings=sp,
-        dense_prompt_embeddings=dp,
-        multimask_output=True,
-        repeat_image=False,
-        high_res_features=hi,
-    )
-    logits_sq = PREDICTOR._transforms.postprocess_masks(low, (SQUARE_DIM, SQUARE_DIM))
-    best = scores.argmax(dim=1).item()
-    logit_sq = logits_sq[0, best]
-    logit_gt = _unpad_and_resize_pred_to_gt(logit_sq, meta, (Hgt, Wgt))
-    prob = torch.sigmoid(logit_gt)
-    mask = (prob > 0.5).cpu().numpy().astype(np.uint8) * 255
-    # Visualization: Overlay
-    overlay_img = make_overlay(rgb_orig, mask, key=text_prompt)
-    # Visualization: Attention
-    text_attn = get_text_to_image_attention(dec)
-    attn_overlay_img = None
-    if text_attn is not None:
-        L_layer, B, H_heads, N_text, N_img = text_attn.shape
-        attn_flat = text_attn.mean(dim=(0, 2, 3)) # Mean over layers, heads, text
-        global_flat = attn_flat[0]
-        a = global_flat.view(H_feat, W_feat)
-        # Upsample attention
-        a_sq = F.interpolate(
-            a.unsqueeze(0).unsqueeze(0),
-            size=(SQUARE_DIM, SQUARE_DIM),
-            mode="bilinear",
-            align_corners=False,
-        )[0, 0]
-        a_gt = _unpad_and_resize_pred_to_gt(a_sq, meta, (Hgt, Wgt))
-        global_attn_orig = a_gt.cpu().numpy()
-        attn_overlay_img = make_attn_overlay(rgb_orig, global_attn_orig)
-    # Return list of images for Gallery or individual blocks
-    # Mask as an image
-    mask_img = Image.fromarray(mask, mode="L")
-    return overlay_img, mask_img, attn_overlay_img
 # ----------------- Gradio UI -----------------

 EDGE_COLORS = [_hex_to_rgb(h) for h in EDGE_COLORS_HEX]
 def stable_color(key: str):
     h = int(hashlib.sha256(str(key).encode("utf-8")).hexdigest(), 16)
     return EDGE_COLORS[h % len(EDGE_COLORS)]
     text_attn = attn[..., n_output_tokens:, :]
     return text_attn
 def load_models():
+    print("Loading models on CPU...")
     # 1. Base SAM2 Model
+    # We assume files are present locally (uploaded via CLI or LFS)
+    if not os.path.exists(BASE_CKPT_NAME):
+        raise FileNotFoundError(f"{BASE_CKPT_NAME} not found")
+    model = build_sam2(SAM2_CONFIG, BASE_CKPT_NAME, device="cpu")
     predictor = SAM2ImagePredictor(model)
     predictor.model.eval()
     # 2. Fine-tuned Weights
+    if not os.path.exists(FINAL_CKPT_NAME):
+        raise FileNotFoundError(f"{FINAL_CKPT_NAME} not found")
+    sd = torch.load(FINAL_CKPT_NAME, map_location="cpu")
     predictor.model.load_state_dict(sd.get("model", sd), strict=True)
     # 3. PLM Adapter
         lora_alpha=32,
         lora_dropout=0.05,
         dtype=torch.bfloat16,
+        device="cpu",
+    ).to("cpu")
     plm.eval()
+    if not os.path.exists(PLM_CKPT_NAME):
+        raise FileNotFoundError(f"{PLM_CKPT_NAME} not found")
+    plm_sd = torch.load(PLM_CKPT_NAME, map_location="cpu")
     plm.load_state_dict(plm_sd["plm"], strict=True)
+    if LORA_CKPT_NAME and os.path.exists(LORA_CKPT_NAME):
+        plm.load_lora(LORA_CKPT_NAME)
+    print("Models loaded successfully (CPU).")
     return predictor, plm
+# Initialize global models on CPU
 try:
     PREDICTOR, PLM = load_models()
 except Exception as e:
     print(f"Error loading models: {e}")
     PREDICTOR, PLM = None, None
+@spaces.GPU # <--- REQUIRED FOR ZEROGPU
 def run_prediction(image_pil, text_prompt):
     if PREDICTOR is None or PLM is None:
         return None, None, None
     if image_pil is None or not text_prompt:
         return None, None, None
+    try:
+        # 1. Move models to GPU for this inference session
+        print("Moving models to CUDA...")
+        PREDICTOR.model.to("cuda")
+        PLM.to("cuda")
+        # 2. Preprocess
+        rgb_orig = np.array(image_pil.convert("RGB"))
+        Hgt, Wgt = rgb_orig.shape[:2]
+        meta = _resize_pad_square_meta(Hgt, Wgt, SQUARE_DIM)
+        rgb_sq = _resize_pad_square(rgb_orig, SQUARE_DIM, is_mask=False)
+        # 3. SAM2 Image Encoding
+        # set_image puts features on the model's device (now cuda)
+        PREDICTOR.set_image(rgb_sq)
+        image_emb = PREDICTOR._features["image_embed"][-1].unsqueeze(0)
+        hi = [lvl[-1].unsqueeze(0) for lvl in PREDICTOR._features["high_res_feats"]]
+        _, _, H_feat, W_feat = image_emb.shape
+        # 4. PLM Inference
+        temp_path = "temp_input.jpg"
+        image_pil.save(temp_path)
+        sp, dp = PLM([text_prompt], H_feat, W_feat, [temp_path])
+        # 5. Prepare SAM2 Decoder inputs (ensure they are on CUDA)
+        dec = PREDICTOR.model.sam_mask_decoder
+        dev = next(dec.parameters()).device # should be cuda now
+        dtype = next(dec.parameters()).dtype
+        image_pe = PREDICTOR.model.sam_prompt_encoder.get_dense_pe().to(dev, dtype)
+        image_emb = image_emb.to(dev, dtype)
+        hi = [h.to(dev, dtype) for h in hi]
+        sp, dp = sp.to(dev, dtype), dp.to(dev, dtype)
+        # 6. SAM2 Decoding
+        low, scores, _, _ = dec(
+            image_embeddings=image_emb,
+            image_pe=image_pe,
+            sparse_prompt_embeddings=sp,
+            dense_prompt_embeddings=dp,
+            multimask_output=True,
+            repeat_image=False,
+            high_res_features=hi,
+        )
+        logits_sq = PREDICTOR._transforms.postprocess_masks(low, (SQUARE_DIM, SQUARE_DIM))
+        best = scores.argmax(dim=1).item()
+        logit_sq = logits_sq[0, best]
+        logit_gt = _unpad_and_resize_pred_to_gt(logit_sq, meta, (Hgt, Wgt))
+        prob = torch.sigmoid(logit_gt)
+        mask = (prob > 0.5).cpu().numpy().astype(np.uint8) * 255
+        # 7. Visualization
+        overlay_img = make_overlay(rgb_orig, mask, key=text_prompt)
+        # Attention
+        text_attn = get_text_to_image_attention(dec)
+        attn_overlay_img = None
+        if text_attn is not None:
+            # Move attn back to CPU for numpy processing
+            text_attn = text_attn.cpu()
+            attn_flat = text_attn.mean(dim=(0, 2, 3))
+            global_flat = attn_flat[0]
+            a = global_flat.view(H_feat, W_feat)
+            a_sq = F.interpolate(
+                a.unsqueeze(0).unsqueeze(0),
+                size=(SQUARE_DIM, SQUARE_DIM),
+                mode="bilinear",
+                align_corners=False,
+            )[0, 0]
+            a_gt = _unpad_and_resize_pred_to_gt(a_sq, meta, (Hgt, Wgt))
+            global_attn_orig = a_gt.numpy()
+            attn_overlay_img = make_attn_overlay(rgb_orig, global_attn_orig)
+        mask_img = Image.fromarray(mask, mode="L")
+        return overlay_img, mask_img, attn_overlay_img
+    finally:
+        # Cleanup: Move models back to CPU to free GPU memory for other users/sessions
+        # This is courteous in ZeroGPU environment
+        print("Moving models back to CPU...")
+        PREDICTOR.model.to("cpu")
+        PLM.to("cpu")
 # ----------------- Gradio UI -----------------