Spaces:

aadarsh99
/

ConverSeg

Running on Zero

App Files Files Community

aadarsh99 commited on Jan 6

Commit

461d3a3

1 Parent(s): 39dd4a1

update app

Browse files

Files changed (1) hide show

app.py +48 -33

app.py CHANGED Viewed

@@ -197,27 +197,26 @@ def get_text_to_image_attention(decoder: MaskDecoder):
     text_attn = attn[..., n_output_tokens:, :]
     return text_attn
-def load_models():
     print("Loading models on CPU...")
-    # 1. Base SAM2 Model
-    # We assume files are present locally (uploaded via CLI or LFS)
     if not os.path.exists(BASE_CKPT_NAME):
         raise FileNotFoundError(f"{BASE_CKPT_NAME} not found")
     model = build_sam2(SAM2_CONFIG, BASE_CKPT_NAME, device="cpu")
-    predictor = SAM2ImagePredictor(model)
-    predictor.model.eval()
     # 2. Fine-tuned Weights
     if not os.path.exists(FINAL_CKPT_NAME):
         raise FileNotFoundError(f"{FINAL_CKPT_NAME} not found")
     sd = torch.load(FINAL_CKPT_NAME, map_location="cpu")
-    predictor.model.load_state_dict(sd.get("model", sd), strict=True)
     # 3. PLM Adapter
-    C = predictor.model.sam_mask_decoder.transformer_dim
     plm = PLMLanguageAdapter(
         model_name="Qwen/Qwen2.5-VL-3B-Instruct",
         transformer_dim=C,
@@ -242,59 +241,69 @@ def load_models():
         plm.load_lora(LORA_CKPT_NAME)
     print("Models loaded successfully (CPU).")
-    return predictor, plm
 # Initialize global models on CPU
 try:
-    PREDICTOR, PLM = load_models()
 except Exception as e:
     print(f"Error loading models: {e}")
-    PREDICTOR, PLM = None, None
-@spaces.GPU # <--- REQUIRED FOR ZEROGPU
 def run_prediction(image_pil, text_prompt):
-    if PREDICTOR is None or PLM is None:
         return None, None, None
     if image_pil is None or not text_prompt:
         return None, None, None
     try:
-        # 1. Move models to GPU for this inference session
         print("Moving models to CUDA...")
-        PREDICTOR.model.to("cuda")
         PLM.to("cuda")
-        # 2. Preprocess
         rgb_orig = np.array(image_pil.convert("RGB"))
         Hgt, Wgt = rgb_orig.shape[:2]
         meta = _resize_pad_square_meta(Hgt, Wgt, SQUARE_DIM)
         rgb_sq = _resize_pad_square(rgb_orig, SQUARE_DIM, is_mask=False)
-        # 3. SAM2 Image Encoding
-        # set_image puts features on the model's device (now cuda)
-        PREDICTOR.set_image(rgb_sq)
-        image_emb = PREDICTOR._features["image_embed"][-1].unsqueeze(0)
-        hi = [lvl[-1].unsqueeze(0) for lvl in PREDICTOR._features["high_res_feats"]]
         _, _, H_feat, W_feat = image_emb.shape
-        # 4. PLM Inference
         temp_path = "temp_input.jpg"
         image_pil.save(temp_path)
         sp, dp = PLM([text_prompt], H_feat, W_feat, [temp_path])
-        # 5. Prepare SAM2 Decoder inputs (ensure they are on CUDA)
-        dec = PREDICTOR.model.sam_mask_decoder
-        dev = next(dec.parameters()).device # should be cuda now
         dtype = next(dec.parameters()).dtype
-        image_pe = PREDICTOR.model.sam_prompt_encoder.get_dense_pe().to(dev, dtype)
         image_emb = image_emb.to(dev, dtype)
         hi = [h.to(dev, dtype) for h in hi]
         sp, dp = sp.to(dev, dtype), dp.to(dev, dtype)
-        # 6. SAM2 Decoding
         low, scores, _, _ = dec(
             image_embeddings=image_emb,
             image_pe=image_pe,
@@ -305,7 +314,7 @@ def run_prediction(image_pil, text_prompt):
             high_res_features=hi,
         )
-        logits_sq = PREDICTOR._transforms.postprocess_masks(low, (SQUARE_DIM, SQUARE_DIM))
         best = scores.argmax(dim=1).item()
         logit_sq = logits_sq[0, best]
         logit_gt = _unpad_and_resize_pred_to_gt(logit_sq, meta, (Hgt, Wgt))
@@ -313,7 +322,7 @@ def run_prediction(image_pil, text_prompt):
         prob = torch.sigmoid(logit_gt)
         mask = (prob > 0.5).cpu().numpy().astype(np.uint8) * 255
-        # 7. Visualization
         overlay_img = make_overlay(rgb_orig, mask, key=text_prompt)
         # Attention
@@ -321,7 +330,6 @@ def run_prediction(image_pil, text_prompt):
         attn_overlay_img = None
         if text_attn is not None:
-            # Move attn back to CPU for numpy processing
             text_attn = text_attn.cpu()
             attn_flat = text_attn.mean(dim=(0, 2, 3))
             global_flat = attn_flat[0]
@@ -341,13 +349,20 @@ def run_prediction(image_pil, text_prompt):
         mask_img = Image.fromarray(mask, mode="L")
         return overlay_img, mask_img, attn_overlay_img
     finally:
-        # Cleanup: Move models back to CPU to free GPU memory for other users/sessions
-        # This is courteous in ZeroGPU environment
         print("Moving models back to CPU...")
-        PREDICTOR.model.to("cpu")
         PLM.to("cpu")
 # ----------------- Gradio UI -----------------

     text_attn = attn[..., n_output_tokens:, :]
     return text_attn
+def load_models_cpu():
     print("Loading models on CPU...")
+    # 1. Base SAM2 Model (Raw Model, not Predictor)
     if not os.path.exists(BASE_CKPT_NAME):
         raise FileNotFoundError(f"{BASE_CKPT_NAME} not found")
     model = build_sam2(SAM2_CONFIG, BASE_CKPT_NAME, device="cpu")
     # 2. Fine-tuned Weights
     if not os.path.exists(FINAL_CKPT_NAME):
         raise FileNotFoundError(f"{FINAL_CKPT_NAME} not found")
     sd = torch.load(FINAL_CKPT_NAME, map_location="cpu")
+    # Load into the model directly
+    model.load_state_dict(sd.get("model", sd), strict=True)
+    model.eval()
     # 3. PLM Adapter
+    C = model.sam_mask_decoder.transformer_dim
     plm = PLMLanguageAdapter(
         model_name="Qwen/Qwen2.5-VL-3B-Instruct",
         transformer_dim=C,
         plm.load_lora(LORA_CKPT_NAME)
     print("Models loaded successfully (CPU).")
+    return model, plm
 # Initialize global models on CPU
 try:
+    # NOTE: We hold the raw MODEL_SAM here, not the predictor
+    MODEL_SAM, PLM = load_models_cpu()
 except Exception as e:
     print(f"Error loading models: {e}")
+    traceback.print_exc()
+    MODEL_SAM, PLM = None, None
+@spaces.GPU(duration=60) # Ensure we have enough time (default is often 60s)
 def run_prediction(image_pil, text_prompt):
+    if MODEL_SAM is None or PLM is None:
         return None, None, None
     if image_pil is None or not text_prompt:
         return None, None, None
+    predictor = None
     try:
+        # 1. Move models to GPU
         print("Moving models to CUDA...")
+        MODEL_SAM.to("cuda")
         PLM.to("cuda")
+        # 2. Instantiate Predictor ON GPU (Crucial Fix)
+        # This ensures the predictor knows it's on CUDA
+        predictor = SAM2ImagePredictor(MODEL_SAM)
+        # 3. Preprocess Image
         rgb_orig = np.array(image_pil.convert("RGB"))
         Hgt, Wgt = rgb_orig.shape[:2]
         meta = _resize_pad_square_meta(Hgt, Wgt, SQUARE_DIM)
         rgb_sq = _resize_pad_square(rgb_orig, SQUARE_DIM, is_mask=False)
+        # 4. SAM2 Image Encoding
+        # set_image puts features on the model's device
+        predictor.set_image(rgb_sq)
+        image_emb = predictor._features["image_embed"][-1].unsqueeze(0)
+        hi = [lvl[-1].unsqueeze(0) for lvl in predictor._features["high_res_feats"]]
         _, _, H_feat, W_feat = image_emb.shape
+        # 5. PLM Inference
         temp_path = "temp_input.jpg"
         image_pil.save(temp_path)
+        # PLM inference usually handles device mapping internally if written well,
+        # but we ensure inputs are passed cleanly.
         sp, dp = PLM([text_prompt], H_feat, W_feat, [temp_path])
+        # 6. Prepare SAM2 Decoder inputs (ensure they are on CUDA)
+        dec = predictor.model.sam_mask_decoder
+        dev = next(dec.parameters()).device
         dtype = next(dec.parameters()).dtype
+        image_pe = predictor.model.sam_prompt_encoder.get_dense_pe().to(dev, dtype)
         image_emb = image_emb.to(dev, dtype)
         hi = [h.to(dev, dtype) for h in hi]
         sp, dp = sp.to(dev, dtype), dp.to(dev, dtype)
+        # 7. SAM2 Decoding
         low, scores, _, _ = dec(
             image_embeddings=image_emb,
             image_pe=image_pe,
             high_res_features=hi,
         )
+        logits_sq = predictor._transforms.postprocess_masks(low, (SQUARE_DIM, SQUARE_DIM))
         best = scores.argmax(dim=1).item()
         logit_sq = logits_sq[0, best]
         logit_gt = _unpad_and_resize_pred_to_gt(logit_sq, meta, (Hgt, Wgt))
         prob = torch.sigmoid(logit_gt)
         mask = (prob > 0.5).cpu().numpy().astype(np.uint8) * 255
+        # 8. Visualization
         overlay_img = make_overlay(rgb_orig, mask, key=text_prompt)
         # Attention
         attn_overlay_img = None
         if text_attn is not None:
             text_attn = text_attn.cpu()
             attn_flat = text_attn.mean(dim=(0, 2, 3))
             global_flat = attn_flat[0]
         mask_img = Image.fromarray(mask, mode="L")
         return overlay_img, mask_img, attn_overlay_img
+    except Exception as e:
+        print("An error occurred during inference:")
+        traceback.print_exc()
+        raise e # Let Gradio show the error
     finally:
+        # Cleanup: Move models back to CPU
         print("Moving models back to CPU...")
+        MODEL_SAM.to("cpu")
         PLM.to("cpu")
+        if predictor:
+            del predictor
+        torch.cuda.empty_cache()
 # ----------------- Gradio UI -----------------