Spaces:

aadarsh99
/

ConverSeg

Running on Zero

App Files Files Community

aadarsh99 commited on Jan 5

Commit

97c4d82

1 Parent(s): 07bd6be

added app.py

Browse files

Files changed (1) hide show

app.py +376 -0

app.py ADDED Viewed

	@@ -0,0 +1,376 @@

+import os
+import logging
+import hashlib
+import sys
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+import gradio as gr
+from PIL import Image, ImageFilter, ImageChops, ImageDraw
+from huggingface_hub import hf_hub_download
+# --- IMPORT YOUR CUSTOM MODULES ---
+# Ensure the 'sam2' folder and 'plm_adapter_...' file are uploaded to your Space
+from sam2.build_sam import build_sam2
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+from sam2.modeling.sam.mask_decoder import MaskDecoder
+from plm_adapter_lora_with_image_input_only_text_positions import PLMLanguageAdapter
+# ----------------- Configuration -----------------
+# UPDATE THESE TO MATCH YOUR HF REPO IF YOU STORE WEIGHTS THERE
+HF_REPO_ID = "your-username/your-model-repo"
+SAM2_CONFIG = "sam2_hiera_l.yaml"
+# Checkpoint filenames (assumed to be in the root or downloaded)
+BASE_CKPT_NAME = "sam2_hiera_large.pt"
+FINAL_CKPT_NAME = "fine_tuned_sam2_batched_100000.torch" # Update with your filename
+PLM_CKPT_NAME = "fine_tuned_sam2_batched_plm_100000.torch"      # Update with your filename
+LORA_CKPT_NAME = "lora_plm_adapter_100000"                   # Set filename if you use LoRA, else None
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+SQUARE_DIM = 1024
+logging.basicConfig(level=logging.INFO)
+# ----------------- Overlay Style Helpers -----------------
+EDGE_COLORS_HEX = ["#3A86FF", "#FF006E", "#43AA8B", "#F3722C", "#8338EC", "#90BE6D"]
+def _hex_to_rgb(h: str):
+    h = h.lstrip("#")
+    return tuple(int(h[i : i + 2], 16) for i in (0, 2, 4))
+EDGE_COLORS = [_hex_to_rgb(h) for h in EDGE_COLORS_HEX]
+def stable_color(key: str):
+    # Use a fixed key if simple color is desired
+    h = int(hashlib.sha256(str(key).encode("utf-8")).hexdigest(), 16)
+    return EDGE_COLORS[h % len(EDGE_COLORS)]
+def tint(rgb, amt: float = 0.1):
+    return tuple(int(255 - (255 - c) * (1 - amt)) for c in rgb)
+def edge_map(mask_bool: np.ndarray, width_px: int = 2) -> Image.Image:
+    m = Image.fromarray((mask_bool.astype(np.uint8) * 255), "L")
+    edges = ImageChops.difference(
+        m.filter(ImageFilter.MaxFilter(3)), m.filter(ImageFilter.MinFilter(3))
+    )
+    for _ in range(max(0, width_px - 1)):
+        edges = edges.filter(ImageFilter.MaxFilter(3))
+    return edges.point(lambda p: 255 if p > 0 else 0)
+def _apply_rounded_corners(img_rgb: Image.Image, radius: int) -> Image.Image:
+    w, h = img_rgb.size
+    mask = Image.new("L", (w, h), 0)
+    ImageDraw.Draw(mask).rounded_rectangle([0, 0, w - 1, h - 1], radius=radius, fill=255)
+    bg = Image.new("RGB", (w, h), "white")
+    img_rgba = img_rgb.convert("RGBA")
+    img_rgba.putalpha(mask)
+    bg.paste(img_rgba.convert("RGB"), (0, 0), mask)
+    return bg
+def make_overlay(rgb: np.ndarray, mask: np.ndarray, key: str = "mask") -> Image.Image:
+    base = Image.fromarray(rgb.astype(np.uint8)).convert("RGB")
+    H, W = mask.shape[:2]
+    if base.size != (W, H):
+        base = base.resize((W, H), Image.BICUBIC)
+    base_rgba = base.convert("RGBA")
+    mask_bool = mask > 0
+    color = stable_color(key)
+    fill_rgb = tint(color, 0.1)
+    alpha_fill = 0.7
+    edge_width = 2
+    a = int(round(alpha_fill * 255))
+    tgt_w, tgt_h = base_rgba.size
+    fill_layer = Image.new("RGBA", (tgt_w, tgt_h), fill_rgb + (0,))
+    fill_alpha = Image.fromarray((mask_bool.astype(np.uint8) * a), "L")
+    fill_layer.putalpha(fill_alpha)
+    edgesL = edge_map(mask_bool, width_px=edge_width)
+    stroke = Image.new("RGBA", (tgt_w, tgt_h), color + (0,))
+    stroke.putalpha(edgesL)
+    out = Image.alpha_composite(base_rgba, fill_layer)
+    out = Image.alpha_composite(out, stroke)
+    out = out.convert("RGB")
+    return _apply_rounded_corners(out, max(12, int(0.06 * min(out.size))))
+def make_attn_overlay(rgb: np.ndarray, attn: np.ndarray, alpha: float = 0.6) -> Image.Image:
+    h, w = rgb.shape[:2]
+    ah, aw = attn.shape
+    if (ah, aw) != (h, w):
+        attn_resized = cv2.resize(attn.astype(np.float32), (w, h), interpolation=cv2.INTER_LINEAR)
+    else:
+        attn_resized = attn.astype(np.float32)
+    attn_resized = attn_resized - attn_resized.min()
+    denom = attn_resized.max()
+    if denom < 1e-6: denom = 1e-6
+    attn_norm = (attn_resized / denom * 255.0).clip(0, 255).astype(np.uint8)
+    heatmap_bgr = cv2.applyColorMap(attn_norm, cv2.COLORMAP_JET)
+    heatmap_rgb = cv2.cvtColor(heatmap_bgr, cv2.COLOR_BGR2RGB)
+    rgb_f = rgb.astype(np.float32)
+    heat_f = heatmap_rgb.astype(np.float32)
+    blended = (1.0 - alpha) * rgb_f + alpha * heat_f
+    blended = blended.clip(0, 255).astype(np.uint8)
+    return Image.fromarray(blended, mode="RGB")
+# ----------------- Image Processing Helpers -----------------
+def _resize_pad_square(arr: np.ndarray, max_dim: int, *, is_mask: bool) -> np.ndarray:
+    h, w = arr.shape[:2]
+    scale = float(max_dim) / float(max(h, w))
+    new_w = max(1, int(round(w * scale)))
+    new_h = max(1, int(round(h * scale)))
+    if is_mask:
+        interp = cv2.INTER_NEAREST
+    else:
+        interp = cv2.INTER_AREA if scale < 1.0 else cv2.INTER_LINEAR
+    arr = cv2.resize(arr, (new_w, new_h), interpolation=interp)
+    pad_w = max_dim - new_w
+    pad_h = max_dim - new_h
+    left = pad_w // 2
+    right = pad_w - left
+    top = pad_h // 2
+    bottom = pad_h - top
+    border_val = 0 if is_mask else (0, 0, 0)
+    arr = cv2.copyMakeBorder(
+        arr, top, bottom, left, right, borderType=cv2.BORDER_CONSTANT, value=border_val
+    )
+    return np.ascontiguousarray(arr)
+def _resize_pad_square_meta(h: int, w: int, max_dim: int):
+    scale = float(max_dim) / float(max(h, w))
+    new_w = max(1, int(round(w * scale)))
+    new_h = max(1, int(round(h * scale)))
+    pad_w = max_dim - new_w
+    pad_h = max_dim - new_h
+    left = pad_w // 2
+    right = pad_w - left
+    top = pad_h // 2
+    bottom = pad_h - top
+    return {
+        "scale": scale, "new_w": new_w, "new_h": new_h,
+        "left": left, "right": right, "top": top, "bottom": bottom,
+    }
+def _unpad_and_resize_pred_to_gt(logit_sq: torch.Tensor, meta: dict, out_hw: tuple[int, int]) -> torch.Tensor:
+    top, left = meta["top"], meta["left"]
+    nh, nw = meta["new_h"], meta["new_w"]
+    crop = logit_sq[top : top + nh, left : left + nw]
+    crop = crop.unsqueeze(0).unsqueeze(0)
+    up = F.interpolate(crop, size=out_hw, mode="bilinear", align_corners=False)
+    return up[0, 0]
+# ----------------- Model Logic -----------------
+def get_text_to_image_attention(decoder: MaskDecoder):
+    two_way = decoder.transformer
+    attn_blocks = []
+    for blk in two_way.layers:
+        a = blk.cross_attn_token_to_image.last_attn
+        if a is not None:
+            attn_blocks.append(a)
+    final = two_way.final_attn_token_to_image.last_attn
+    if final is not None:
+        attn_blocks.append(final)
+    if not attn_blocks:
+        return None
+    attn = torch.stack(attn_blocks, dim=0)
+    s = 1 if decoder.pred_obj_scores else 0
+    n_output_tokens = s + 1 + decoder.num_mask_tokens
+    text_attn = attn[..., n_output_tokens:, :]
+    return text_attn
+def download_model_if_needed(filename):
+    """Checks local disk, else downloads from HF Hub."""
+    if os.path.exists(filename):
+        return filename
+    try:
+        print(f"Downloading {filename} from {HF_REPO_ID}...")
+        path = hf_hub_download(repo_id=HF_REPO_ID, filename=filename)
+        return path
+    except Exception as e:
+        print(f"Could not download {filename}. Ensure it exists locally or in the HF repo.")
+        # Fallback for Space: if files are uploaded directly to the Files tab,
+        # they are in the current working directory.
+        if os.path.exists(filename):
+            return filename
+        raise e
+def load_models():
+    print("Loading models...")
+    # 1. Base SAM2 Model
+    base_ckpt_path = download_model_if_needed(BASE_CKPT_NAME)
+    model = build_sam2(SAM2_CONFIG, base_ckpt_path, device=DEVICE)
+    predictor = SAM2ImagePredictor(model)
+    predictor.model.eval()
+    # 2. Fine-tuned Weights
+    final_ckpt_path = download_model_if_needed(FINAL_CKPT_NAME)
+    sd = torch.load(final_ckpt_path, map_location=DEVICE)
+    predictor.model.load_state_dict(sd.get("model", sd), strict=True)
+    # 3. PLM Adapter
+    C = predictor.model.sam_mask_decoder.transformer_dim
+    plm = PLMLanguageAdapter(
+        model_name="Qwen/Qwen2.5-VL-3B-Instruct",
+        transformer_dim=C,
+        n_sparse_tokens=0,
+        use_dense_bias=True,
+        use_lora=True,
+        lora_r=16,
+        lora_alpha=32,
+        lora_dropout=0.05,
+        dtype=torch.bfloat16,
+        device=DEVICE,
+    ).to(DEVICE)
+    plm.eval()
+    plm_ckpt_path = download_model_if_needed(PLM_CKPT_NAME)
+    plm_sd = torch.load(plm_ckpt_path, map_location=DEVICE)
+    plm.load_state_dict(plm_sd["plm"], strict=True)
+    if LORA_CKPT_NAME:
+        lora_path = download_model_if_needed(LORA_CKPT_NAME)
+        plm.load_lora(lora_path)
+    print("Models loaded successfully.")
+    return predictor, plm
+# Initialize global models
+try:
+    PREDICTOR, PLM = load_models()
+except Exception as e:
+    print(f"Error loading models: {e}")
+    print("Please check your checkpoint filenames and HF_REPO_ID in the script.")
+    PREDICTOR, PLM = None, None
+@torch.no_grad()
+def run_prediction(image_pil, text_prompt):
+    if PREDICTOR is None or PLM is None:
+        return None, None, None
+    if image_pil is None or not text_prompt:
+        return None, None, None
+    # Preprocess
+    rgb_orig = np.array(image_pil.convert("RGB"))
+    Hgt, Wgt = rgb_orig.shape[:2]
+    meta = _resize_pad_square_meta(Hgt, Wgt, SQUARE_DIM)
+    rgb_sq = _resize_pad_square(rgb_orig, SQUARE_DIM, is_mask=False)
+    PREDICTOR.set_image(rgb_sq)
+    image_emb = PREDICTOR._features["image_embed"][-1].unsqueeze(0)
+    hi = [lvl[-1].unsqueeze(0) for lvl in PREDICTOR._features["high_res_feats"]]
+    _, _, H_feat, W_feat = image_emb.shape
+    # PLM Inference
+    # Note: PLM expects a path list for 'images', but the Qwen adapter likely handles
+    # the internal logic. If your PLM adapter strictly requires disk paths,
+    # save 'image_pil' to a temp file here.
+    # Assuming PLM adapter needs a placeholder path or we save temp:
+    temp_path = "temp_input.jpg"
+    image_pil.save(temp_path)
+    sp, dp = PLM([text_prompt], H_feat, W_feat, [temp_path])
+    dec = PREDICTOR.model.sam_mask_decoder
+    dev, dtype = next(dec.parameters()).device, next(dec.parameters()).dtype
+    image_pe = PREDICTOR.model.sam_prompt_encoder.get_dense_pe().to(dev, dtype)
+    image_emb = image_emb.to(dev, dtype)
+    hi = [h.to(dev, dtype) for h in hi]
+    sp, dp = sp.to(dev, dtype), dp.to(dev, dtype)
+    # SAM2 Decoding
+    low, scores, _, _ = dec(
+        image_embeddings=image_emb,
+        image_pe=image_pe,
+        sparse_prompt_embeddings=sp,
+        dense_prompt_embeddings=dp,
+        multimask_output=True,
+        repeat_image=False,
+        high_res_features=hi,
+    )
+    logits_sq = PREDICTOR._transforms.postprocess_masks(low, (SQUARE_DIM, SQUARE_DIM))
+    best = scores.argmax(dim=1).item()
+    logit_sq = logits_sq[0, best]
+    logit_gt = _unpad_and_resize_pred_to_gt(logit_sq, meta, (Hgt, Wgt))
+    prob = torch.sigmoid(logit_gt)
+    mask = (prob > 0.5).cpu().numpy().astype(np.uint8) * 255
+    # Visualization: Overlay
+    overlay_img = make_overlay(rgb_orig, mask, key=text_prompt)
+    # Visualization: Attention
+    text_attn = get_text_to_image_attention(dec)
+    attn_overlay_img = None
+    if text_attn is not None:
+        L_layer, B, H_heads, N_text, N_img = text_attn.shape
+        attn_flat = text_attn.mean(dim=(0, 2, 3)) # Mean over layers, heads, text
+        global_flat = attn_flat[0]
+        a = global_flat.view(H_feat, W_feat)
+        # Upsample attention
+        a_sq = F.interpolate(
+            a.unsqueeze(0).unsqueeze(0),
+            size=(SQUARE_DIM, SQUARE_DIM),
+            mode="bilinear",
+            align_corners=False,
+        )[0, 0]
+        a_gt = _unpad_and_resize_pred_to_gt(a_sq, meta, (Hgt, Wgt))
+        global_attn_orig = a_gt.cpu().numpy()
+        attn_overlay_img = make_attn_overlay(rgb_orig, global_attn_orig)
+    # Return list of images for Gallery or individual blocks
+    # Mask as an image
+    mask_img = Image.fromarray(mask, mode="L")
+    return overlay_img, mask_img, attn_overlay_img
+# ----------------- Gradio UI -----------------
+with gr.Blocks(title="SAM2 + PLM Interactive Segmentation") as demo:
+    gr.Markdown("# SAM2 + PLM Interactive Segmentation")
+    gr.Markdown("Enter a text prompt to segment objects in the image.")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(type="pil", label="Input Image")
+            text_prompt = gr.Textbox(label="Text Prompt", placeholder="e.g., 'the red car'")
+            run_btn = gr.Button("Segment", variant="primary")
+        with gr.Column():
+            out_overlay = gr.Image(label="Segmentation Overlay", type="pil")
+            with gr.Row():
+                out_mask = gr.Image(label="Binary Mask", type="pil")
+                out_attn = gr.Image(label="Attention Heatmap", type="pil")
+    run_btn.click(
+        fn=run_prediction,
+        inputs=[input_image, text_prompt],
+        outputs=[out_overlay, out_mask, out_attn]
+    )
+if __name__ == "__main__":
+    demo.launch()