Spaces:

jerpelhan
/

GECO2-demo

Running on Zero

App Files Files Community

jerpelhan commited on Jan 6

Commit

0e137ec

1 Parent(s): dcddf2d

Updated demo, added AMP for faster inference, added examples

Browse files

Files changed (7) hide show

demo_gradio.py +109 -80
material/1.jpg +0 -0
material/2.jpg +0 -0
material/3.jpg +0 -0
material/4.jpg +0 -0
material/5.jpg +0 -0
models/counter_infer.py +18 -10

demo_gradio.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import spaces
 import torch
 import gradio as gr
 from gradio_image_annotation import image_annotator
-from torch.nn import DataParallel
 from models.counter_infer import build_model
 from utils.arg_parser import get_argparser
 from utils.data import resize_and_pad
@@ -13,55 +13,11 @@ from huggingface_hub import hf_hub_download
 import numpy as np
 import colorsys
-# -----------------------------
-# Minimal UI + force "Create" mode (press C a few times)
 # -----------------------------
-JS_FORCE_CREATE_MODE = r"""
-function () {
-  const pressC = () => {
-    const ev = new KeyboardEvent("keydown", {
-      key: "c",
-      code: "KeyC",
-      bubbles: true
-    });
-    document.dispatchEvent(ev);
-  };
-  let tries = 0;
-  const t = setInterval(() => {
-    tries++;
-    pressC();
-    if (tries > 20) clearInterval(t);
-  }, 200);
-}
-"""
-CSS_MINIMAL_UI = """
-/* Hide labels, instructions, help text */
-.gradio-container label,
-.gradio-container .block-label,
-.gradio-container .markdown,
-.gradio-container p {
-  display: none !important;
-}
-/* Reduce rounding of UI containers */
-.gradio-container [class*="rounded"] {
-  border-radius: 4px !important;
-}
-/* Reduce padding */
-.gradio-container [class*="p-4"] {
-  padding: 0.25rem !important;
-}
-"""
 _MODEL = None
 _ARGS = None
 _WEIGHTS_PATH = None
 def _get_args():
     global _ARGS
@@ -83,6 +39,36 @@ def _get_weights_path():
     return _WEIGHTS_PATH
 def get_model_on_device(device: torch.device):
     """
     Lazily build and load model, then move to the requested device.
@@ -95,18 +81,19 @@ def get_model_on_device(device: torch.device):
         # Build on CPU first to avoid CUDA init in the wrong process
         model = build_model(args)
-        model = DataParallel(model)  # wrap before loading; matches your original
         weights_path = _get_weights_path()
-        ckpt = torch.load(weights_path, map_location="cpu", weights_only=True)
-        state = ckpt["model"] if isinstance(ckpt, dict) and "model" in ckpt else ckpt
-        model.load_state_dict(state, strict=False)
         model.eval()
         _MODEL = model
-    # Ensure correct device for this invocation
     _MODEL = _MODEL.to(device)
     return _MODEL
@@ -114,11 +101,6 @@ def get_model_on_device(device: torch.device):
 # Rotation helper (in case annotator reports orientation)
 # -----------------------------
 def _rotate_image_and_boxes(image_np: np.ndarray, boxes: list[dict], angle: int):
-    """
-    angle is in 90-degree steps. The gradio_image_annotation README demonstrates:
-        np.rot90(image, k=-angle)
-    so angle=1 => rotate clockwise 90 deg.
-    """
     if angle is None:
         return image_np, boxes
@@ -136,7 +118,6 @@ def _rotate_image_and_boxes(image_np: np.ndarray, boxes: list[dict], angle: int)
         xmax = max(0, min(newW, xmax))
         ymin = max(0, min(newH, ymin))
         ymax = max(0, min(newH, ymax))
-        # ensure ordering
         if xmax < xmin:
             xmin, xmax = xmax, xmin
         if ymax < ymin:
@@ -212,27 +193,25 @@ def process_image_once(inputs, enable_mask):
     image = inputs["image"]
     boxes = inputs.get("boxes", []) or []
-    # Ensure numpy image
     if isinstance(image, Image.Image):
-        image = np.array(image)
     elif isinstance(image, str):
-        # If you ever allow URL/path returns, you’d need to load it here.
-        # For now, enforce image_type="numpy" in the UI so this does not occur.
-        raise ValueError("Annotator returned image as str. Set image_type='numpy' on image_annotator.")
-    # Handle orientation if provided (rare but supported by component)
     angle = inputs.get("orientation", None)
     if angle is not None:
         image, boxes = _rotate_image_and_boxes(image, boxes, angle)
-    # Convert boxes dicts to your legacy list format so downstream code stays unchanged:
-    # drawn_boxes elements must support [0],[1],[3],[4] usage in your code.
-    # We'll encode as: [x1, y1, 0, x2, y2]
     drawn_boxes = []
     for b in boxes:
         drawn_boxes.append([float(b["xmin"]), float(b["ymin"]), 0.0, float(b["xmax"]), float(b["ymax"])])
-    # If no boxes, keep consistent behavior (model call would likely fail)
     if len(drawn_boxes) == 0:
         return image, [{"pred_boxes": torch.empty(0, 4), "box_v": torch.empty(0)}], [None], torch.empty(1), 1.0, []
@@ -249,19 +228,17 @@ def process_image_once(inputs, enable_mask):
     img = img.unsqueeze(0).to(device)
     bboxes = bboxes.unsqueeze(0).to(device)
-    with torch.no_grad():
-        model.module.return_masks = enable_mask
         outputs, _, _, _, masks = model(img, bboxes)
     # Return ONLY CPU-native objects to main process.
     out0 = outputs[0]
     pred_boxes_cpu = out0["pred_boxes"].detach().float().cpu()
     box_v_cpu = out0["box_v"].detach().float().cpu()
-    outputs_cpu = [{
-        "pred_boxes": pred_boxes_cpu,
-        "box_v": box_v_cpu,
-    }]
     if enable_mask and masks is not None and masks[0] is not None:
         masks_cpu = [masks[0].detach().float().cpu()]
@@ -369,13 +346,25 @@ def post_process(image, outputs, masks, img, scale, drawn_boxes, enable_mask, th
     return image.convert("RGB"), len(pred_boxes)
 # -----------------------------
 # Gradio UI
 # -----------------------------
 iface = gr.Blocks(
     title="GeCo2 Gradio Demo",
-    # js=JS_FORCE_CREATE_MODE,
-    # css=CSS_MINIMAL_UI,
 )
 with iface:
@@ -383,7 +372,8 @@ with iface:
         """
 # GeCo2: Generalized-Scale Object Counting with Gradual Query Aggregation
 GeCo2 is a few-shot, category-agnostic detection counter. With only a small number of exemplars, GeCo2 can detect and count all instances of the target object in an image without any retraining.
-1) Upload an image.
 2) Draw bounding boxes on the target object (preferably ~3 instances).
 3) Click **Count**.
 4) If needed, adjust the threshold.
@@ -399,7 +389,6 @@ GeCo2 is a few-shot, category-agnostic detection counter. With only a small numb
     drawn_boxes_state = gr.State()
     with gr.Row():
-        # New annotator component
         annotator = image_annotator(
             value=None,
             image_type="numpy",              # ensures inputs["image"] is a numpy array
@@ -408,7 +397,7 @@ GeCo2 is a few-shot, category-agnostic detection counter. With only a small numb
             use_default_label=True,
             enable_keyboard_shortcuts=True,
             interactive=True,
-            show_label=False,                # hide label text on boxes
         )
         image_output = gr.Image(type="pil")
@@ -419,12 +408,52 @@ GeCo2 is a few-shot, category-agnostic detection counter. With only a small numb
     count_button = gr.Button("Count")
     def initial_process(inputs, enable_mask, threshold):
         image, outputs, masks, img, scale, drawn_boxes = process_image_once(inputs, enable_mask)
         if image is None:
             return None, 0, None, None, None, None, None, None
         return (
-            *post_process(image, outputs, masks, img, scale, drawn_boxes, enable_mask, threshold),
             image,
             outputs,
             masks,

 import spaces
 import torch
+import torch.nn.functional as F
 import gradio as gr
 from gradio_image_annotation import image_annotator
 from models.counter_infer import build_model
 from utils.arg_parser import get_argparser
 from utils.data import resize_and_pad
 import numpy as np
 import colorsys
 # -----------------------------
 _MODEL = None
 _ARGS = None
 _WEIGHTS_PATH = None
+# -----------------------------
 def _get_args():
     global _ARGS
     return _WEIGHTS_PATH
+def _strip_module_prefix(state_dict: dict) -> dict:
+    """
+    If weights were saved from torch.nn.DataParallel, keys are often prefixed with 'module.'.
+    When loading into a non-DataParallel model, strip that prefix.
+    """
+    if not isinstance(state_dict, dict) or len(state_dict) == 0:
+        return state_dict
+    # Only strip if it looks like DP
+    has_module = any(k.startswith("module.") for k in state_dict.keys())
+    if not has_module:
+        return state_dict
+    return {k[len("module.") :]: v for k, v in state_dict.items()}
+def _extract_state_dict(ckpt) -> dict:
+    """
+    Robustly extract a state_dict from typical checkpoint formats.
+    """
+    if isinstance(ckpt, dict):
+        # Common keys
+        if "model" in ckpt and isinstance(ckpt["model"], dict):
+            return ckpt["model"]
+        if "state_dict" in ckpt and isinstance(ckpt["state_dict"], dict):
+            return ckpt["state_dict"]
+    # Fallback: checkpoint itself is the state_dict
+    return ckpt
 def get_model_on_device(device: torch.device):
     """
     Lazily build and load model, then move to the requested device.
         # Build on CPU first to avoid CUDA init in the wrong process
         model = build_model(args)
         weights_path = _get_weights_path()
+        ckpt = torch.load(weights_path, map_location="cpu")  # keep compatibility across torch versions
+        state = _extract_state_dict(ckpt)
+        state = _strip_module_prefix(state)
+        model.load_state_dict(state, strict=False)
         model.eval()
         _MODEL = model
     _MODEL = _MODEL.to(device)
+    if device.type == "cuda":
+        torch.backends.cudnn.benchmark = True
     return _MODEL
 # Rotation helper (in case annotator reports orientation)
 # -----------------------------
 def _rotate_image_and_boxes(image_np: np.ndarray, boxes: list[dict], angle: int):
     if angle is None:
         return image_np, boxes
         xmax = max(0, min(newW, xmax))
         ymin = max(0, min(newH, ymin))
         ymax = max(0, min(newH, ymax))
         if xmax < xmin:
             xmin, xmax = xmax, xmin
         if ymax < ymin:
     image = inputs["image"]
     boxes = inputs.get("boxes", []) or []
+    # Ensure numpy image (support numpy, PIL, OR local path string)
     if isinstance(image, Image.Image):
+        image = np.array(image.convert("RGB"))
     elif isinstance(image, str):
+        image = np.array(Image.open(image).convert("RGB"))
+    elif isinstance(image, np.ndarray):
+        pass
+    else:
+        raise ValueError(f"Unsupported image type from annotator: {type(image)}")
     angle = inputs.get("orientation", None)
     if angle is not None:
         image, boxes = _rotate_image_and_boxes(image, boxes, angle)
     drawn_boxes = []
     for b in boxes:
         drawn_boxes.append([float(b["xmin"]), float(b["ymin"]), 0.0, float(b["xmax"]), float(b["ymax"])])
+    # If no boxes, do not call model (caller will handle warning)
     if len(drawn_boxes) == 0:
         return image, [{"pred_boxes": torch.empty(0, 4), "box_v": torch.empty(0)}], [None], torch.empty(1), 1.0, []
     img = img.unsqueeze(0).to(device)
     bboxes = bboxes.unsqueeze(0).to(device)
+    # Faster inference mode
+    use_amp = (device.type == "cuda")
+    with torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16, enabled=use_amp):
+        model.return_masks = enable_mask
         outputs, _, _, _, masks = model(img, bboxes)
     # Return ONLY CPU-native objects to main process.
     out0 = outputs[0]
     pred_boxes_cpu = out0["pred_boxes"].detach().float().cpu()
     box_v_cpu = out0["box_v"].detach().float().cpu()
+    outputs_cpu = [{"pred_boxes": pred_boxes_cpu, "box_v": box_v_cpu}]
     if enable_mask and masks is not None and masks[0] is not None:
         masks_cpu = [masks[0].detach().float().cpu()]
     return image.convert("RGB"), len(pred_boxes)
+# -----------------------------
+# Examples: gallery click -> set annotator value
+# -----------------------------
+EXAMPLE_PATHS = ["material/1.jpg", "material/2.jpg", "material/3.jpg", "material/4.jpg", "material/5.jpg"]
+def load_example_from_gallery(evt: gr.SelectData):
+    """
+    When user clicks a thumbnail in the gallery, load that image into the annotator.
+    """
+    idx = int(evt.index)
+    path = EXAMPLE_PATHS[idx]
+    return {"image": path, "boxes": []}
 # -----------------------------
 # Gradio UI
 # -----------------------------
 iface = gr.Blocks(
     title="GeCo2 Gradio Demo",
 )
 with iface:
         """
 # GeCo2: Generalized-Scale Object Counting with Gradual Query Aggregation
 GeCo2 is a few-shot, category-agnostic detection counter. With only a small number of exemplars, GeCo2 can detect and count all instances of the target object in an image without any retraining.
+1) Upload an image or click an example below.
 2) Draw bounding boxes on the target object (preferably ~3 instances).
 3) Click **Count**.
 4) If needed, adjust the threshold.
     drawn_boxes_state = gr.State()
     with gr.Row():
         annotator = image_annotator(
             value=None,
             image_type="numpy",              # ensures inputs["image"] is a numpy array
             use_default_label=True,
             enable_keyboard_shortcuts=True,
             interactive=True,
+            show_label=False,
         )
         image_output = gr.Image(type="pil")
     count_button = gr.Button("Count")
+    gallery = gr.Gallery(
+        value=EXAMPLE_PATHS,
+        columns=5,
+        height=450,
+        label="Examples (click an image to load it into the annotator)",
+        show_label=True,
+        allow_preview=False,
+    )
+    gallery.select(
+        fn=load_example_from_gallery,
+        inputs=None,
+        outputs=annotator,
+    )
     def initial_process(inputs, enable_mask, threshold):
+        # Validate: must have at least one box
+        if inputs is None or inputs.get("image", None) is None:
+            gr.Warning("please delineate at least one target category object")
+            return None, 0, None, None, None, None, None, None
+        img_val = inputs.get("image", None)
+        boxes = inputs.get("boxes", []) or []
+        if len(boxes) == 0:
+            # Try to show current image in the output even if no boxes
+            if isinstance(img_val, str):
+                preview = Image.open(img_val).convert("RGB")
+            elif isinstance(img_val, Image.Image):
+                preview = img_val.convert("RGB")
+            elif isinstance(img_val, np.ndarray):
+                preview = Image.fromarray(img_val.astype(np.uint8)).convert("RGB")
+            else:
+                preview = None
+            gr.Warning("please delineate at least one target category object")
+            return preview, 0, None, None, None, None, None, None
         image, outputs, masks, img, scale, drawn_boxes = process_image_once(inputs, enable_mask)
         if image is None:
             return None, 0, None, None, None, None, None, None
+        out_img, cnt = post_process(image, outputs, masks, img, scale, drawn_boxes, enable_mask, threshold)
         return (
+            out_img,
+            cnt,
             image,
             outputs,
             masks,

material/1.jpg ADDED Viewed

material/2.jpg ADDED Viewed

material/3.jpg ADDED Viewed

material/4.jpg ADDED Viewed

material/5.jpg ADDED Viewed

models/counter_infer.py CHANGED Viewed

@@ -8,7 +8,7 @@ from torch import nn
 from torch.nn import functional as F
 from torchvision.ops import roi_align
 from torchvision.transforms import Resize
 from utils.box_ops import boxes_with_scores
 from .query_generator import C_base
 from .sam_mask import MaskProcessor
@@ -128,15 +128,23 @@ class CNT(nn.Module):
         prototype_embeddings_l2 = torch.cat([exemplars_l2, shape], dim=1)
         hq_prototype_embeddings = [prototype_embeddings_l1, prototype_embeddings_l2]
-        # adapt image feature with prototypes
-        adapted_f, adapted_f_aux = self.adapt_features(
-            image_embeddings=src,
-            image_pe=self.sam_prompt_encoder.get_dense_pe(),
-            prototype_embeddings=prototype_embeddings,
-            hq_features=feats['backbone_fpn'],
-            hq_prototypes=hq_prototype_embeddings,
-            hq_pos=feats['vision_pos_enc'],
-        )
         # Predict class [fg, bg] and l,r,t,b
         bs, c, w, h = adapted_f.shape
         adapted_f = adapted_f.view(bs, self.emb_dim, -1).permute(0, 2, 1)

 from torch.nn import functional as F
 from torchvision.ops import roi_align
 from torchvision.transforms import Resize
+from torch.cuda.amp import autocast
 from utils.box_ops import boxes_with_scores
 from .query_generator import C_base
 from .sam_mask import MaskProcessor
         prototype_embeddings_l2 = torch.cat([exemplars_l2, shape], dim=1)
         hq_prototype_embeddings = [prototype_embeddings_l1, prototype_embeddings_l2]
+        with autocast(enabled=False):
+            if src.type != torch.float32:
+                src = src.float()
+                prototype_embeddings = prototype_embeddings.float()
+                hq_prototype_embeddings = [hq.float() for hq in hq_prototype_embeddings]
+                feats['backbone_fpn'] = [f.float() for f in feats['backbone_fpn']]
+                feats['vision_pos_enc'] = [f.float() for f in feats['vision_pos_enc']]
+            # adapt image feature with prototypes
+            adapted_f, adapted_f_aux = self.adapt_features(
+                image_embeddings=src,
+                image_pe=self.sam_prompt_encoder.get_dense_pe(),
+                prototype_embeddings=prototype_embeddings,
+                hq_features=feats['backbone_fpn'],
+                hq_prototypes=hq_prototype_embeddings,
+                hq_pos=feats['vision_pos_enc'],
+            )
         # Predict class [fg, bg] and l,r,t,b
         bs, c, w, h = adapted_f.shape
         adapted_f = adapted_f.view(bs, self.emb_dim, -1).permute(0, 2, 1)