Spaces:

GF-John
/

sam2

Running on Zero

App Files Files Community

John Ho commited on Jun 10, 2025

Commit

aaa1b00

1 Parent(s): 579e65b

fixed bug when returning masks for multiple objects

Browse files

Files changed (3) hide show

app.py +13 -3
samv2_handler.py +94 -50
visualizer.py +100 -0

app.py CHANGED Viewed

@@ -127,6 +127,7 @@ def process_video(
     masks: Union[list, str],
     drop_masks: bool = False,
     ref_frame_idx: int = 0,
 ):
     """
     SAM2 Video Segmentation
@@ -153,7 +154,7 @@ def process_video(
         device="cuda",
         do_tidy_up=True,
         drop_mask=drop_masks,
-        async_frame_load=True,
         ref_frame_idx=ref_frame_idx,
     )
@@ -202,12 +203,21 @@ with gr.Blocks() as demo:
                     JSON list of base64 encoded masks, e.g.: ["b'iVBORw0KGgoAAAANSUhEUgAABDgAAAeAAQAAAAADGtqnAAAXz...'",...]
                     """,
                 ),
-                gr.Checkbox(label="remove base64 encoded masks from result JSON"),
                 gr.Number(
-                    label="frame index for the provided object masks",
                     value=0,
                     precision=0,
                 ),
             ],
             outputs=gr.JSON(label="Output JSON"),
             title="SAM2 for Videos",

     masks: Union[list, str],
     drop_masks: bool = False,
     ref_frame_idx: int = 0,
+    async_frame_load: bool = True,
 ):
     """
     SAM2 Video Segmentation
         device="cuda",
         do_tidy_up=True,
         drop_mask=drop_masks,
+        async_frame_load=async_frame_load,
         ref_frame_idx=ref_frame_idx,
     )
                     JSON list of base64 encoded masks, e.g.: ["b'iVBORw0KGgoAAAANSUhEUgAABDgAAAeAAQAAAAADGtqnAAAXz...'",...]
                     """,
                 ),
+                gr.Checkbox(
+                    label="Drop Masks",
+                    info="remove base64 encoded masks from result JSON",
+                    value=True,
+                ),
                 gr.Number(
+                    label="Reference Frame Index",
+                    info="frame index for the provided object masks",
                     value=0,
                     precision=0,
                 ),
+                gr.Checkbox(
+                    label="async frame load",
+                    info="start inference in parallel to frame loading",
+                ),
             ],
             outputs=gr.JSON(label="Output JSON"),
             title="SAM2 for Videos",

samv2_handler.py CHANGED Viewed

@@ -9,8 +9,10 @@ from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
 from sam2.utils.misc import variant_to_config_mapping
 from sam2.utils.visualization import show_masks
 from ffmpeg_extractor import extract_frames, logger
-from toolbox.vid_utils import VidInfo
 from toolbox.mask_encoding import b64_mask_encode
 variant_checkpoints_mapping = {
     "tiny": "checkpoints/sam2_hiera_tiny.pt",
@@ -32,23 +34,6 @@ class point_xy(BaseModel):
     y: Union[int, float]
-def mask_to_xyxy(mask: np.ndarray) -> tuple:
-    """Convert a binary mask of shape (h, w) to
-    xyxy bounding box format (top-left and bottom-right coordinates).
-    """
-    ys, xs = np.where(mask)
-    if len(xs) == 0 or len(ys) == 0:
-        logger.warning("mask_to_xyxy: No object found in the mask")
-        return None
-    x_min = np.min(xs)
-    y_min = np.min(ys)
-    x_max = np.max(xs)
-    y_max = np.max(ys)
-    xyxy = (x_min, y_min, x_max, y_max)
-    xyxy = tuple([int(i) for i in xyxy])
-    return xyxy
 def load_sam_image_model(
     # variant: Literal[*variant_checkpoints_mapping.keys()],
     variant: Literal["tiny", "small", "base_plus", "large"],
@@ -96,7 +81,8 @@ def run_sam_im_inference(
             point_labels
         ), f"{len(points)} points provided but {len(point_labels)} labels given."
-    # determine multimask_output
     has_multi = False
     if points and bboxes:
         has_multi = True
@@ -129,7 +115,7 @@ def run_sam_im_inference(
         box=box_coords,
         point_coords=point_coords,
         point_labels=point_labels,
-        multimask_output=has_multi,
     )
     # mask here is of shape (X, h, w) of np array, X = number of masks
@@ -138,11 +124,16 @@ def run_sam_im_inference(
     else:
         output_masks = []
         for i, mask in enumerate(masks):
-            if mask.ndim > 2:  # shape (3, h, w)
-                mask = np.transpose(mask, (1, 2, 0))  # shape (h,w,3)
-                mask = Image.fromarray((mask * 255).astype(np.uint8)).convert("L")
-                output_masks.append(np.array(mask))
             else:
                 output_masks.append(mask.squeeze().astype(np.uint8))
         return (
             [b64_mask_encode(m).decode("ascii") for m in output_masks]
@@ -151,6 +142,48 @@ def run_sam_im_inference(
         )
 def run_sam_video_inference(
     model: Any,
     video_path: str,
@@ -166,7 +199,6 @@ def run_sam_video_inference(
     # put video frames into directory
     # TODO:
     # change frame size
-    # async frame load
     l_frames_fp = extract_frames(
         video_path,
         fps=sample_fps,
@@ -176,43 +208,55 @@ def run_sam_video_inference(
     )
     vframes_dir = os.path.dirname(l_frames_fp[0])
     vinfo = VidInfo(video_path)
     w = vinfo["frame_width"]
     h = vinfo["frame_height"]
     inference_state = model.init_state(
         video_path=vframes_dir, device=device, async_loading_frames=async_frame_load
     )
-    for i, mask in enumerate(masks):
-        model.add_new_mask(
             inference_state=inference_state,
             frame_idx=ref_frame_idx,
-            obj_id=i,
             mask=mask,
         )
     masks_generator = model.propagate_in_video(inference_state)
-    detections = []
-    for i, tracker_ids, mask_logits in masks_generator:
-        masks = (mask_logits > 0.0).cpu().numpy().astype(np.uint8)
-        for id, mask in zip(tracker_ids, masks):
-            mask = mask.squeeze().astype(np.uint8)
-            xyxy = mask_to_xyxy(mask)
-            if not xyxy:  # mask is empty
-                logger.debug(f"track_id {id} is missing mask at frame {i}")
-                continue
-            x0, y0, x1, y1 = xyxy
-            det = {  # miro's detections format for videos
-                "frame": i,
-                "track_id": id,
-                "x": x0 / w,
-                "y": y0 / h,
-                "w": (x1 - x0) / w,
-                "h": (y1 - y0) / h,
-                "conf": 1,
-            }
-            if not drop_mask:
-                det["mask_b64"] = b64_mask_encode(mask).decode("ascii")
-            detections.append(det)
     if do_tidy_up:
         # remove vframes_dir

 from sam2.utils.misc import variant_to_config_mapping
 from sam2.utils.visualization import show_masks
 from ffmpeg_extractor import extract_frames, logger
+from visualizer import annotate_masks, mask_to_xyxy
+from toolbox.vid_utils import VidInfo, VidReader
 from toolbox.mask_encoding import b64_mask_encode
+from toolbox.img_utils import get_pil_im
 variant_checkpoints_mapping = {
     "tiny": "checkpoints/sam2_hiera_tiny.pt",
     y: Union[int, float]
 def load_sam_image_model(
     # variant: Literal[*variant_checkpoints_mapping.keys()],
     variant: Literal["tiny", "small", "base_plus", "large"],
             point_labels
         ), f"{len(points)} points provided but {len(point_labels)} labels given."
+    # multimask_output actually will provide 3 masks for each segmentation (see https://github.com/facebookresearch/sam2/blob/main/notebooks/image_predictor_example.ipynb)
+    # so should also be set to False
     has_multi = False
     if points and bboxes:
         has_multi = True
         box=box_coords,
         point_coords=point_coords,
         point_labels=point_labels,
+        multimask_output=False,  # has_multi,
     )
     # mask here is of shape (X, h, w) of np array, X = number of masks
     else:
         output_masks = []
         for i, mask in enumerate(masks):
+            if mask.ndim > 2:  # shape (1, h, w)
+                # logger.debug(f"found mask of shape {mask.shape}")
+                output_masks.append(mask.squeeze().astype(np.uint8))
+                # when multimask_output = True the mask is shape (3,h,w)
+                # mask = np.transpose(mask, (1, 2, 0))  # shape (h,w,3)
+                # mask = Image.fromarray((mask * 255).astype(np.uint8)).convert("L")
+                # output_masks.append(np.array(mask))
             else:
+                # logger.debug(f"found mask of shape {mask.shape}")
                 output_masks.append(mask.squeeze().astype(np.uint8))
         return (
             [b64_mask_encode(m).decode("ascii") for m in output_masks]
         )
+def unpack_masks(
+    masks_generator,
+    frame_wh: tuple,
+    drop_mask: bool = False,
+):
+    """return a list of detections in Miro's format given a SAM2 mask generator"""
+    w, h = frame_wh
+    detections = []
+    for frame_idx, tracker_ids, mask_logits in masks_generator:
+        masks = (mask_logits > 0.0).cpu().numpy().astype(np.uint8)
+        # draw a couple frames for debug purpose
+        # if frame_idx % 15 == 0:
+        #     ann_masks = [m.squeeze() for m in masks if mask_to_xyxy(m.squeeze())]
+        #     if len(ann_masks) > 0:
+        #         annotate_masks(
+        #             get_pil_im(np.array(vr.get_data(frame_idx))),
+        #             masks=ann_masks,
+        #         ).save(os.path.join(vframes_dir, f"{frame_idx}.png"))
+        for id, mask in zip(tracker_ids, masks):
+            mask = mask.squeeze().astype(np.uint8)
+            xyxy = mask_to_xyxy(mask)
+            if not xyxy:  # mask is empty
+                # logger.debug(f"track_id {id} is missing mask at frame {frame_idx}")
+                continue
+            x0, y0, x1, y1 = xyxy
+            det = {  # miro's detections format for videos
+                "frame": frame_idx,
+                "track_id": id,
+                "x": x0 / w,
+                "y": y0 / h,
+                "w": (x1 - x0) / w,
+                "h": (y1 - y0) / h,
+                "conf": 1,
+            }
+            if not drop_mask:
+                det["mask_b64"] = b64_mask_encode(mask).decode("ascii")
+            detections.append(det)
+    return detections
 def run_sam_video_inference(
     model: Any,
     video_path: str,
     # put video frames into directory
     # TODO:
     # change frame size
     l_frames_fp = extract_frames(
         video_path,
         fps=sample_fps,
     )
     vframes_dir = os.path.dirname(l_frames_fp[0])
     vinfo = VidInfo(video_path)
+    vr = VidReader(video_path, use_imageio=True)
     w = vinfo["frame_width"]
     h = vinfo["frame_height"]
     inference_state = model.init_state(
         video_path=vframes_dir, device=device, async_loading_frames=async_frame_load
     )
+    for mask_idx, mask in enumerate(masks):
+        _, object_ids, mask_logits = model.add_new_mask(
             inference_state=inference_state,
             frame_idx=ref_frame_idx,
+            obj_id=mask_idx,
             mask=mask,
         )
+        # debug
+        logger.debug(
+            f"adding mask {mask_idx} of shape {mask.shape} for frame {ref_frame_idx}, xyxy: {mask_to_xyxy(mask)}"
+        )
+    # debug init state
+    logger.debug(f"model initiated with mask_logits of shape {mask_logits.shape}")
+    logger.debug(f"model initiated with object_ids of len {len(object_ids)}")
+    init_masks = (mask_logits > 0.0).cpu().numpy().astype(np.uint8)
+    init_masks = [m.squeeze() for m in init_masks]
+    ref_frame_im = get_pil_im(np.array(vr.get_data(ref_frame_idx)))
+    init_masks_im_fp = os.path.join(vframes_dir, f"model_init_masks.jpg")
+    input_masks_im_fp = os.path.join(vframes_dir, f"input_masks.jpg")
+    annotate_masks(ref_frame_im, init_masks).save(init_masks_im_fp)
+    annotate_masks(ref_frame_im, masks).save(input_masks_im_fp)
+    logger.debug(f"masks received by model visualized at {init_masks_im_fp}")
+    logger.debug(f"masks provided to model visualized at {input_masks_im_fp}")
     masks_generator = model.propagate_in_video(inference_state)
+    detections = unpack_masks(
+        masks_generator,
+        drop_mask=drop_mask,
+        frame_wh=(w, h),
+    )
+    if ref_frame_idx != 0:
+        logger.debug(f"propagating in reverse now from {ref_frame_idx}")
+        # there's no need to reset state
+        # model.reset_state(inference_state)
+        masks_generator = model.propagate_in_video(inference_state, reverse=True)
+        detections += unpack_masks(
+            masks_generator,
+            drop_mask=drop_mask,
+            frame_wh=(w, h),
+        )
     if do_tidy_up:
         # remove vframes_dir

visualizer.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from PIL import Image, ImageColor
+import matplotlib.colors as mcolors
+import numpy as np
+from toolbox.mask_encoding import b64_mask_decode
+from toolbox.img_utils import im_draw_bbox, im_draw_point, im_color_mask
+def mask_to_xyxy(mask: np.ndarray, verbose: bool = False) -> tuple:
+    """Convert a binary mask of shape (h, w) to
+    xyxy bounding box format (top-left and bottom-right coordinates).
+    """
+    ys, xs = np.where(mask)
+    if len(xs) == 0 or len(ys) == 0:
+        if verbose:
+            logger.warning("mask_to_xyxy: No object found in the mask")
+        return None
+    x_min = np.min(xs)
+    y_min = np.min(ys)
+    x_max = np.max(xs)
+    y_max = np.max(ys)
+    xyxy = (x_min, y_min, x_max, y_max)
+    xyxy = tuple([int(i) for i in xyxy])
+    return xyxy
+def annotate_detections(
+    im: Image.Image,
+    l_obj: list,
+    color_key: str = "class",
+    bbox_width: int = 1,
+    label_key: str = "object_id",
+    color_dict: dict = {},
+):
+    # color_list is  a list of tuple(name, color_hex)
+    color_list = list(
+        mcolors.XKCD_COLORS.items()
+    )  # list(mcolors.TABLEAU_COLORS.items())
+    unique_color_keys = list(
+        set([o[color_key] for o in l_obj if color_key in o.keys()])
+    )
+    for obj in l_obj:
+        color_index = unique_color_keys.index(obj[color_key])
+        bbox_color = (
+            color_dict[obj[color_key]] if color_dict else color_list[color_index][1]
+        )
+        im = (
+            im_draw_bbox(
+                im,
+                color=bbox_color,
+                width=bbox_width,
+                caption=(str(obj[label_key]) if label_key else None),
+                **obj["boundingBox"],
+                use_bbv=True,
+            )
+            if "boundingBox" in obj.keys()
+            else im_draw_point(
+                im,
+                **obj["point"],
+                width=bbox_width,
+                caption=(str(obj[label_key]) if label_key else None),
+                color=bbox_color,
+            )
+        )
+    return im
+def annotate_masks(
+    im: Image.Image, masks: list, mask_alpha: float = 0.9, bbox_width: int = 3
+) -> Image.Image:
+    """returns an annotated pillow image"""
+    masks = [
+        b64_mask_decode(m).astype(np.uint8) if isinstance(m, str) else m for m in masks
+    ]
+    segs = []
+    for i, m in enumerate(masks):
+        x0, y0, x1, y1 = mask_to_xyxy(m)
+        segs.append(
+            {
+                "object_id": i,
+                "boundingBox": {"x0": x0, "y0": y0, "x1": x1, "y1": y1},
+            }
+        )
+    ann_im = np.array(im)
+    for i, m in enumerate(masks):
+        m_color = list(mcolors.XKCD_COLORS.items())[i]
+        ann_im = im_color_mask(
+            ann_im,
+            mask_array=m,
+            alpha=mask_alpha,
+            rbg_tup=ImageColor.getrgb(m_color[1]),
+        )
+    ann_im = annotate_detections(
+        ann_im,
+        l_obj=segs,
+        color_key="object_id",
+        label_key="object_id",
+        bbox_width=bbox_width,
+    )
+    return ann_im