Spaces:

GF-John
/

sam3

Running on Zero

App Files Files Community

John Ho commited on Dec 22, 2025

Commit

2e155e5

1 Parent(s): 1345eda

updated demo json output

Browse files

Files changed (3) hide show

app.py +49 -8
toolbox/mask_encoding.py +43 -0
visualizer.py +102 -0

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import sys
 import tempfile
@@ -14,6 +15,10 @@ from transformers import (
     Sam3VideoProcessor,
 )
 logger.remove()
 logger.add(
     sys.stderr,
@@ -100,7 +105,7 @@ def apply_mask_overlay(base_image, mask_data, object_ids=None, opacity=0.5):
     return Image.alpha_composite(base_image, composite_layer).convert("RGB")
-print("Loading Models and Processors...")
 try:
     VID_MODEL = Sam3VideoModel.from_pretrained("facebook/sam3").to(DEVICE, dtype=DTYPE)
     VID_PROCESSOR = Sam3VideoProcessor.from_pretrained("facebook/sam3")
@@ -113,18 +118,23 @@ except Exception as e:
 # Our Inference Function
 @spaces.GPU(duration=120)
-def video_inference(input_video, prompt):
     """
     Segments objects in a video using a text prompt.
-    Returns a JSON with output video path and status.
     """
     if VID_MODEL is None or VID_PROCESSOR is None:
         return {
             "output_video": None,
             "status": "Video Models failed to load on startup.",
         }
     if input_video is None or not prompt:
-        return {"output_video": None, "status": "Missing video or prompt."}
     try:
         # Gradio passes a dict with 'name' key for uploaded files
         video_path = (
@@ -133,7 +143,11 @@ def video_inference(input_video, prompt):
             else input_video.get("name", None)
         )
         if not video_path:
-            return {"output_video": None, "status": "Invalid video input."}
         video_cap = cv2.VideoCapture(video_path)
         vid_fps = video_cap.get(cv2.CAP_PROP_FPS)
         vid_w = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
@@ -146,7 +160,11 @@ def video_inference(input_video, prompt):
             video_frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
         video_cap.release()
         if len(video_frames) == 0:
-            return {"output_video": None, "status": "No frames found in video."}
         session = VID_PROCESSOR.init_video_session(
             video=video_frames, inference_device=DEVICE, dtype=DTYPE
         )
@@ -155,17 +173,38 @@ def video_inference(input_video, prompt):
         video_writer = cv2.VideoWriter(
             temp_out_path, cv2.VideoWriter_fourcc(*"mp4v"), vid_fps, (vid_w, vid_h)
         )
         for model_out in VID_MODEL.propagate_in_video_iterator(
             inference_session=session, max_frame_num_to_track=len(video_frames)
         ):
             post_processed = VID_PROCESSOR.postprocess_outputs(session, model_out)
             f_idx = model_out.frame_idx
             original_pil = Image.fromarray(video_frames[f_idx])
             if "masks" in post_processed:
                 detected_masks = post_processed["masks"]
                 object_ids = post_processed["object_ids"]
                 if detected_masks.ndim == 4:
                     detected_masks = detected_masks.squeeze(1)
                 final_frame = apply_mask_overlay(
                     original_pil, detected_masks, object_ids=object_ids
                 )
@@ -175,11 +214,13 @@ def video_inference(input_video, prompt):
         video_writer.release()
         return {
             "output_video": temp_out_path,
             "status": "Video processing completed successfully.✅",
         }
     except Exception as e:
         return {
             "output_video": None,
             "status": f"Error during video processing: {str(e)}",
         }
@@ -192,8 +233,8 @@ app = gr.Interface(
         gr.Textbox(
             label="Prompt",
             lines=3,
-            info="Some models like [cam motion](https://huggingface.co/chancharikm/qwen2.5-vl-7b-cam-motion-preview) are trained specific prompts",
-            value="Describe the camera motion in this video.",
         ),
     ],
     outputs=gr.JSON(label="Output JSON"),

+# Import helpers for mask encoding and bbox extraction
 import sys
 import tempfile
     Sam3VideoProcessor,
 )
+# import local helpers
+from toolbox.mask_encoding import b64_mask_encode
+from visualizer import mask_to_xyxy
 logger.remove()
 logger.add(
     sys.stderr,
     return Image.alpha_composite(base_image, composite_layer).convert("RGB")
+logger.info("Loading Models and Processors...")
 try:
     VID_MODEL = Sam3VideoModel.from_pretrained("facebook/sam3").to(DEVICE, dtype=DTYPE)
     VID_PROCESSOR = Sam3VideoProcessor.from_pretrained("facebook/sam3")
 # Our Inference Function
 @spaces.GPU(duration=120)
+def video_inference(input_video, prompt: str):
     """
     Segments objects in a video using a text prompt.
+    Returns a list of detection dicts (one per object per frame) and output video path/status.
     """
     if VID_MODEL is None or VID_PROCESSOR is None:
         return {
             "output_video": None,
+            "detections": [],
             "status": "Video Models failed to load on startup.",
         }
     if input_video is None or not prompt:
+        return {
+            "output_video": None,
+            "detections": [],
+            "status": "Missing video or prompt.",
+        }
     try:
         # Gradio passes a dict with 'name' key for uploaded files
         video_path = (
             else input_video.get("name", None)
         )
         if not video_path:
+            return {
+                "output_video": None,
+                "detections": [],
+                "status": "Invalid video input.",
+            }
         video_cap = cv2.VideoCapture(video_path)
         vid_fps = video_cap.get(cv2.CAP_PROP_FPS)
         vid_w = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
             video_frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
         video_cap.release()
         if len(video_frames) == 0:
+            return {
+                "output_video": None,
+                "detections": [],
+                "status": "No frames found in video.",
+            }
         session = VID_PROCESSOR.init_video_session(
             video=video_frames, inference_device=DEVICE, dtype=DTYPE
         )
         video_writer = cv2.VideoWriter(
             temp_out_path, cv2.VideoWriter_fourcc(*"mp4v"), vid_fps, (vid_w, vid_h)
         )
+        detections = []
         for model_out in VID_MODEL.propagate_in_video_iterator(
             inference_session=session, max_frame_num_to_track=len(video_frames)
         ):
             post_processed = VID_PROCESSOR.postprocess_outputs(session, model_out)
             f_idx = model_out.frame_idx
             original_pil = Image.fromarray(video_frames[f_idx])
+            frame_detections = []
             if "masks" in post_processed:
                 detected_masks = post_processed["masks"]
                 object_ids = post_processed["object_ids"]
                 if detected_masks.ndim == 4:
                     detected_masks = detected_masks.squeeze(1)
+                # detected_masks: (num_objects, H, W)
+                for i, mask in enumerate(detected_masks):
+                    mask_bin = (mask > 0.0).astype(np.uint8)
+                    xyxy = mask_to_xyxy(mask_bin)
+                    if not xyxy:
+                        continue
+                    x0, y0, x1, y1 = xyxy
+                    det = {
+                        "frame": f_idx,
+                        "track_id": int(object_ids[i]) if object_ids is not None else i,
+                        "x": x0 / vid_w,
+                        "y": y0 / vid_h,
+                        "w": (x1 - x0) / vid_w,
+                        "h": (y1 - y0) / vid_h,
+                        "conf": 1,
+                        "mask_b64": b64_mask_encode(mask_bin).decode("ascii"),
+                    }
+                    detections.append(det)
                 final_frame = apply_mask_overlay(
                     original_pil, detected_masks, object_ids=object_ids
                 )
         video_writer.release()
         return {
             "output_video": temp_out_path,
+            "detections": detections,
             "status": "Video processing completed successfully.✅",
         }
     except Exception as e:
         return {
             "output_video": None,
+            "detections": [],
             "status": f"Error during video processing: {str(e)}",
         }
         gr.Textbox(
             label="Prompt",
             lines=3,
+            info="Describe the Object(s) you would like to track/ segmentate",
+            value="",
         ),
     ],
     outputs=gr.JSON(label="Output JSON"),

toolbox/mask_encoding.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import base64, os, io, random, time
+from PIL import Image
+import numpy as np
+def b64_mask_encode(mask_np_arr, tmp_dir = '/tmp/miro/mask_encoding/'):
+    '''
+    turn a binary mask in numpy into a base64 string
+    '''
+    mask_im = Image.fromarray(np.array(mask_np_arr).astype(np.uint8)*255)
+    mask_im = mask_im.convert(mode = '1') # convert to 1bit image
+    if not os.path.isdir(tmp_dir):
+        print(f'b64_mask_encode: making tmp dir for mask encoding...')
+        os.makedirs(tmp_dir)
+    timestr = time.strftime("%Y%m%d-%H%M%S")
+    hash_str = random.getrandbits(128)
+    tmp_fname = tmp_dir + f'{timestr}_{hash_str}_mask.png'
+    mask_im.save(tmp_fname)
+    return base64.b64encode(open(tmp_fname, 'rb').read())
+def b64_mask_decode(b64_string):
+    '''
+    decode a base64 string back to a binary mask numpy array
+    '''
+    im_bytes = base64.b64decode(b64_string)
+    im_decode = Image.open(io.BytesIO(im_bytes))
+    return np.array(im_decode)
+def get_true_mask(mask_arr, im_w_h:tuple, x0, y0, x1, y1):
+    '''
+    decode the mask of CM output to get a mask that's the same size as source im
+    '''
+    if x0 > im_w_h[0] or x1 > im_w_h[0] or y0 > im_w_h[1] or y1 > im_w_h[1]:
+        raise ValueError(f'get_true_mask: Xs and Ys exceeded im_w_h bound: {im_w_h}')
+    if mask_arr.shape != (y1 - y0, x1 - x0):
+        raise ValueError(f'get_true_mask: Bounding Box h: {y1-y0} w: {x1-x0} does not match mask shape: {mask_arr.shape}')
+    w, h = im_w_h
+    mask = np.zeros((h,w), dtype = np.uint8)
+    mask[y0:y1, x0:x1] = mask_arr
+    return mask

visualizer.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from PIL import Image, ImageColor
+# import matplotlib.colors as mcolors
+import numpy as np
+# from toolbox.mask_encoding import b64_mask_decode
+# from toolbox.img_utils import im_draw_bbox, im_draw_point, im_color_mask
+def mask_to_xyxy(mask: np.ndarray, verbose: bool = False) -> tuple:
+    """Convert a binary mask of shape (h, w) to
+    xyxy bounding box format (top-left and bottom-right coordinates).
+    """
+    ys, xs = np.where(mask)
+    if len(xs) == 0 or len(ys) == 0:
+        if verbose:
+            logger.warning("mask_to_xyxy: No object found in the mask")
+        return None
+    x_min = np.min(xs)
+    y_min = np.min(ys)
+    x_max = np.max(xs)
+    y_max = np.max(ys)
+    xyxy = (x_min, y_min, x_max, y_max)
+    xyxy = tuple([int(i) for i in xyxy])
+    return xyxy
+def annotate_detections(
+    im: Image.Image,
+    l_obj: list,
+    color_key: str = "class",
+    bbox_width: int = 1,
+    label_key: str = "object_id",
+    color_dict: dict = {},
+):
+    # color_list is  a list of tuple(name, color_hex)
+    color_list = list(
+        mcolors.XKCD_COLORS.items()
+    )  # list(mcolors.TABLEAU_COLORS.items())
+    unique_color_keys = list(
+        set([o[color_key] for o in l_obj if color_key in o.keys()])
+    )
+    for obj in l_obj:
+        color_index = unique_color_keys.index(obj[color_key])
+        bbox_color = (
+            color_dict[obj[color_key]] if color_dict else color_list[color_index][1]
+        )
+        im = (
+            im_draw_bbox(
+                im,
+                color=bbox_color,
+                width=bbox_width,
+                caption=(str(obj[label_key]) if label_key else None),
+                **obj["boundingBox"],
+                use_bbv=True,
+            )
+            if "boundingBox" in obj.keys()
+            else im_draw_point(
+                im,
+                **obj["point"],
+                width=bbox_width,
+                caption=(str(obj[label_key]) if label_key else None),
+                color=bbox_color,
+            )
+        )
+    return im
+def annotate_masks(
+    im: Image.Image, masks: list, mask_alpha: float = 0.9, bbox_width: int = 3
+) -> Image.Image:
+    """returns an annotated pillow image"""
+    masks = [
+        b64_mask_decode(m).astype(np.uint8) if isinstance(m, str) else m for m in masks
+    ]
+    segs = []
+    for i, m in enumerate(masks):
+        x0, y0, x1, y1 = mask_to_xyxy(m)
+        segs.append(
+            {
+                "object_id": i,
+                "boundingBox": {"x0": x0, "y0": y0, "x1": x1, "y1": y1},
+            }
+        )
+    ann_im = np.array(im)
+    for i, m in enumerate(masks):
+        m_color = list(mcolors.XKCD_COLORS.items())[i]
+        ann_im = im_color_mask(
+            ann_im,
+            mask_array=m,
+            alpha=mask_alpha,
+            rbg_tup=ImageColor.getrgb(m_color[1]),
+        )
+    ann_im = annotate_detections(
+        ann_im,
+        l_obj=segs,
+        color_key="object_id",
+        label_key="object_id",
+        bbox_width=bbox_width,
+    )
+    return ann_im