Spaces:

GF-John
/

sam2

Running on Zero

John Ho commited on Jun 4, 2025

Commit

59822ae

1 Parent(s): 95ca774

trying async frame load

Files changed (2) hide show

app.py CHANGED Viewed

@@ -128,7 +128,7 @@ def process_video(video_path: str, variant: str, masks: Union[list, str]):
         variant: SAMv2's model variant
         masks: a list of b64 encoded masks for the first frame of the video, indicating the objects to be tracked
     Returns:
-        list: a list of masks
     """
     model = load_vid_model(variant=variant)
     masks = json.loads(masks) if isinstance(masks, str) else masks
@@ -145,6 +145,7 @@ def process_video(video_path: str, variant: str, masks: Union[list, str]):
         device="cuda",
         do_tidy_up=True,
         drop_mask=False,
     )
@@ -185,10 +186,12 @@ with gr.Blocks() as demo:
                     choices=["tiny", "small", "base_plus", "large"],
                 ),
                 gr.Textbox(
-                    label='Masks for Objects of Interest in the First Frame (JSON list of dicts: [{"x0":..., "y0":..., "x1":..., "y1":...}, ...])',
                     value=None,
                     lines=5,
-                    placeholder='JSON list of dicts: [{"x0":..., "y0":..., "x1":..., "y1":...}, ...]',
                 ),
             ],
             outputs=gr.JSON(label="Output JSON"),

         variant: SAMv2's model variant
         masks: a list of b64 encoded masks for the first frame of the video, indicating the objects to be tracked
     Returns:
+        list: a list of tracked objects expressed as a list of dictionary [{"frame":..., "track_id":..., "x":..., "y":...,"w":...,"h":...,"conf":..., "mask_b64":...},...]
     """
     model = load_vid_model(variant=variant)
     masks = json.loads(masks) if isinstance(masks, str) else masks
         device="cuda",
         do_tidy_up=True,
         drop_mask=False,
+        async_frame_load=True,
     )
                     choices=["tiny", "small", "base_plus", "large"],
                 ),
                 gr.Textbox(
+                    label="Masks for Objects of Interest in the First Frame",
                     value=None,
                     lines=5,
+                    placeholder="""
+                    JSON list of base64 encoded masks, e.g.: ["b'iVBORw0KGgoAAAANSUhEUgAABDgAAAeAAQAAAAADGtqnAAAXz...'",...]
+                    """,
                 ),
             ],
             outputs=gr.JSON(label="Output JSON"),

samv2_handler.py CHANGED Viewed

@@ -160,6 +160,7 @@ def run_sam_video_inference(
     every_x: int = None,
     do_tidy_up: bool = False,
     drop_mask: bool = True,
 ):
     # put video frames into directory
     # TODO:
@@ -177,7 +178,9 @@ def run_sam_video_inference(
     w = vinfo["frame_width"]
     h = vinfo["frame_height"]
-    inference_state = model.init_state(video_path=vframes_dir, device=device)
     for i, mask in enumerate(masks):
         model.add_new_mask(
             inference_state=inference_state, frame_idx=0, obj_id=i, mask=mask

     every_x: int = None,
     do_tidy_up: bool = False,
     drop_mask: bool = True,
+    async_frame_load: bool = False,
 ):
     # put video frames into directory
     # TODO:
     w = vinfo["frame_width"]
     h = vinfo["frame_height"]
+    inference_state = model.init_state(
+        video_path=vframes_dir, device=device, async_loading_frames=async_frame_load
+    )
     for i, mask in enumerate(masks):
         model.add_new_mask(
             inference_state=inference_state, frame_idx=0, obj_id=i, mask=mask