Spaces:

merve
/

SAM3-video-segmentation

Running on Zero

App Files Files Community

hysts HF Staff commited on 14 days ago

Commit

ffce5fe

1 Parent(s): 4e68fb4

ZeroGPU

Browse files

Files changed (1) hide show

app.py +118 -19

app.py CHANGED Viewed

@@ -1,11 +1,14 @@
 import colorsys
 import gc
 import tempfile
-from collections.abc import Iterator
 import cv2
 import gradio as gr
 import numpy as np
 import torch
 from gradio.themes import Soft
 from PIL import Image, ImageDraw, ImageFont
@@ -15,7 +18,7 @@ MODEL_ID = "facebook/sam3"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = torch.bfloat16
-TRACKER_MODEL = Sam3TrackerVideoModel.from_pretrained(MODEL_ID, torch_dtype=DTYPE, device_map=DEVICE).eval()
 TRACKER_PROCESSOR = Sam3TrackerVideoProcessor.from_pretrained(MODEL_ID)
 TEXT_VIDEO_MODEL = Sam3VideoModel.from_pretrained(MODEL_ID).to(DEVICE, dtype=DTYPE).eval()
@@ -25,6 +28,81 @@ print("Models loaded successfully!")
 MAX_SECONDS = 8.0
 def try_load_video_frames(video_path_or_url: str) -> tuple[list[Image.Image], dict]:
     cap = cv2.VideoCapture(video_path_or_url)
     frames = []
@@ -175,8 +253,8 @@ def init_video_session(
         processor = TEXT_VIDEO_PROCESSOR
         state.inference_session = processor.init_video_session(
             video=frames,
-            inference_device=DEVICE,
-            inference_state_device=DEVICE,
             processing_device="cpu",
             video_storage_device="cpu",
             dtype=DTYPE,
@@ -185,13 +263,17 @@ def init_video_session(
         processor = TRACKER_PROCESSOR
         state.inference_session = processor.init_video_session(
             video=raw_video,
-            inference_device=DEVICE,
-            inference_state_device=DEVICE,
             processing_device="cpu",
             video_storage_device="cpu",
             dtype=DTYPE,
         )
     first_frame = frames[0]
     max_idx = len(frames) - 1
     if active_tab == "text":
@@ -362,6 +444,7 @@ def _ensure_color_for_obj(state: AppState, obj_id: int) -> None:
         state.color_by_obj[obj_id] = pastel_color_for_object(obj_id)
 def on_image_click(
     img: Image.Image | np.ndarray,
     state: AppState,
@@ -370,12 +453,13 @@ def on_image_click(
     label: str,
     clear_old: bool,
     evt: gr.SelectData,
-) -> Image.Image:
     if state is None or state.inference_session is None:
         return img
     model = TRACKER_MODEL
     processor = TRACKER_PROCESSOR
     x = y = None
     if evt is not None:
@@ -471,14 +555,17 @@ def on_image_click(
     state.composited_frames.pop(ann_frame_idx, None)
-    return update_frame_display(state, ann_frame_idx)
 def on_text_prompt(
     state: AppState,
     frame_idx: int,
     text_prompt: str,
-) -> tuple[Image.Image, str, str]:
     if state is None or state.inference_session is None:
         return None, "Upload a video and enter text prompt.", "**Active prompts:** None"
@@ -487,7 +574,7 @@ def on_text_prompt(
     if not text_prompt or not text_prompt.strip():
         active_prompts = _get_active_prompts_display(state)
-        return update_frame_display(state, int(frame_idx)), "Please enter a text prompt.", active_prompts
     frame_idx = int(np.clip(frame_idx, 0, len(state.video_frames) - 1))
@@ -495,7 +582,9 @@ def on_text_prompt(
     prompt_texts = [p.strip() for p in text_prompt.split(",") if p.strip()]
     if not prompt_texts:
         active_prompts = _get_active_prompts_display(state)
-        return update_frame_display(state, int(frame_idx)), "Please enter a valid text prompt.", active_prompts
     # Add text prompt(s) - supports both single string and list of strings
     state.inference_session = processor.add_text_prompt(
@@ -579,7 +668,10 @@ def on_text_prompt(
         status = f"Processed text prompt(s) {prompts_str} on frame {frame_idx}. No objects detected."
     active_prompts = _get_active_prompts_display(state)
-    return update_frame_display(state, int(frame_idx)), status, active_prompts
 def _get_active_prompts_display(state: AppState) -> str:
@@ -596,6 +688,7 @@ def _get_active_prompts_display(state: AppState) -> str:
     return "**Active prompts:** None"
 def propagate_masks(state: AppState) -> Iterator[tuple[AppState, str, dict]]:
     if state is None:
         return state, "Load a video first.", gr.update()
@@ -619,6 +712,8 @@ def propagate_masks(state: AppState) -> Iterator[tuple[AppState, str, dict]]:
             model = TEXT_VIDEO_MODEL
             processor = TEXT_VIDEO_PROCESSOR
             # Collect all unique prompts from existing frame annotations
             text_prompt_to_obj_ids = {}
             for frame_idx, frame_texts in state.text_prompts_by_frame_obj.items():
@@ -638,6 +733,7 @@ def propagate_masks(state: AppState) -> Iterator[tuple[AppState, str, dict]]:
                 text_prompt_to_obj_ids[text_prompt].sort()
             if not text_prompt_to_obj_ids:
                 yield state, "No text prompts found. Please add a text prompt first.", gr.update()
                 return
@@ -705,7 +801,9 @@ def propagate_masks(state: AppState) -> Iterator[tuple[AppState, str, dict]]:
                 last_frame_idx = frame_idx
                 processed += 1
                 if processed % 30 == 0 or processed == total:
                     yield state, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
         else:
             if state.inference_session is None:
                 yield state, "Tracker model not loaded.", gr.update()
@@ -714,6 +812,8 @@ def propagate_masks(state: AppState) -> Iterator[tuple[AppState, str, dict]]:
             model = TRACKER_MODEL
             processor = TRACKER_PROCESSOR
             for sam2_video_output in model.propagate_in_video_iterator(inference_session=state.inference_session):
                 video_res_masks = processor.post_process_masks(
                     [sam2_video_output.pred_masks],
@@ -731,9 +831,12 @@ def propagate_masks(state: AppState) -> Iterator[tuple[AppState, str, dict]]:
                 last_frame_idx = frame_idx
                 processed += 1
                 if processed % 30 == 0 or processed == total:
                     yield state, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
     text = f"Propagated masks across {processed} frames."
     yield state, text, gr.update(value=last_frame_idx)
@@ -1079,17 +1182,13 @@ with gr.Blocks(title="SAM3", theme=Soft(primary_hue="blue", secondary_hue="rose"
     preview_pointbox.select(
         fn=on_image_click,
         inputs=[preview_pointbox, app_state, frame_slider_pointbox, obj_id_inp, label_radio, clear_old_chk],
-        outputs=preview_pointbox,
     )
-    def _on_text_apply(state: AppState, frame_idx: int, text: str) -> tuple[Image.Image, str, str]:
-        img, status, active_prompts = on_text_prompt(state, frame_idx, text)
-        return img, status, active_prompts
     text_apply_btn.click(
-        fn=_on_text_apply,
         inputs=[app_state, frame_slider_text, text_prompt_input],
-        outputs=[preview_text, text_status, active_prompts_display],
     )
     reset_prompts_btn.click(

 import colorsys
 import gc
 import tempfile
+from collections import defaultdict
+from collections.abc import Iterator, Mapping, Sequence
+from typing import Any
 import cv2
 import gradio as gr
 import numpy as np
+import spaces
 import torch
 from gradio.themes import Soft
 from PIL import Image, ImageDraw, ImageFont
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = torch.bfloat16
+TRACKER_MODEL = Sam3TrackerVideoModel.from_pretrained(MODEL_ID, torch_dtype=DTYPE).to(DEVICE).eval()
 TRACKER_PROCESSOR = Sam3TrackerVideoProcessor.from_pretrained(MODEL_ID)
 TEXT_VIDEO_MODEL = Sam3VideoModel.from_pretrained(MODEL_ID).to(DEVICE, dtype=DTYPE).eval()
 MAX_SECONDS = 8.0
+def to_device_recursive(obj: Any, device: str | torch.device) -> Any:  # noqa: ANN401
+    """Return a new object where all torch.Tensors reachable from `obj` are moved to the given device.
+    - Does NOT mutate the original object.
+    - Handles:
+        * torch.Tensor
+        * Mapping (e.g. dict, defaultdict, OrderedDict, etc.)
+        * Sequence (e.g. list, tuple) except str/bytes
+        * Custom classes with attributes (__dict__)
+    - Tries to preserve container types where reasonable.
+    """
+    device = torch.device(device)
+    memo = {}
+    def _convert(x: Any) -> Any:  # noqa: ANN401, C901
+        obj_id = id(x)
+        if obj_id in memo:
+            return memo[obj_id]
+        # 1. Tensor
+        if isinstance(x, torch.Tensor):
+            y = x.to(device)
+            memo[obj_id] = y
+            return y
+        # 2. Mapping (dict, defaultdict, etc.)
+        if isinstance(x, Mapping):
+            # Special case: defaultdict
+            if isinstance(x, defaultdict):
+                y = defaultdict(x.default_factory)
+                memo[obj_id] = y
+                for k, v in x.items():
+                    y[k] = _convert(v)
+                return y
+            # Try to rebuild the same type using (key, value) pairs
+            try:
+                y = type(x)((k, _convert(v)) for k, v in x.items())
+                memo[obj_id] = y
+                return y
+            except TypeError:
+                # Fallback: plain dict
+                y = {k: _convert(v) for k, v in x.items()}
+                memo[obj_id] = y
+                return y
+        # 3. Sequence (list/tuple/etc.) but not str/bytes
+        if isinstance(x, Sequence) and not isinstance(x, (str, bytes, bytearray)):
+            if isinstance(x, list):
+                y = [_convert(v) for v in x]
+            elif isinstance(x, tuple):
+                y = type(x)(_convert(v) for v in x)
+            else:
+                try:
+                    y = type(x)(_convert(v) for v in x)
+                except TypeError:
+                    y = [_convert(v) for v in x]
+            memo[obj_id] = y
+            return y
+        # 4. Custom object with attributes (__dict__)
+        if hasattr(x, "__dict__") and not isinstance(x, type):
+            new_obj = x.__class__.__new__(x.__class__)
+            memo[obj_id] = new_obj
+            for name, value in vars(x).items():
+                setattr(new_obj, name, _convert(value))
+            return new_obj
+        # 5. Everything else → keep as-is
+        memo[obj_id] = x
+        return x
+    return _convert(obj)
 def try_load_video_frames(video_path_or_url: str) -> tuple[list[Image.Image], dict]:
     cap = cv2.VideoCapture(video_path_or_url)
     frames = []
         processor = TEXT_VIDEO_PROCESSOR
         state.inference_session = processor.init_video_session(
             video=frames,
+            inference_device="cpu",
+            inference_state_device="cpu",
             processing_device="cpu",
             video_storage_device="cpu",
             dtype=DTYPE,
         processor = TRACKER_PROCESSOR
         state.inference_session = processor.init_video_session(
             video=raw_video,
+            inference_device="cpu",
+            inference_state_device="cpu",
             processing_device="cpu",
             video_storage_device="cpu",
             dtype=DTYPE,
         )
+    state.inference_session.inference_device = DEVICE
+    state.inference_session.processing_device = DEVICE
+    state.inference_session.cache.inference_device = DEVICE
     first_frame = frames[0]
     max_idx = len(frames) - 1
     if active_tab == "text":
         state.color_by_obj[obj_id] = pastel_color_for_object(obj_id)
+@spaces.GPU
 def on_image_click(
     img: Image.Image | np.ndarray,
     state: AppState,
     label: str,
     clear_old: bool,
     evt: gr.SelectData,
+) -> tuple[Image.Image, AppState]:
     if state is None or state.inference_session is None:
         return img
     model = TRACKER_MODEL
     processor = TRACKER_PROCESSOR
+    state.inference_session = to_device_recursive(state.inference_session, DEVICE)
     x = y = None
     if evt is not None:
     state.composited_frames.pop(ann_frame_idx, None)
+    state.inference_session = to_device_recursive(state.inference_session, "cpu")
+    return update_frame_display(state, ann_frame_idx), state
+@spaces.GPU
 def on_text_prompt(
     state: AppState,
     frame_idx: int,
     text_prompt: str,
+) -> tuple[Image.Image, str, str, AppState]:
     if state is None or state.inference_session is None:
         return None, "Upload a video and enter text prompt.", "**Active prompts:** None"
     if not text_prompt or not text_prompt.strip():
         active_prompts = _get_active_prompts_display(state)
+        return update_frame_display(state, int(frame_idx)), "Please enter a text prompt.", active_prompts, state
     frame_idx = int(np.clip(frame_idx, 0, len(state.video_frames) - 1))
     prompt_texts = [p.strip() for p in text_prompt.split(",") if p.strip()]
     if not prompt_texts:
         active_prompts = _get_active_prompts_display(state)
+        return update_frame_display(state, int(frame_idx)), "Please enter a valid text prompt.", active_prompts, state
+    state.inference_session = to_device_recursive(state.inference_session, DEVICE)
     # Add text prompt(s) - supports both single string and list of strings
     state.inference_session = processor.add_text_prompt(
         status = f"Processed text prompt(s) {prompts_str} on frame {frame_idx}. No objects detected."
     active_prompts = _get_active_prompts_display(state)
+    state.inference_session = to_device_recursive(state.inference_session, "cpu")
+    return update_frame_display(state, int(frame_idx)), status, active_prompts, state
 def _get_active_prompts_display(state: AppState) -> str:
     return "**Active prompts:** None"
+@spaces.GPU
 def propagate_masks(state: AppState) -> Iterator[tuple[AppState, str, dict]]:
     if state is None:
         return state, "Load a video first.", gr.update()
             model = TEXT_VIDEO_MODEL
             processor = TEXT_VIDEO_PROCESSOR
+            state.inference_session = to_device_recursive(state.inference_session, DEVICE)
             # Collect all unique prompts from existing frame annotations
             text_prompt_to_obj_ids = {}
             for frame_idx, frame_texts in state.text_prompts_by_frame_obj.items():
                 text_prompt_to_obj_ids[text_prompt].sort()
             if not text_prompt_to_obj_ids:
+                state.inference_session = to_device_recursive(state.inference_session, "cpu")
                 yield state, "No text prompts found. Please add a text prompt first.", gr.update()
                 return
                 last_frame_idx = frame_idx
                 processed += 1
                 if processed % 30 == 0 or processed == total:
+                    state.inference_session = to_device_recursive(state.inference_session, "cpu")
                     yield state, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
+                    state.inference_session = to_device_recursive(state.inference_session, DEVICE)
         else:
             if state.inference_session is None:
                 yield state, "Tracker model not loaded.", gr.update()
             model = TRACKER_MODEL
             processor = TRACKER_PROCESSOR
+            state.inference_session = to_device_recursive(state.inference_session, DEVICE)
             for sam2_video_output in model.propagate_in_video_iterator(inference_session=state.inference_session):
                 video_res_masks = processor.post_process_masks(
                     [sam2_video_output.pred_masks],
                 last_frame_idx = frame_idx
                 processed += 1
                 if processed % 30 == 0 or processed == total:
+                    state.inference_session = to_device_recursive(state.inference_session, "cpu")
                     yield state, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
+                    state.inference_session = to_device_recursive(state.inference_session, DEVICE)
     text = f"Propagated masks across {processed} frames."
+    state.inference_session = to_device_recursive(state.inference_session, "cpu")
     yield state, text, gr.update(value=last_frame_idx)
     preview_pointbox.select(
         fn=on_image_click,
         inputs=[preview_pointbox, app_state, frame_slider_pointbox, obj_id_inp, label_radio, clear_old_chk],
+        outputs=[preview_pointbox, app_state],
     )
     text_apply_btn.click(
+        fn=on_text_prompt,
         inputs=[app_state, frame_slider_text, text_prompt_input],
+        outputs=[preview_text, text_status, active_prompts_display, app_state],
     )
     reset_prompts_btn.click(