Spaces:

yonigozlan
/

edgetam

Running on Zero

App Files Files Community

yonigozlan HF Staff commited on Sep 26

Commit

adb1841

1 Parent(s): 03b77e4

process frames one at a time

Browse files

Files changed (1) hide show

app.py +40 -66

app.py CHANGED Viewed

@@ -54,10 +54,12 @@ def try_load_video_frames(video_path_or_url: str) -> tuple[list[Image.Image], di
                 cap.release()
                 if fps_val and fps_val > 0:
                     info["fps"] = float(fps_val)
-            except Exception:
                 pass
         return pil_frames, info
-    except Exception:
         # Fallback to OpenCV
         try:
             import cv2  # type: ignore
@@ -180,14 +182,6 @@ def load_model_if_needed(GLOBAL_STATE: gr.State) -> tuple[AutoModel, Sam2VideoPr
         if GLOBAL_STATE.model_repo_id == desired_repo:
             return GLOBAL_STATE.model, GLOBAL_STATE.processor, GLOBAL_STATE.device, GLOBAL_STATE.dtype
         # Different repo requested: dispose current and reload
-        try:
-            del GLOBAL_STATE.model
-        except Exception:
-            pass
-        try:
-            del GLOBAL_STATE.processor
-        except Exception:
-            pass
         GLOBAL_STATE.model = None
         GLOBAL_STATE.processor = None
     print(f"Loading model from {desired_repo}")
@@ -219,16 +213,8 @@ def ensure_session_for_current_model(GLOBAL_STATE: gr.State) -> None:
             GLOBAL_STATE.clicks_by_frame_obj.clear()
             GLOBAL_STATE.boxes_by_frame_obj.clear()
             GLOBAL_STATE.composited_frames.clear()
-            # Dispose previous session cleanly
-            try:
-                if GLOBAL_STATE.inference_session is not None:
-                    GLOBAL_STATE.inference_session.reset_inference_session()
-            except Exception:
-                pass
             GLOBAL_STATE.inference_session = None
-            gc.collect()
             GLOBAL_STATE.inference_session = processor.init_video_session(
-                video=GLOBAL_STATE.video_frames,
                 inference_device=device,
                 video_storage_device="cpu",
                 dtype=dtype,
@@ -265,40 +251,18 @@ def init_video_session(GLOBAL_STATE: gr.State, video: str | dict) -> tuple[AppSt
     # Enforce max duration of 8 seconds (trim if longer)
     MAX_SECONDS = 8.0
     trimmed_note = ""
-    fps_in = None
-    if isinstance(info, dict) and info.get("fps"):
-        try:
-            fps_in = float(info["fps"]) or None
-        except Exception:
-            fps_in = None
-    if fps_in is not None:
-        max_frames_allowed = int(MAX_SECONDS * fps_in)
-        if len(frames) > max_frames_allowed:
-            frames = frames[:max_frames_allowed]
-            trimmed_note = f" (trimmed to {int(MAX_SECONDS)}s = {len(frames)} frames)"
-            if isinstance(info, dict):
-                info["num_frames"] = len(frames)
-    else:
-        # Fallback when FPS unknown: assume ~30 FPS and cap to 240 frames (~8s)
-        max_frames_allowed = 240
-        if len(frames) > max_frames_allowed:
-            frames = frames[:max_frames_allowed]
-            trimmed_note = " (trimmed to 240 frames ~8s @30fps)"
-            if isinstance(info, dict):
-                info["num_frames"] = len(frames)
     GLOBAL_STATE.video_frames = frames
     # Try to capture original FPS if provided by loader
-    GLOBAL_STATE.video_fps = None
-    if isinstance(info, dict) and info.get("fps"):
-        try:
-            GLOBAL_STATE.video_fps = float(info["fps"]) or None
-        except Exception:
-            GLOBAL_STATE.video_fps = None
     # Initialize session
     inference_session = processor.init_video_session(
-        video=frames,
         inference_device=device,
         video_storage_device="cpu",
         dtype=dtype,
@@ -412,6 +376,12 @@ def on_image_click(
     processor = state.processor
     model = state.model
     inference_session = state.inference_session
     if state.current_prompt_type == "Boxes":
         # Two-click box input
@@ -443,6 +413,7 @@ def on_image_click(
                 obj_ids=int(obj_id),
                 input_boxes=[[[x_min, y_min, x_max, y_max]]],
                 clear_old_inputs=True,  # For boxes, always clear old inputs
             )
             frame_boxes = state.boxes_by_frame_obj.setdefault(int(frame_idx), {})
@@ -465,6 +436,7 @@ def on_image_click(
             obj_ids=int(obj_id),
             input_points=[[[[int(x), int(y)]]]],
             input_labels=[[[int(label_int)]]],
             clear_old_inputs=bool(clear_old),
         )
@@ -477,10 +449,7 @@ def on_image_click(
     # Forward on that frame
     with torch.inference_mode():
-        outputs = model(
-            inference_session=inference_session,
-            frame_idx=int(frame_idx),
-        )
     H = inference_session.video_height
     W = inference_session.video_width
@@ -509,8 +478,8 @@ def on_image_click(
 @spaces.GPU()
 def propagate_masks(GLOBAL_STATE: gr.State):
     if GLOBAL_STATE is None or GLOBAL_STATE.inference_session is None:
-        yield "Load a video first.", gr.update()
-        return
     processor = deepcopy(GLOBAL_STATE.processor)
     model = deepcopy(GLOBAL_STATE.model)
@@ -524,17 +493,19 @@ def propagate_masks(GLOBAL_STATE: gr.State):
     processed = 0
     # Initial status; no slider change yet
-    yield f"Propagating masks: {processed}/{total}", gr.update()
     last_frame_idx = 0
     with torch.inference_mode():
-        for sam2_video_output in model.propagate_in_video_iterator(inference_session):
             H = inference_session.video_height
             W = inference_session.video_width
             pred_masks = sam2_video_output.pred_masks.detach().cpu()
             video_res_masks = processor.post_process_masks([pred_masks], original_sizes=[[H, W]])[0]
-            frame_idx = int(sam2_video_output.frame_idx)
             last_frame_idx = frame_idx
             masks_for_frame: dict[int, np.ndarray] = {}
             obj_ids_order = list(inference_session.obj_ids)
@@ -547,15 +518,15 @@ def propagate_masks(GLOBAL_STATE: gr.State):
             processed += 1
             # Every 15th frame (or last), move slider to current frame to update preview via slider binding
-            if processed % 15 == 0 or processed == total:
-                yield f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
-            else:
-                yield f"Propagating masks: {processed}/{total}", gr.update()
     text = f"Propagated masks across {processed} frames for {len(inference_session.obj_ids)} objects."
     # Final status; ensure slider points to last processed frame
-    yield text, gr.update(value=last_frame_idx)
 def reset_session(GLOBAL_STATE: gr.State) -> tuple[AppState, Image.Image, int, int, str]:
@@ -785,14 +756,16 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
             iio.imwrite(out_path, [fr[:, :, ::-1] for fr in frames_np], plugin="pyav", fps=fps)
             return out_path
-        except Exception:
             # Fallbacks
             try:
                 import imageio.v2 as imageio  # type: ignore
                 imageio.mimsave(out_path, [fr[:, :, ::-1] for fr in frames_np], fps=fps)
                 return out_path
-            except Exception:
                 try:
                     import cv2  # type: ignore
@@ -803,6 +776,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
                     writer.release()
                     return out_path
                 except Exception as e:
                     raise gr.Error(f"Failed to render video: {e}")
     render_btn.click(_render_video, inputs=[GLOBAL_STATE], outputs=[playback_video])
@@ -811,7 +785,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
     propagate_btn.click(
         propagate_masks,
         inputs=[GLOBAL_STATE],
-        outputs=[propagate_status, frame_slider],
     )
     reset_btn.click(

                 cap.release()
                 if fps_val and fps_val > 0:
                     info["fps"] = float(fps_val)
+            except Exception as e:
+                print(f"Failed to render video with cv2: {e}")
                 pass
         return pil_frames, info
+    except Exception as e:
+        print(f"Failed to load video with transformers.video_utils: {e}")
         # Fallback to OpenCV
         try:
             import cv2  # type: ignore
         if GLOBAL_STATE.model_repo_id == desired_repo:
             return GLOBAL_STATE.model, GLOBAL_STATE.processor, GLOBAL_STATE.device, GLOBAL_STATE.dtype
         # Different repo requested: dispose current and reload
         GLOBAL_STATE.model = None
         GLOBAL_STATE.processor = None
     print(f"Loading model from {desired_repo}")
             GLOBAL_STATE.clicks_by_frame_obj.clear()
             GLOBAL_STATE.boxes_by_frame_obj.clear()
             GLOBAL_STATE.composited_frames.clear()
             GLOBAL_STATE.inference_session = None
             GLOBAL_STATE.inference_session = processor.init_video_session(
                 inference_device=device,
                 video_storage_device="cpu",
                 dtype=dtype,
     # Enforce max duration of 8 seconds (trim if longer)
     MAX_SECONDS = 8.0
     trimmed_note = ""
+    fps_in = info.get("fps")
+    max_frames_allowed = int(MAX_SECONDS * fps_in)
+    if len(frames) > max_frames_allowed:
+        frames = frames[:max_frames_allowed]
+        trimmed_note = f" (trimmed to {int(MAX_SECONDS)}s = {len(frames)} frames)"
+        if isinstance(info, dict):
+            info["num_frames"] = len(frames)
     GLOBAL_STATE.video_frames = frames
     # Try to capture original FPS if provided by loader
+    GLOBAL_STATE.video_fps = float(fps_in)
     # Initialize session
     inference_session = processor.init_video_session(
         inference_device=device,
         video_storage_device="cpu",
         dtype=dtype,
     processor = state.processor
     model = state.model
     inference_session = state.inference_session
+    original_size = None
+    pixel_values = None
+    if not inference_session.processed_frames or frame_idx not in inference_session.processed_frames:
+        inputs = processor(images=state.video_frames[frame_idx], device=state.device, return_tensors="pt")
+        original_size = inputs.original_sizes[0]
+        pixel_values = inputs.pixel_values[0]
     if state.current_prompt_type == "Boxes":
         # Two-click box input
                 obj_ids=int(obj_id),
                 input_boxes=[[[x_min, y_min, x_max, y_max]]],
                 clear_old_inputs=True,  # For boxes, always clear old inputs
+                original_size=original_size,
             )
             frame_boxes = state.boxes_by_frame_obj.setdefault(int(frame_idx), {})
             obj_ids=int(obj_id),
             input_points=[[[[int(x), int(y)]]]],
             input_labels=[[[int(label_int)]]],
+            original_size=original_size,
             clear_old_inputs=bool(clear_old),
         )
     # Forward on that frame
     with torch.inference_mode():
+        outputs = model(inference_session=inference_session, frame=pixel_values, frame_idx=int(frame_idx))
     H = inference_session.video_height
     W = inference_session.video_width
 @spaces.GPU()
 def propagate_masks(GLOBAL_STATE: gr.State):
     if GLOBAL_STATE is None or GLOBAL_STATE.inference_session is None:
+        # yield GLOBAL_STATE, "Load a video first.", gr.update()
+        return GLOBAL_STATE, "Load a video first.", gr.update()
     processor = deepcopy(GLOBAL_STATE.processor)
     model = deepcopy(GLOBAL_STATE.model)
     processed = 0
     # Initial status; no slider change yet
+    yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update()
     last_frame_idx = 0
     with torch.inference_mode():
+        for frame_idx, frame in enumerate(GLOBAL_STATE.video_frames):
+            pixel_values = None
+            if not inference_session.processed_frames or frame_idx not in inference_session.processed_frames:
+                pixel_values = processor(images=frame, device="cuda", return_tensors="pt").pixel_values[0]
+            sam2_video_output = model(inference_session=inference_session, frame=pixel_values, frame_idx=frame_idx)
             H = inference_session.video_height
             W = inference_session.video_width
             pred_masks = sam2_video_output.pred_masks.detach().cpu()
             video_res_masks = processor.post_process_masks([pred_masks], original_sizes=[[H, W]])[0]
             last_frame_idx = frame_idx
             masks_for_frame: dict[int, np.ndarray] = {}
             obj_ids_order = list(inference_session.obj_ids)
             processed += 1
             # Every 15th frame (or last), move slider to current frame to update preview via slider binding
+            if processed % 30 == 0 or processed == total:
+                yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
+            # else:
+            # yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update()
     text = f"Propagated masks across {processed} frames for {len(inference_session.obj_ids)} objects."
     # Final status; ensure slider points to last processed frame
+    yield GLOBAL_STATE, text, gr.update(value=last_frame_idx)
 def reset_session(GLOBAL_STATE: gr.State) -> tuple[AppState, Image.Image, int, int, str]:
             iio.imwrite(out_path, [fr[:, :, ::-1] for fr in frames_np], plugin="pyav", fps=fps)
             return out_path
+        except Exception as e:
+            print(f"Failed to render video with imageio.v3: {e}")
             # Fallbacks
             try:
                 import imageio.v2 as imageio  # type: ignore
                 imageio.mimsave(out_path, [fr[:, :, ::-1] for fr in frames_np], fps=fps)
                 return out_path
+            except Exception as e:
+                print(f"Failed to render video with imageio.v2: {e}")
                 try:
                     import cv2  # type: ignore
                     writer.release()
                     return out_path
                 except Exception as e:
+                    print(f"Failed to render video with cv2: {e}")
                     raise gr.Error(f"Failed to render video: {e}")
     render_btn.click(_render_video, inputs=[GLOBAL_STATE], outputs=[playback_video])
     propagate_btn.click(
         propagate_masks,
         inputs=[GLOBAL_STATE],
+        outputs=[GLOBAL_STATE, propagate_status, frame_slider],
     )
     reset_btn.click(