Spaces:

SammyLim
/

VideoMaMa

Running on Zero

App Files Files Community

pizb commited on Jan 8

Commit

e1a1811

1 Parent(s): 638f167

update

Browse files

Files changed (1) hide show

app.py +51 -43

app.py CHANGED Viewed

@@ -125,35 +125,30 @@ def get_prompt(click_state, click_input):
 def load_video(video_input, video_state):
     """
-    Load video, store path, and extract first frame for mask generation
     """
     if video_input is None:
         return video_state, None, \
                gr.update(visible=False), gr.update(visible=False), \
                gr.update(visible=False), gr.update(visible=False)
-    # Extract ONLY the first frame for the UI to save memory/bandwidth
-    # We will load the full video inside the GPU function later
-    cap = cv2.VideoCapture(video_input)
-    ret, first_frame = cap.read()
-    cap.release()
-    if not ret:
         return video_state, None, \
                gr.update(visible=False), gr.update(visible=False), \
                gr.update(visible=False), gr.update(visible=False)
-    first_frame_rgb = cv2.cvtColor(first_frame, cv2.COLOR_BGR2RGB)
-    # Initialize video state with PATH, not full frames
     video_state = {
-        "video_path": video_input,  # <--- Store Path
-        "first_frame": first_frame_rgb, # <--- Store only one frame
         "first_frame_mask": None,
         "masks": None,
     }
-    first_frame_pil = Image.fromarray(first_frame_rgb)
     return video_state, first_frame_pil, \
            gr.update(visible=True), gr.update(visible=True), \
@@ -161,6 +156,17 @@ def load_video(video_input, video_state):
 @spaces.GPU
 def sam_refine(video_state, point_prompt, click_state, evt: gr.SelectData):
     """
     Add click and update mask on first frame
@@ -171,10 +177,7 @@ def sam_refine(video_state, point_prompt, click_state, evt: gr.SelectData):
         click_state: [[points], [labels]]
         evt: Gradio SelectData event with click coordinates
     """
-    # Lazy load models on first use
-    initialize_models()
-    if video_state is None or "first_frame" not in video_state: # Check for first_frame
         return None, video_state, click_state
     # Add new click
@@ -186,13 +189,9 @@ def sam_refine(video_state, point_prompt, click_state, evt: gr.SelectData):
     print(f"Added {point_prompt} click at ({x}, {y}). Total clicks: {len(click_state[0])}")
-    # Generate mask with SAM2
-    first_frame = video_state["first_frame"]
-    mask = sam2_tracker.get_first_frame_mask(
-        frame=first_frame,
-        points=click_state[0],
-        labels=click_state[1]
-    )
     # Store mask in video state
     video_state["first_frame_mask"] = mask
@@ -280,37 +279,46 @@ def propagate_masks(video_state, click_state):
 @spaces.GPU(duration=120)
 def run_videomama_with_sam2(video_state, click_state):
     """
     Run SAM2 propagation and VideoMaMa inference together
     """
-    # Lazy load models on first use
-    initialize_models()
-    if video_state is None or "video_path" not in video_state:
         return video_state, None, None, None, "⚠️ No video loaded"
     if len(click_state[0]) == 0:
         return video_state, None, None, None, "⚠️ Please add at least one point first"
-    # RELOAD FRAMES HERE inside the GPU worker
-    print(f"Loading frames from {video_state['video_path']}...")
-    frames, fps = extract_frames_from_video(video_state["video_path"], max_frames=50)
-    # Update state with FPS just in case (though we likely don't need to return it)
-    video_state["fps"] = fps
-    masks = sam2_tracker.track_video(
-        frames=frames,
-        points=click_state[0],
-        labels=click_state[1]
     )
     video_state["masks"] = masks
-    print(f"✓ Generated {len(masks)} masks")
-    # Step 2: Run VideoMaMa
-    print(f"🎨 Running VideoMaMa on {len(frames)} frames...")
-    output_frames = videomama(videomama_pipeline, frames, masks)
     # Save output videos
     output_dir = Path("outputs")
@@ -513,4 +521,4 @@ if __name__ == "__main__":
     #     server_port=7860,
     #     share=True
     # )
-    demo.launch()

 def load_video(video_input, video_state):
     """
+    Load video and extract first frame for mask generation
     """
     if video_input is None:
         return video_state, None, \
                gr.update(visible=False), gr.update(visible=False), \
                gr.update(visible=False), gr.update(visible=False)
+    # Extract frames
+    frames, fps = extract_frames_from_video(video_input, max_frames=50)
+    if len(frames) == 0:
         return video_state, None, \
                gr.update(visible=False), gr.update(visible=False), \
                gr.update(visible=False), gr.update(visible=False)
+    # Initialize video state
     video_state = {
+        "frames": frames,
+        "fps": fps,
         "first_frame_mask": None,
         "masks": None,
     }
+    first_frame_pil = Image.fromarray(frames[0])
     return video_state, first_frame_pil, \
            gr.update(visible=True), gr.update(visible=True), \
 @spaces.GPU
+def generate_sam2_mask(first_frame, points, labels):
+    """GPU-intensive SAM2 mask generation"""
+    initialize_models()
+    mask = sam2_tracker.get_first_frame_mask(
+        frame=first_frame,
+        points=points,
+        labels=labels
+    )
+    return mask
 def sam_refine(video_state, point_prompt, click_state, evt: gr.SelectData):
     """
     Add click and update mask on first frame
         click_state: [[points], [labels]]
         evt: Gradio SelectData event with click coordinates
     """
+    if video_state is None or "frames" not in video_state:
         return None, video_state, click_state
     # Add new click
     print(f"Added {point_prompt} click at ({x}, {y}). Total clicks: {len(click_state[0])}")
+    # Generate mask with SAM2 (GPU operation)
+    first_frame = video_state["frames"][0]
+    mask = generate_sam2_mask(first_frame, click_state[0], click_state[1])
     # Store mask in video state
     video_state["first_frame_mask"] = mask
 @spaces.GPU(duration=120)
+def process_video_with_models(frames, points, labels):
+    """GPU-intensive video processing with SAM2 and VideoMaMa"""
+    initialize_models()
+    # Step 1: Track through video with SAM2
+    print(f"🎯 Tracking object through {len(frames)} frames with SAM2...")
+    masks = sam2_tracker.track_video(
+        frames=frames,
+        points=points,
+        labels=labels
+    )
+    print(f"✓ Generated {len(masks)} masks")
+    # Step 2: Run VideoMaMa
+    print(f"🎨 Running VideoMaMa on {len(frames)} frames...")
+    output_frames = videomama(videomama_pipeline, frames, masks)
+    return masks, output_frames
 def run_videomama_with_sam2(video_state, click_state):
     """
     Run SAM2 propagation and VideoMaMa inference together
     """
+    if video_state is None or "frames" not in video_state:
         return video_state, None, None, None, "⚠️ No video loaded"
     if len(click_state[0]) == 0:
         return video_state, None, None, None, "⚠️ Please add at least one point first"
+    frames = video_state["frames"]
+    # Run GPU-intensive processing
+    masks, output_frames = process_video_with_models(
+        frames,
+        click_state[0],
+        click_state[1]
     )
     video_state["masks"] = masks
     # Save output videos
     output_dir = Path("outputs")
     #     server_port=7860,
     #     share=True
     # )
+    demo.launch()