Spaces:

SammyLim
/

VideoMaMa

Running on Zero

App Files Files Community

pizb commited on Jan 8

Commit

638f167

1 Parent(s): 6cec120

update

Browse files

Files changed (1) hide show

app.py +68 -125

app.py CHANGED Viewed

@@ -123,32 +123,37 @@ def get_prompt(click_state, click_input):
     return click_state
-def load_video(video_input):
     """
-    Load video and extract first frame for mask generation
     """
     if video_input is None:
-        return None, None, \
                gr.update(visible=False), gr.update(visible=False), \
                gr.update(visible=False), gr.update(visible=False)
-    # Extract frames
-    frames, fps = extract_frames_from_video(video_input, max_frames=50)
-    if len(frames) == 0:
-        return None, None, \
                gr.update(visible=False), gr.update(visible=False), \
                gr.update(visible=False), gr.update(visible=False)
-    # Initialize video state - convert frames to list for pickling
     video_state = {
-        "frames": [frame.tolist() for frame in frames],  # Convert numpy to list
-        "fps": float(fps),  # Ensure JSON serializable
         "first_frame_mask": None,
         "masks": None,
     }
-    first_frame_pil = Image.fromarray(frames[0])
     return video_state, first_frame_pil, \
            gr.update(visible=True), gr.update(visible=True), \
@@ -156,35 +161,6 @@ def load_video(video_input):
 @spaces.GPU
-def sam_refine_gpu(first_frame_list, points, labels):
-    """
-    GPU function: Generate mask with SAM2
-    Args:
-        first_frame_list: First frame as list
-        points: List of [x, y] coordinates
-        labels: List of labels (1=positive, 0=negative)
-    Returns:
-        mask as list
-    """
-    # Lazy load models on first use
-    initialize_models()
-    # Convert to numpy
-    first_frame = np.array(first_frame_list, dtype=np.uint8)
-    # Generate mask with SAM2
-    mask = sam2_tracker.get_first_frame_mask(
-        frame=first_frame,
-        points=points,
-        labels=labels
-    )
-    # Return as list for pickling
-    return mask.tolist() if hasattr(mask, 'tolist') else mask
 def sam_refine(video_state, point_prompt, click_state, evt: gr.SelectData):
     """
     Add click and update mask on first frame
@@ -195,7 +171,10 @@ def sam_refine(video_state, point_prompt, click_state, evt: gr.SelectData):
         click_state: [[points], [labels]]
         evt: Gradio SelectData event with click coordinates
     """
-    if video_state is None or "frames" not in video_state:
         return None, video_state, click_state
     # Add new click
@@ -207,20 +186,18 @@ def sam_refine(video_state, point_prompt, click_state, evt: gr.SelectData):
     print(f"Added {point_prompt} click at ({x}, {y}). Total clicks: {len(click_state[0])}")
-    # Call GPU function with plain data (no Gradio State objects)
-    mask_list = sam_refine_gpu(
-        video_state["frames"][0],
-        click_state[0],
-        click_state[1]
     )
-    # Store mask as list
-    video_state["first_frame_mask"] = mask_list
     # Visualize mask and points
-    first_frame = np.array(video_state["frames"][0], dtype=np.uint8)
-    mask = np.array(mask_list, dtype=np.uint8)
     painted_image = mask_painter(
         first_frame.copy(),
         mask,
@@ -268,7 +245,7 @@ def clear_clicks(video_state, click_state):
     click_state = [[], []]
     if video_state is not None and "frames" in video_state:
-        first_frame = np.array(video_state["frames"][0], dtype=np.uint8)
         video_state["first_frame_mask"] = None
         return Image.fromarray(first_frame), video_state, click_state
@@ -285,8 +262,7 @@ def propagate_masks(video_state, click_state):
     if len(click_state[0]) == 0:
         return video_state, "⚠️ Please add at least one point first", gr.update(visible=False)
-    # Convert frames back to numpy arrays
-    frames = [np.array(f, dtype=np.uint8) for f in video_state["frames"]]
     # Track through video
     print(f"Tracking object through {len(frames)} frames...")
@@ -296,8 +272,7 @@ def propagate_masks(video_state, click_state):
         labels=click_state[1]
     )
-    # Convert masks to lists for pickling
-    video_state["masks"] = [m.tolist() if hasattr(m, 'tolist') else m for m in masks]
     status_msg = f"✓ Generated {len(masks)} masks. Ready to run VideoMaMa!"
@@ -305,88 +280,38 @@ def propagate_masks(video_state, click_state):
 @spaces.GPU(duration=120)
-def run_videomama_with_sam2_gpu(frames_list, points, labels):
     """
-    GPU function: Run SAM2 propagation and VideoMaMa inference
-    Args:
-        frames_list: List of frames as lists
-        points: List of [x, y] coordinates
-        labels: List of labels (1=positive, 0=negative)
-    Returns:
-        Tuple of (masks_list, output_frames_list, greenscreen_frames_list)
     """
     # Lazy load models on first use
     initialize_models()
-    # Convert frames back to numpy arrays
-    frames = [np.array(f, dtype=np.uint8) for f in frames_list]
-    # Step 1: Track through video with SAM2
-    print(f"🎯 Tracking object through {len(frames)} frames with SAM2...")
     masks = sam2_tracker.track_video(
         frames=frames,
-        points=points,
-        labels=labels
     )
     print(f"✓ Generated {len(masks)} masks")
     # Step 2: Run VideoMaMa
     print(f"🎨 Running VideoMaMa on {len(frames)} frames...")
     output_frames = videomama(videomama_pipeline, frames, masks)
-    # Create greenscreen composite
-    greenscreen_frames = []
-    for orig_frame, output_frame in zip(frames, output_frames):
-        # Extract alpha matte from VideoMaMa output
-        gray = cv2.cvtColor(output_frame, cv2.COLOR_RGB2GRAY)
-        alpha = np.clip(gray.astype(np.float32) / 255.0, 0, 1)
-        alpha_3ch = np.stack([alpha, alpha, alpha], axis=-1)
-        # Create green background
-        green_bg = np.zeros_like(orig_frame)
-        green_bg[:, :] = [156, 251, 165]  # Green screen color
-        # Composite: original_RGB * alpha + green * (1 - alpha)
-        composite = (orig_frame.astype(np.float32) * alpha_3ch +
-                    green_bg.astype(np.float32) * (1 - alpha_3ch)).astype(np.uint8)
-        greenscreen_frames.append(composite)
-    # Convert to lists for pickling
-    masks_list = [m.tolist() if hasattr(m, 'tolist') else m for m in masks]
-    output_frames_list = [f.tolist() for f in output_frames]
-    greenscreen_frames_list = [f.tolist() for f in greenscreen_frames]
-    return masks_list, output_frames_list, greenscreen_frames_list
-def run_videomama_with_sam2(video_state, click_state):
-    """
-    Run SAM2 propagation and VideoMaMa inference together
-    """
-    if video_state is None or "frames" not in video_state:
-        return video_state, None, None, None, "⚠️ No video loaded"
-    if len(click_state[0]) == 0:
-        return video_state, None, None, None, "⚠️ Please add at least one point first"
-    # Call GPU function with plain data (no Gradio State objects)
-    masks_list, output_frames_list, greenscreen_frames_list = run_videomama_with_sam2_gpu(
-        video_state["frames"],
-        click_state[0],
-        click_state[1]
-    )
-    # Store masks
-    video_state["masks"] = masks_list
-    # Convert back to numpy for video saving
-    frames = [np.array(f, dtype=np.uint8) for f in video_state["frames"]]
-    masks = [np.array(m, dtype=np.uint8) for m in masks_list]
-    output_frames = [np.array(f, dtype=np.uint8) for f in output_frames_list]
-    greenscreen_frames = [np.array(f, dtype=np.uint8) for f in greenscreen_frames_list]
     # Save output videos
     output_dir = Path("outputs")
     output_dir.mkdir(exist_ok=True)
@@ -403,7 +328,25 @@ def run_videomama_with_sam2(video_state, click_state):
     mask_frames_rgb = [np.stack([m, m, m], axis=-1) for m in masks]
     save_video(mask_frames_rgb, mask_video_path, video_state["fps"])
-    # Save greenscreen composite
     save_video(greenscreen_frames, greenscreen_path, video_state["fps"])
     status_msg = f"✓ Complete! Generated {len(output_frames)} frames."
@@ -515,7 +458,7 @@ with gr.Blocks(title="VideoMaMa Demo") as demo:
     # Event handlers
     load_button.click(
         fn=load_video,
-        inputs=[video_input],
         outputs=[video_state, first_frame_display,
                 point_prompt, clear_button, run_button, status_text]
     )

     return click_state
+def load_video(video_input, video_state):
     """
+    Load video, store path, and extract first frame for mask generation
     """
     if video_input is None:
+        return video_state, None, \
                gr.update(visible=False), gr.update(visible=False), \
                gr.update(visible=False), gr.update(visible=False)
+    # Extract ONLY the first frame for the UI to save memory/bandwidth
+    # We will load the full video inside the GPU function later
+    cap = cv2.VideoCapture(video_input)
+    ret, first_frame = cap.read()
+    cap.release()
+    if not ret:
+        return video_state, None, \
                gr.update(visible=False), gr.update(visible=False), \
                gr.update(visible=False), gr.update(visible=False)
+    first_frame_rgb = cv2.cvtColor(first_frame, cv2.COLOR_BGR2RGB)
+    # Initialize video state with PATH, not full frames
     video_state = {
+        "video_path": video_input,  # <--- Store Path
+        "first_frame": first_frame_rgb, # <--- Store only one frame
         "first_frame_mask": None,
         "masks": None,
     }
+    first_frame_pil = Image.fromarray(first_frame_rgb)
     return video_state, first_frame_pil, \
            gr.update(visible=True), gr.update(visible=True), \
 @spaces.GPU
 def sam_refine(video_state, point_prompt, click_state, evt: gr.SelectData):
     """
     Add click and update mask on first frame
         click_state: [[points], [labels]]
         evt: Gradio SelectData event with click coordinates
     """
+    # Lazy load models on first use
+    initialize_models()
+    if video_state is None or "first_frame" not in video_state: # Check for first_frame
         return None, video_state, click_state
     # Add new click
     print(f"Added {point_prompt} click at ({x}, {y}). Total clicks: {len(click_state[0])}")
+    # Generate mask with SAM2
+    first_frame = video_state["first_frame"]
+    mask = sam2_tracker.get_first_frame_mask(
+        frame=first_frame,
+        points=click_state[0],
+        labels=click_state[1]
     )
+    # Store mask in video state
+    video_state["first_frame_mask"] = mask
     # Visualize mask and points
     painted_image = mask_painter(
         first_frame.copy(),
         mask,
     click_state = [[], []]
     if video_state is not None and "frames" in video_state:
+        first_frame = video_state["frames"][0]
         video_state["first_frame_mask"] = None
         return Image.fromarray(first_frame), video_state, click_state
     if len(click_state[0]) == 0:
         return video_state, "⚠️ Please add at least one point first", gr.update(visible=False)
+    frames = video_state["frames"]
     # Track through video
     print(f"Tracking object through {len(frames)} frames...")
         labels=click_state[1]
     )
+    video_state["masks"] = masks
     status_msg = f"✓ Generated {len(masks)} masks. Ready to run VideoMaMa!"
 @spaces.GPU(duration=120)
+def run_videomama_with_sam2(video_state, click_state):
     """
+    Run SAM2 propagation and VideoMaMa inference together
     """
     # Lazy load models on first use
     initialize_models()
+    if video_state is None or "video_path" not in video_state:
+        return video_state, None, None, None, "⚠️ No video loaded"
+    if len(click_state[0]) == 0:
+        return video_state, None, None, None, "⚠️ Please add at least one point first"
+    # RELOAD FRAMES HERE inside the GPU worker
+    print(f"Loading frames from {video_state['video_path']}...")
+    frames, fps = extract_frames_from_video(video_state["video_path"], max_frames=50)
+    # Update state with FPS just in case (though we likely don't need to return it)
+    video_state["fps"] = fps
     masks = sam2_tracker.track_video(
         frames=frames,
+        points=click_state[0],
+        labels=click_state[1]
     )
+    video_state["masks"] = masks
     print(f"✓ Generated {len(masks)} masks")
     # Step 2: Run VideoMaMa
     print(f"🎨 Running VideoMaMa on {len(frames)} frames...")
     output_frames = videomama(videomama_pipeline, frames, masks)
     # Save output videos
     output_dir = Path("outputs")
     output_dir.mkdir(exist_ok=True)
     mask_frames_rgb = [np.stack([m, m, m], axis=-1) for m in masks]
     save_video(mask_frames_rgb, mask_video_path, video_state["fps"])
+    # Create greenscreen composite: RGB * VideoMaMa_alpha + green * (1 - VideoMaMa_alpha)
+    # VideoMaMa output_frames already contain the alpha matte result
+    greenscreen_frames = []
+    for orig_frame, output_frame in zip(frames, output_frames):
+        # Extract alpha matte from VideoMaMa output
+        # VideoMaMa outputs matted foreground, we use its intensity as alpha
+        gray = cv2.cvtColor(output_frame, cv2.COLOR_RGB2GRAY)
+        alpha = np.clip(gray.astype(np.float32) / 255.0, 0, 1)
+        alpha_3ch = np.stack([alpha, alpha, alpha], axis=-1)
+        # Create green background
+        green_bg = np.zeros_like(orig_frame)
+        green_bg[:, :] = [156, 251, 165]  # Green screen color
+        # Composite: original_RGB * alpha + green * (1 - alpha)
+        composite = (orig_frame.astype(np.float32) * alpha_3ch +
+                    green_bg.astype(np.float32) * (1 - alpha_3ch)).astype(np.uint8)
+        greenscreen_frames.append(composite)
     save_video(greenscreen_frames, greenscreen_path, video_state["fps"])
     status_msg = f"✓ Complete! Generated {len(output_frames)} frames."
     # Event handlers
     load_button.click(
         fn=load_video,
+        inputs=[video_input, video_state],
         outputs=[video_state, first_frame_display,
                 point_prompt, clear_button, run_button, status_text]
     )