SAM3-Video

Running on Zero

File size: 5,742 Bytes

74593d4
bc27759
 
0cc203f
74593d4
 
 
c7dac2c
 
74593d4
 
c7dac2c
 
 
 
 
 
 
 
 
 
 
 
bc27759
 
 
c7dac2c
bc27759
 
c7dac2c
bc27759
 
 
a3f9b81
c7dac2c
 
 
 
 
 
 
1fd4203
c7dac2c
 
 
 
28be05f
c7dac2c
 
 
28be05f
c7dac2c
28be05f
 
c7dac2c
 
 
 
 
 
 
 
74593d4
57943d6
c7dac2c
 
 
 
 
 
 
 
 
 
 
 
bc27759
c7dac2c
 
a3f9b81
c7dac2c
 
 
 
 
bc27759
c7dac2c
 
 
 
bc27759
c7dac2c
 
 
 
 
 
 
 
 
 
57943d6
c7dac2c
 
 
 
 
 
 
 
bc27759
c7dac2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3f9b81
c7dac2c
 
a3f9b81
c7dac2c
 
74593d4
 
c7dac2c
74593d4
c7dac2c
 
 
 
 
 
 
 
 
 
 
 
28be05f
c7dac2c
 
 
 
 
 
 
 
 
 
 
5a34082
74593d4
c7dac2c

import os
import cv2
import tempfile
import spaces
import gradio as gr
import numpy as np
import torch
from PIL import Image
from transformers import Sam3VideoModel, Sam3VideoProcessor

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

print("Loading SAM3 Video Model...")
VID_MODEL = Sam3VideoModel.from_pretrained("facebook/sam3").to(device, dtype=torch.bfloat16)
VID_PROCESSOR = Sam3VideoProcessor.from_pretrained("facebook/sam3")
print("Model loaded!")

OUTPUT_FPS = 24


def apply_green_mask(base_image, mask_data, opacity=0.5):
    """Draw green mask overlay on a frame."""
    if isinstance(base_image, np.ndarray):
        base_image = Image.fromarray(base_image)
    base_image = base_image.convert("RGBA")

    if mask_data is None or len(mask_data) == 0:
        return base_image.convert("RGB")

    if isinstance(mask_data, torch.Tensor):
        mask_data = mask_data.cpu().numpy()
    mask_data = mask_data.astype(np.uint8)

    if mask_data.ndim == 4:
        mask_data = mask_data[0]
    if mask_data.ndim == 3 and mask_data.shape[0] == 1:
        mask_data = mask_data[0]
    if mask_data.ndim == 3:
        # Multiple masks — merge into one
        mask_data = np.any(mask_data > 0, axis=0).astype(np.uint8)

    green = (0, 255, 0)
    mask_bitmap = Image.fromarray((mask_data * 255).astype(np.uint8))
    if mask_bitmap.size != base_image.size:
        mask_bitmap = mask_bitmap.resize(base_image.size, resample=Image.NEAREST)

    color_fill = Image.new("RGBA", base_image.size, green + (0,))
    mask_alpha = mask_bitmap.point(lambda v: int(v * opacity) if v > 0 else 0)
    color_fill.putalpha(mask_alpha)

    return Image.alpha_composite(base_image, color_fill).convert("RGB")


def get_video_info(video_path):
    """Return frame count and fps of the input video."""
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS) or 24
    cap.release()
    duration = total_frames / fps
    return total_frames, fps, duration


def calc_timeout(source_vid, text_query):
    if not source_vid:
        return 60
    _, _, duration = get_video_info(source_vid)
    # ~2s processing per second of video, with a floor/ceiling
    return max(60, min(int(duration * 3) + 30, 300))


@spaces.GPU(duration=calc_timeout)
def run_video_segmentation(source_vid, text_query):
    if VID_MODEL is None or VID_PROCESSOR is None:
        raise gr.Error("Video model failed to load.")
    if not source_vid or not text_query:
        raise gr.Error("Please provide both a video and a text prompt.")

    try:
        cap = cv2.VideoCapture(source_vid)
        src_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        src_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        src_fps = cap.get(cv2.CAP_PROP_FPS) or 24

        video_frames = []
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            video_frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        cap.release()

        total_frames = len(video_frames)
        duration = total_frames / src_fps
        status = f"Loaded {total_frames} frames ({duration:.1f}s @ {src_fps:.0f}fps). Processing..."
        print(status)

        session = VID_PROCESSOR.init_video_session(
            video=video_frames, inference_device=device, dtype=torch.bfloat16
        )
        session = VID_PROCESSOR.add_text_prompt(inference_session=session, text=text_query)

        temp_out = tempfile.mktemp(suffix=".mp4")
        writer = cv2.VideoWriter(temp_out, cv2.VideoWriter_fourcc(*"mp4v"), OUTPUT_FPS, (src_w, src_h))

        for model_out in VID_MODEL.propagate_in_video_iterator(
            inference_session=session, max_frame_num_to_track=total_frames
        ):
            post = VID_PROCESSOR.postprocess_outputs(session, model_out)
            f_idx = model_out.frame_idx
            original = Image.fromarray(video_frames[f_idx])

            if "masks" in post:
                masks = post["masks"]
                if masks.ndim == 4:
                    masks = masks.squeeze(1)
                frame_out = apply_green_mask(original, masks)
            else:
                frame_out = original

            writer.write(cv2.cvtColor(np.array(frame_out), cv2.COLOR_RGB2BGR))

        writer.release()

        out_info = f"Done — {total_frames} frames, {duration:.1f}s input → output at {OUTPUT_FPS}fps"
        return temp_out, out_info

    except Exception as e:
        return None, f"Error: {str(e)}"


css = """
#col-container { margin: 0 auto; max-width: 1000px; }
"""

with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.Markdown("# SAM3 Video Segmentation — Green Mask")
        gr.Markdown(
            "Upload a video and describe what to segment. "
            "Output is rendered at **24fps** with a **green mask** overlay."
        )

        with gr.Row():
            with gr.Column():
                video_input = gr.Video(label="Input Video", format="mp4")
                text_prompt = gr.Textbox(
                    label="Text Prompt",
                    placeholder="e.g., person, red car, dog",
                )
                run_btn = gr.Button("Segment Video", variant="primary", size="lg")

            with gr.Column():
                video_output = gr.Video(label="Segmented Video", autoplay=True)
                status_box = gr.Textbox(label="Status", interactive=False)

        run_btn.click(
            fn=run_video_segmentation,
            inputs=[video_input, text_prompt],
            outputs=[video_output, status_box],
        )

if __name__ == "__main__":
    demo.launch(show_error=True)