Spaces:

SammyLim
/

VideoMaMa

Running on Zero

App Files Files Community

pizb commited on Jan 8

Commit

d33e75e

1 Parent(s): dea893d

initial update

Browse files

Files changed (14) hide show

.gitignore +65 -0
.hf_gitignore +47 -0
app.py +485 -0
download_checkpoints.sh +78 -0
requirements.txt +31 -0
sam2_hiera_l.yaml +124 -0
sam2_wrapper.py +172 -0
sam2_wrapper_hf.py +196 -0
tools/__init__.py +1 -0
tools/base_segmenter.py +68 -0
tools/interact_tools.py +121 -0
tools/painter.py +126 -0
videomama_wrapper.py +88 -0
videomama_wrapper_hf.py +110 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,65 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual environments
+venv/
+env/
+ENV/
+.venv
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# Gradio
+flagged/
+# Temporary files
+*.tmp
+temp/
+temp_*/
+*.log
+# Model checkpoints (download separately)
+checkpoints/*.pt
+checkpoints/*.pth
+checkpoints/*.safetensors
+checkpoints/*.bin
+# Videos
+samples/*.mp4
+samples/*.avi
+samples/*.mov
+*.mp4
+*.avi
+*.mov
+# OS
+.DS_Store
+Thumbs.db
+*.bak
+# Jupyter
+.ipynb_checkpoints/

.hf_gitignore ADDED Viewed

	@@ -0,0 +1,47 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+*.egg-info/
+dist/
+build/
+# Virtual environments
+venv/
+env/
+ENV/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+Thumbs.db
+# Model checkpoints (will be downloaded)
+checkpoints/
+*.pt
+*.pth
+*.safetensors
+*.bin
+# Outputs
+outputs/
+output_*.mp4
+masks_*.mp4
+greenscreen_*.mp4
+# Temporary files
+*.tmp
+tmp/
+temp/
+# Logs
+*.log
+logs/

app.py ADDED Viewed

	@@ -0,0 +1,485 @@

+"""
+VideoMaMa Gradio Demo
+Interactive video matting with SAM2 mask tracking
+"""
+import sys
+sys.path.append("../")
+sys.path.append("../../")
+import os
+import json
+import time
+import cv2
+import torch
+import numpy as np
+import gradio as gr
+from PIL import Image
+from pathlib import Path
+from sam2_wrapper import load_sam2_tracker
+from videomama_wrapper import load_videomama_pipeline, videomama
+from tools.painter import mask_painter, point_painter
+import warnings
+warnings.filterwarnings("ignore")
+# Global models
+sam2_tracker = None
+videomama_pipeline = None
+# Constants
+MASK_COLOR = 3
+MASK_ALPHA = 0.7
+CONTOUR_COLOR = 1
+CONTOUR_WIDTH = 5
+POINT_COLOR_POS = 8   # Positive points - orange
+POINT_COLOR_NEG = 1   # Negative points - red
+POINT_ALPHA = 0.9
+POINT_RADIUS = 15
+def initialize_models():
+    """Initialize SAM2 and VideoMaMa models"""
+    global sam2_tracker, videomama_pipeline
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    # Load SAM2
+    sam2_tracker = load_sam2_tracker(device=device)
+    # Load VideoMaMa
+    videomama_pipeline = load_videomama_pipeline(device=device)
+    print("All models initialized successfully!")
+def extract_frames_from_video(video_path, max_frames=50):
+    """
+    Extract frames from video file
+    Args:
+        video_path: Path to video file
+        max_frames: Maximum number of frames to extract
+    Returns:
+        frames: List of numpy arrays (H,W,3), uint8 RGB
+        fps: Original FPS of video
+    """
+    cap = cv2.VideoCapture(video_path)
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    frames = []
+    while cap.isOpened() and len(frames) < max_frames:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Convert BGR to RGB
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frames.append(frame_rgb)
+    cap.release()
+    print(f"Extracted {len(frames)} frames from video (FPS: {fps})")
+    return frames, fps
+def get_prompt(click_state, click_input):
+    """
+    Convert click input to prompt format
+    Args:
+        click_state: [[points], [labels]]
+        click_input: JSON string "[[x, y, label]]"
+    Returns:
+        Updated click_state
+    """
+    inputs = json.loads(click_input)
+    points = click_state[0]
+    labels = click_state[1]
+    for input_item in inputs:
+        points.append(input_item[:2])
+        labels.append(input_item[2])
+    click_state[0] = points
+    click_state[1] = labels
+    return click_state
+def load_video(video_input, video_state):
+    """
+    Load video and extract first frame for mask generation
+    """
+    if video_input is None:
+        return video_state, None, \
+               gr.update(visible=False), gr.update(visible=False), \
+               gr.update(visible=False), gr.update(visible=False)
+    # Extract frames
+    frames, fps = extract_frames_from_video(video_input, max_frames=50)
+    if len(frames) == 0:
+        return video_state, None, \
+               gr.update(visible=False), gr.update(visible=False), \
+               gr.update(visible=False), gr.update(visible=False)
+    # Initialize video state
+    video_state = {
+        "frames": frames,
+        "fps": fps,
+        "first_frame_mask": None,
+        "masks": None,
+    }
+    first_frame_pil = Image.fromarray(frames[0])
+    return video_state, first_frame_pil, \
+           gr.update(visible=True), gr.update(visible=True), \
+           gr.update(visible=True), gr.update(visible=False)
+def sam_refine(video_state, point_prompt, click_state, evt: gr.SelectData):
+    """
+    Add click and update mask on first frame
+    Args:
+        video_state: Dictionary with video data
+        point_prompt: "Positive" or "Negative"
+        click_state: [[points], [labels]]
+        evt: Gradio SelectData event with click coordinates
+    """
+    if video_state is None or "frames" not in video_state:
+        return None, video_state, click_state
+    # Add new click
+    x, y = evt.index[0], evt.index[1]
+    label = 1 if point_prompt == "Positive" else 0
+    click_state[0].append([x, y])
+    click_state[1].append(label)
+    print(f"Added {point_prompt} click at ({x}, {y}). Total clicks: {len(click_state[0])}")
+    # Generate mask with SAM2
+    first_frame = video_state["frames"][0]
+    mask = sam2_tracker.get_first_frame_mask(
+        frame=first_frame,
+        points=click_state[0],
+        labels=click_state[1]
+    )
+    # Store mask in video state
+    video_state["first_frame_mask"] = mask
+    # Visualize mask and points
+    painted_image = mask_painter(
+        first_frame.copy(),
+        mask,
+        MASK_COLOR,
+        MASK_ALPHA,
+        CONTOUR_COLOR,
+        CONTOUR_WIDTH
+    )
+    # Paint positive points
+    positive_points = np.array([click_state[0][i] for i in range(len(click_state[0]))
+                               if click_state[1][i] == 1])
+    if len(positive_points) > 0:
+        painted_image = point_painter(
+            painted_image,
+            positive_points,
+            POINT_COLOR_POS,
+            POINT_ALPHA,
+            POINT_RADIUS,
+            CONTOUR_COLOR,
+            CONTOUR_WIDTH
+        )
+    # Paint negative points
+    negative_points = np.array([click_state[0][i] for i in range(len(click_state[0]))
+                               if click_state[1][i] == 0])
+    if len(negative_points) > 0:
+        painted_image = point_painter(
+            painted_image,
+            negative_points,
+            POINT_COLOR_NEG,
+            POINT_ALPHA,
+            POINT_RADIUS,
+            CONTOUR_COLOR,
+            CONTOUR_WIDTH
+        )
+    painted_pil = Image.fromarray(painted_image)
+    return painted_pil, video_state, click_state
+def clear_clicks(video_state, click_state):
+    """Clear all clicks and reset to original first frame"""
+    click_state = [[], []]
+    if video_state is not None and "frames" in video_state:
+        first_frame = video_state["frames"][0]
+        video_state["first_frame_mask"] = None
+        return Image.fromarray(first_frame), video_state, click_state
+    return None, video_state, click_state
+def propagate_masks(video_state, click_state):
+    """
+    Propagate first frame mask through entire video using SAM2
+    """
+    if video_state is None or "frames" not in video_state:
+        return video_state, "No video loaded", gr.update(visible=False)
+    if len(click_state[0]) == 0:
+        return video_state, "⚠️ Please add at least one point first", gr.update(visible=False)
+    frames = video_state["frames"]
+    # Track through video
+    print(f"Tracking object through {len(frames)} frames...")
+    masks = sam2_tracker.track_video(
+        frames=frames,
+        points=click_state[0],
+        labels=click_state[1]
+    )
+    video_state["masks"] = masks
+    status_msg = f"✓ Generated {len(masks)} masks. Ready to run VideoMaMa!"
+    return video_state, status_msg, gr.update(visible=True)
+def run_videomama_with_sam2(video_state, click_state):
+    """
+    Run SAM2 propagation and VideoMaMa inference together
+    """
+    if video_state is None or "frames" not in video_state:
+        return video_state, None, None, None, "⚠️ No video loaded"
+    if len(click_state[0]) == 0:
+        return video_state, None, None, None, "⚠️ Please add at least one point first"
+    frames = video_state["frames"]
+    # Step 1: Track through video with SAM2
+    print(f"🎯 Tracking object through {len(frames)} frames with SAM2...")
+    masks = sam2_tracker.track_video(
+        frames=frames,
+        points=click_state[0],
+        labels=click_state[1]
+    )
+    video_state["masks"] = masks
+    print(f"✓ Generated {len(masks)} masks")
+    # Step 2: Run VideoMaMa
+    print(f"🎨 Running VideoMaMa on {len(frames)} frames...")
+    output_frames = videomama(videomama_pipeline, frames, masks)
+    # Save output videos
+    output_dir = Path("outputs")
+    output_dir.mkdir(exist_ok=True)
+    timestamp = int(time.time())
+    output_video_path = output_dir / f"output_{timestamp}.mp4"
+    mask_video_path = output_dir / f"masks_{timestamp}.mp4"
+    greenscreen_path = output_dir / f"greenscreen_{timestamp}.mp4"
+    # Save matting result
+    save_video(output_frames, output_video_path, video_state["fps"])
+    # Save mask video (for visualization)
+    mask_frames_rgb = [np.stack([m, m, m], axis=-1) for m in masks]
+    save_video(mask_frames_rgb, mask_video_path, video_state["fps"])
+    # Create greenscreen composite: RGB * VideoMaMa_alpha + green * (1 - VideoMaMa_alpha)
+    # VideoMaMa output_frames already contain the alpha matte result
+    greenscreen_frames = []
+    for orig_frame, output_frame in zip(frames, output_frames):
+        # Extract alpha matte from VideoMaMa output
+        # VideoMaMa outputs matted foreground, we use its intensity as alpha
+        gray = cv2.cvtColor(output_frame, cv2.COLOR_RGB2GRAY)
+        alpha = np.clip(gray.astype(np.float32) / 255.0, 0, 1)
+        alpha_3ch = np.stack([alpha, alpha, alpha], axis=-1)
+        # Create green background
+        green_bg = np.zeros_like(orig_frame)
+        green_bg[:, :] = [156, 251, 165]  # Green screen color
+        # Composite: original_RGB * alpha + green * (1 - alpha)
+        composite = (orig_frame.astype(np.float32) * alpha_3ch +
+                    green_bg.astype(np.float32) * (1 - alpha_3ch)).astype(np.uint8)
+        greenscreen_frames.append(composite)
+    save_video(greenscreen_frames, greenscreen_path, video_state["fps"])
+    status_msg = f"✓ Complete! Generated {len(output_frames)} frames."
+    return video_state, str(output_video_path), str(mask_video_path), str(greenscreen_path), status_msg
+def save_video(frames, output_path, fps):
+    """Save frames as video file"""
+    if len(frames) == 0:
+        return
+    height, width = frames[0].shape[:2]
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))
+    for frame in frames:
+        if len(frame.shape) == 2:  # Grayscale
+            frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR)
+        else:  # RGB
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+        out.write(frame)
+    out.release()
+    print(f"Saved video to {output_path}")
+def restart():
+    """Reset all states"""
+    return None, [[], []], None, \
+           gr.update(visible=False), gr.update(visible=False), \
+           gr.update(visible=False), None, None, None, ""
+# CSS styling
+custom_css = """
+.gradio-container {width: 90% !important; margin: 0 auto;}
+.title-text {text-align: center; font-size: 48px; font-weight: bold;
+             background: linear-gradient(to right, #8b5cf6, #10b981);
+             -webkit-background-clip: text; -webkit-text-fill-color: transparent;}
+.description-text {text-align: center; font-size: 18px; margin: 20px 0;}
+button {border-radius: 8px !important;}
+.green_button {background-color: #10b981 !important; color: white !important;}
+.red_button {background-color: #ef4444 !important; color: white !important;}
+.run_matting_button {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 50%, #f093fb 100%) !important;
+    color: white !important;
+    font-weight: bold !important;
+    font-size: 18px !important;
+    padding: 20px !important;
+    box-shadow: 0 4px 15px 0 rgba(102, 126, 234, 0.75) !important;
+    border: none !important;
+}
+.run_matting_button:hover {
+    background: linear-gradient(135deg, #764ba2 0%, #667eea 50%, #f093fb 100%) !important;
+    box-shadow: 0 6px 20px 0 rgba(102, 126, 234, 0.9) !important;
+    transform: translateY(-2px) !important;
+}
+"""
+# Build Gradio interface
+with gr.Blocks(css=custom_css, title="VideoMaMa Demo") as demo:
+    gr.HTML('<div class="title-text">VideoMaMa Interactive Demo</div>')
+    gr.Markdown(
+        '<div class="description-text">🎬 Upload a video → 🖱️ Click to mark object → ✅ Generate masks → 🎨 Run VideoMaMa</div>'
+    )
+    # State variables
+    video_state = gr.State(None)
+    click_state = gr.State([[], []])  # [[points], [labels]]
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### Step 1: Upload Video")
+            video_input = gr.Video(label="Input Video")
+            load_button = gr.Button("📁 Load Video", variant="primary")
+            gr.Markdown("### Step 2: Mark Object")
+            point_prompt = gr.Radio(
+                choices=["Positive", "Negative"],
+                value="Positive",
+                label="Click Type",
+                info="Positive: object, Negative: background",
+                visible=False
+            )
+            clear_button = gr.Button("🗑️ Clear Clicks", visible=False)
+        with gr.Column(scale=1):
+            gr.Markdown("### First Frame (Click to Add Points)")
+            first_frame_display = gr.Image(
+                label="First Frame",
+                type="pil",
+                interactive=True
+            )
+            run_button = gr.Button("🚀 Run Matting", visible=False, elem_classes="run_matting_button", size="lg")
+    status_text = gr.Textbox(label="Status", value="", interactive=False, visible=False)
+    gr.Markdown("### Outputs")
+    with gr.Row():
+        with gr.Column():
+            output_video = gr.Video(label="Matting Result", autoplay=True)
+        with gr.Column():
+            greenscreen_video = gr.Video(label="Greenscreen Composite", autoplay=True)
+        with gr.Column():
+            mask_video = gr.Video(label="Mask Track", autoplay=True)
+    # Event handlers
+    load_button.click(
+        fn=load_video,
+        inputs=[video_input, video_state],
+        outputs=[video_state, first_frame_display,
+                point_prompt, clear_button, run_button, status_text]
+    )
+    first_frame_display.select(
+        fn=sam_refine,
+        inputs=[video_state, point_prompt, click_state],
+        outputs=[first_frame_display, video_state, click_state]
+    )
+    clear_button.click(
+        fn=clear_clicks,
+        inputs=[video_state, click_state],
+        outputs=[first_frame_display, video_state, click_state]
+    )
+    run_button.click(
+        fn=run_videomama_with_sam2,
+        inputs=[video_state, click_state],
+        outputs=[video_state, output_video, mask_video, greenscreen_video, status_text]
+    )
+    video_input.change(
+        fn=restart,
+        inputs=[],
+        outputs=[video_state, click_state, first_frame_display,
+                point_prompt, clear_button, run_button,
+                output_video, mask_video, greenscreen_video, status_text]
+    )
+    # Examples
+    gr.Markdown("---\n### 📦 Example Videos")
+    example_dir = Path("samples")
+    if example_dir.exists():
+        examples = [str(p) for p in sorted(example_dir.glob("*.mp4"))]
+        if examples:
+            gr.Examples(examples=examples, inputs=[video_input])
+if __name__ == "__main__":
+    print("=" * 60)
+    print("VideoMaMa Interactive Demo")
+    print("=" * 60)
+    # Initialize models
+    initialize_models()
+    # Launch demo
+    demo.queue()
+    demo.launch(
+        server_name="127.0.0.1",
+        server_port=7860,
+        share=True
+    )

download_checkpoints.sh ADDED Viewed

	@@ -0,0 +1,78 @@

+#!/bin/bash
+# Download model checkpoints for VideoMaMa demo
+set -e
+echo "🔽 Downloading model checkpoints for VideoMaMa demo..."
+echo ""
+# Create checkpoints directory
+echo "Creating checkpoints directory..."
+mkdir -p checkpoints
+echo "✓ Directory created"
+echo ""
+# Download SAM2 checkpoint
+echo "Downloading SAM2 checkpoint..."
+echo "URL: https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt"
+echo "This may take a few minutes (file size: ~900MB)..."
+if command -v wget &> /dev/null; then
+    wget https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt \
+         -O checkpoints/sam2_hiera_large.pt
+elif command -v curl &> /dev/null; then
+    curl -L https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt \
+         -o checkpoints/sam2_hiera_large.pt
+else
+    echo "❌ Error: Neither wget nor curl is available. Please install one of them."
+    exit 1
+fi
+echo "✓ SAM2 checkpoint downloaded successfully"
+echo ""
+# Check if VideoMaMa checkpoint exists
+echo "Checking VideoMaMa checkpoint..."
+if [ -d "checkpoints/videomama_unet" ]; then
+    if [ -f "checkpoints/videomama_unet/config.json" ] && \
+       { [ -f "checkpoints/videomama_unet/diffusion_pytorch_model.safetensors" ] || \
+         [ -f "checkpoints/videomama_unet/diffusion_pytorch_model.bin" ]; }; then
+        echo "✓ VideoMaMa checkpoint already exists"
+    else
+        echo "⚠️  VideoMaMa checkpoint directory exists but is incomplete"
+        echo "   Please add the following files to checkpoints/videomama_unet/:"
+        echo "   - config.json"
+        echo "   - diffusion_pytorch_model.safetensors (or .bin)"
+    fi
+else
+    echo "⚠️  VideoMaMa checkpoint not found"
+    echo ""
+    echo "📝 Manual step required:"
+    echo "   1. Create directory: checkpoints/videomama_unet/"
+    echo "   2. Copy your trained VideoMaMa checkpoint files:"
+    echo "      - config.json"
+    echo "      - diffusion_pytorch_model.safetensors (or .bin)"
+    echo ""
+    echo "   Example:"
+    echo "   mkdir -p checkpoints/videomama_unet"
+    echo "   cp /path/to/your/checkpoint/* checkpoints/videomama_unet/"
+fi
+echo ""
+echo "="*70
+echo "✨ Checkpoint download complete!"
+echo "="*70
+echo ""
+echo "Next steps:"
+echo "1. Verify checkpoints are in place:"
+echo "   python test_setup.py"
+echo ""
+echo "2. (Optional) Add sample videos:"
+echo "   mkdir -p samples"
+echo "   cp your_sample.mp4 samples/"
+echo ""
+echo "3. Test locally:"
+echo "   python app.py"
+echo ""
+echo "4. Deploy to Hugging Face Space"
+echo ""

requirements.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+# Hugging Face Space Requirements for VideoMaMa Demo
+# Core frameworks
+torch>=2.0.0
+torchvision>=0.15.0
+diffusers>=0.24.0
+transformers>=4.30.0
+# Gradio for UI
+gradio==4.31.0
+# Image and video processing
+opencv-python>=4.8.0
+opencv-contrib-python>=4.8.0
+Pillow>=10.0.0
+numpy>=1.24.0
+scipy>=1.10.0
+# SAM2 dependencies
+segment-anything-2 @ git+https://github.com/facebookresearch/segment-anything-2.git
+# Additional utilities
+accelerate>=0.20.0
+einops>=0.6.0
+tqdm>=4.65.0
+safetensors>=0.3.0
+# For video export
+imageio>=2.31.0
+imageio-ffmpeg>=0.4.9
+pydantic==2.10.6

sam2_hiera_l.yaml ADDED Viewed

	@@ -0,0 +1,124 @@

+# Model Configuration for SAM2
+# This file should be placed alongside the SAM2 checkpoint
+# SAM 2 Hiera Large Configuration
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 144
+      num_heads: 2
+      stages: [2, 6, 36, 4]
+      global_att_blocks: [23, 33, 43]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+      window_spec: [8, 4, 16, 8]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [1152, 576, 288, 144]
+      fpn_top_down_levels: [2, 3]
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+    _target_: sam2.modeling.memory_encoder.MemoryEncoder
+    out_dim: 64
+    position_encoding:
+      _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+      num_pos_feats: 64
+      normalize: true
+      scale: null
+      temperature: 10000
+    mask_downsampler:
+      _target_: sam2.modeling.memory_encoder.MaskDownSampler
+      kernel_size: 3
+      stride: 2
+      padding: 1
+    fuser:
+      _target_: sam2.modeling.memory_encoder.Fuser
+      layer:
+        _target_: sam2.modeling.memory_encoder.CXBlock
+        dim: 256
+        kernel_size: 7
+        padding: 3
+        layer_scale_init_value: 1e-6
+        use_dwconv: True
+      num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  directly_add_no_mem_embed: true
+  use_high_res_features_in_sam: true
+  multimask_output_in_sam: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  iou_prediction_use_sigmoid: True
+  memory_temporal_stride_for_eval: 1
+  non_overlap_masks_for_mem_enc: true
+  use_obj_ptrs_in_encoder: true
+  max_obj_ptrs_in_encoder: 16
+  add_tpos_enc_to_obj_ptrs: false
+  proj_tpos_enc_in_obj_ptrs: false
+  use_signed_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  soft_no_obj_ptr: false
+  use_mlp_for_obj_ptr_proj: true
+  no_obj_embed_spatial: true
+  sam_mask_decoder_extra_args:
+    dynamic_multimask_via_stability: true
+    dynamic_multimask_stability_delta: 0.05
+    dynamic_multimask_stability_thresh: 0.98
+    pred_obj_scores: true
+    pred_obj_scores_mlp: true
+    use_multimask_token_for_obj_ptr: true
+  compile_image_encoder: False

sam2_wrapper.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+SAM2 Wrapper for Video Mask Tracking
+Handles mask generation and propagation through video
+"""
+import sys
+sys.path.append("/home/cvlab19/project/samuel/CVPR/sam2")
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from pathlib import Path
+from typing import List, Tuple
+import tempfile
+import shutil
+from sam2.build_sam import build_sam2_video_predictor
+class SAM2VideoTracker:
+    def __init__(self, checkpoint_path, config_file, device="cuda"):
+        """
+        Initialize SAM2 video tracker
+        Args:
+            checkpoint_path: Path to SAM2 checkpoint
+            config_file: Path to SAM2 config file
+            device: Device to run on
+        """
+        self.device = device
+        self.predictor = build_sam2_video_predictor(
+            config_file=config_file,
+            ckpt_path=checkpoint_path,
+            device=device
+        )
+        print(f"SAM2 video tracker initialized on {device}")
+    def track_video(self, frames: List[np.ndarray], points: List[List[int]],
+                   labels: List[int]) -> List[np.ndarray]:
+        """
+        Track object through video using SAM2
+        Args:
+            frames: List of numpy arrays, [(H,W,3)]*n, uint8 RGB frames
+            points: List of [x, y] coordinates for prompts
+            labels: List of labels (1 for positive, 0 for negative)
+        Returns:
+            masks: List of numpy arrays, [(H,W)]*n, uint8 binary masks
+        """
+        # Create temporary directory for frames
+        temp_dir = Path(tempfile.mkdtemp())
+        frames_dir = temp_dir / "frames"
+        frames_dir.mkdir(exist_ok=True)
+        try:
+            # Save frames to temp directory
+            print(f"Saving {len(frames)} frames to temporary directory...")
+            for i, frame in enumerate(frames):
+                frame_path = frames_dir / f"{i:05d}.jpg"
+                Image.fromarray(frame).save(frame_path, quality=95)
+            # Initialize SAM2 video predictor
+            print("Initializing SAM2 inference state...")
+            inference_state = self.predictor.init_state(video_path=str(frames_dir))
+            # Add prompts on first frame
+            points_array = np.array(points, dtype=np.float32)
+            labels_array = np.array(labels, dtype=np.int32)
+            print(f"Adding {len(points)} point prompts on first frame...")
+            _, out_obj_ids, out_mask_logits = self.predictor.add_new_points(
+                inference_state=inference_state,
+                frame_idx=0,
+                obj_id=1,
+                points=points_array,
+                labels=labels_array,
+            )
+            # Propagate through video
+            print("Propagating masks through video...")
+            masks = []
+            for frame_idx, object_ids, mask_logits in self.predictor.propagate_in_video(inference_state):
+                # Get mask for object ID 1
+                # object_ids can be a tensor or a list
+                obj_ids_list = object_ids.tolist() if hasattr(object_ids, 'tolist') else object_ids
+                if 1 in obj_ids_list:
+                    mask_idx = obj_ids_list.index(1)
+                    mask = (mask_logits[mask_idx] > 0.0).cpu().numpy()
+                    mask_uint8 = (mask.squeeze() * 255).astype(np.uint8)
+                    masks.append(mask_uint8)
+                else:
+                    # No mask for this frame, use empty mask
+                    h, w = frames[0].shape[:2]
+                    masks.append(np.zeros((h, w), dtype=np.uint8))
+            print(f"Generated {len(masks)} masks")
+            return masks
+        finally:
+            # Clean up temporary directory
+            shutil.rmtree(temp_dir, ignore_errors=True)
+    def get_first_frame_mask(self, frame: np.ndarray, points: List[List[int]],
+                            labels: List[int]) -> np.ndarray:
+        """
+        Get mask for first frame only (for preview)
+        Args:
+            frame: np.ndarray, (H, W, 3), uint8 RGB frame
+            points: List of [x, y] coordinates
+            labels: List of labels (1 for positive, 0 for negative)
+        Returns:
+            mask: np.ndarray, (H, W), uint8 binary mask
+        """
+        # Create temporary directory
+        temp_dir = Path(tempfile.mkdtemp())
+        frames_dir = temp_dir / "frames"
+        frames_dir.mkdir(exist_ok=True)
+        try:
+            # Save single frame
+            frame_path = frames_dir / "00000.jpg"
+            Image.fromarray(frame).save(frame_path, quality=95)
+            # Initialize SAM2
+            inference_state = self.predictor.init_state(video_path=str(frames_dir))
+            # Add prompts
+            points_array = np.array(points, dtype=np.float32)
+            labels_array = np.array(labels, dtype=np.int32)
+            _, out_obj_ids, out_mask_logits = self.predictor.add_new_points(
+                inference_state=inference_state,
+                frame_idx=0,
+                obj_id=1,
+                points=points_array,
+                labels=labels_array,
+            )
+            # Get mask
+            if len(out_mask_logits) > 0:
+                mask = (out_mask_logits[0] > 0.0).cpu().numpy()
+                mask_uint8 = (mask.squeeze() * 255).astype(np.uint8)
+                return mask_uint8
+            else:
+                return np.zeros(frame.shape[:2], dtype=np.uint8)
+        finally:
+            shutil.rmtree(temp_dir, ignore_errors=True)
+def load_sam2_tracker(device="cuda"):
+    """
+    Load SAM2 video tracker with pretrained weights
+    Args:
+        device: Device to run on
+    Returns:
+        SAM2VideoTracker instance
+    """
+    checkpoint_path = "/home/cvlab19/project/samuel/CVPR/sam2/checkpoints/sam2.1_hiera_large.pt"
+    config_file = "configs/sam2.1/sam2.1_hiera_l.yaml"
+    print(f"Loading SAM2 from {checkpoint_path}...")
+    tracker = SAM2VideoTracker(checkpoint_path, config_file, device)
+    return tracker

sam2_wrapper_hf.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""
+SAM2 Wrapper for Video Mask Tracking - Hugging Face Space Version
+Handles mask generation and propagation through video
+"""
+import sys
+import os
+from pathlib import Path
+# Add SAM2 to path if installed
+try:
+    import sam2
+except ImportError:
+    # Try to add from common locations
+    possible_paths = [
+        "/home/cvlab19/project/samuel/CVPR/sam2",
+        "./sam2"
+    ]
+    for path in possible_paths:
+        if os.path.exists(path):
+            sys.path.append(path)
+            break
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from typing import List, Tuple
+import tempfile
+import shutil
+from sam2.build_sam import build_sam2_video_predictor
+class SAM2VideoTracker:
+    def __init__(self, checkpoint_path, config_file, device="cuda"):
+        """
+        Initialize SAM2 video tracker
+        Args:
+            checkpoint_path: Path to SAM2 checkpoint
+            config_file: Path to SAM2 config file
+            device: Device to run on
+        """
+        self.device = device
+        self.predictor = build_sam2_video_predictor(
+            config_file=config_file,
+            ckpt_path=checkpoint_path,
+            device=device
+        )
+        print(f"SAM2 video tracker initialized on {device}")
+    def track_video(self, frames: List[np.ndarray], points: List[List[int]],
+                   labels: List[int]) -> List[np.ndarray]:
+        """
+        Track object through video using SAM2
+        Args:
+            frames: List of numpy arrays, [(H,W,3)]*n, uint8 RGB frames
+            points: List of [x, y] coordinates for prompts
+            labels: List of labels (1 for positive, 0 for negative)
+        Returns:
+            masks: List of numpy arrays, [(H,W)]*n, uint8 binary masks
+        """
+        # Create temporary directory for frames
+        temp_dir = Path(tempfile.mkdtemp())
+        frames_dir = temp_dir / "frames"
+        frames_dir.mkdir(exist_ok=True)
+        try:
+            # Save frames to temp directory
+            print(f"Saving {len(frames)} frames to temporary directory...")
+            for i, frame in enumerate(frames):
+                frame_path = frames_dir / f"{i:05d}.jpg"
+                Image.fromarray(frame).save(frame_path, quality=95)
+            # Initialize SAM2 video predictor
+            print("Initializing SAM2 inference state...")
+            inference_state = self.predictor.init_state(video_path=str(frames_dir))
+            # Add prompts on first frame
+            points_array = np.array(points, dtype=np.float32)
+            labels_array = np.array(labels, dtype=np.int32)
+            print(f"Adding {len(points)} point prompts on first frame...")
+            _, out_obj_ids, out_mask_logits = self.predictor.add_new_points(
+                inference_state=inference_state,
+                frame_idx=0,
+                obj_id=1,
+                points=points_array,
+                labels=labels_array,
+            )
+            # Propagate through video
+            print("Propagating masks through video...")
+            masks = []
+            for frame_idx, object_ids, mask_logits in self.predictor.propagate_in_video(inference_state):
+                # Get mask for object ID 1
+                obj_ids_list = object_ids.tolist() if hasattr(object_ids, 'tolist') else object_ids
+                if 1 in obj_ids_list:
+                    mask_idx = obj_ids_list.index(1)
+                    mask = (mask_logits[mask_idx] > 0.0).cpu().numpy()
+                    mask_uint8 = (mask.squeeze() * 255).astype(np.uint8)
+                    masks.append(mask_uint8)
+                else:
+                    # No mask for this frame, use empty mask
+                    h, w = frames[0].shape[:2]
+                    masks.append(np.zeros((h, w), dtype=np.uint8))
+            print(f"Generated {len(masks)} masks")
+            return masks
+        finally:
+            # Clean up temporary directory
+            shutil.rmtree(temp_dir, ignore_errors=True)
+    def get_first_frame_mask(self, frame: np.ndarray, points: List[List[int]],
+                            labels: List[int]) -> np.ndarray:
+        """
+        Get mask for first frame only (for preview)
+        Args:
+            frame: np.ndarray, (H, W, 3), uint8 RGB frame
+            points: List of [x, y] coordinates
+            labels: List of labels (1 for positive, 0 for negative)
+        Returns:
+            mask: np.ndarray, (H, W), uint8 binary mask
+        """
+        # Create temporary directory
+        temp_dir = Path(tempfile.mkdtemp())
+        frames_dir = temp_dir / "frames"
+        frames_dir.mkdir(exist_ok=True)
+        try:
+            # Save single frame
+            frame_path = frames_dir / "00000.jpg"
+            Image.fromarray(frame).save(frame_path, quality=95)
+            # Initialize SAM2
+            inference_state = self.predictor.init_state(video_path=str(frames_dir))
+            # Add prompts
+            points_array = np.array(points, dtype=np.float32)
+            labels_array = np.array(labels, dtype=np.int32)
+            _, out_obj_ids, out_mask_logits = self.predictor.add_new_points(
+                inference_state=inference_state,
+                frame_idx=0,
+                obj_id=1,
+                points=points_array,
+                labels=labels_array,
+            )
+            # Get mask
+            if len(out_mask_logits) > 0:
+                mask = (out_mask_logits[0] > 0.0).cpu().numpy()
+                mask_uint8 = (mask.squeeze() * 255).astype(np.uint8)
+                return mask_uint8
+            else:
+                return np.zeros(frame.shape[:2], dtype=np.uint8)
+        finally:
+            shutil.rmtree(temp_dir, ignore_errors=True)
+def load_sam2_tracker(checkpoint_path=None, device="cuda"):
+    """
+    Load SAM2 video tracker with pretrained weights
+    Args:
+        checkpoint_path: Path to SAM2 checkpoint (if None, uses default location)
+        device: Device to run on
+    Returns:
+        SAM2VideoTracker instance
+    """
+    # Use provided path or default
+    if checkpoint_path is None:
+        checkpoint_path = "checkpoints/sam2.1_hiera_large.pt"
+    # Config file should be in the SAM2 repo
+    config_file = "configs/sam2.1/sam2.1_hiera_l.yaml"
+    # Check if we need to use the local yaml file
+    if not os.path.exists(config_file):
+        config_file = "sam2_hiera_l.yaml"
+    print(f"Loading SAM2 from {checkpoint_path}...")
+    print(f"Using config: {config_file}")
+    tracker = SAM2VideoTracker(checkpoint_path, config_file, device)
+    return tracker

tools/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Tools module

tools/base_segmenter.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+SAM2 Base Segmenter
+Adapted from MatAnyone demo
+"""
+import sys
+sys.path.append("/home/cvlab19/project/samuel/CVPR/sam2")
+import torch
+import numpy as np
+from sam2.build_sam import build_sam2_video_predictor
+class BaseSegmenter:
+    def __init__(self, SAM_checkpoint, model_type, device):
+        """
+        Initialize SAM2 segmenter
+        Args:
+            SAM_checkpoint: Path to SAM2 checkpoint
+            model_type: SAM2 model config file
+            device: Device to run on
+        """
+        self.device = device
+        self.model_type = model_type
+        # Build SAM2 video predictor
+        self.sam_predictor = build_sam2_video_predictor(
+            config_file=model_type,
+            ckpt_path=SAM_checkpoint,
+            device=device
+        )
+        self.orignal_image = None
+        self.inference_state = None
+    def set_image(self, image: np.ndarray):
+        """Set the current image for segmentation"""
+        self.orignal_image = image
+    def reset_image(self):
+        """Reset the current image"""
+        self.orignal_image = None
+        self.inference_state = None
+    def predict(self, prompts, prompt_type, multimask=True):
+        """
+        Predict mask from prompts
+        Args:
+            prompts: Dictionary with point_coords, point_labels, mask_input
+            prompt_type: 'point' or 'both'
+            multimask: Whether to return multiple masks
+        Returns:
+            masks, scores, logits
+        """
+        # For SAM2, we need to handle prompts differently
+        # This is simplified - actual implementation will use video predictor
+        # Placeholder - actual SAM2 prediction would go here
+        # For now, return dummy values
+        h, w = self.orignal_image.shape[:2]
+        dummy_mask = np.zeros((h, w), dtype=bool)
+        dummy_score = np.array([1.0])
+        dummy_logit = np.zeros((h, w), dtype=np.float32)
+        return np.array([dummy_mask]), dummy_score, np.array([dummy_logit])

tools/interact_tools.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""
+SAM2 Interaction Tools
+Handles SAM2 mask generation with user clicks
+"""
+import sys
+sys.path.append("/home/cvlab19/project/samuel/CVPR/sam2")
+import numpy as np
+from PIL import Image
+from .base_segmenter import BaseSegmenter
+from .painter import mask_painter, point_painter
+mask_color = 3
+mask_alpha = 0.7
+contour_color = 1
+contour_width = 5
+point_color_ne = 8  # positive points
+point_color_ps = 50 # negative points
+point_alpha = 0.9
+point_radius = 15
+class SamControler:
+    def __init__(self, SAM_checkpoint, model_type, device):
+        """
+        Initialize SAM controller
+        Args:
+            SAM_checkpoint: Path to SAM2 checkpoint
+            model_type: SAM2 model config file
+            device: Device to run on
+        """
+        self.sam_controler = BaseSegmenter(SAM_checkpoint, model_type, device)
+        self.device = device
+    def first_frame_click(self, image: np.ndarray, points: np.ndarray,
+                         labels: np.ndarray, multimask=True, mask_color=3):
+        """
+        Generate mask from clicks on first frame
+        Args:
+            image: np.ndarray, (H, W, 3), RGB image
+            points: np.ndarray, (N, 2), [x, y] coordinates
+            labels: np.ndarray, (N,), 1 for positive, 0 for negative
+            multimask: bool, whether to generate multiple masks
+            mask_color: int, color ID for mask overlay
+        Returns:
+            mask: np.ndarray, (H, W), binary mask
+            logit: np.ndarray, (H, W), mask logits
+            painted_image: PIL.Image, visualization with mask and points
+        """
+        # Check if we have positive clicks
+        neg_flag = labels[-1]
+        if neg_flag == 1:  # Has positive click
+            # First pass with points only
+            prompts = {
+                'point_coords': points,
+                'point_labels': labels,
+            }
+            masks, scores, logits = self.sam_controler.predict(prompts, 'point', multimask)
+            mask, logit = masks[np.argmax(scores)], logits[np.argmax(scores), :, :]
+            # Refine with mask input
+            prompts = {
+                'point_coords': points,
+                'point_labels': labels,
+                'mask_input': logit[None, :, :]
+            }
+            masks, scores, logits = self.sam_controler.predict(prompts, 'both', multimask)
+            mask, logit = masks[np.argmax(scores)], logits[np.argmax(scores), :, :]
+        else:  # Only positive clicks
+            prompts = {
+                'point_coords': points,
+                'point_labels': labels,
+            }
+            masks, scores, logits = self.sam_controler.predict(prompts, 'point', multimask)
+            mask, logit = masks[np.argmax(scores)], logits[np.argmax(scores), :, :]
+        # Paint mask on image
+        painted_image = mask_painter(
+            image,
+            mask.astype('uint8'),
+            mask_color,
+            mask_alpha,
+            contour_color,
+            contour_width
+        )
+        # Paint positive points (label > 0)
+        positive_points = np.squeeze(points[np.argwhere(labels > 0)], axis=1)
+        if len(positive_points) > 0:
+            painted_image = point_painter(
+                painted_image,
+                positive_points,
+                point_color_ne,
+                point_alpha,
+                point_radius,
+                contour_color,
+                contour_width
+            )
+        # Paint negative points (label < 1)
+        negative_points = np.squeeze(points[np.argwhere(labels < 1)], axis=1)
+        if len(negative_points) > 0:
+            painted_image = point_painter(
+                painted_image,
+                negative_points,
+                point_color_ps,
+                point_alpha,
+                point_radius,
+                contour_color,
+                contour_width
+            )
+        painted_image = Image.fromarray(painted_image)
+        return mask, logit, painted_image

tools/painter.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+Mask and point painting utilities
+Adapted from MatAnyone demo
+"""
+import cv2
+import numpy as np
+from PIL import Image
+def mask_painter(input_image, input_mask, mask_color=5, mask_alpha=0.7,
+                 contour_color=1, contour_width=5):
+    """
+    Paint mask on image with transparency
+    Args:
+        input_image: np.ndarray, (H, W, 3)
+        input_mask: np.ndarray, (H, W), binary mask
+        mask_color: int, color ID for mask
+        mask_alpha: float, transparency
+        contour_color: int, color ID for contour
+        contour_width: int, width of contour
+    Returns:
+        painted_image: np.ndarray, (H, W, 3)
+    """
+    assert input_image.shape[:2] == input_mask.shape, "Image and mask must have same dimensions"
+    # Color palette
+    palette = np.array([
+        [0, 0, 0],        # 0: black
+        [255, 0, 0],      # 1: red
+        [0, 255, 0],      # 2: green
+        [0, 0, 255],      # 3: blue
+        [255, 255, 0],    # 4: yellow
+        [255, 0, 255],    # 5: magenta
+        [0, 255, 255],    # 6: cyan
+        [128, 128, 128],  # 7: gray
+        [255, 165, 0],    # 8: orange
+        [128, 0, 128],    # 9: purple
+    ])
+    mask_color_rgb = palette[mask_color % len(palette)]
+    contour_color_rgb = palette[contour_color % len(palette)]
+    # Create colored mask
+    painted_image = input_image.copy()
+    colored_mask = np.zeros_like(input_image)
+    colored_mask[input_mask > 0] = mask_color_rgb
+    # Blend with alpha
+    mask_region = input_mask > 0
+    painted_image[mask_region] = (
+        painted_image[mask_region] * (1 - mask_alpha) +
+        colored_mask[mask_region] * mask_alpha
+    ).astype(np.uint8)
+    # Draw contour
+    if contour_width > 0:
+        contours, _ = cv2.findContours(
+            input_mask.astype(np.uint8),
+            cv2.RETR_EXTERNAL,
+            cv2.CHAIN_APPROX_SIMPLE
+        )
+        cv2.drawContours(
+            painted_image,
+            contours,
+            -1,
+            contour_color_rgb.tolist(),
+            contour_width
+        )
+    return painted_image
+def point_painter(input_image, input_points, point_color=8, point_alpha=0.9,
+                  point_radius=15, contour_color=2, contour_width=3):
+    """
+    Paint points on image
+    Args:
+        input_image: np.ndarray, (H, W, 3)
+        input_points: np.ndarray, (N, 2), [x, y] coordinates
+        point_color: int, color ID for points
+        point_alpha: float, transparency
+        point_radius: int, radius of point circles
+        contour_color: int, color ID for contour
+        contour_width: int, width of contour
+    Returns:
+        painted_image: np.ndarray, (H, W, 3)
+    """
+    if len(input_points) == 0:
+        return input_image
+    palette = np.array([
+        [0, 0, 0],        # 0: black
+        [255, 0, 0],      # 1: red
+        [0, 255, 0],      # 2: green
+        [0, 0, 255],      # 3: blue
+        [255, 255, 0],    # 4: yellow
+        [255, 0, 255],    # 5: magenta
+        [0, 255, 255],    # 6: cyan
+        [128, 128, 128],  # 7: gray
+        [255, 165, 0],    # 8: orange
+        [128, 0, 128],    # 9: purple
+    ])
+    point_color_rgb = palette[point_color % len(palette)]
+    contour_color_rgb = palette[contour_color % len(palette)]
+    painted_image = input_image.copy()
+    for point in input_points:
+        x, y = int(point[0]), int(point[1])
+        # Draw filled circle with alpha blending
+        overlay = painted_image.copy()
+        cv2.circle(overlay, (x, y), point_radius, point_color_rgb.tolist(), -1)
+        cv2.addWeighted(overlay, point_alpha, painted_image, 1 - point_alpha, 0, painted_image)
+        # Draw contour
+        if contour_width > 0:
+            cv2.circle(painted_image, (x, y), point_radius, contour_color_rgb.tolist(), contour_width)
+    return painted_image

videomama_wrapper.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""
+VideoMaMa Inference Wrapper
+Handles video matting with mask conditioning
+"""
+import sys
+sys.path.append("../")
+sys.path.append("../../")
+import torch
+import numpy as np
+from PIL import Image
+from pathlib import Path
+from typing import List
+import tqdm
+from pipeline_svd_mask import VideoInferencePipeline
+def videomama(pipeline, frames_np, mask_frames_np):
+    """
+    Run VideoMaMa inference on video frames with mask conditioning
+    Args:
+        pipeline: VideoInferencePipeline instance
+        frames_np: List of numpy arrays, [(H,W,3)]*n, uint8 RGB frames
+        mask_frames_np: List of numpy arrays, [(H,W)]*n, uint8 grayscale masks
+    Returns:
+        output_frames: List of numpy arrays, [(H,W,3)]*n, uint8 RGB outputs
+    """
+    # Convert numpy arrays to PIL Images
+    frames_pil = [Image.fromarray(f) for f in frames_np]
+    mask_frames_pil = [Image.fromarray(m, mode='L') for m in mask_frames_np]
+    # Resize to model input size
+    target_width, target_height = 1024, 576
+    frames_resized = [f.resize((target_width, target_height), Image.Resampling.BILINEAR)
+                     for f in frames_pil]
+    masks_resized = [m.resize((target_width, target_height), Image.Resampling.BILINEAR)
+                    for m in mask_frames_pil]
+    # Run inference
+    print(f"Running VideoMaMa inference on {len(frames_resized)} frames...")
+    output_frames_pil = pipeline.run(
+        cond_frames=frames_resized,
+        mask_frames=masks_resized,
+        seed=42,
+        mask_cond_mode="vae"
+    )
+    # Resize back to original resolution
+    original_size = frames_pil[0].size
+    output_frames_resized = [f.resize(original_size, Image.Resampling.BILINEAR)
+                            for f in output_frames_pil]
+    # Convert back to numpy arrays
+    output_frames_np = [np.array(f) for f in output_frames_resized]
+    return output_frames_np
+def load_videomama_pipeline(device="cuda"):
+    """
+    Load VideoMaMa pipeline with pretrained weights
+    Args:
+        device: Device to run on
+    Returns:
+        VideoInferencePipeline instance
+    """
+    # Local paths for testing
+    base_model_path = "/home/cvlab19/project/samuel/data/CVPR/pretrained_models/stable-video-diffusion-img2vid-xt"
+    unet_checkpoint_path = "/home/cvlab19/project/samuel/data/CVPR/pretrained_models/videomama"
+    print(f"Loading VideoMaMa pipeline from {unet_checkpoint_path}...")
+    pipeline = VideoInferencePipeline(
+        base_model_path=base_model_path,
+        unet_checkpoint_path=unet_checkpoint_path,
+        weight_dtype=torch.float16,
+        device=device
+    )
+    print("VideoMaMa pipeline loaded successfully!")
+    return pipeline

videomama_wrapper_hf.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""
+VideoMaMa Inference Wrapper - Hugging Face Space Version
+Handles video matting with mask conditioning
+"""
+import sys
+import os
+from pathlib import Path
+# Add parent directories to path for imports
+sys.path.append(str(Path(__file__).parent))
+sys.path.append(str(Path(__file__).parent.parent))
+import torch
+import numpy as np
+from PIL import Image
+from typing import List
+from pipeline_svd_mask import VideoInferencePipeline
+def videomama(pipeline, frames_np, mask_frames_np):
+    """
+    Run VideoMaMa inference on video frames with mask conditioning
+    Args:
+        pipeline: VideoInferencePipeline instance
+        frames_np: List of numpy arrays, [(H,W,3)]*n, uint8 RGB frames
+        mask_frames_np: List of numpy arrays, [(H,W)]*n, uint8 grayscale masks
+    Returns:
+        output_frames: List of numpy arrays, [(H,W,3)]*n, uint8 RGB outputs
+    """
+    # Convert numpy arrays to PIL Images
+    frames_pil = [Image.fromarray(f) for f in frames_np]
+    mask_frames_pil = [Image.fromarray(m, mode='L') for m in mask_frames_np]
+    # Resize to model input size
+    target_width, target_height = 1024, 576
+    frames_resized = [f.resize((target_width, target_height), Image.Resampling.BILINEAR)
+                     for f in frames_pil]
+    masks_resized = [m.resize((target_width, target_height), Image.Resampling.BILINEAR)
+                    for m in mask_frames_pil]
+    # Run inference
+    print(f"Running VideoMaMa inference on {len(frames_resized)} frames...")
+    output_frames_pil = pipeline.run(
+        cond_frames=frames_resized,
+        mask_frames=masks_resized,
+        seed=42,
+        mask_cond_mode="vae"
+    )
+    # Resize back to original resolution
+    original_size = frames_pil[0].size
+    output_frames_resized = [f.resize(original_size, Image.Resampling.BILINEAR)
+                            for f in output_frames_pil]
+    # Convert back to numpy arrays
+    output_frames_np = [np.array(f) for f in output_frames_resized]
+    return output_frames_np
+def load_videomama_pipeline(base_model_path=None, unet_checkpoint_path=None, device="cuda"):
+    """
+    Load VideoMaMa pipeline with pretrained weights
+    Args:
+        base_model_path: Path to SVD base model (if None, uses default)
+        unet_checkpoint_path: Path to VideoMaMa UNet checkpoint (if None, uses default)
+        device: Device to run on
+    Returns:
+        VideoInferencePipeline instance
+    """
+    # Use provided paths or defaults
+    if base_model_path is None:
+        base_model_path = "checkpoints/stable-video-diffusion-img2vid-xt"
+    if unet_checkpoint_path is None:
+        unet_checkpoint_path = "checkpoints/videomama"
+    # Check if paths exist
+    if not os.path.exists(base_model_path):
+        raise FileNotFoundError(
+            f"SVD base model not found at {base_model_path}. "
+            f"Please ensure models are downloaded correctly."
+        )
+    if not os.path.exists(unet_checkpoint_path):
+        raise FileNotFoundError(
+            f"VideoMaMa checkpoint not found at {unet_checkpoint_path}. "
+            f"Please upload your VideoMaMa model to Hugging Face Hub and update the download logic."
+        )
+    print(f"Loading VideoMaMa pipeline...")
+    print(f"  Base model: {base_model_path}")
+    print(f"  UNet checkpoint: {unet_checkpoint_path}")
+    pipeline = VideoInferencePipeline(
+        base_model_path=base_model_path,
+        unet_checkpoint_path=unet_checkpoint_path,
+        weight_dtype=torch.float16,
+        device=device
+    )
+    print("VideoMaMa pipeline loaded successfully!")
+    return pipeline