Spaces:

MogensR
/

VideoBackgroundReplacer2

Paused

App Files Files Community

MogensR commited on Oct 1, 2025

Commit

7cb91e0

verified ·

1 Parent(s): eee126e

Create video_pipeline.py

Browse files

Files changed (1) hide show

video_pipeline.py +469 -0

video_pipeline.py ADDED Viewed

	@@ -0,0 +1,469 @@

+#!/usr/bin/env python3
+"""
+Video Processing Pipeline
+Two-stage processing: SAM2+MatAnyone → Transparent → Composite
+Includes temporal smoothing to eliminate jitter/shaking
+"""
+import os
+import time
+import tempfile
+import shutil
+import gc
+import logging
+from pathlib import Path
+import cv2
+import numpy as np
+from collections import deque
+import torch
+import streamlit as st
+from models import (
+    load_sam2_predictor,
+    load_matanyone_processor,
+    torch_memory_manager,
+    get_memory_usage,
+    clear_model_cache
+)
+logger = logging.getLogger(__name__)
+# Persistent temp dir
+TMP_DIR = Path("tmp")
+TMP_DIR.mkdir(parents=True, exist_ok=True)
+# ============================================================================
+# SAM2 Mask Generation
+# ============================================================================
+def generate_mask_from_video_first_frame(video_path, sam2_predictor):
+    """
+    Generate segmentation mask for the first frame using SAM2.
+    This mask is used as seed for MatAnyone's temporal propagation.
+    """
+    try:
+        with torch_memory_manager():
+            cap = cv2.VideoCapture(video_path)
+            ret, frame = cap.read()
+            cap.release()
+            if not ret:
+                logger.error("Failed to read video frame")
+                return None
+            # Resize frame if too large to save memory
+            h, w = frame.shape[:2]
+            max_size = 1080
+            if max(h, w) > max_size:
+                scale = max_size / max(h, w)
+                new_w, new_h = int(w * scale), int(h * scale)
+                frame = cv2.resize(frame, (new_w, new_h))
+                logger.info(f"Resized frame from {w}x{h} to {new_w}x{new_h}")
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            # Use SAM2 to generate mask
+            with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+                sam2_predictor.set_image(frame_rgb)
+                # Use center point as default prompt
+                h, w = frame_rgb.shape[:2]
+                center_point = np.array([[w//2, h//2]], dtype=np.float32)
+                center_label = np.array([1], dtype=np.int32)
+                masks, scores, logits = sam2_predictor.predict(
+                    point_coords=center_point,
+                    point_labels=center_label,
+                    multimask_output=True
+                )
+                # Select best mask based on score
+                best_mask = masks[np.argmax(scores)]
+                return best_mask.astype(np.uint8) * 255
+    except Exception as e:
+        logger.error(f"Failed to generate mask: {e}")
+        return None
+# ============================================================================
+# TEMPORAL SMOOTHING - Fixes the shaking issue
+# ============================================================================
+def smooth_alpha_video(alpha_video_path, output_path, window_size=5):
+    """
+    Apply temporal smoothing to alpha video to reduce jitter/shaking.
+    This averages each alpha frame with its neighbors to eliminate
+    the frame-to-frame instability that causes the shaking effect.
+    Args:
+        alpha_video_path: Path to MatAnyone's alpha output video
+        output_path: Path for smoothed alpha video
+        window_size: Number of frames to average (default 5)
+                    - 3: Minimal smoothing, fastest
+                    - 5: Balanced (recommended)
+                    - 7: Maximum smoothing, may blur fast motion
+    Returns:
+        Path to smoothed alpha video
+    """
+    logger.info(f"🎬 Applying temporal smoothing to reduce jitter (window={window_size})")
+    try:
+        cap = cv2.VideoCapture(alpha_video_path)
+        fps = int(cap.get(cv2.CAP_PROP_FPS)) or 30
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height), isColor=False)
+        # Rolling buffer for temporal averaging
+        frame_buffer = deque(maxlen=window_size)
+        frame_count = 0
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            # Convert to grayscale if needed
+            if len(frame.shape) == 3:
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+            # Add to buffer
+            frame_buffer.append(frame.astype(np.float32))
+            # Average all frames in buffer
+            smoothed = np.mean(frame_buffer, axis=0).astype(np.uint8)
+            out.write(smoothed)
+            frame_count += 1
+            # Periodic memory cleanup
+            if frame_count % 30 == 0:
+                gc.collect()
+        cap.release()
+        out.release()
+        logger.info(f"✅ Temporal smoothing complete: {frame_count} frames processed")
+        return output_path
+    except Exception as e:
+        logger.error(f"Temporal smoothing failed: {e}")
+        # Return original path if smoothing fails
+        return alpha_video_path
+# ============================================================================
+# Transparent Video Creation
+# ============================================================================
+def create_transparent_mov(foreground_path, alpha_path, temp_dir):
+    """
+    Create a .mov file with alpha channel from foreground and alpha videos.
+    Uses PNG codec to preserve alpha channel.
+    """
+    try:
+        output_path = str(temp_dir / "transparent.mov")
+        # Read videos
+        fg_cap = cv2.VideoCapture(foreground_path)
+        alpha_cap = cv2.VideoCapture(alpha_path)
+        # Get video properties
+        fps = int(fg_cap.get(cv2.CAP_PROP_FPS)) or 30
+        width = int(fg_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(fg_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        # Use PNG codec for alpha channel support
+        fourcc = cv2.VideoWriter_fourcc(*'png ')
+        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height), True)
+        frame_count = 0
+        while True:
+            ret_fg, fg_frame = fg_cap.read()
+            ret_alpha, alpha_frame = alpha_cap.read()
+            if not ret_fg or not ret_alpha:
+                break
+            # Convert alpha to single channel if needed
+            if len(alpha_frame.shape) == 3:
+                alpha_frame = cv2.cvtColor(alpha_frame, cv2.COLOR_BGR2GRAY)
+            # Create RGBA frame
+            rgba_frame = np.zeros((height, width, 4), dtype=np.uint8)
+            rgba_frame[:, :, :3] = fg_frame  # RGB channels
+            rgba_frame[:, :, 3] = alpha_frame  # Alpha channel
+            # Convert RGBA to BGRA for OpenCV
+            bgra_frame = cv2.cvtColor(rgba_frame, cv2.COLOR_RGBA2BGRA)
+            out.write(bgra_frame)
+            frame_count += 1
+            if frame_count % 10 == 0:
+                gc.collect()
+        fg_cap.release()
+        alpha_cap.release()
+        out.release()
+        logger.info(f"Created transparent MOV: {frame_count} frames")
+        return output_path if os.path.exists(output_path) else None
+    except Exception as e:
+        logger.error(f"Failed to create transparent MOV: {e}")
+        return None
+# ============================================================================
+# STAGE 1: Create Transparent Video (with smoothing fix)
+# ============================================================================
+def stage1_create_transparent_video(input_file):
+    """
+    STAGE 1: Create transparent video using SAM2 + MatAnyone.
+    Pipeline:
+    1. Generate first-frame mask with SAM2
+    2. Process video with MatAnyone (temporal propagation)
+    3. Apply temporal smoothing to alpha channel (FIXES SHAKING)
+    4. Create transparent .mov file
+    """
+    logger.info("🎬 Starting Stage 1: Create transparent video")
+    # Check memory
+    memory_info = get_memory_usage()
+    if memory_info.get('gpu_free', 0) < 2.0:
+        st.warning("⚠️ Low GPU memory detected. Processing may be slower.")
+        clear_model_cache()
+    try:
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        def update_progress(progress, message):
+            progress = max(0, min(1, progress))
+            progress_bar.progress(progress)
+            gpu_mem = get_memory_usage().get('gpu_allocated', 0)
+            status_text.text(f"Stage 1: {message} | GPU: {gpu_mem:.1f}GB")
+            logger.info(f"Stage 1 [{progress:.0%}]: {message}")
+        # Load models
+        update_progress(0.05, "Loading SAM2 model...")
+        sam2_predictor = load_sam2_predictor()
+        if sam2_predictor is None:
+            st.error("❌ Failed to load SAM2 model")
+            return None
+        update_progress(0.1, "Loading MatAnyone model...")
+        matanyone_processor = load_matanyone_processor()
+        if matanyone_processor is None:
+            st.error("❌ Failed to load MatAnyone model")
+            return None
+        # Process video
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_dir = Path(temp_dir)
+            input_path = str(temp_dir / "input.mp4")
+            # Save input video
+            with open(input_path, "wb") as f:
+                f.write(input_file.getvalue())
+            update_progress(0.2, "Generating first-frame segmentation mask...")
+            # Generate mask using SAM2
+            with torch_memory_manager():
+                mask = generate_mask_from_video_first_frame(input_path, sam2_predictor)
+            if mask is None:
+                st.error("❌ Failed to generate mask")
+                return None
+            mask_path = str(temp_dir / "mask.png")
+            cv2.imwrite(mask_path, mask)
+            logger.info(f"First-frame mask saved: {mask_path}")
+            update_progress(0.4, "Running MatAnyone temporal propagation...")
+            # Process with MatAnyone
+            try:
+                with torch_memory_manager():
+                    foreground_path, alpha_path = matanyone_processor.process_video(
+                        input_path=input_path,
+                        mask_path=mask_path,
+                        output_path=str(temp_dir),
+                        max_size=720  # Limit resolution for memory efficiency
+                    )
+                logger.info(f"MatAnyone complete - Foreground: {foreground_path}, Alpha: {alpha_path}")
+                # 🔧 FIX: Apply temporal smoothing to alpha channel
+                update_progress(0.6, "Applying temporal smoothing to eliminate jitter...")
+                smoothed_alpha_path = str(temp_dir / "alpha_smoothed.mp4")
+                alpha_path = smooth_alpha_video(alpha_path, smoothed_alpha_path, window_size=5)
+                logger.info("✅ Temporal smoothing applied - shaking should be eliminated")
+                update_progress(0.8, "Creating transparent .mov file...")
+                # Create transparent video
+                transparent_path = create_transparent_mov(foreground_path, alpha_path, temp_dir)
+                if transparent_path and os.path.exists(transparent_path):
+                    # Copy to persistent location
+                    persist_path = TMP_DIR / "transparent_video.mov"
+                    shutil.copyfile(transparent_path, persist_path)
+                    update_progress(1.0, "✅ Transparent video created successfully!")
+                    time.sleep(0.5)
+                    return str(persist_path)
+                else:
+                    st.error("❌ Failed to create transparent video")
+                    return None
+            except Exception as e:
+                logger.error(f"MatAnyone processing failed: {e}", exc_info=True)
+                st.error(f"❌ MatAnyone processing failed: {e}")
+                return None
+    except Exception as e:
+        logger.error(f"Stage 1 error: {e}", exc_info=True)
+        st.error(f"❌ Stage 1 failed: {e}")
+        # Show memory info for debugging
+        try:
+            memory_info = get_memory_usage()
+            st.info(f"Memory at failure - GPU: {memory_info.get('gpu_allocated', 0):.1f}GB, "
+                   f"RAM: {memory_info.get('ram_used', 0):.1f}GB")
+        except:
+            pass
+        return None
+    finally:
+        logger.info("Stage 1 cleanup...")
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        gc.collect()
+# ============================================================================
+# STAGE 2: Composite with Background
+# ============================================================================
+def stage2_composite_background(transparent_video_path, background, bg_type):
+    """
+    STAGE 2: Composite transparent video with new background.
+    Fast compositing that can be repeated with different backgrounds.
+    """
+    logger.info("🎬 Starting Stage 2: Composite with background")
+    try:
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        def update_progress(progress, message):
+            progress = max(0, min(1, progress))
+            progress_bar.progress(progress)
+            status_text.text(f"Stage 2: {message}")
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_dir = Path(temp_dir)
+            update_progress(0.2, "Loading transparent video...")
+            # Read transparent video
+            cap = cv2.VideoCapture(transparent_video_path)
+            fps = int(cap.get(cv2.CAP_PROP_FPS)) or 30
+            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            update_progress(0.4, "Preparing background...")
+            # Prepare background
+            if bg_type == "image" and background is not None:
+                bg_array = np.array(background)
+                if len(bg_array.shape) == 3 and bg_array.shape[2] == 3:
+                    bg_array = cv2.cvtColor(bg_array, cv2.COLOR_RGB2BGR)
+                elif len(bg_array.shape) == 3 and bg_array.shape[2] == 4:
+                    bg_array = cv2.cvtColor(bg_array, cv2.COLOR_RGBA2BGR)
+                bg_resized = cv2.resize(bg_array, (width, height))
+            elif bg_type == "color":
+                # Parse hex color
+                color_hex = st.session_state.bg_color.lstrip('#')
+                r = int(color_hex[0:2], 16)
+                g = int(color_hex[2:4], 16)
+                b = int(color_hex[4:6], 16)
+                bg_resized = np.full((height, width, 3), (b, g, r), dtype=np.uint8)
+            else:
+                # Default green screen
+                bg_resized = np.full((height, width, 3), (0, 255, 0), dtype=np.uint8)
+            # Create output video
+            output_path = str(temp_dir / "final_output.mp4")
+            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+            out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+            update_progress(0.6, "Compositing frames...")
+            frame_count = 0
+            while True:
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                # Extract alpha channel (BGRA format)
+                if frame.shape[2] == 4:
+                    bgr_frame = frame[:, :, :3]
+                    alpha_channel = frame[:, :, 3]
+                else:
+                    # Fallback: assume full opacity
+                    bgr_frame = frame
+                    alpha_channel = np.full((height, width), 255, dtype=np.uint8)
+                # Normalize alpha to 0-1 range
+                alpha_norm = alpha_channel.astype(np.float32) / 255.0
+                alpha_norm = np.expand_dims(alpha_norm, axis=2)
+                # Composite: result = foreground * alpha + background * (1 - alpha)
+                fg_float = bgr_frame.astype(np.float32)
+                bg_float = bg_resized.astype(np.float32)
+                result = fg_float * alpha_norm + bg_float * (1 - alpha_norm)
+                result = result.astype(np.uint8)
+                out.write(result)
+                frame_count += 1
+                # Update progress
+                if total_frames > 0 and frame_count % 5 == 0:
+                    progress = 0.6 + 0.3 * (frame_count / total_frames)
+                    update_progress(progress, f"Compositing frame {frame_count}/{total_frames}")
+                if frame_count % 10 == 0:
+                    gc.collect()
+            cap.release()
+            out.release()
+            logger.info(f"Compositing complete: {frame_count} frames")
+            if os.path.exists(output_path):
+                # Copy to persistent location
+                persist_path = TMP_DIR / "final_video.mp4"
+                shutil.copyfile(output_path, persist_path)
+                update_progress(1.0, "✅ Compositing complete!")
+                time.sleep(0.5)
+                return str(persist_path)
+            else:
+                return None
+    except Exception as e:
+        logger.error(f"Stage 2 error: {e}", exc_info=True)
+        st.error(f"❌ Stage 2 failed: {e}")
+        return None