Spaces:

MogensR
/

VideoBackgroundReplacer2

Paused

App Files Files Community

MogensR commited on Oct 2, 2025

Commit

dd52427

verified ·

1 Parent(s): ec9ba45

Delete pipeline/integrated_pipeline.py

Browse files

Files changed (1) hide show

pipeline/integrated_pipeline.py +0 -586

pipeline/integrated_pipeline.py DELETED Viewed

@@ -1,586 +0,0 @@
-#!/usr/bin/env python3
-"""
-integrated_pipeline.py - Two-stage pipeline with proper GPU management and error handling
-"""
-import os
-import sys
-import gc
-import json
-import subprocess
-import tempfile
-from pathlib import Path
-from typing import Dict, Any, Optional, Tuple
-import numpy as np
-import cv2
-import torch
-import logging
-import shutil
-import traceback
-from concurrent.futures import ThreadPoolExecutor, TimeoutError
-# --- Project Setup ---
-current_dir = Path(__file__).parent
-parent_dir = current_dir.parent
-sys.path.append(str(parent_dir))
-# --- Logging Configuration ---
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# --- TwoStageProcessor Class ---
-class TwoStageProcessor:
-    def __init__(self, temp_dir: Optional[str] = None):
-        self.temp_dir = Path(temp_dir) if temp_dir else Path(tempfile.mkdtemp())
-        self.temp_dir.mkdir(exist_ok=True)
-        logger.info(f"Initialized temp_dir: {self.temp_dir}")
-        # Stage outputs
-        self.masks_path = self.temp_dir / "masks.mkv"
-        self.metadata_path = self.temp_dir / "meta.json"
-        # Ensure GPU is available and set as default
-        if torch.cuda.is_available():
-            torch.cuda.set_device(0)
-            logger.info(f"GPU set as default device: {torch.cuda.get_device_name(0)}")
-        else:
-            logger.warning("CUDA not available, using CPU")
-    def process_video(self, input_video: str, background_video: str,
-                     click_points: list, output_path: str,
-                     use_matanyone: bool = True, progress_callback=None) -> bool:
-        """Main entry point - two-stage processing"""
-        try:
-            logger.info("="*60)
-            logger.info("STARTING TWO-STAGE PROCESSING")
-            logger.info("="*60)
-            # Log disk usage
-            free_gb = shutil.disk_usage(self.temp_dir).free/1e9
-            logger.info(f"Disk free: {free_gb:.2f}GB")
-            if free_gb < 5.0:
-                logger.error("Insufficient disk space (need at least 5GB)")
-                return False
-            # Stage 1: Generate masks
-            logger.info("="*60)
-            logger.info("STAGE 1: MASK GENERATION")
-            logger.info("="*60)
-            if progress_callback:
-                progress_callback(0.05, "Stage 1: Starting SAM2...")
-            stage1_success = self._stage1_generate_masks(input_video, click_points, progress_callback)
-            if not stage1_success:
-                logger.error("STAGE 1 FAILED - Aborting")
-                return False
-            logger.info("STAGE 1 COMPLETED SUCCESSFULLY")
-            # Verify stage 1 outputs exist
-            if not self.masks_path.exists():
-                logger.error(f"Masks file not found: {self.masks_path}")
-                return False
-            if not self.metadata_path.exists():
-                logger.error(f"Metadata file not found: {self.metadata_path}")
-                return False
-            masks_size = self.masks_path.stat().st_size / 1e6
-            logger.info(f"Masks file size: {masks_size:.2f}MB")
-            # Force GPU cleanup before Stage 2
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-                torch.cuda.synchronize()
-                logger.info(f"GPU memory after Stage 1: {torch.cuda.memory_allocated()/1e9:.2f}GB")
-            gc.collect()
-            # Stage 2: Process and composite
-            logger.info("="*60)
-            logger.info("STAGE 2: MATTING & COMPOSITING")
-            logger.info("="*60)
-            if progress_callback:
-                progress_callback(0.5, "Stage 2: Starting MatAnyone...")
-            stage2_success = self._stage2_composite(
-                input_video, background_video,
-                output_path, use_matanyone, progress_callback
-            )
-            if not stage2_success:
-                logger.error("STAGE 2 FAILED")
-                return False
-            logger.info("STAGE 2 COMPLETED SUCCESSFULLY")
-            logger.info("="*60)
-            logger.info("TWO-STAGE PROCESSING COMPLETE")
-            logger.info("="*60)
-            return True
-        except Exception as e:
-            logger.error(f"Two-stage processing exception: {str(e)}")
-            logger.error(traceback.format_exc())
-            return False
-    def _stage1_generate_masks(self, input_video: str, click_points: list,
-                              progress_callback=None) -> bool:
-        """Stage 1: SAM2 mask generation"""
-        predictor = None
-        inference_state = None
-        ffmpeg_process = None
-        try:
-            logger.info("Loading SAM2...")
-            # Use the SAM2Predictor wrapper
-            from models.sam2_loader import SAM2Predictor
-            # Force GPU device
-            if torch.cuda.is_available():
-                device = torch.device("cuda:0")
-                torch.cuda.set_device(0)
-            else:
-                device = torch.device("cpu")
-            predictor = SAM2Predictor(device=device, model_size="large")
-            if torch.cuda.is_available():
-                logger.info(f"SAM2 loaded on GPU: {torch.cuda.get_device_name(0)}")
-                logger.info(f"GPU Memory: {torch.cuda.memory_allocated()/1e9:.2f}GB")
-            else:
-                logger.info("SAM2 loaded on CPU")
-            # Get video info
-            cap = cv2.VideoCapture(input_video)
-            if not cap.isOpened():
-                raise RuntimeError(f"Cannot open video: {input_video}")
-            fps = cap.get(cv2.CAP_PROP_FPS)
-            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-            cap.release()
-            logger.info(f"Video: {width}x{height}, {frame_count} frames @ {fps:.2f}fps")
-            # Save metadata
-            metadata = {
-                "fps": fps,
-                "frame_count": frame_count,
-                "width": width,
-                "height": height,
-                "click_points": click_points
-            }
-            with open(self.metadata_path, 'w') as f:
-                json.dump(metadata, f, indent=2)
-            logger.info(f"Metadata saved")
-            # Initialize inference
-            logger.info("Initializing SAM2 inference state...")
-            inference_state = predictor.init_state(video_path=input_video)
-            logger.info("Inference state initialized")
-            # Add prompts
-            logger.info(f"Adding {len(click_points)} prompts...")
-            for i, point in enumerate(click_points):
-                x, y = point
-                points = np.array([[x * width, y * height]], dtype=np.float32)
-                labels = np.array([1], dtype=np.int32)
-                predictor.add_new_points_or_box(
-                    inference_state=inference_state,
-                    frame_idx=0,
-                    obj_id=i,
-                    points=points,
-                    labels=labels,
-                )
-            logger.info("Prompts added")
-            # Setup FFmpeg for lossless encoding
-            ffmpeg_cmd = [
-                'ffmpeg', '-y', '-f', 'rawvideo',
-                '-pix_fmt', 'gray', '-s', f'{width}x{height}',
-                '-r', str(fps), '-i', '-',
-                '-c:v', 'ffv1', '-level', '3', '-pix_fmt', 'gray',
-                str(self.masks_path)
-            ]
-            logger.info("Starting FFmpeg...")
-            ffmpeg_process = subprocess.Popen(
-                ffmpeg_cmd,
-                stdin=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                stdout=subprocess.PIPE
-            )
-            logger.info("FFmpeg started")
-            # Generate and stream masks
-            logger.info(f"Propagating masks through {frame_count} frames...")
-            frame_idx = 0
-            for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(inference_state):
-                # Progress update
-                if progress_callback and (out_frame_idx % 30 == 0 or out_frame_idx == frame_count - 1):
-                    progress = 0.05 + (out_frame_idx + 1) / frame_count * 0.4  # 5% to 45%
-                    progress_callback(progress, f"SAM2: Frame {out_frame_idx + 1}/{frame_count}")
-                # Combine masks
-                combined_mask = np.zeros((height, width), dtype=np.uint8)
-                if isinstance(out_obj_ids, torch.Tensor):
-                    obj_ids = out_obj_ids.cpu().numpy()
-                else:
-                    obj_ids = out_obj_ids
-                for i, obj_id in enumerate(obj_ids):
-                    if i < len(out_mask_logits):
-                        mask = (out_mask_logits[i] > 0.0)
-                        if isinstance(mask, torch.Tensor):
-                            mask = mask.cpu().numpy()
-                        mask = mask.squeeze().astype(np.uint8) * 255
-                        combined_mask = np.maximum(combined_mask, mask)
-                # Write to FFmpeg
-                try:
-                    ffmpeg_process.stdin.write(combined_mask.tobytes())
-                except BrokenPipeError:
-                    logger.error("FFmpeg pipe broken")
-                    return False
-                frame_idx = out_frame_idx
-                # Memory management every 50 frames
-                if (out_frame_idx + 1) % 50 == 0:
-                    if torch.cuda.is_available():
-                        torch.cuda.empty_cache()
-                    gc.collect()
-            logger.info(f"Processed {frame_idx + 1} frames")
-            # Close FFmpeg
-            logger.info("Finalizing FFmpeg...")
-            ffmpeg_process.stdin.close()
-            # Wait for FFmpeg to finish (increased timeout)
-            try:
-                ffmpeg_process.wait(timeout=300)  # 5 minutes timeout
-            except subprocess.TimeoutExpired:
-                logger.error("FFmpeg timeout after 5 minutes")
-                ffmpeg_process.kill()
-                return False
-            if ffmpeg_process.returncode != 0:
-                error = ffmpeg_process.stderr.read().decode()
-                logger.error(f"FFmpeg failed: {error}")
-                return False
-            logger.info("FFmpeg completed successfully")
-            # Verify output
-            if not self.masks_path.exists():
-                logger.error("Masks file was not created")
-                return False
-            return True
-        except Exception as e:
-            logger.error(f"Stage 1 exception: {str(e)}")
-            logger.error(traceback.format_exc())
-            return False
-        finally:
-            # CRITICAL: Complete cleanup
-            logger.info("Cleaning up Stage 1...")
-            if ffmpeg_process is not None:
-                try:
-                    ffmpeg_process.kill()
-                except:
-                    pass
-            if predictor is not None:
-                del predictor
-            if inference_state is not None:
-                del inference_state
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-                torch.cuda.synchronize()
-                logger.info(f"GPU memory after cleanup: {torch.cuda.memory_allocated()/1e9:.2f}GB")
-            gc.collect()
-            logger.info("Stage 1 cleanup complete")
-    def _stage2_composite(self, input_video: str, background_video: str,
-                         output_path: str, use_matanyone: bool, progress_callback=None) -> bool:
-        """Stage 2: Read masks, refine, and composite"""
-        try:
-            # Load metadata
-            with open(self.metadata_path, 'r') as f:
-                metadata = json.load(f)
-            logger.info(f"Metadata loaded")
-            frame_count = metadata["frame_count"]
-            # Read masks
-            if progress_callback:
-                progress_callback(0.5, "Reading masks...")
-            logger.info("Reading mask stream...")
-            masks = self._read_mask_stream()
-            if masks is None or len(masks) == 0:
-                logger.error("Failed to read masks")
-                return False
-            logger.info(f"Read {len(masks)} masks")
-            # MatAnyone refinement
-            if use_matanyone:
-                if progress_callback:
-                    progress_callback(0.6, "Refining with MatAnyone...")
-                logger.info("Starting MatAnyone refinement...")
-                refined_masks = self._refine_with_matanyone(input_video, masks, progress_callback)
-                if refined_masks is not None and len(refined_masks) > 0:
-                    masks = refined_masks
-                    logger.info(f"Using {len(refined_masks)} refined masks")
-                else:
-                    logger.warning("MatAnyone failed, using SAM2 masks")
-            # Final composition
-            if progress_callback:
-                progress_callback(0.8, "Compositing final video...")
-            logger.info("Starting final composition...")
-            return self._composite_final_video(
-                input_video, background_video,
-                masks, output_path, metadata, progress_callback
-            )
-        except Exception as e:
-            logger.error(f"Stage 2 exception: {str(e)}")
-            logger.error(traceback.format_exc())
-            return False
-    def _read_mask_stream(self) -> Optional[list]:
-        """Read masks from FFV1 stream"""
-        try:
-            with open(self.metadata_path, 'r') as f:
-                metadata = json.load(f)
-            width = metadata["width"]
-            height = metadata["height"]
-            frame_count = metadata["frame_count"]
-            logger.info(f"Reading {frame_count} masks ({width}x{height})...")
-            # FFmpeg decode
-            ffmpeg_cmd = [
-                'ffmpeg', '-i', str(self.masks_path),
-                '-f', 'rawvideo', '-pix_fmt', 'gray', '-'
-            ]
-            process = subprocess.Popen(
-                ffmpeg_cmd,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE
-            )
-            masks = []
-            frame_size = width * height
-            for frame_idx in range(frame_count):
-                frame_data = process.stdout.read(frame_size)
-                if len(frame_data) != frame_size:
-                    logger.error(f"Unexpected frame size at {frame_idx}: {len(frame_data)} vs {frame_size}")
-                    break
-                mask = np.frombuffer(frame_data, dtype=np.uint8).reshape((height, width))
-                masks.append(mask)
-            process.stdout.close()
-            process.wait(timeout=60)
-            if process.returncode != 0:
-                error = process.stderr.read().decode()
-                logger.error(f"FFmpeg decode error: {error}")
-                return None
-            logger.info(f"Successfully read {len(masks)} masks")
-            return masks
-        except Exception as e:
-            logger.error(f"Mask reading exception: {str(e)}")
-            logger.error(traceback.format_exc())
-            return None
-    def _refine_with_matanyone(self, input_video: str, masks: list, progress_callback=None) -> Optional[list]:
-        """Apply MatAnyone refinement"""
-        try:
-            from models.matanyone_loader import MatAnyoneSession
-            logger.info("Loading MatAnyone...")
-            # Create temp directory
-            matanyone_temp = self.temp_dir / "matanyone"
-            matanyone_temp.mkdir(exist_ok=True)
-            # Save first mask
-            first_mask_path = matanyone_temp / "first_mask.png"
-            cv2.imwrite(str(first_mask_path), masks[0])
-            # Initialize on GPU
-            if torch.cuda.is_available():
-                device = "cuda"
-                torch.cuda.set_device(0)
-            else:
-                device = "cpu"
-            session = MatAnyoneSession(device=device)
-            if torch.cuda.is_available():
-                logger.info(f"MatAnyone on GPU, Memory: {torch.cuda.memory_allocated()/1e9:.2f}GB")
-            # Process
-            alpha_path, fg_path = session.process_stream(
-                video_path=Path(input_video),
-                seed_mask_path=first_mask_path,
-                out_dir=matanyone_temp,
-                progress_cb=progress_callback
-            )
-            if not alpha_path or not alpha_path.exists():
-                logger.warning("MatAnyone produced no output")
-                return None
-            # Read refined masks
-            refined_masks = []
-            cap = cv2.VideoCapture(str(alpha_path))
-            while True:
-                ret, frame = cap.read()
-                if not ret:
-                    break
-                alpha_mask = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-                refined_masks.append(alpha_mask)
-            cap.release()
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            gc.collect()
-            logger.info(f"MatAnyone produced {len(refined_masks)} refined masks")
-            return refined_masks if len(refined_masks) > 0 else None
-        except Exception as e:
-            logger.warning(f"MatAnyone exception: {str(e)}")
-            logger.warning(traceback.format_exc())
-            return None
-    def _composite_final_video(self, input_video: str, background_video: str,
-                              masks: list, output_path: str, metadata: Dict[str, Any],
-                              progress_callback=None) -> bool:
-        """Create final composite"""
-        try:
-            fg_cap = cv2.VideoCapture(input_video)
-            fps = metadata["fps"]
-            width = metadata["width"]
-            height = metadata["height"]
-            # Handle background
-            if background_video and os.path.exists(background_video):
-                if background_video.lower().endswith(('.png', '.jpg', '.jpeg')):
-                    bg_image = cv2.imread(background_video)
-                    bg_image = cv2.resize(bg_image, (width, height))
-                    bg_cap = None
-                else:
-                    bg_cap = cv2.VideoCapture(background_video)
-            else:
-                bg_image = np.full((height, width, 3), (0, 255, 0), dtype=np.uint8)
-                bg_cap = None
-            # Output writer
-            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-            out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
-            frame_idx = 0
-            total_frames = len(masks)
-            logger.info(f"Compositing {total_frames} frames...")
-            while frame_idx < total_frames:
-                ret_fg, fg_frame = fg_cap.read()
-                if not ret_fg:
-                    logger.error(f"Failed to read foreground frame {frame_idx}")
-                    break
-                # Background
-                if bg_cap is not None:
-                    ret_bg, bg_frame = bg_cap.read()
-                    if not ret_bg:
-                        bg_cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
-                        ret_bg, bg_frame = bg_cap.read()
-                    if ret_bg:
-                        bg_frame = cv2.resize(bg_frame, (width, height))
-                    else:
-                        bg_frame = bg_image
-                else:
-                    bg_frame = bg_image
-                # Composite
-                mask = masks[frame_idx]
-                mask_norm = mask.astype(np.float32) / 255.0
-                mask_3ch = np.stack([mask_norm, mask_norm, mask_norm], axis=-1)
-                composite = (fg_frame * mask_3ch + bg_frame * (1 - mask_3ch)).astype(np.uint8)
-                out.write(composite)
-                frame_idx += 1
-                if progress_callback and frame_idx % 30 == 0:
-                    progress = 0.8 + (frame_idx / total_frames) * 0.2
-                    progress_callback(progress, f"Compositing: {frame_idx}/{total_frames}")
-            fg_cap.release()
-            if bg_cap is not None:
-                bg_cap.release()
-            out.release()
-            logger.info(f"Composite complete: {output_path}")
-            return True
-        except Exception as e:
-            logger.error(f"Compositing exception: {str(e)}")
-            logger.error(traceback.format_exc())
-            return False
-    def cleanup(self):
-        """Clean up temp files"""
-        try:
-            if self.temp_dir.exists():
-                shutil.rmtree(self.temp_dir)
-                logger.info("Temp cleaned")
-        except Exception as e:
-            logger.error(f"Cleanup error: {str(e)}")
-# --- Compatibility Wrapper ---
-def process_video_two_stage(input_video: str, background_video: str,
-                           click_points: list, output_path: str,
-                           use_matanyone: bool = True, progress_callback=None) -> bool:
-    """Drop-in replacement"""
-    processor = TwoStageProcessor()
-    try:
-        return processor.process_video(
-            input_video, background_video, click_points,
-            output_path, use_matanyone, progress_callback
-        )
-    finally:
-        processor.cleanup()