Spaces:

MogensR
/

VideoBackgroundReplacer2

Paused

App Files Files Community

MogensR commited on Sep 17, 2025

Commit

3723e02

1 Parent(s): 39f0b54

dinner

Browse files

Files changed (1) hide show

pipeline.py +415 -568

pipeline.py CHANGED Viewed

@@ -1,256 +1,245 @@
 #!/usr/bin/env python3
 """
-pipeline.py - Core video background replacement processing
-Direct SAM2 + MatAnyone implementation with zero abstraction layers
-Processes video frame-by-frame: person detection -> mask generation -> background replacement
 """
 import os
 import cv2
 import time
 import uuid
 import shutil
 import tempfile
 import subprocess
 import numpy as np
 from PIL import Image
-import logging
-import gc
 from pathlib import Path
 from typing import Optional, Tuple, Dict, Any, Callable
-logger = logging.getLogger(__name__)
-# ================================================================================================
-# VERSION VALIDATION AND LAZY IMPORTS
-# ================================================================================================
-def validate_pytorch_environment():
-    """Validate PyTorch installation and versions before loading heavy models"""
     try:
-        import torch
-        torch_version = torch.__version__
-        logger.info(f"PyTorch version: {torch_version}")
-        if not torch.cuda.is_available():
-            raise RuntimeError("CUDA not available - GPU processing required")
-        cuda_version = torch.version.cuda
-        cudnn_version = torch.backends.cudnn.version()
-        logger.info(f"CUDA version: {cuda_version}")
-        logger.info(f"cuDNN version: {cudnn_version}")
-        # Test basic CUDA operations
         try:
-            device = torch.device('cuda')
-            test_tensor = torch.randn(100, 100).to(device)
-            result = torch.mm(test_tensor, test_tensor.t())
-            logger.info("CUDA basic operations test: PASSED")
-        except Exception as cuda_test_error:
-            logger.error(f"CUDA operations test FAILED: {cuda_test_error}")
-            raise RuntimeError(f"CUDA incompatibility detected: {cuda_test_error}")
-        # Version compatibility warnings
-        torch_major = int(torch_version.split('.')[0])
-        torch_minor = int(torch_version.split('.')[1])
-        if torch_major == 2 and torch_minor >= 8:
-            logger.warning(f"PyTorch {torch_version} is very new - may have compatibility issues")
-        if torch_major < 2 or (torch_major == 2 and torch_minor < 3):
-            raise RuntimeError(f"PyTorch {torch_version} too old for SAM2. Need >= 2.3.0")
-        # GPU memory and capabilities
-        total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
-        gpu_name = torch.cuda.get_device_name(0)
-        compute_capability = torch.cuda.get_device_capability(0)
-        logger.info(f"GPU: {gpu_name}")
-        logger.info(f"Compute capability: {compute_capability}")
-        logger.info(f"Total GPU memory: {total_memory:.1f}GB")
-        if total_memory < 8.0:
-            logger.warning(f"Low GPU memory: {total_memory:.1f}GB. May fail on large videos.")
-        return device, torch_version
-    except Exception as e:
-        logger.error(f"PyTorch environment validation failed: {e}")
-        raise
-def lazy_import_sam2():
-    """Lazy import SAM2 with error handling"""
     try:
-        logger.info("Lazy importing SAM2...")
-        import torch
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            gc.collect()
-        from sam2.build_sam import build_sam2_video_predictor
-        logger.info("SAM2 imported successfully")
-        return build_sam2_video_predictor
-    except ImportError as e:
-        logger.error(f"SAM2 import failed: {e}")
-        raise RuntimeError(f"SAM2 not available: {e}")
     except Exception as e:
-        logger.error(f"Unexpected error importing SAM2: {e}")
         raise
-def lazy_import_matanyone():
-    """Lazy import MatAnyone with graceful fallback"""
     try:
-        logger.info("Attempting MatAnyone import...")
-        from models.matanyone_loader import MatAnyoneLoader
-        logger.info("MatAnyone imported successfully")
-        return MatAnyoneLoader
     except Exception as e:
-        logger.warning(f"MatAnyone not available: {e}")
         return None
-# ================================================================================================
-# MEMORY MANAGEMENT UTILITIES
-# ================================================================================================
-def clear_gpu_memory():
-    """Clear GPU memory cache"""
     try:
-        import torch
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
-            gc.collect()
     except Exception as e:
-        logger.warning(f"GPU memory clear failed: {e}")
-def log_memory_usage(stage: str):
-    """Log current memory usage"""
-    try:
-        import torch
-        if torch.cuda.is_available():
-            allocated = torch.cuda.memory_allocated() / (1024**3)
-            reserved = torch.cuda.memory_reserved() / (1024**3)
-            logger.info(f"{stage} - GPU Memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")
-    except Exception:
-        pass
-# ================================================================================================
-# SAFE FILE OPERATIONS
-# ================================================================================================
-def safe_tmp_path(base_dir: str, extension: str) -> Path:
-    """Generate safe temporary file path"""
-    timestamp = int(time.time())
-    random_id = uuid.uuid4().hex[:8]
-    filename = f"tmp_{timestamp}_{random_id}{extension}"
-    return Path(base_dir) / filename
-def safe_video_writer(output_path: Path, fourcc_str: str, fps: float, size: Tuple[int, int]):
-    """Create video writer with error handling"""
     try:
-        fourcc = cv2.VideoWriter_fourcc(*fourcc_str)
-        writer = cv2.VideoWriter(str(output_path), fourcc, fps, size)
-        if not writer.isOpened():
-            raise RuntimeError(f"Failed to open video writer: {output_path}")
-        return writer
-    except Exception as e:
-        logger.error(f"Video writer creation failed: {e}")
-        raise
-# ================================================================================================
-# CHECKPOINT DOWNLOAD
-# ================================================================================================
-def download_sam2_checkpoint(checkpoint_path: str, work_dir: str = None, timeout_seconds: int = 600):
-    """Download SAM2 checkpoint with timeout protection"""
-    checkpoint_file = Path(checkpoint_path)
-    if checkpoint_file.exists():
-        logger.info(f"SAM2 checkpoint already exists: {checkpoint_file}")
         return True
-    try:
-        logger.info("SAM2 checkpoint not found, downloading...")
-        checkpoint_file.parent.mkdir(parents=True, exist_ok=True)
-        import requests
-        checkpoint_url = "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt"
-        logger.info(f"Downloading from: {checkpoint_url}")
-        logger.info(f"Target: {checkpoint_file}")
-        start_time = time.time()
-        response = requests.get(checkpoint_url, stream=True, timeout=30)
-        response.raise_for_status()
-        total_size = int(response.headers.get('content-length', 0))
-        logger.info(f"File size: {total_size / (1024**2):.1f}MB")
-        # Download to temporary file first
-        work_path = Path(work_dir) if work_dir else checkpoint_file.parent
-        temp_download = safe_tmp_path(str(work_path), ".pt.download")
-        downloaded = 0
-        last_log_time = start_time
-        try:
-            with open(temp_download, 'wb') as f:
-                for chunk in response.iter_content(chunk_size=1024*1024):
-                    if chunk:
-                        f.write(chunk)
-                        downloaded += len(chunk)
-                        current_time = time.time()
-                        elapsed = current_time - start_time
-                        # Timeout check
-                        if elapsed > timeout_seconds:
-                            raise TimeoutError(f"Download timeout after {elapsed:.1f}s")
-                        # Progress logging every 15 seconds
-                        if current_time - last_log_time > 15:
-                            progress = (downloaded / total_size * 100) if total_size > 0 else 0
-                            speed = downloaded / elapsed / (1024**2)  # MB/s
-                            logger.info(f"Download: {progress:.1f}% ({speed:.1f}MB/s)")
-                            last_log_time = current_time
-            # Verify download
-            if total_size > 0 and downloaded != total_size:
-                raise RuntimeError(f"Incomplete download: {downloaded}/{total_size} bytes")
-            # Move to final location
-            temp_download.replace(checkpoint_file)
-            total_time = time.time() - start_time
-            speed = downloaded / total_time / (1024**2)
-            logger.info(f"Download complete: {downloaded / (1024**2):.1f}MB in {total_time:.1f}s ({speed:.1f}MB/s)")
-            return True
-        except Exception as download_error:
-            if temp_download.exists():
-                temp_download.unlink()
-            raise download_error
     except Exception as e:
-        logger.error(f"Download failed: {e}")
-        if checkpoint_file.exists():
-            try:
-                checkpoint_file.unlink()
-            except Exception:
-                pass
         return False
-# ================================================================================================
-# MAIN PROCESSING FUNCTION
-# ================================================================================================
 def process(
     video_path: str,
     background_image: Optional[Image.Image] = None,
@@ -260,371 +249,229 @@ def process(
     progress_callback: Optional[Callable[[str, float], None]] = None
 ) -> str:
     """
-    Process video with background replacement using SAM2 + MatAnyone
-    Args:
-        video_path: Path to input video
-        background_image: PIL Image for background (if background_type is custom)
-        background_type: Type of background ("custom", "gradient", "solid", etc.)
-        background_prompt: Prompt for background generation
-        job_directory: Directory for processing files
-        progress_callback: Optional callback for progress updates
-    Returns:
-        Path to processed video file
     """
-    def log_progress(step: str, progress: float = None):
-        if progress is not None:
-            logger.info(f"Progress {progress:.1%}: {step}")
         else:
-            logger.info(f"Step: {step}")
         if progress_callback:
             try:
-                progress_callback(step, progress)
             except Exception as e:
-                logger.warning(f"Progress callback error: {e}")
-    # Set up job directory
     if job_directory is None:
         job_directory = Path.cwd() / "tmp" / f"job_{uuid.uuid4().hex[:8]}"
     job_directory.mkdir(parents=True, exist_ok=True)
-    logger.info(f"Processing in job directory: {job_directory}")
-    start_time = time.time()
-    try:
-        # ============================================================================================
-        # STAGE 1: ENVIRONMENT VALIDATION
-        # ============================================================================================
-        log_progress("Validating PyTorch environment", 0.02)
-        device, torch_version = validate_pytorch_environment()
-        log_memory_usage("Environment validated")
-        # ============================================================================================
-        # STAGE 2: VIDEO ANALYSIS
-        # ============================================================================================
-        log_progress("Analyzing input video", 0.05)
-        video_file = Path(video_path)
-        if not video_file.exists():
-            raise FileNotFoundError(f"Video file not found: {video_path}")
-        # Copy video to job directory for safe processing
-        safe_video_path = job_directory / f"input{video_file.suffix}"
-        if safe_video_path != video_file:
-            logger.info(f"Copying video to job directory: {safe_video_path}")
-            shutil.copy2(video_path, safe_video_path)
-            video_path = str(safe_video_path)
-        # Get video properties
-        cap = cv2.VideoCapture(video_path)
-        if not cap.isOpened():
-            raise RuntimeError(f"Cannot open video: {video_path}")
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        duration = frame_count / fps if fps > 0 else 0
-        cap.release()
-        logger.info(f"Video: {width}x{height} @ {fps:.1f}fps, {frame_count} frames ({duration:.1f}s)")
-        # ============================================================================================
-        # STAGE 3: BACKGROUND PREPARATION
-        # ============================================================================================
-        log_progress("Preparing background", 0.08)
-        if background_image is None:
-            raise ValueError("Background image is required")
-        # Resize background to match video
-        bg_image = background_image.resize((width, height), Image.LANCZOS)
-        bg_array = np.array(bg_image)
-        logger.info(f"Background prepared: {bg_image.size}")
-        # ============================================================================================
-        # STAGE 4: SAM2 MODEL LOADING
-        # ============================================================================================
-        log_progress("Loading SAM2 model", 0.1)
-        # Download checkpoint
-        sam2_checkpoint = "./checkpoints/sam2_hiera_large.pt"
-        if not download_sam2_checkpoint(sam2_checkpoint, str(job_directory)):
-            raise RuntimeError("Failed to download SAM2 checkpoint")
-        # Import and load SAM2
-        build_sam2_video_predictor = lazy_import_sam2()
-        clear_gpu_memory()
-        model_cfg = "sam2_hiera_l.yaml"
-        predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device=device)
-        logger.info("SAM2 model loaded successfully")
-        log_memory_usage("SAM2 loaded")
-        # ============================================================================================
-        # STAGE 5: VIDEO PROCESSING INITIALIZATION
-        # ============================================================================================
-        log_progress("Initializing video processing", 0.2)
-        inference_state = predictor.init_state(video_path=video_path)
-        # Add prompt for person detection (center of frame)
-        ann_frame_idx = 0
-        ann_obj_id = 1
-        points = np.array([[width//2, height//2]], dtype=np.float32)
-        labels = np.array([1], np.int32)
-        _, out_obj_ids, out_mask_logits = predictor.add_new_points(
-            inference_state=inference_state,
-            frame_idx=ann_frame_idx,
             obj_id=ann_obj_id,
-            points=points,
             labels=labels,
         )
-        logger.info("Video processing initialized with person detection prompt")
-        # ============================================================================================
-        # STAGE 6: CHUNKED MASK GENERATION WITH FULL CACHE CLEARANCE
-        # ============================================================================================
-        log_progress("Generating masks with chunked SAM2 processing", 0.3)
-        # Calculate optimal chunk size based on available memory
-        available_memory_gb = 12.0  # Conservative for T4
-        estimated_memory_per_frame = 0.05  # ~50MB per frame for 720p
-        max_chunk_size = min(200, int(available_memory_gb / estimated_memory_per_frame))
-        chunk_size = max(50, max_chunk_size)  # Minimum 50 frames, maximum based on memory
-        logger.info(f"Using chunk size: {chunk_size} frames for {frame_count} total frames")
-        video_segments = {}
-        frames_processed = 0
-        # Process video in chunks to prevent memory overflow
-        for chunk_start in range(0, frame_count, chunk_size):
-            chunk_end = min(chunk_start + chunk_size, frame_count)
-            chunk_frames = chunk_end - chunk_start
-            logger.info(f"Processing chunk: frames {chunk_start}-{chunk_end} ({chunk_frames} frames)")
-            # Clear all GPU memory before each chunk
-            clear_gpu_memory()
-            log_memory_usage(f"Before chunk {chunk_start//chunk_size + 1}")
-            try:
-                # Create fresh inference state for this chunk
-                chunk_inference_state = predictor.init_state(video_path=video_path)
-                # Add prompt for this chunk (re-add for each chunk)
-                _, out_obj_ids, out_mask_logits = predictor.add_new_points(
-                    inference_state=chunk_inference_state,
-                    frame_idx=chunk_start,  # Use chunk start as reference frame
-                    obj_id=ann_obj_id,
-                    points=points,
-                    labels=labels,
-                )
-                # Process only frames in this chunk
-                chunk_segments = {}
-                for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
-                    chunk_inference_state,
-                    start_frame_idx=chunk_start,
-                    max_frame_idx=chunk_end - 1
-                ):
-                    if chunk_start <= out_frame_idx < chunk_end:
-                        # Immediately move masks to CPU and store
-                        frame_masks = {}
-                        for i, out_obj_id in enumerate(out_obj_ids):
-                            mask = (out_mask_logits[i] > 0.0).cpu().numpy()
-                            frame_masks[out_obj_id] = mask
-                        video_segments[out_frame_idx] = frame_masks
-                        chunk_segments[out_frame_idx] = frame_masks
-                        frames_processed += 1
-                logger.info(f"Chunk {chunk_start//chunk_size + 1} complete: {len(chunk_segments)} masks generated")
-                # Aggressive cleanup after each chunk
-                del chunk_inference_state
-                del chunk_segments
-                clear_gpu_memory()
-                # Progress update
-                progress = 0.3 + (frames_processed / frame_count) * 0.4
-                log_progress(f"Processed {frames_processed}/{frame_count} frames in chunks", progress)
-            except Exception as e:
-                logger.error(f"Chunk {chunk_start//chunk_size + 1} failed: {e}")
-                # Try to continue with next chunk rather than failing completely
-                clear_gpu_memory()
-                continue
-        logger.info(f"Chunked processing complete: {len(video_segments)} total masks generated")
-        log_memory_usage("All chunks processed")
-        # ============================================================================================
-        # STAGE 7: COMPLETE SAM2 MODEL AND INFERENCE STATE CLEANUP
-        # ============================================================================================
-        log_progress("Complete SAM2 cleanup and memory reclaim", 0.72)
-        try:
-            # Delete all SAM2 references
-            del predictor
-            if 'inference_state' in locals():
-                del inference_state
-            # Remove SAM2 from Python modules
-            import sys
-            sam2_modules = [name for name in sys.modules.keys() if 'sam2' in name.lower()]
-            logger.info(f"Removing {len(sam2_modules)} SAM2 modules from memory")
-            for module_name in sam2_modules:
-                try:
-                    del sys.modules[module_name]
-                except Exception:
-                    pass
-            # Force Python garbage collection
-            import gc
-            collected = gc.collect()
-            logger.info(f"Garbage collected {collected} objects")
-            # Final aggressive GPU cleanup
-            import torch
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-                torch.cuda.synchronize()
-                # Reset memory stats
-                torch.cuda.reset_peak_memory_stats()
-            log_memory_usage("SAM2 completely removed")
-        except Exception as e:
-            logger.warning(f"SAM2 cleanup warning: {e}")
-        # ============================================================================================
-        # STAGE 8: MEMORY-EFFICIENT VIDEO COMPOSITION
-        # ============================================================================================
-        log_progress("Video composition with memory management", 0.8)
-        output_path = job_directory / f"output_{int(time.time())}.mp4"
-        out_writer = safe_video_writer(output_path, 'mp4v', fps, (width, height))
-        cap = cv2.VideoCapture(video_path)
-        frame_idx = 0
-        composition_chunk_size = 50  # Smaller chunks for composition
-        try:
-            frames_batch = []
-            while True:
-                ret, frame = cap.read()
                 if not ret:
                     break
-                # Process frame
-                if frame_idx in video_segments and ann_obj_id in video_segments[frame_idx]:
-                    mask = video_segments[frame_idx][ann_obj_id]
-                    mask_3ch = np.stack([mask, mask, mask], axis=2)
-                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    composite = frame_rgb * mask_3ch + bg_array * (1 - mask_3ch)
-                    composite_bgr = cv2.cvtColor(composite.astype(np.uint8), cv2.COLOR_RGB2BGR)
-                    out_writer.write(composite_bgr)
                 else:
-                    out_writer.write(frame)
-                frame_idx += 1
-                # Memory cleanup every batch
-                if frame_idx % composition_chunk_size == 0:
-                    # Clear processed masks from memory to save RAM
-                    for i in range(max(0, frame_idx - composition_chunk_size), frame_idx):
-                        if i in video_segments:
-                            del video_segments[i]
-                    clear_gpu_memory()
-                    progress = 0.8 + (frame_idx / frame_count) * 0.15
-                    log_progress(f"Compositing {frame_idx}/{frame_count} (memory managed)", progress)
-        finally:
-            cap.release()
-            out_writer.release()
-            # Final cleanup of remaining masks
-            video_segments.clear()
-            clear_gpu_memory()
-        # ============================================================================================
-        # STAGE 9: AUDIO RESTORATION
-        # ============================================================================================
-        log_progress("Adding audio track", 0.95)
-        final_output = job_directory / f"final_with_audio_{int(time.time())}.mp4"
-        try:
-            cmd = [
-                'ffmpeg', '-y', '-hide_banner', '-loglevel', 'error',
-                '-i', str(output_path),  # Video input
-                '-i', video_path,        # Audio source
-                '-c:v', 'copy',          # Copy video
-                '-c:a', 'aac',           # Encode audio
-                '-shortest',             # Match shortest stream
-                str(final_output)
-            ]
-            result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
-            if result.returncode == 0:
-                logger.info("Audio successfully added")
-                output_path.unlink()  # Remove temp video
-                final_path = str(final_output)
-            else:
-                logger.warning(f"Audio processing failed: {result.stderr}")
-                final_path = str(output_path)
-        except Exception as e:
-            logger.warning(f"Audio processing error: {e}")
-            final_path = str(output_path)
-        # ============================================================================================
-        # STAGE 10: COMPLETION
-        # ============================================================================================
-        total_time = time.time() - start_time
-        log_memory_usage("Processing complete")
-        try:
-            import torch
-            if torch.cuda.is_available():
-                peak_memory = torch.cuda.max_memory_allocated() / (1024**3)
-                logger.info(f"Peak GPU memory: {peak_memory:.2f}GB")
-        except Exception:
-            pass
-        log_progress(f"Processing complete in {total_time:.1f}s", 1.0)
-        logger.info(f"Output video: {final_path}")
-        logger.info(f"Job directory: {job_directory}")
-        return final_path
-    except Exception as e:
-        logger.error(f"Processing failed: {e}")
-        logger.error(f"Job directory: {job_directory}")
-        raise
-    finally:
-        # Final cleanup
-        clear_gpu_memory()

 #!/usr/bin/env python3
 """
+pipeline.py — Production SAM2 + MatAnyone (T4-optimized, single-pass streaming)
+Key features
+------------
+- One SAM2 inference state for the entire video (no per-chunk reinit).
+- In-stream pipeline: Read → SAM2 → MatAnyone → Compose → Write (no big RAM dicts).
+- Bounded memory everywhere (deque/window); optional CPU spill.
+- fp16 + channels_last on SAM2; mixed precision blocks.
+- VRAM-aware controller adjusts memory window/scale.
+- Heartbeat logger to prevent HF watchdog restarts.
+- Safer FFmpeg audio re-mux.
+Compatible with Tesla T4 (≈15–16 GB) and PyTorch 2.5.x + CUDA 12.4 wheels.
 """
 import os
+import gc
 import cv2
 import time
 import uuid
+import torch
+import queue
 import shutil
+import logging
 import tempfile
 import subprocess
+import threading
 import numpy as np
 from PIL import Image
 from pathlib import Path
 from typing import Optional, Tuple, Dict, Any, Callable
+from collections import deque
+# ----------------------------------------------------------------------------------------------------------------------
+# Logging
+# ----------------------------------------------------------------------------------------------------------------------
+logger = logging.getLogger("backgroundfx_pro")
+if not logger.handlers:
+    h = logging.StreamHandler()
+    h.setFormatter(logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s: %(message)s"))
+    logger.addHandler(h)
+logger.setLevel(logging.INFO)
+# ----------------------------------------------------------------------------------------------------------------------
+# Environment & Torch tuning for T4
+# ----------------------------------------------------------------------------------------------------------------------
+def setup_t4_environment():
+    os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF",
+                          "expandable_segments:True,max_split_size_mb:256,garbage_collection_threshold:0.7")
+    os.environ.setdefault("OMP_NUM_THREADS", "1")
+    os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
+    os.environ.setdefault("MKL_NUM_THREADS", "1")
+    os.environ.setdefault("OPENCV_OPENCL_RUNTIME", "disabled")
+    os.environ.setdefault("OPENCV_IO_ENABLE_OPENEXR", "0")
+    torch.set_grad_enabled(False)
     try:
+        torch.backends.cudnn.benchmark = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        torch.set_float32_matmul_precision("high")
+    except Exception:
+        pass
+    if torch.cuda.is_available():
         try:
+            frac = float(os.getenv("CUDA_MEMORY_FRACTION", "0.88"))
+            torch.cuda.set_per_process_memory_fraction(frac)
+            logger.info(f"CUDA per-process memory fraction = {frac:.2f}")
+        except Exception as e:
+            logger.warning(f"Could not set CUDA memory fraction: {e}")
+def vram_gb() -> Tuple[float, float]:
+    if not torch.cuda.is_available():
+        return 0.0, 0.0
+    free, total = torch.cuda.mem_get_info()
+    return free / (1024 ** 3), total / (1024 ** 3)
+# ----------------------------------------------------------------------------------------------------------------------
+# Heartbeat (prevents Spaces watchdog killing the job)
+# ----------------------------------------------------------------------------------------------------------------------
+def heartbeat_monitor(running_flag: Dict[str, bool], interval: float = 8.0):
+    while running_flag.get("running", False):
+        print(f"[HB] t={int(time.time())}", flush=True)
+        time.sleep(interval)
+# ----------------------------------------------------------------------------------------------------------------------
+# Streaming video I/O
+# ----------------------------------------------------------------------------------------------------------------------
+class StreamingVideoIO:
+    def __init__(self, video_path: str, out_path: str, fps: float):
+        self.video_path = video_path
+        self.out_path = out_path
+        self.fps = fps
+        self.cap = None
+        self.writer = None
+        self.size = None
+    def __enter__(self):
+        self.cap = cv2.VideoCapture(self.video_path)
+        if not self.cap.isOpened():
+            raise RuntimeError(f"Cannot open video: {self.video_path}")
+        w = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        h = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        self.size = (w, h)
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        self.writer = cv2.VideoWriter(self.out_path, fourcc, self.fps, (w, h))
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.cap:
+            self.cap.release()
+        if self.writer:
+            self.writer.release()
+    def read_frame(self):
+        if not self.cap:
+            return False, None
+        return self.cap.read()
+    def write_frame(self, frame_bgr: np.ndarray):
+        if not self.writer:
+            return
+        self.writer.write(frame_bgr)
+# ----------------------------------------------------------------------------------------------------------------------
+# Models: loaders and safe optimizations
+# ----------------------------------------------------------------------------------------------------------------------
+def load_sam2_predictor(device: torch.device):
+    """
+    Prefer your local wrapper to keep interfaces stable.
+    """
     try:
+        from models.sam2_loader import SAM2Predictor  # your wrapper
+        predictor = SAM2Predictor(device=device)
+        # Optional: try to access underlying model to set fp16 + channels_last
+        try:
+            if hasattr(predictor, "model") and predictor.model is not None:
+                predictor.model = predictor.model.half().to(device)
+                predictor.model = predictor.model.to(memory_format=torch.channels_last)
+                logger.info("SAM2: fp16 + channels_last applied (wrapper model).")
+        except Exception as e:
+            logger.warning(f"SAM2 fp16 optimization warning: {e}")
+        return predictor
     except Exception as e:
+        logger.error(f"Failed to import SAM2Predictor: {e}")
         raise
+def load_matany_session(device: torch.device):
+    """
+    Supports either MatAnyoneSession or MatAnyoneLoader (your code has varied).
+    """
     try:
+        try:
+            from models.matanyone_loader import MatAnyoneSession as _MatAny
+        except Exception:
+            from models.matanyone_loader import MatAnyoneLoader as _MatAny
+        session = _MatAny(device=device)
+        # Try fp16 eval where safe
+        if hasattr(session, "model") and session.model is not None:
+            session.model.eval()
+            try:
+                session.model = session.model.half().to(device)
+                logger.info("MatAnyone: fp16 + eval applied.")
+            except Exception:
+                logger.info("MatAnyone: using fp32 (fp16 not supported for some layers).")
+        return session
     except Exception as e:
+        logger.warning(f"MatAnyone not available ({e}). Proceeding without refinement.")
         return None
+# ----------------------------------------------------------------------------------------------------------------------
+# SAM2 state pruning (adapter): we call predictor.prune_state if present, else best-effort
+# ----------------------------------------------------------------------------------------------------------------------
+def prune_sam2_state(predictor, state: Any, keep: int):
+    """
+    Try to prune SAM2 temporal caches to a fixed window length.
+    Your SAM2Predictor should implement prune_state(state, keep=N). If not, we do nothing.
+    """
     try:
+        if hasattr(predictor, "prune_state"):
+            predictor.prune_state(state, keep=keep)
+        elif hasattr(state, "prune") and callable(getattr(state, "prune")):
+            state.prune(keep=keep)
+        else:
+            # No-op; rely on model internals and GC
+            pass
     except Exception as e:
+        logger.debug(f"SAM2 prune_state warning: {e}")
+# ----------------------------------------------------------------------------------------------------------------------
+# VRAM-aware controller
+# ----------------------------------------------------------------------------------------------------------------------
+class VRAMAdaptiveController:
+    def __init__(self):
+        self.memory_window = int(os.getenv("SAM2_WINDOW", "96"))   # frames to keep in model state
+        self.propagation_scale = float(os.getenv("SAM2_PROP_SCALE", "0.90"))  # e.g., downscale factor for propagation
+        self.cleanup_every = 20  # frames
+    def adapt(self):
+        free, total = vram_gb()
+        if free == 0.0:
+            return
+        # Tighten if we dip under ~1.6 GB
+        if free < 1.6:
+            self.memory_window = max(48, self.memory_window - 8)
+            self.propagation_scale = max(0.75, self.propagation_scale - 0.03)
+            self.cleanup_every = max(12, self.cleanup_every - 2)
+            logger.warning(f"Low VRAM ({free:.2f} GB free) → window={self.memory_window}, scale={self.propagation_scale:.2f}")
+        # Relax if plenty free
+        elif free > 3.0:
+            self.memory_window = min(128, self.memory_window + 4)
+            self.propagation_scale = min(1.0, self.propagation_scale + 0.01)
+            self.cleanup_every = min(40, self.cleanup_every + 2)
+# ----------------------------------------------------------------------------------------------------------------------
+# Audio mux helper (safer stream mapping)
+# ----------------------------------------------------------------------------------------------------------------------
+def mux_audio(video_path_no_audio: str, source_with_audio: str, out_path: str) -> bool:
+    cmd = [
+        "ffmpeg", "-y", "-hide_banner", "-loglevel", "error",
+        "-i", video_path_no_audio,
+        "-i", source_with_audio,
+        "-map", "0:v:0", "-map", "1:a:0",
+        "-c:v", "copy", "-c:a", "aac", "-shortest",
+        out_path
+    ]
     try:
+        r = subprocess.run(cmd, capture_output=True, text=True, timeout=180)
+        if r.returncode != 0:
+            logger.warning(f"FFmpeg mux failed: {r.stderr.strip()}")
+            return False
         return True
     except Exception as e:
+        logger.warning(f"FFmpeg mux error: {e}")
         return False
+# ----------------------------------------------------------------------------------------------------------------------
+# Main processing
+# ----------------------------------------------------------------------------------------------------------------------
 def process(
     video_path: str,
     background_image: Optional[Image.Image] = None,
     progress_callback: Optional[Callable[[str, float], None]] = None
 ) -> str:
     """
+    Production SAM2 + MatAnyone pipeline for T4.
+    - Single-pass streaming (no large mask dicts)
+    - Bounded memory windows
     """
+    setup_t4_environment()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Heartbeat
+    hb_flag = {"running": True}
+    hb_thread = threading.Thread(target=heartbeat_monitor, args=(hb_flag, 8.0), daemon=True)
+    hb_thread.start()
+    def report(step: str, p: Optional[float] = None):
+        if p is None:
+            logger.info(step)
         else:
+            logger.info(f"{step} [{p:.1%}]")
         if progress_callback:
             try:
+                progress_callback(step, p)
             except Exception as e:
+                logger.debug(f"progress_callback error: {e}")
+    # Validate I/O
+    src = Path(video_path)
+    if not src.exists():
+        hb_flag["running"] = False
+        raise FileNotFoundError(f"Video not found: {video_path}")
     if job_directory is None:
         job_directory = Path.cwd() / "tmp" / f"job_{uuid.uuid4().hex[:8]}"
     job_directory.mkdir(parents=True, exist_ok=True)
+    # Probe video
+    cap_probe = cv2.VideoCapture(str(src))
+    if not cap_probe.isOpened():
+        hb_flag["running"] = False
+        raise RuntimeError(f"Cannot open video: {video_path}")
+    fps = cap_probe.get(cv2.CAP_PROP_FPS) or 25.0
+    width = int(cap_probe.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap_probe.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    frame_count = int(cap_probe.get(cv2.CAP_PROP_FRAME_COUNT))
+    duration = frame_count / fps if fps > 0 else 0.0
+    cap_probe.release()
+    logger.info(f"Video: {width}x{height} @ {fps:.2f} fps | {frame_count} frames ({duration:.1f}s)")
+    # Prepare background
+    if background_image is None:
+        hb_flag["running"] = False
+        raise ValueError("background_image is required")
+    bg = background_image.resize((width, height), Image.LANCZOS)
+    bg_np = np.array(bg).astype(np.float32)
+    # Load models
+    report("Loading SAM2 + MatAnyone", 0.05)
+    predictor = load_sam2_predictor(device)
+    matany = load_matany_session(device)
+    # Init SAM2 state (single)
+    report("Initializing SAM2 video state", 0.08)
+    state = predictor.init_state(video_path=str(src))
+    # Minimal prompt: single positive point at center (replace with your prompt UI if needed)
+    center_pt = np.array([[width // 2, height // 2]], dtype=np.float32)
+    labels = np.array([1], dtype=np.int32)
+    ann_obj_id = 1
+    with torch.inference_mode():
+        _ = predictor.add_new_points(
+            inference_state=state,
+            frame_idx=0,
             obj_id=ann_obj_id,
+            points=center_pt,
             labels=labels,
         )
+    # Controller
+    ctrl = VRAMAdaptiveController()
+    # Output paths
+    out_raw = str(job_directory / f"composite_{int(time.time())}.mp4")
+    out_final = str(job_directory / f"final_{int(time.time())}.mp4")
+    # Windows/buffers (bounded)
+    # For completeness we keep a tiny deque for any auxiliary temporal ops (e.g., matting history)
+    aux_window = deque(maxlen=max(32, min(96, ctrl.memory_window // 2)))
+    # Stream processing
+    start = time.time()
+    frames_done = 0
+    next_cleanup_at = ctrl.cleanup_every
+    report("Streaming: SAM2 → MatAnyone → Compose → Write", 0.12)
+    with StreamingVideoIO(str(src), out_raw, fps) as vio:
+        # iterate SAM2 propagation alongside reading frames
+        with torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16 if device.type == "cuda" else None):
+            for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(state, scale=ctrl.propagation_scale):
+                # Read the matching frame
+                ret, frame_bgr = vio.read_frame()
                 if not ret:
                     break
+                # Get mask for ann_obj_id; keep on GPU as long as possible
+                mask_t = None
+                try:
+                    if isinstance(out_obj_ids, torch.Tensor):
+                        # find index where id == ann_obj_id
+                        idxs = (out_obj_ids == ann_obj_id).nonzero(as_tuple=False)
+                        if idxs.numel() > 0:
+                            i = idxs[0].item()
+                            logits = out_mask_logits[i]
+                        else:
+                            logits = None
+                    else:
+                        # list/array fallback
+                        ids_list = list(out_obj_ids)
+                        i = ids_list.index(ann_obj_id) if ann_obj_id in ids_list else -1
+                        logits = out_mask_logits[i] if i >= 0 else None
+                    if logits is not None:
+                        # logits → prob → binary mask (threshold 0)
+                        mask_t = (logits > 0).float()  # HxW on CUDA fp16 → fp32 float
+                except Exception as e:
+                    logger.debug(f"Mask extraction warning @frame {out_frame_idx}: {e}")
+                    mask_t = None
+                # Optional: MatAnyone refinement
+                if mask_t is not None and matany is not None:
+                    try:
+                        # MatAnyone APIs vary — try common forms
+                        # Convert RGB because many mattors expect RGB
+                        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+                        # Move frame to GPU only if your matting backend supports it
+                        refined = None
+                        if hasattr(matany, "refine_mask"):
+                            refined = matany.refine_mask(frame_rgb, mask_t)  # allow handler to decide device
+                        elif hasattr(matany, "process_frame"):
+                            refined = matany.process_frame(frame_rgb, mask_t)
+                        if refined is not None:
+                            # ensure float mask 0..1 on CUDA or CPU
+                            if isinstance(refined, torch.Tensor):
+                                mask_t = refined.float()
+                            else:
+                                # numpy → torch
+                                mask_t = torch.from_numpy(refined.astype(np.float32))
+                                if device.type == "cuda":
+                                    mask_t = mask_t.to(device)
+                    except Exception as e:
+                        logger.debug(f"MatAnyone refinement failed (frame {out_frame_idx}): {e}")
+                # Compose and write (convert once, keep math sane)
+                if mask_t is not None:
+                    # bring mask to CPU for np composition; keep as float [0,1]
+                    mask_np = mask_t.detach().clamp(0, 1).to("cpu", non_blocking=True).float().numpy()
+                    m3 = mask_np[..., None]  # HxWx1
+                    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB).astype(np.float32)
+                    comp = frame_rgb * m3 + bg_np * (1.0 - m3)
+                    comp_bgr = cv2.cvtColor(comp.astype(np.uint8), cv2.COLOR_RGB2BGR)
+                    vio.write_frame(comp_bgr)
                 else:
+                    # No mask — write original frame
+                    vio.write_frame(frame_bgr)
+                # Periodic maintenance
+                frames_done += 1
+                if frames_done >= next_cleanup_at:
+                    ctrl.adapt()
+                    prune_sam2_state(predictor, state, keep=ctrl.memory_window)
+                    # Clear small aux buffers
+                    aux_window.clear()
+                    if device.type == "cuda":
+                        torch.cuda.ipc_collect()
+                        torch.cuda.empty_cache()
+                    next_cleanup_at = frames_done + ctrl.cleanup_every
+                # Progress
+                if frames_done % 25 == 0 and frame_count > 0:
+                    p = 0.12 + 0.75 * (frames_done / frame_count)
+                    report(f"Processing frame {frames_done}/{frame_count} | win={ctrl.memory_window} scale={ctrl.propagation_scale:.2f}", p)
+    # Audio mux
+    report("Restoring audio", 0.93)
+    ok = mux_audio(out_raw, str(src), out_final)
+    final_path = out_final if ok else out_raw
+    # Cleanup models/state promptly
+    try:
+        del predictor
+        del state
+        if matany is not None:
+            del matany
+    except Exception:
+        pass
+    if device.type == "cuda":
+        torch.cuda.ipc_collect()
+        torch.cuda.empty_cache()
+    gc.collect()
+    hb_flag["running"] = False
+    elapsed = time.time() - start
+    try:
+        peak = torch.cuda.max_memory_allocated() / (1024 ** 3) if device.type == "cuda" else 0.0
+        logger.info(f"Peak GPU memory: {peak:.2f} GB")
+    except Exception:
+        pass
+    report(f"Done in {elapsed:.1f}s", 1.0)
+    logger.info(f"Output: {final_path}")
+    logger.info(f"Artifacts: {job_directory}")
+    return final_path
+# -------------------------------------------------------------------------------------------------
+# CLI entry (optional)
+# -------------------------------------------------------------------------------------------------
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="BackgroundFX Pro pipeline")
+    parser.add_argument("--video", required=True, help="Path to input video")
+    parser.add_argument("--background", required=True, help="Path to background image")
+    parser.add_argument("--outdir", default=None, help="Job directory (optional)")
+    args = parser.parse_args()
+    bg_img = Image.open(args.background).convert("RGB")
+    outdir = Path(args.outdir) if args.outdir else None
+    out_path = process(args.video, background_image=bg_img, job_directory=outdir)
+    print(out_path)