Spaces:

MogensR
/

VideoBackgroundReplacer2

Paused

App Files Files Community

MogensR commited on Sep 15, 2025

Commit

37f2d16

1 Parent(s): 96de0be

agent 2.0

Browse files

Files changed (1) hide show

models/matanyone_loader.py +212 -59

models/matanyone_loader.py CHANGED Viewed

@@ -44,9 +44,98 @@ def _emit_progress(cb, pct: float, msg: str):
         except TypeError:
             pass            # ignore if cb is incompatible
-class MatAnyError(Exception):
     pass
 def _read_mask_hw(mask_path: Path, target_hw: Tuple[int, int]) -> np.ndarray:
     """Read mask image, convert to float32 [0,1], resize to target (H,W)."""
     if not Path(mask_path).exists():
@@ -60,6 +149,7 @@ def _read_mask_hw(mask_path: Path, target_hw: Tuple[int, int]) -> np.ndarray:
     maskf = (mask.astype(np.float32) / 255.0).clip(0.0, 1.0)
     return maskf
 def _to_chw01(img_bgr: np.ndarray) -> np.ndarray:
     """BGR [H,W,3] uint8 -> CHW float32 [0,1] RGB."""
     # OpenCV gives BGR; convert to RGB
@@ -68,13 +158,16 @@ def _to_chw01(img_bgr: np.ndarray) -> np.ndarray:
     chw = np.transpose(rgbf, (2, 0, 1))  # C,H,W
     return chw
 def _mask_to_1hw(mask_hw01: np.ndarray) -> np.ndarray:
     """HW float32 [0,1] -> 1HW float32 [0,1]."""
     return np.expand_dims(mask_hw01, axis=0)
 def _ensure_dir(p: Path) -> None:
     p.mkdir(parents=True, exist_ok=True)
 def _open_video_writers(out_dir: Path, fps: float, size: Tuple[int, int]) -> Tuple[cv2.VideoWriter, cv2.VideoWriter]:
     """Return (alpha_writer, fg_writer). size=(W,H)."""
     fourcc = cv2.VideoWriter_fourcc(*"mp4v")
@@ -88,10 +181,12 @@ def _open_video_writers(out_dir: Path, fps: float, size: Tuple[int, int]) -> Tup
         raise MatAnyError("Failed to open VideoWriter for alpha/fg outputs.")
     return alpha_writer, fg_writer
 def _validate_nonempty(file_path: Path) -> None:
     if not file_path.exists() or file_path.stat().st_size == 0:
         raise MatAnyError(f"Output file missing/empty: {file_path}")
 class MatAnyoneSession:
     """
     Unified, streaming wrapper over MatAnyone variants.
@@ -529,7 +624,7 @@ def process_stream(
                                     gpu_info = f" | GPU: {mem_alloc:.1f}/{mem_cached:.1f}MB"
                                 status = (f"Processing frame {idx+1}/{N} (ETA: {eta_str}, "
-                                         f"{fps:.1f} FPS{gpu_info}")
                                 _emit_progress(progress_cb, min(0.99, current_progress), status)
                                 last_progress_update = time.time()
@@ -807,68 +902,126 @@ def process_stream(
                         raise MatAnyError(error_msg) from e
     def _flush_chunk(self, frames_bgr, seed_1hw, alpha_writer, fg_writer):
-        """Process a chunk of frames with MatAnyone.
-        Args:
-            frames_bgr: List of frames in BGR format
-            seed_1hw: Seed mask in 1HW format or None
-            alpha_writer: VideoWriter for alpha channel output
-            fg_writer: VideoWriter for foreground output
-        Raises:
-            MatAnyError: If there's an error processing the frames
-        """
-        if not frames_bgr:
-            return
-        # Prepare inputs
-        frames_chw = [_to_chw01(f) for f in frames_bgr]  # list of CHW
-        frames_t = torch.from_numpy(np.stack(frames_chw)).to(self.device)  # T,C,H,W
-        mask_t = None
-        if seed_1hw is not None:
-            mask_t = torch.from_numpy(seed_1hw).to(self.device)
-        try:
-            with torch.no_grad(), self._maybe_amp():
-                # Process frames in batch
-                if self._api_mode == "process_frame":
-                    alphas = []
-                    for i in range(len(frames_t)):
-                        # Only use mask on first frame if provided
-                        current_mask = mask_t if (i == 0 and mask_t is not None) else None
-                        alpha = self._core.process_frame(frames_t[i].unsqueeze(0), current_mask)
-                        alphas.append(alpha.squeeze(0))
-                    alphas = torch.stack(alphas)
-                elif hasattr(self._core, '_process_tensor_video'):
-                    # Try direct tensor processing (newer versions)
-                    alphas = self._core._process_tensor_video(frames_t, mask_t)
-                else:  # step mode
-                    alphas = self._core.step(frames_t, mask_t)
-                # Convert to numpy and write frames
-                alphas_np = alphas.cpu().numpy()
-                for i, alpha in enumerate(alphas_np):
-                    # Convert alpha to uint8 and write
-                    alpha_uint8 = (alpha * 255).astype(np.uint8)
-                    if len(alpha_uint8.shape) == 2:  # If single channel, convert to 3 channels
-                        alpha_uint8 = cv2.cvtColor(alpha_uint8, cv2.COLOR_GRAY2BGR)
-                    alpha_writer.write(alpha_uint8)
-                    # Write foreground (frame * alpha)
-                    fg = frames_bgr[i] * (alpha[..., None] if alpha.ndim == 2 else alpha[0:1].permute(1, 2, 0))
-                    fg = fg.astype(np.uint8)
-                    fg_writer.write(fg)
-        except RuntimeError as e:
-            if "out of memory" in str(e).lower():
-                # Clear CUDA cache and retry once
-                torch.cuda.empty_cache()
-                log.warning("CUDA out of memory, retrying after cache clear")
-                return self._flush_chunk(frames_bgr, seed_1hw, alpha_writer, fg_writer)
-            raise
         except Exception as e:
-            error_msg = f"Error processing frame chunk: {str(e)}"
-            log.error(error_msg, exc_info=True)
-            raise MatAnyError(error_msg) from e

         except TypeError:
             pass            # ignore if cb is incompatible
+class MatAnyError(RuntimeError):
+    """Custom exception for MatAnyone processing errors."""
     pass
+def _to_device_batch(frames_bgr_np, device, dtype=torch.float16):
+    """
+    Convert a list/array of BGR uint8 frames [N,H,W,3] to a normalized
+    CHW tensor on device using pinned memory + non_blocking copies.
+    """
+    if isinstance(frames_bgr_np, list):
+        frames_bgr_np = np.stack(frames_bgr_np, axis=0)  # [N,H,W,3]
+    # BGR -> RGB
+    frames_rgb = frames_bgr_np[..., ::-1].copy(order="C")
+    # to torch
+    pin = torch.from_numpy(frames_rgb).pin_memory()  # uint8 [N,H,W,3]
+    # NCHW and normalize
+    t = pin.permute(0, 3, 1, 2).contiguous().to(device, non_blocking=True)
+    t = t.to(dtype=dtype) / 255.0
+    return t  # [N,3,H,W]
+def _select_matany_mode(core):
+    """
+    Pick the best-available MatAnyone API at runtime.
+    Priority: process_frame > _process_tensor_video > step
+    """
+    if hasattr(core, "process_frame"):
+        return "process_frame"
+    if hasattr(core, "_process_tensor_video"):
+        return "_process_tensor_video"
+    if hasattr(core, "step"):
+        return "step"
+    raise MatAnyError("No supported MatAnyone API on core (process_frame/_process_tensor_video/step).")
+def _matany_run(core, mode, frames_04chw, seed_1hw=None):
+    """
+    Dispatch into the selected API. All tensors are on device.
+    Returns (alpha_1nhw, fg_n3hw) where alpha is [N,1,H,W], fg [N,3,H,W].
+    """
+    with torch.no_grad():
+        if mode == "process_frame":
+            alphas, fgs = [], []
+            # process_frame usually wants per-frame tensors in [1,3,H,W]
+            for i in range(frames_04chw.shape[0]):
+                f = frames_04chw[i:i+1]  # [1,3,H,W]
+                if seed_1hw is not None and seed_1hw.ndim == 3:
+                    a, fg = core.process_frame(f, seed_1hw.unsqueeze(0))
+                else:
+                    a, fg = core.process_frame(f)
+                alphas.append(a)  # [1,1,H,W]
+                fgs.append(fg)    # [1,3,H,W]
+            alpha = torch.cat(alphas, dim=0)
+            fg = torch.cat(fgs, dim=0)
+            return alpha, fg
+        elif mode == "_process_tensor_video":
+            return core._process_tensor_video(frames_04chw.float(), seed_1hw)
+        elif mode == "step":
+            alphas, fgs = [], []
+            for i in range(frames_04chw.shape[0]):
+                f = frames_04chw[i:i+1]
+                if i == 0 and seed_1hw is not None:
+                    a, fg = core.step(f, seed_1hw)
+                else:
+                    a, fg = core.step(f)
+                alphas.append(a)
+                fgs.append(fg)
+            alpha = torch.cat(alphas, dim=0)
+            fg = torch.cat(fgs, dim=0)
+            return alpha, fg
+    raise MatAnyError(f"Unsupported mode: {mode}")
+def _safe_empty_cache():
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        torch.cuda.empty_cache()
+def _cuda_snapshot():
+    if not torch.cuda.is_available():
+        return "CUDA: N/A"
+    i = torch.cuda.current_device()
+    return (f"device={i}, name={torch.cuda.get_device_name(i)}, "
+            f"alloc={torch.cuda.memory_allocated(i)/1e9:.2f}GB, "
+            f"reserved={torch.cuda.memory_reserved(i)/1e9:.2f}GB")
 def _read_mask_hw(mask_path: Path, target_hw: Tuple[int, int]) -> np.ndarray:
     """Read mask image, convert to float32 [0,1], resize to target (H,W)."""
     if not Path(mask_path).exists():
     maskf = (mask.astype(np.float32) / 255.0).clip(0.0, 1.0)
     return maskf
 def _to_chw01(img_bgr: np.ndarray) -> np.ndarray:
     """BGR [H,W,3] uint8 -> CHW float32 [0,1] RGB."""
     # OpenCV gives BGR; convert to RGB
     chw = np.transpose(rgbf, (2, 0, 1))  # C,H,W
     return chw
 def _mask_to_1hw(mask_hw01: np.ndarray) -> np.ndarray:
     """HW float32 [0,1] -> 1HW float32 [0,1]."""
     return np.expand_dims(mask_hw01, axis=0)
 def _ensure_dir(p: Path) -> None:
     p.mkdir(parents=True, exist_ok=True)
 def _open_video_writers(out_dir: Path, fps: float, size: Tuple[int, int]) -> Tuple[cv2.VideoWriter, cv2.VideoWriter]:
     """Return (alpha_writer, fg_writer). size=(W,H)."""
     fourcc = cv2.VideoWriter_fourcc(*"mp4v")
         raise MatAnyError("Failed to open VideoWriter for alpha/fg outputs.")
     return alpha_writer, fg_writer
 def _validate_nonempty(file_path: Path) -> None:
     if not file_path.exists() or file_path.stat().st_size == 0:
         raise MatAnyError(f"Output file missing/empty: {file_path}")
 class MatAnyoneSession:
     """
     Unified, streaming wrapper over MatAnyone variants.
                                     gpu_info = f" | GPU: {mem_alloc:.1f}/{mem_cached:.1f}MB"
                                 status = (f"Processing frame {idx+1}/{N} (ETA: {eta_str}, "
+                                        f"{fps:.1f} FPS{gpu_info}")
                                 _emit_progress(progress_cb, min(0.99, current_progress), status)
                                 last_progress_update = time.time()
                         raise MatAnyError(error_msg) from e
     def _flush_chunk(self, frames_bgr, seed_1hw, alpha_writer, fg_writer):
+        """
+        Take an in-memory batch of frames (list of np.uint8 BGR), run MatAnyone on GPU,
+        then write alpha/fg frames via the provided writers. Clears GPU memory on exit.
+        """
+        # Initialize variables for cleanup
+        alpha_n1hw, fg_n3hw, frames_04chw = None, None, None
+        try:
+            device = self.device
+            use_fp16 = (device.type == "cuda") and getattr(self, 'use_fp16', True)
+            mode = _select_matany_mode(self._core)
+            # Move input frames to device in a batched, pinned way
+            frames_04chw = _to_device_batch(frames_bgr, device,
+                                          dtype=torch.float16 if use_fp16 else torch.float32)
+            # Move seed mask to device if provided
+            seed_tensor = None
+            if seed_1hw is not None:
+                seed_tensor = torch.from_numpy(seed_1hw).to(device)
+            # Process with CUDA stream if available
+            if device.type == "cuda":
+                stream = torch.cuda.Stream()
+                with torch.cuda.stream(stream):
+                    with torch.autocast(device_type="cuda", enabled=use_fp16):
+                        alpha_n1hw, fg_n3hw = _matany_run(self._core, mode, frames_04chw, seed_tensor)
+                torch.cuda.synchronize()
+            else:
+                alpha_n1hw, fg_n3hw = _matany_run(self._core, mode, frames_04chw, seed_tensor)
+            # Write out results (convert back to CPU uint8)
+            alpha_cpu = (alpha_n1hw.clamp(0, 1) * 255.0).byte().squeeze(1).contiguous().cpu().numpy()  # [N,H,W]
+            for i in range(alpha_cpu.shape[0]):
+                # Write alpha mask
+                alpha_uint8 = alpha_cpu[i]
+                if len(alpha_uint8.shape) == 2:  # Ensure 3 channels for writer
+                    alpha_uint8 = cv2.cvtColor(alpha_uint8, cv2.COLOR_GRAY2BGR)
+                alpha_writer.write(alpha_uint8)
+                # Write foreground (frame * alpha)
+                alpha_expanded = alpha_cpu[i] / 255.0
+                if alpha_expanded.ndim == 2:
+                    alpha_expanded = alpha_expanded[..., None]  # [H,W,1]
+                fg = (frames_bgr[i] * alpha_expanded).astype(np.uint8)
+                fg_writer.write(fg)
+            # Keep seed for temporal methods that need it
+            if hasattr(self._core, "last_mask"):
+                self._last_alpha_1hw = self._core.last_mask
+        except torch.cuda.OutOfMemoryError as e:
+            # Downshift strategy: smaller chunk or resolution; propagate with context
+            snap = _cuda_snapshot()
+            _safe_empty_cache()
+            raise MatAnyError(f"CUDA OOM in _flush_chunk (before retry). Snapshot: {snap}") from e
+        except Exception as e:
+            # Convert unexpected exceptions to MatAnyError with context
+            snap = _cuda_snapshot()
+            raise MatAnyError(f"MatAnyone failure in _flush_chunk: {e} | {snap}") from e
+        finally:
+            # Hard cleanup to avoid lingering allocations between chunks
+            for var in [alpha_n1hw, fg_n3hw, frames_04chw]:
+                try:
+                    del var
+                except Exception:
+                    pass
+            _safe_empty_cache()
+    def process_stream(self, frames_iterable, seed_1hw, alpha_writer, fg_writer, chunk_size=32):
+        """
+        Public entry that buffers frames from an iterator and processes them in chunks.
+        Ensures cleanup and graceful degradation on OOM.
+        """
+        frames_buf = []
+        last_error = None
+        try:
+            for f in frames_iterable:
+                frames_buf.append(f)
+                if len(frames_buf) >= chunk_size:
+                    try:
+                        self._flush_chunk(frames_buf, seed_1hw, alpha_writer, fg_writer)
+                        frames_buf.clear()
+                    except MatAnyError as e:
+                        # Attempt one downshift: halve the chunk and retry once
+                        if chunk_size > 4:
+                            half = max(4, chunk_size // 2)
+                            # Split and try smaller batches
+                            for i in range(0, len(frames_buf), half):
+                                sub = frames_buf[i:i+half]
+                                self._flush_chunk(sub, seed_1hw, alpha_writer, fg_writer)
+                            frames_buf.clear()
+                        else:
+                            last_error = e
+                            break
+            # Flush remainder
+            if frames_buf:
+                self._flush_chunk(frames_buf, seed_1hw, alpha_writer, fg_writer)
+                frames_buf.clear()
+        except torch.cuda.OutOfMemoryError as e:
+            last_error = MatAnyError(f"CUDA OOM in process_stream: {_cuda_snapshot()}") from e
         except Exception as e:
+            last_error = MatAnyError(f"Unexpected error in process_stream: {e}") from e
+        finally:
+            frames_buf.clear()
+            _safe_empty_cache()
+            # Optional: if core has a reset method, call it
+            if hasattr(getattr(self, '_core', None), 'reset'):
+                try:
+                    self._core.reset()
+                except Exception:
+                    pass
+        if last_error:
+            raise last_error