Spaces:

MogensR
/

VideoBackgroundReplacer2

Paused

App Files Files Community

MogensR commited on Sep 15, 2025

Commit

db85143

1 Parent(s): 37f2d16

agent 2.1

Browse files

Files changed (1) hide show

models/matanyone_loader.py +130 -66

models/matanyone_loader.py CHANGED Viewed

@@ -49,6 +49,95 @@ class MatAnyError(RuntimeError):
     pass
 def _to_device_batch(frames_bgr_np, device, dtype=torch.float16):
     """
     Convert a list/array of BGR uint8 frames [N,H,W,3] to a normalized
@@ -903,85 +992,62 @@ def process_stream(
     def _flush_chunk(self, frames_bgr, seed_1hw, alpha_writer, fg_writer):
         """
-        Take an in-memory batch of frames (list of np.uint8 BGR), run MatAnyone on GPU,
-        then write alpha/fg frames via the provided writers. Clears GPU memory on exit.
         """
-        # Initialize variables for cleanup
-        alpha_n1hw, fg_n3hw, frames_04chw = None, None, None
-        try:
-            device = self.device
-            use_fp16 = (device.type == "cuda") and getattr(self, 'use_fp16', True)
-            mode = _select_matany_mode(self._core)
-            # Move input frames to device in a batched, pinned way
-            frames_04chw = _to_device_batch(frames_bgr, device,
-                                          dtype=torch.float16 if use_fp16 else torch.float32)
-            # Move seed mask to device if provided
-            seed_tensor = None
-            if seed_1hw is not None:
-                seed_tensor = torch.from_numpy(seed_1hw).to(device)
-            # Process with CUDA stream if available
             if device.type == "cuda":
                 stream = torch.cuda.Stream()
                 with torch.cuda.stream(stream):
                     with torch.autocast(device_type="cuda", enabled=use_fp16):
-                        alpha_n1hw, fg_n3hw = _matany_run(self._core, mode, frames_04chw, seed_tensor)
-                torch.cuda.synchronize()
             else:
-                alpha_n1hw, fg_n3hw = _matany_run(self._core, mode, frames_04chw, seed_tensor)
-            # Write out results (convert back to CPU uint8)
-            alpha_cpu = (alpha_n1hw.clamp(0, 1) * 255.0).byte().squeeze(1).contiguous().cpu().numpy()  # [N,H,W]
             for i in range(alpha_cpu.shape[0]):
-                # Write alpha mask
-                alpha_uint8 = alpha_cpu[i]
-                if len(alpha_uint8.shape) == 2:  # Ensure 3 channels for writer
-                    alpha_uint8 = cv2.cvtColor(alpha_uint8, cv2.COLOR_GRAY2BGR)
-                alpha_writer.write(alpha_uint8)
-                # Write foreground (frame * alpha)
-                alpha_expanded = alpha_cpu[i] / 255.0
-                if alpha_expanded.ndim == 2:
-                    alpha_expanded = alpha_expanded[..., None]  # [H,W,1]
-                fg = (frames_bgr[i] * alpha_expanded).astype(np.uint8)
-                fg_writer.write(fg)
-            # Keep seed for temporal methods that need it
             if hasattr(self._core, "last_mask"):
                 self._last_alpha_1hw = self._core.last_mask
         except torch.cuda.OutOfMemoryError as e:
-            # Downshift strategy: smaller chunk or resolution; propagate with context
             snap = _cuda_snapshot()
             _safe_empty_cache()
-            raise MatAnyError(f"CUDA OOM in _flush_chunk (before retry). Snapshot: {snap}") from e
         except Exception as e:
-            # Convert unexpected exceptions to MatAnyError with context
             snap = _cuda_snapshot()
             raise MatAnyError(f"MatAnyone failure in _flush_chunk: {e} | {snap}") from e
         finally:
-            # Hard cleanup to avoid lingering allocations between chunks
-            for var in [alpha_n1hw, fg_n3hw, frames_04chw]:
-                try:
-                    del var
-                except Exception:
-                    pass
             _safe_empty_cache()
     def process_stream(self, frames_iterable, seed_1hw, alpha_writer, fg_writer, chunk_size=32):
         """
-        Public entry that buffers frames from an iterator and processes them in chunks.
-        Ensures cleanup and graceful degradation on OOM.
         """
         frames_buf = []
-        last_error = None
         try:
             for f in frames_iterable:
                 frames_buf.append(f)
@@ -989,39 +1055,37 @@ def process_stream(self, frames_iterable, seed_1hw, alpha_writer, fg_writer, chu
                     try:
                         self._flush_chunk(frames_buf, seed_1hw, alpha_writer, fg_writer)
                         frames_buf.clear()
-                    except MatAnyError as e:
-                        # Attempt one downshift: halve the chunk and retry once
                         if chunk_size > 4:
                             half = max(4, chunk_size // 2)
-                            # Split and try smaller batches
                             for i in range(0, len(frames_buf), half):
                                 sub = frames_buf[i:i+half]
                                 self._flush_chunk(sub, seed_1hw, alpha_writer, fg_writer)
                             frames_buf.clear()
                         else:
-                            last_error = e
-                            break
-            # Flush remainder
             if frames_buf:
                 self._flush_chunk(frames_buf, seed_1hw, alpha_writer, fg_writer)
                 frames_buf.clear()
         except torch.cuda.OutOfMemoryError as e:
-            last_error = MatAnyError(f"CUDA OOM in process_stream: {_cuda_snapshot()}") from e
         except Exception as e:
-            last_error = MatAnyError(f"Unexpected error in process_stream: {e}") from e
         finally:
             frames_buf.clear()
             _safe_empty_cache()
-            # Optional: if core has a reset method, call it
-            if hasattr(getattr(self, '_core', None), 'reset'):
                 try:
                     self._core.reset()
                 except Exception:
                     pass
-        if last_error:
-            raise last_error

     pass
+def _to_device_batch(frames_bgr_np, device, dtype=torch.float16):
+    """
+    frames_bgr_np: list or np.ndarray of shape [N,H,W,3], dtype=uint8, BGR
+    Returns torch tensor [N,3,H,W] on device, normalized to 0..1
+    """
+    if isinstance(frames_bgr_np, list):
+        frames_bgr_np = np.stack(frames_bgr_np, axis=0)
+    frames_rgb = frames_bgr_np[..., ::-1].copy(order="C")               # BGR->RGB
+    pin = torch.from_numpy(frames_rgb).pin_memory()                     # [N,H,W,3]
+    t = pin.permute(0, 3, 1, 2).contiguous().to(device, non_blocking=True)
+    t = t.to(dtype=dtype) / 255.0
+    return t                                                            # [N,3,H,W]
+def _select_matany_mode(core):
+    """Pick best available API."""
+    if hasattr(core, "process_frame"):
+        return "process_frame"
+    if hasattr(core, "_process_tensor_video"):
+        return "_process_tensor_video"
+    if hasattr(core, "step"):
+        return "step"
+    raise MatAnyError("MatAnyone core has no supported API (process_frame/_process_tensor_video/step).")
+def _matany_run(core, mode, frames_04chw, seed_1hw=None, use_fp16=False):
+    """
+    Returns (alpha [N,1,H,W], fg [N,3,H,W]) on current device.
+    """
+    with torch.no_grad():
+        if mode == "process_frame":
+            alphas, fgs = [], []
+            for i in range(frames_04chw.shape[0]):
+                f = frames_04chw[i:i+1]  # [1,3,H,W]
+                if seed_1hw is not None and seed_1hw.ndim == 3:
+                    a, fg = core.process_frame(f, seed_1hw.unsqueeze(0))
+                else:
+                    a, fg = core.process_frame(f)
+                alphas.append(a)  # [1,1,H,W]
+                fgs.append(fg)    # [1,3,H,W]
+            alpha = torch.cat(alphas, dim=0)
+            fg = torch.cat(fgs, dim=0)
+            return alpha, fg
+        elif mode == "_process_tensor_video":
+            # Many repos expect float32 for this path
+            return core._process_tensor_video(frames_04chw.float(), seed_1hw)
+        elif mode == "step":
+            alphas, fgs = [], []
+            for i in range(frames_04chw.shape[0]):
+                f = frames_04chw[i:i+1]
+                if i == 0 and seed_1hw is not None:
+                    a, fg = core.step(f, seed_1hw)
+                else:
+                    a, fg = core.step(f)
+                alphas.append(a)
+                fgs.append(fg)
+            alpha = torch.cat(alphas, dim=0)
+            fg = torch.cat(fgs, dim=0)
+            return alpha, fg
+    raise MatAnyError(f"Unsupported MatAnyone mode: {mode}")
+def _cuda_snapshot():
+    if not torch.cuda.is_available():
+        return "CUDA: N/A"
+    i = torch.cuda.current_device()
+    return (f"device={i}, name={torch.cuda.get_device_name(i)}, "
+            f"alloc={torch.cuda.memory_allocated(i)/1e9:.2f}GB, "
+            f"reserved={torch.cuda.memory_reserved(i)/1e9:.2f}GB")
+def _safe_empty_cache():
+    if torch.cuda.is_available():
+        try:
+            torch.cuda.synchronize()
+        except Exception:
+            pass
+        torch.cuda.empty_cache()
+def _to_uint8_cpu(alpha_n1hw, fg_n3hw):
+    alpha_cpu = (alpha_n1hw.clamp(0, 1) * 255.0).byte().squeeze(1).contiguous().cpu().numpy()      # [N,H,W]
+    fg_cpu    = (fg_n3hw.clamp(0, 1) * 255.0).byte().permute(0, 2, 3, 1).contiguous().cpu().numpy() # [N,H,W,3] RGB
+    return alpha_cpu, fg_cpu
 def _to_device_batch(frames_bgr_np, device, dtype=torch.float16):
     """
     Convert a list/array of BGR uint8 frames [N,H,W,3] to a normalized
     def _flush_chunk(self, frames_bgr, seed_1hw, alpha_writer, fg_writer):
         """
+        Process an in-memory batch (list of uint8 BGR frames), write results via writers.
+        Strong CUDA guards + cleanup.
         """
+        device = self.device
+        use_fp16 = (device.type == "cuda") and getattr(self, "use_fp16", True)
+        mode = _select_matany_mode(self._core)
+        frames_04chw = None
+        alpha_n1hw = None
+        fg_n3hw = None
+        try:
+            frames_04chw = _to_device_batch(frames_bgr, device, dtype=torch.float16 if use_fp16 else torch.float32)
             if device.type == "cuda":
                 stream = torch.cuda.Stream()
                 with torch.cuda.stream(stream):
                     with torch.autocast(device_type="cuda", enabled=use_fp16):
+                        alpha_n1hw, fg_n3hw = _matany_run(self._core, mode, frames_04chw, seed_1hw, use_fp16)
+                stream.synchronize()
             else:
+                alpha_n1hw, fg_n3hw = _matany_run(self._core, mode, frames_04chw, seed_1hw, use_fp16)
+            alpha_cpu, fg_cpu = _to_uint8_cpu(alpha_n1hw, fg_n3hw)
             for i in range(alpha_cpu.shape[0]):
+                alpha_writer.write(alpha_cpu[i])                 # [H,W] uint8
+                fg_writer.write(fg_cpu[i][..., ::-1].copy())     # RGB->BGR
             if hasattr(self._core, "last_mask"):
                 self._last_alpha_1hw = self._core.last_mask
         except torch.cuda.OutOfMemoryError as e:
             snap = _cuda_snapshot()
             _safe_empty_cache()
+            # Re-raise with context for pipeline to catch
+            raise MatAnyError(f"CUDA OOM in _flush_chunk | {snap}") from e
         except Exception as e:
             snap = _cuda_snapshot()
             raise MatAnyError(f"MatAnyone failure in _flush_chunk: {e} | {snap}") from e
         finally:
+            # ensure we release heavy tensors
+            try:
+                del alpha_n1hw, fg_n3hw, frames_04chw
+            except Exception:
+                pass
             _safe_empty_cache()
     def process_stream(self, frames_iterable, seed_1hw, alpha_writer, fg_writer, chunk_size=32):
         """
+        Buffer frames from iterable and process in chunks.
+        On OOM, retry once with half chunk size; otherwise bubble up MatAnyError.
         """
         frames_buf = []
         try:
             for f in frames_iterable:
                 frames_buf.append(f)
                     try:
                         self._flush_chunk(frames_buf, seed_1hw, alpha_writer, fg_writer)
                         frames_buf.clear()
+                    except torch.cuda.OutOfMemoryError:
+                        # should be wrapped above, but double-guard
+                        raise
+                    except MatAnyError as inner:
+                        # one-time downshift
                         if chunk_size > 4:
                             half = max(4, chunk_size // 2)
                             for i in range(0, len(frames_buf), half):
                                 sub = frames_buf[i:i+half]
                                 self._flush_chunk(sub, seed_1hw, alpha_writer, fg_writer)
                             frames_buf.clear()
                         else:
+                            raise inner
             if frames_buf:
                 self._flush_chunk(frames_buf, seed_1hw, alpha_writer, fg_writer)
                 frames_buf.clear()
         except torch.cuda.OutOfMemoryError as e:
+            snap = _cuda_snapshot()
+            _safe_empty_cache()
+            raise MatAnyError(f"CUDA OOM in process_stream outer | {snap}") from e
         except Exception as e:
+            raise MatAnyError(f"Unexpected error in process_stream: {e}") from e
         finally:
             frames_buf.clear()
             _safe_empty_cache()
+            if hasattr(self._core, "reset"):
                 try:
                     self._core.reset()
                 except Exception:
                     pass