Spaces:

MogensR
/

VideoBackgroundReplacer2

Paused

App Files Files Community

MogensR commited on Sep 15, 2025

Commit

80ac736

1 Parent(s): b8796b9

agent 1.4

Browse files

Files changed (1) hide show

models/matanyone_loader.py +192 -105

models/matanyone_loader.py CHANGED Viewed

@@ -26,6 +26,7 @@
 import time
 import torch
 import logging
 import numpy as np
 from pathlib import Path
 from typing import Optional, Callable, Tuple, Union
@@ -229,114 +230,194 @@ def _run_frame(self, frame_bgr: np.ndarray, seed_1hw: Optional[np.ndarray], is_f
         return alpha_np
     def process_stream(
         self,
         video_path: Path,
-        seed_mask_path: Optional[Path],
-        out_dir: Path,
-        progress_cb: Optional[Callable[[float, str], None]] = None,
     ) -> Tuple[Path, Path]:
         """
-        Stream the video, write alpha.mp4 and fg.mp4, return their paths.
-        """
-        log.info(f"[MATANY] Starting process_video: {video_path}")
-        log.info(f"[MATANY] API mode: {self._api_mode}")
-        log.info(f"[MATANY] Device: {self.device}")
-        video_path = Path(video_path)
         out_dir = Path(out_dir)
-        _ensure_dir(out_dir)
         cap = cv2.VideoCapture(str(video_path))
         if not cap.isOpened():
             raise MatAnyError(f"Failed to open video: {video_path}")
-        fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
-        W   = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        H   = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        N   = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        log.info(f"[MATANY] Video info: {W}x{H}, {N} frames, {fps} fps")
-        alpha_writer, fg_writer = _open_video_writers(out_dir, fps, (W, H))
-        seed_1hw = None
-        if seed_mask_path is not None:
-            seed_hw = _read_mask_hw(seed_mask_path, (H, W))
-            seed_1hw = _mask_to_1hw(seed_hw)
-        # If only process_video is available, we'll chunk to avoid RAM blow-ups.
         if self._api_mode == "process_video":
-            log.info(f"[MATANY] Using chunked process_video mode")
-            frames_buf = []
-            idx = 0
-            chunk = max(1, min(64, int(2048 * 1024 * 1024 / (H * W * 3 * 4))))  # ~2GB budget heuristic
-            # SAFETY: never 0
-            if chunk <= 0:
-                chunk = 32
-            log.info(f"[MATANY] Chunk size: {chunk} frames")
-            while True:
-                ret, frame = cap.read()
-                if not ret:  # flush tail
-                    if frames_buf:
-                        log.info(f"[MATANY] Flushing final chunk of {len(frames_buf)} frames")
-                        self._flush_chunk(frames_buf, seed_1hw, alpha_writer, fg_writer)
-                    break
-                frames_buf.append(frame.copy())
-                if len(frames_buf) >= chunk:
-                    log.info(f"[MATANY] Processing chunk {idx//chunk + 1}: {len(frames_buf)} frames")
-                    self._flush_chunk(frames_buf, seed_1hw, alpha_writer, fg_writer)
-                    frames_buf.clear()
-                idx += 1
-                if N > 0:
-                    _emit_progress(progress_cb, idx / N, f"MatAnyone chunking… ({idx}/{N})")
         else:
             # Frame-by-frame (preferred)
             log.info(f"[MATANY] Using frame-by-frame mode: {self._api_mode}")
-            idx = 0
-            while True:
-                ret, frame = cap.read()
-                if not ret:
-                    break
-                if idx % 10 == 0:
-                    _emit_progress(progress_cb, min(0.999, (idx / N) if N > 0 else 0.0),
-                                 f"MatAnyone matting… ({idx}/{N})")
-                log.debug(f"[MATANY] Processing frame {idx+1}/{N}")
-                # Only pass seed mask on first frame
-                current_mask = seed_1hw if idx == 0 else None
-                alpha_hw = self._run_frame(frame, current_mask, is_first=(idx == 0))
-                # compose fg for immediate write
-                # alpha 0..1 -> 0..255 3-channel grayscale
-                alpha_u8 = (alpha_hw * 255.0 + 0.5).astype(np.uint8)
-                alpha_rgb = cv2.cvtColor(alpha_u8, cv2.COLOR_GRAY2BGR)
-                # Blend: fg = alpha*frame + (1-alpha)*black == alpha*frame
-                fg_bgr = (frame.astype(np.float32) * (alpha_hw[..., None])).clip(0, 255).astype(np.uint8)
-                alpha_writer.write(alpha_rgb)
-                fg_writer.write(fg_bgr)
-                idx += 1
-                if progress_cb and N > 0 and idx % 10 == 0:
-                    progress_cb(f"MatAnyone matting… ({idx}/{N})")
-                    log.info(f"[MATANY] Progress: {idx}/{N} frames processed")
-        cap.release()
-        alpha_writer.release()
-        fg_writer.release()
-        alpha_path = out_dir / "alpha.mp4"
-        fg_path    = out_dir / "fg.mp4"
-        _validate_nonempty(alpha_path)
-        _validate_nonempty(fg_path)
-        return alpha_path, fg_path
     def _flush_chunk(self, frames_bgr, seed_1hw, alpha_writer, fg_writer):
-        """Call core.process_video(frames, mask) safely, then write results."""
         # Prepare inputs
         frames_chw = [_to_chw01(f) for f in frames_bgr]                     # list of CHW
         frames_t   = torch.from_numpy(np.stack(frames_chw)).to(self.device) # T,C,H,W
@@ -344,24 +425,30 @@ def _flush_chunk(self, frames_bgr, seed_1hw, alpha_writer, fg_writer):
         with torch.no_grad(), self._maybe_amp():
             try:
-                # Preferred: T,C,H,W (+ 1,H,W mask)
-                alphas = self._core.process_video(frames_t, mask_t)
-            except RuntimeError as e:
-                # Some wheels require B,T,C,H,W (+ B,T,1,H,W)
-                msg = str(e)
-                if "number of dimensions" in msg or "Expected" in msg or "got" in msg:
-                    frames_btchw = frames_t.unsqueeze(0)  # 1,T,C,H,W
-                    mask_bt1hw = mask_t.unsqueeze(0) if mask_t is not None else None  # 1,1,H,W -> (maybe ok) ; some expect 1,T,1,H,W
-                    # If mask still mismatches, try broadcast across T:
-                    try:
-                        alphas = self._core.process_video(frames_btchw, mask_bt1hw)
-                    except RuntimeError:
-                        if mask_t is not None:
-                            T = frames_t.shape[0]
-                            mask_bt1hw = mask_t.unsqueeze(0).unsqueeze(0).expand(1, T, 1, *mask_t.shape[-2:])  # 1,T,1,H,W
-                        alphas = self._core.process_video(frames_btchw, mask_bt1hw)
                 else:
-                    raise
         # Normalize to numpy list of HW float32 [0,1]
         if isinstance(alphas, torch.Tensor):

 import time
 import torch
 import logging
+import tempfile
 import numpy as np
 from pathlib import Path
 from typing import Optional, Callable, Tuple, Union
         return alpha_np
+    def _harvest_process_video_output(self, res, out_dir: Path, base: str) -> Tuple[Path, Path]:
+        """
+        Accepts varied return types from MatAnyone.process_video and produces
+        (alpha.mp4, fg.mp4) inside out_dir. Strategies:
+          - If res is a sequence of alpha arrays/tensors → write our own videos.
+          - If res is dict/tuple of paths → copy/rename.
+          - Else: glob typical output dirs for files matching base.
+        """
+        # Case A: sequence of masks
+        import torch, numpy as np, cv2, glob, shutil
+        def _as_np(a):
+            if isinstance(a, torch.Tensor):
+                a = a.detach().float().cpu().numpy()
+            a = np.asarray(a)
+            if a.ndim == 3 and a.shape[0] in (1,3):   # (C,H,W) → prefer HW
+                a = np.squeeze(a) if a.shape[0] == 1 else np.mean(a, axis=0)
+            if a.max() > 1.0:
+                a = a / 255.0
+            return a.clip(0,1).astype(np.float32)
+        alpha_mp4 = out_dir / "alpha.mp4"
+        fg_mp4    = out_dir / "fg.mp4"
+        # If we got arrays/tensors: we can't reconstruct FG without original frames here,
+        # so prefer path-returning flows. If needed, you can extend this to re-read frames
+        # and blend. For now, try to detect paths first.
+        if isinstance(res, dict):
+            cand_alpha = res.get("alpha") or res.get("alpha_path") or res.get("matte") or res.get("matte_path")
+            cand_fg    = res.get("fg")    or res.get("fg_path")    or res.get("foreground") or res.get("foreground_path")
+            moved = 0
+            if cand_alpha and Path(cand_alpha).exists():
+                shutil.copy2(cand_alpha, alpha_mp4); moved += 1
+            if cand_fg and Path(cand_fg).exists():
+                shutil.copy2(cand_fg, fg_mp4); moved += 1
+            if moved == 2: return alpha_mp4, fg_mp4
+        if isinstance(res, (list, tuple)) and len(res) >= 1:
+            # Heuristic: assume list/tuple of file paths
+            paths = [Path(x) for x in res if isinstance(x, (str, Path))]
+            if paths:
+                # Pick best matches by name
+                alpha_candidates = [p for p in paths if p.exists() and ("alpha" in p.name or "matte" in p.name)]
+                fg_candidates    = [p for p in paths if p.exists() and ("fg" in p.name or "fore" in p.name)]
+                if alpha_candidates and fg_candidates:
+                    shutil.copy2(alpha_candidates[0], alpha_mp4)
+                    shutil.copy2(fg_candidates[0], fg_mp4)
+                    return alpha_mp4, fg_mp4
+        # As last resort, glob common dirs created by the lib
+        search_dirs = [Path.cwd(), out_dir, Path("results"), Path("result"), Path("output"), Path("outputs")]
+        hits = []
+        for d in search_dirs:
+            if d.exists():
+                hits.extend(list(d.rglob(f"*{base}*.*")))
+        # choose best alpha/fg
+        alpha_candidates = [p for p in hits if p.suffix.lower() in (".mp4",".mov",".mkv",".avi") and ("alpha" in p.name or "matte" in p.name)]
+        fg_candidates    = [p for p in hits if p.suffix.lower() in (".mp4",".mov",".mkv",".avi") and ("fg" in p.name or "fore" in p.name)]
+        if alpha_candidates and fg_candidates:
+            import shutil
+            shutil.copy2(alpha_candidates[0], alpha_mp4)
+            shutil.copy2(fg_candidates[0], fg_mp4)
+            return alpha_mp4, fg_mp4
+        raise MatAnyError("MatAnyone.process_video did not yield discoverable outputs.")
     def process_stream(
         self,
         video_path: Path,
+        seed_mask_path: Optional[Path] = None,
+        out_dir: Optional[Path] = None,
+        progress_cb: Optional[Callable] = None,
     ) -> Tuple[Path, Path]:
+        """Process video stream with MatAnyone.
+        Args:
+            video_path: Input video file
+            seed_mask_path: Optional seed mask image (grayscale, same size as video)
+            out_dir: Output directory (default: video_path.parent)
+            progress_cb: Callback for progress updates (signature: (float, str) or (str,))
+        Returns:
+            Tuple of (alpha_path, fg_path) output video paths
         """
+        if out_dir is None:
+            out_dir = video_path.parent
         out_dir = Path(out_dir)
+        out_dir.mkdir(parents=True, exist_ok=True)
         cap = cv2.VideoCapture(str(video_path))
         if not cap.isOpened():
             raise MatAnyError(f"Failed to open video: {video_path}")
+        N = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        cap.release()
+        log.info(f"[MATANY] Processing {N} frames ({W}x{H} @ {fps:.1f}fps) from {video_path}")
         if self._api_mode == "process_video":
+            # --- PATH-BASED CALL (this wheel expects a video path, not tensors) ---
+            _emit_progress(progress_cb, 0.05, "MatAnyone (video mode)…")
+            # Some builds accept (video_path, seed_mask_path), others just (video_path)
+            try:
+                res = self._core.process_video(str(video_path),
+                                            str(seed_mask_path) if seed_mask_path is not None else None)
+            except TypeError:
+                # Fallback: only video path
+                res = self._core.process_video(str(video_path))
+            # Normalize whatever we got back into alpha.mp4 + fg.mp4 in out_dir
+            alpha_path, fg_path = self._harvest_process_video_output(res, out_dir, base=video_path.stem)
+            _validate_nonempty(alpha_path)
+            _validate_nonempty(fg_path)
+            _emit_progress(progress_cb, 1.0, "MatAnyone complete")
+            return alpha_path, fg_path
         else:
             # Frame-by-frame (preferred)
             log.info(f"[MATANY] Using frame-by-frame mode: {self._api_mode}")
+            cap = cv2.VideoCapture(str(video_path))
+            alpha_path = out_dir / "alpha.mp4"
+            fg_path = out_dir / "fg.mp4"
+            alpha_writer = cv2.VideoWriter(
+                str(alpha_path),
+                cv2.VideoWriter_fourcc(*'mp4v'),
+                fps,
+                (W, H),
+                isColor=False
+            )
+            fg_writer = cv2.VideoWriter(
+                str(fg_path),
+                cv2.VideoWriter_fourcc(*'mp4v'),
+                fps,
+                (W, H),
+                isColor=True
+            )
+            try:
+                # Load seed mask if provided
+                seed_1hw = None
+                if seed_mask_path is not None:
+                    seed_1hw = _read_mask_hw(seed_mask_path, (H, W))
+                idx = 0
+                while True:
+                    ret, frame = cap.read()
+                    if not ret:
+                        break
+                    if idx % 10 == 0:
+                        _emit_progress(progress_cb, min(0.999, (idx / N) if N > 0 else 0.0),
+                                     f"MatAnyone matting… ({idx}/{N})")
+                    log.debug(f"[MATANY] Processing frame {idx+1}/{N}")
+                    # Only pass seed mask on first frame
+                    current_mask = seed_1hw if idx == 0 else None
+                    alpha_hw = self._run_frame(frame, current_mask, is_first=(idx == 0))
+                    # compose fg for immediate write
+                    # alpha 0..1 -> 0..255 3-channel grayscale
+                    alpha_u8 = (alpha_hw * 255.0 + 0.5).astype(np.uint8)
+                    alpha_rgb = cv2.cvtColor(alpha_u8, cv2.COLOR_GRAY2BGR)
+                    # Blend: fg = alpha*frame + (1-alpha)*black == alpha*frame
+                    fg_bgr = (frame.astype(np.float32) * (alpha_hw[..., None] / 255.0)).astype(np.uint8)
+                    # Write outputs
+                    alpha_writer.write(alpha_rgb)
+                    fg_writer.write(fg_bgr)
+                    idx += 1
+            finally:
+                cap.release()
+                alpha_writer.release()
+                fg_writer.release()
+                _validate_nonempty(alpha_path)
+                _validate_nonempty(fg_path)
+                _emit_progress(progress_cb, 1.0, "MatAnyone complete")
+                return alpha_path, fg_path
     def _flush_chunk(self, frames_bgr, seed_1hw, alpha_writer, fg_writer):
+        """Process a chunk of frames with MatAnyone."""
+        if not frames_bgr:
+            return
         # Prepare inputs
         frames_chw = [_to_chw01(f) for f in frames_bgr]                     # list of CHW
         frames_t   = torch.from_numpy(np.stack(frames_chw)).to(self.device) # T,C,H,W
         with torch.no_grad(), self._maybe_amp():
             try:
+                # Try direct tensor processing first (newer versions)
+                if hasattr(self._core, '_process_tensor_video'):
+                    alphas = self._core._process_tensor_video(frames_t, mask_t)
                 else:
+                    # Fall back to file-based processing if tensor API not available
+                    with tempfile.TemporaryDirectory() as tmpdir:
+                        # Save frames to temp directory
+                        frame_paths = []
+                        for i, frame in enumerate(frames_bgr):
+                            path = os.path.join(tmpdir, f'frame_{i:06d}.png')
+                            cv2.imwrite(path, frame)
+                            frame_paths.append(path)
+                        # Process video from frames
+                        alphas = self._core.process_video(tmpdir,
+                                                         mask_path=seed_1hw_path if seed_1hw is not None else None)
+                        # Ensure alphas is a tensor
+                        if not isinstance(alphas, torch.Tensor):
+                            alphas = torch.from_numpy(alphas).to(self.device)
+            except Exception as e:
+                log.error(f"Error in _flush_chunk: {str(e)}")
+                raise
         # Normalize to numpy list of HW float32 [0,1]
         if isinstance(alphas, torch.Tensor):