Spaces:

MogensR
/

VideoBackgroundReplacer2

Paused

App Files Files Community

MogensR commited on Sep 16, 2025

Commit

975ab1f

1 Parent(s): 9923851

agent 2.9

Browse files

Files changed (1) hide show

models/matanyone_loader.py +263 -227

models/matanyone_loader.py CHANGED Viewed

@@ -1,12 +1,13 @@
 #!/usr/bin/env python3
 """
-MatAnyone Adapter (streaming, API-agnostic)
--------------------------------------------
 - Supports multiple MatAnyone variants:
   * frame API:  core.step(image[, mask])  or  core.process_frame(image, mask)
   * video API:  core.process_video(video_path[, mask_path])  [DISABLED BY DEFAULT]
 - Streams frames: no full-video-in-RAM.
-- Emits alpha.mp4 (grayscale-as-BGR for compatibility) and fg.mp4 (RGB-on-black) as it goes.
 - Validates outputs and raises MatAnyError on failure (so pipeline can fallback).
 I/O conventions:
@@ -17,31 +18,37 @@
 Requires: OpenCV, Torch, NumPy
 """
 from __future__ import annotations
 import os
 import cv2
 import time
 import shutil
-import torch
 import logging
 import numpy as np
 from pathlib import Path
 from typing import Optional, Callable, Tuple, List
 log = logging.getLogger(__name__)
-# -----------------------------
-# Small utilities
-# -----------------------------
 def _emit_progress(cb, pct: float, msg: str):
     if not cb:
         return
     try:
-        cb(pct, msg)        # preferred 2-args
     except TypeError:
         try:
-            cb(msg)         # legacy 1-arg
         except TypeError:
             pass
@@ -52,6 +59,7 @@ class MatAnyError(RuntimeError):
 def _cuda_snapshot(device: Optional[torch.device] = None) -> str:
     if not torch.cuda.is_available():
         return "CUDA: N/A"
     idx = 0
@@ -64,6 +72,7 @@ def _cuda_snapshot(device: Optional[torch.device] = None) -> str:
 def _safe_empty_cache():
     if torch.cuda.is_available():
         try:
             torch.cuda.synchronize()
@@ -73,7 +82,7 @@ def _safe_empty_cache():
 def _read_mask_hw(mask_path: Path, target_hw: Tuple[int, int]) -> np.ndarray:
-    """Read mask image, convert to float32 [0,1], resize to target (H,W)."""
     if not Path(mask_path).exists():
         raise MatAnyError(f"Seed mask not found: {mask_path}")
     mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
@@ -83,34 +92,28 @@ def _read_mask_hw(mask_path: Path, target_hw: Tuple[int, int]) -> np.ndarray:
     if mask.shape[:2] != (H, W):
         mask = cv2.resize(mask, (W, H), interpolation=cv2.INTER_LINEAR)
     maskf = (mask.astype(np.float32) / 255.0).clip(0.0, 1.0)
-    return maskf  # (H, W)
-def _to_hwc01(img_bgr: np.ndarray) -> np.ndarray:
-    """BGR [H,W,3] uint8 -> HWC float32 [0,1] RGB."""
-    rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
-    rgbf = rgb.astype(np.float32) / 255.0
-    return rgbf  # (H, W, 3)
 def _to_chw01(img_bgr: np.ndarray) -> np.ndarray:
     """BGR [H,W,3] uint8 -> CHW float32 [0,1] RGB."""
     rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
     rgbf = rgb.astype(np.float32) / 255.0
-    chw = np.transpose(rgbf, (2, 0, 1))  # (3, H, W)
     return chw
 def _validate_nonempty(file_path: Path) -> None:
     if not file_path.exists() or file_path.stat().st_size == 0:
         raise MatAnyError(f"Output file missing/empty: {file_path}")
 def _select_matany_mode(core) -> str:
     """
-    Pick the best-available MatAnyone API at runtime.
     Priority: process_video > process_frame > step
-    (Note: we force frame mode in _lazy_init; this helper is used only in a chunk helper.)
     """
     if hasattr(core, "process_video") and callable(getattr(core, "process_video")):
         return "process_video"
@@ -121,9 +124,9 @@ def _select_matany_mode(core) -> str:
     raise MatAnyError("No supported MatAnyone API on core (process_video/process_frame/step).")
-# -----------------------------
-# Main session
-# -----------------------------
 class MatAnyoneSession:
     """
     Unified, streaming wrapper over MatAnyone variants.
@@ -133,24 +136,23 @@ class MatAnyoneSession:
         -> returns (alpha_path, fg_path)
     """
     def __init__(self, device: Optional[str] = None, precision: str = "auto"):
         """
         Args:
             device: 'cuda', 'cpu', 'cuda:0', etc. If None, auto-detects CUDA.
             precision: 'auto' | 'fp32' | 'fp16'
         """
-        self.device = torch.device(device) if device else (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
         self.precision = precision.lower()
         self.use_fp16 = (self.precision == "fp16") or (self.precision == "auto" and self.device.type == "cuda")
         self._core = None
         self._api_mode = None
         self._initialized = False
-        # chosen builders after first frame succeeds
-        self._build_img = None   # Callable[[np.ndarray], torch.Tensor]
-        self._build_msk = None   # Optional[Callable[[np.ndarray], Optional[torch.Tensor]]]
-        self._layout_name = None
         self._lazy_init()
         log.info(f"Initialized MatAnyoneSession on {self.device} | precision={self.precision}, use_fp16={self.use_fp16}")
@@ -159,8 +161,8 @@ def __init__(self, device: Optional[str] = None, precision: str = "auto"):
             log.info(f"CUDA device: {torch.cuda.get_device_name(idx)}")
             self._log_gpu_memory()
-    # ---- internals ----
     def _log_gpu_memory(self) -> Tuple[float, float]:
         if torch.cuda.is_available():
             idx = self.device.index if isinstance(self.device, torch.device) and self.device.index is not None else 0
             try:
@@ -172,8 +174,11 @@ def _log_gpu_memory(self) -> Tuple[float, float]:
                 log.warning(f"Failed to read GPU memory: {e}")
         return 0.0, 0.0
     def _lazy_init(self) -> None:
-        """Import and initialize the MatAnyone InferenceCore and choose API mode."""
         try:
             from matanyone.inference.inference_core import InferenceCore  # type: ignore
         except ImportError as e:
@@ -187,20 +192,43 @@ def _lazy_init(self) -> None:
         except TypeError:
             self._core = InferenceCore("PeiqingYang/MatAnyone")
-        # --- Force reliable frame-by-frame mode (avoid process_video) ---
         if hasattr(self._core, "process_frame"):
             self._api_mode = "process_frame"
         elif hasattr(self._core, "step"):
             self._api_mode = "step"
         else:
             raise MatAnyError(
-                "MatAnyone build has no frame API (process_frame/step). "
-                "Cannot proceed safely."
             )
         log.info(f"[MATANY] API mode forced to: {self._api_mode} (video-mode disabled)")
         self._initialized = True
     def _maybe_amp(self):
         enabled = (self.device.type == "cuda")
         if self.precision == "fp32":
@@ -210,6 +238,9 @@ def _maybe_amp(self):
         # auto
         return torch.amp.autocast(device_type="cuda", enabled=enabled and self.use_fp16)
     def _validate_input_frame(self, frame: np.ndarray) -> None:
         if not isinstance(frame, np.ndarray):
             raise MatAnyError(f"Frame must be numpy.ndarray, got {type(frame)}")
@@ -218,125 +249,118 @@ def _validate_input_frame(self, frame: np.ndarray) -> None:
         if frame.ndim != 3 or frame.shape[2] != 3:
             raise MatAnyError(f"Frame must be HWC with 3 channels, got {frame.shape}")
-    def _core_call(self, img_t: torch.Tensor, mask_t: Optional[torch.Tensor]):
-        if self._api_mode == "step":
-            return self._core.step(img_t, mask_t) if mask_t is not None else self._core.step(img_t)
-        elif self._api_mode == "process_frame":
-            return self._core.process_frame(img_t, mask_t) if mask_t is not None else self._core.process_frame(img_t)
-        raise MatAnyError("Internal error: unknown API mode")
-    # ---- builders for probing ----
-    def _mk_builder_bchw(self) -> Tuple[str, Callable[[np.ndarray], torch.Tensor], Callable[[np.ndarray], Optional[torch.Tensor]]]:
-        def b_img(frame_bgr: np.ndarray) -> torch.Tensor:
-            chw = _to_chw01(frame_bgr)
-            return torch.from_numpy(chw).unsqueeze(0).contiguous().to(self.device, dtype=torch.float32, non_blocking=True)  # [1,3,H,W]
-        def b_msk(seed_hw: np.ndarray) -> torch.Tensor:
-            return torch.from_numpy(seed_hw).unsqueeze(0).unsqueeze(0).contiguous().to(self.device, dtype=torch.float32, non_blocking=True)  # [1,1,H,W]
-        return "BCHW+B1HW", b_img, b_msk
-    def _mk_builder_bchw_nomask(self) -> Tuple[str, Callable[[np.ndarray], torch.Tensor], Callable[[np.ndarray], Optional[torch.Tensor]]]:
-        def b_img(frame_bgr: np.ndarray) -> torch.Tensor:
-            chw = _to_chw01(frame_bgr)
-            return torch.from_numpy(chw).unsqueeze(0).contiguous().to(self.device, dtype=torch.float32, non_blocking=True)
-        def b_msk(_: np.ndarray) -> Optional[torch.Tensor]:
-            return None
-        return "BCHW+None", b_img, b_msk
-    def _mk_builder_btchw(self) -> Tuple[str, Callable[[np.ndarray], torch.Tensor], Callable[[np.ndarray], Optional[torch.Tensor]]]:
-        def b_img(frame_bgr: np.ndarray) -> torch.Tensor:
-            chw = _to_chw01(frame_bgr)
-            return torch.from_numpy(chw).unsqueeze(0).unsqueeze(1).contiguous().to(self.device, dtype=torch.float32, non_blocking=True)  # [1,1,3,H,W]
-        def b_msk(seed_hw: np.ndarray) -> torch.Tensor:
-            return torch.from_numpy(seed_hw).unsqueeze(0).unsqueeze(0).unsqueeze(0).contiguous().to(self.device, dtype=torch.float32, non_blocking=True)  # [1,1,1,H,W]
-        return "BTCHW+B1THW", b_img, b_msk
-    def _mk_builder_chw(self) -> Tuple[str, Callable[[np.ndarray], torch.Tensor], Callable[[np.ndarray], Optional[torch.Tensor]]]:
-        def b_img(frame_bgr: np.ndarray) -> torch.Tensor:
-            chw = _to_chw01(frame_bgr)
-            return torch.from_numpy(chw).contiguous().to(self.device, dtype=torch.float32, non_blocking=True)  # [3,H,W]
-        def b_msk(seed_hw: np.ndarray) -> torch.Tensor:
-            return torch.from_numpy(seed_hw).unsqueeze(0).contiguous().to(self.device, dtype=torch.float32, non_blocking=True)  # [1,H,W]
-        return "CHW+1HW", b_img, b_msk
-    def _mk_builder_hwc(self) -> Tuple[str, Callable[[np.ndarray], torch.Tensor], Callable[[np.ndarray], Optional[torch.Tensor]]]:
-        def b_img(frame_bgr: np.ndarray) -> torch.Tensor:
-            hwc = _to_hwc01(frame_bgr)
-            return torch.from_numpy(hwc).contiguous().to(self.device, dtype=torch.float32, non_blocking=True)  # [H,W,3]
-        def b_msk(seed_hw: np.ndarray) -> torch.Tensor:
-            return torch.from_numpy(seed_hw).contiguous().to(self.device, dtype=torch.float32, non_blocking=True)  # [H,W]
-        return "HWC+HW", b_img, b_msk
-    def _run_frame(self, frame_bgr: np.ndarray, seed_hw: Optional[np.ndarray], is_first: bool) -> np.ndarray:
         """
-        Returns alpha matte as 2D np.float32 in [0,1].
-        - On first frame, try several (image,mask) layout combos and remember the winner.
-        - On later frames, use the recorded builders (mask is None).
         """
         self._validate_input_frame(frame_bgr)
-        # Later frames: use the memorized builders
-        if self._build_img is not None and not is_first:
-            img_t = self._build_img(frame_bgr)
             with torch.no_grad(), self._maybe_amp():
-                out = self._core_call(img_t, None)
-            alpha_np = out.detach().float().clamp(0, 1).squeeze().cpu().numpy() if isinstance(out, torch.Tensor) \
-                       else np.asarray(out, dtype=np.float32)
             if alpha_np.max() > 1.0:
                 alpha_np = alpha_np / 255.0
-            alpha_np = np.squeeze(alpha_np)
-            if alpha_np.ndim != 2:
-                raise MatAnyError(f"Expected 2D alpha matte; got shape {alpha_np.shape}")
-            return alpha_np.astype(np.float32)
-        # First frame: probe combos
-        attempts = [
-            self._mk_builder_bchw(),         # [1,3,H,W] + [1,1,H,W]
-            self._mk_builder_bchw_nomask(),  # [1,3,H,W] + None
-            self._mk_builder_btchw(),        # [1,1,3,H,W] + [1,1,1,H,W]
-            self._mk_builder_chw(),          # [3,H,W] + [1,H,W]
-            self._mk_builder_hwc(),          # [H,W,3] + [H,W]
-        ]
-        last_err = None
-        for name, mk_img, mk_msk in attempts:
-            try:
-                img_t = mk_img(frame_bgr)
-                mask_t = None
-                if seed_hw is not None:
-                    mask_t = mk_msk(seed_hw)
-                log.info(f"[MATANY] Trying layout: {name} | img.shape={tuple(img_t.shape)}"
-                         f"{'' if mask_t is None else ' mask.shape=' + str(tuple(mask_t.shape))}")
-                with torch.no_grad(), self._maybe_amp():
-                    out = self._core_call(img_t, mask_t)
-                # success → remember builders for subsequent frames
-                self._build_img = mk_img
-                # after first frame, we won't pass mask anymore
-                self._build_msk = mk_msk
-                self._layout_name = name
-                log.info(f"[MATANY] Selected layout: {name}")
-                alpha_np = out.detach().float().clamp(0, 1).squeeze().cpu().numpy() if isinstance(out, torch.Tensor) \
-                           else np.asarray(out, dtype=np.float32)
-                if alpha_np.max() > 1.0:
-                    alpha_np = alpha_np / 255.0
-                alpha_np = np.squeeze(alpha_np)
-                if alpha_np.ndim != 2:
-                    raise MatAnyError(f"Expected 2D alpha matte; got shape {alpha_np.shape}")
-                return alpha_np.astype(np.float32)
-            except Exception as e:
-                last_err = e
-                log.warning(f"[MATANY] Layout attempt failed ({name}): {e}")
-        snap = _cuda_snapshot(self.device)
-        raise MatAnyError(f"MatAnyone first-frame probe failed for all layouts. Last error: {last_err} | {snap}")
-    # -----------------------------
-    # Public API
-    # -----------------------------
     def process_stream(
         self,
         video_path: Path,
@@ -345,7 +369,7 @@ def process_stream(
         progress_cb: Optional[Callable] = None,
     ) -> Tuple[Path, Path]:
         """
-        Process a video with MatAnyone.
         Returns:
             (alpha_path, fg_path)
@@ -376,74 +400,99 @@ def process_stream(
         log.info(f"[MATANY] {video_path.name}: {N} frames {W}x{H} @ {fps:.2f} fps")
         _emit_progress(progress_cb, 0.05, f"Video: {N} frames {W}x{H} @ {fps:.2f} fps")
-        # Writers (alpha as BGR grayscale for broad mp4v compatibility)
         alpha_path = out_dir / "alpha.mp4"
         fg_path    = out_dir / "fg.mp4"
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-        cap = cv2.VideoCapture(str(video_path))
-        if not cap.isOpened():
-            raise MatAnyError(f"Failed to open video for reading: {video_path}")
-        alpha_writer = cv2.VideoWriter(str(alpha_path), fourcc, fps, (W, H), True)  # isColor=True
-        fg_writer    = cv2.VideoWriter(str(fg_path),    fourcc, fps, (W, H), True)
-        if not alpha_writer.isOpened() or not fg_writer.isOpened():
-            raise MatAnyError("Failed to initialize VideoWriter(s)")
-        # Optional seed mask for first frame
-        seed_hw = None
-        if seed_mask_path is not None:
-            seed_hw = _read_mask_hw(Path(seed_mask_path), (H, W))
-        idx = 0
-        last_tick = time.time()
-        start = time.time()
         try:
-            while True:
-                ret, frame = cap.read()
-                if not ret:
-                    break
-                alpha_hw = self._run_frame(frame, seed_hw if idx == 0 else None, is_first=(idx == 0))
-                # Compose outputs
-                alpha_u8  = (alpha_hw * 255.0 + 0.5).astype(np.uint8)
-                alpha_bgr = cv2.cvtColor(alpha_u8, cv2.COLOR_GRAY2BGR)
-                fg_bgr = (frame.astype(np.float32) * alpha_hw[..., None]).clip(0, 255).astype(np.uint8)
-                alpha_writer.write(alpha_bgr)
-                fg_writer.write(fg_bgr)
-                idx += 1
-                # progress & ETA
-                if N > 0 and (idx % max(5, N // 100) == 0 or (time.time() - last_tick) > 2.0):
-                    elapsed = time.time() - start
-                    prog = idx / max(1, N)
-                    eta_s = (elapsed / prog) * (1.0 - prog) if prog > 0 else 0.0
-                    if eta_s > 3600:
-                        eta = f"{eta_s/3600:.1f} h"
-                    elif eta_s > 60:
-                        eta = f"{eta_s/60:.1f} m"
-                    else:
-                        eta = f"{eta_s:.0f} s"
-                    fps_run = idx / elapsed if elapsed > 0 else 0.0
-                    gpu_tail = ""
-                    if torch.cuda.is_available():
-                        idx_dev = self.device.index if self.device.index is not None else 0
-                        mem_a = torch.cuda.memory_allocated(idx_dev) / 1024**2
-                        mem_r = torch.cuda.memory_reserved(idx_dev) / 1024**2
-                        gpu_tail = f" | GPU {mem_a:.0f}/{mem_r:.0f}MB"
-                    _emit_progress(progress_cb, min(0.99, prog), f"Frame {idx}/{N} • {fps_run:.1f} FPS • ETA {eta}{gpu_tail}")
-                    last_tick = time.time()
-            # finalize
-            _validate_nonempty(alpha_path)
-            _validate_nonempty(fg_path)
-            total = time.time() - start
-            fps_run = idx / total if total > 0 else 0.0
-            _emit_progress(progress_cb, 1.0, f"Complete! {idx} frames at {fps_run:.1f} FPS")
-            return alpha_path, fg_path
         except Exception as e:
             msg = f"Error during video processing: {e}"
@@ -452,20 +501,7 @@ def process_stream(
                 msg += f" | {_cuda_snapshot(self.device)}"
             _emit_progress(progress_cb, -1, msg)
             raise MatAnyError(msg) from e
-        finally:
-            try:
-                if cap and hasattr(cap, "isOpened") and cap.isOpened():
-                    cap.release()
-            except Exception:
-                pass
-            try:
-                if alpha_writer:
-                    alpha_writer.release()
-            except Exception:
-                pass
-            try:
-                if fg_writer:
-                    fg_writer.release()
-            except Exception:
-                pass
-            _safe_empty_cache()

 #!/usr/bin/env python3
+# =============================================================================
+# MatAnyone Adapter (streaming, API-agnostic) — with chapter markers
+# =============================================================================
 """
 - Supports multiple MatAnyone variants:
   * frame API:  core.step(image[, mask])  or  core.process_frame(image, mask)
   * video API:  core.process_video(video_path[, mask_path])  [DISABLED BY DEFAULT]
 - Streams frames: no full-video-in-RAM.
+- Emits alpha.mp4 (grayscale-as-BGR for compatibility) and fg.mp4 (RGB-on-black).
 - Validates outputs and raises MatAnyError on failure (so pipeline can fallback).
 I/O conventions:
 Requires: OpenCV, Torch, NumPy
 """
+# =============================================================================
+# CHAPTER 0 — Imports & logging
+# =============================================================================
 from __future__ import annotations
 import os
 import cv2
 import time
 import shutil
 import logging
 import numpy as np
+import torch
 from pathlib import Path
 from typing import Optional, Callable, Tuple, List
 log = logging.getLogger(__name__)
+# =============================================================================
+# CHAPTER 1 — Small utilities
+# =============================================================================
 def _emit_progress(cb, pct: float, msg: str):
+    """Route progress to callback (supports new 2-arg and legacy 1-arg styles)."""
     if not cb:
         return
     try:
+        cb(pct, msg)  # preferred 2-arg
     except TypeError:
         try:
+            cb(msg)   # legacy 1-arg
         except TypeError:
             pass
 def _cuda_snapshot(device: Optional[torch.device] = None) -> str:
+    """Human-friendly GPU memory snapshot."""
     if not torch.cuda.is_available():
         return "CUDA: N/A"
     idx = 0
 def _safe_empty_cache():
+    """Synchronize and empty CUDA cache if present (best-effort)."""
     if torch.cuda.is_available():
         try:
             torch.cuda.synchronize()
 def _read_mask_hw(mask_path: Path, target_hw: Tuple[int, int]) -> np.ndarray:
+    """Read mask, convert to float32 [0,1], resize to target (H,W)."""
     if not Path(mask_path).exists():
         raise MatAnyError(f"Seed mask not found: {mask_path}")
     mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
     if mask.shape[:2] != (H, W):
         mask = cv2.resize(mask, (W, H), interpolation=cv2.INTER_LINEAR)
     maskf = (mask.astype(np.float32) / 255.0).clip(0.0, 1.0)
+    return maskf
 def _to_chw01(img_bgr: np.ndarray) -> np.ndarray:
     """BGR [H,W,3] uint8 -> CHW float32 [0,1] RGB."""
     rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
     rgbf = rgb.astype(np.float32) / 255.0
+    chw = np.transpose(rgbf, (2, 0, 1))  # C,H,W
     return chw
 def _validate_nonempty(file_path: Path) -> None:
+    """Ensure output file exists and is non-empty."""
     if not file_path.exists() or file_path.stat().st_size == 0:
         raise MatAnyError(f"Output file missing/empty: {file_path}")
 def _select_matany_mode(core) -> str:
     """
+    Inspect available APIs.
     Priority: process_video > process_frame > step
+    (Note: we still force frame mode in _lazy_init; this helper is used by chunk helper.)
     """
     if hasattr(core, "process_video") and callable(getattr(core, "process_video")):
         return "process_video"
     raise MatAnyError("No supported MatAnyone API on core (process_video/process_frame/step).")
+# =============================================================================
+# CHAPTER 2 — Main session
+# =============================================================================
 class MatAnyoneSession:
     """
     Unified, streaming wrapper over MatAnyone variants.
         -> returns (alpha_path, fg_path)
     """
+    # -------------------------------------------------------------------------
+    # 2.1 — Init & device
+    # -------------------------------------------------------------------------
     def __init__(self, device: Optional[str] = None, precision: str = "auto"):
         """
         Args:
             device: 'cuda', 'cpu', 'cuda:0', etc. If None, auto-detects CUDA.
             precision: 'auto' | 'fp32' | 'fp16'
         """
+        self.device = torch.device(device) if device else (
+            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        )
         self.precision = precision.lower()
         self.use_fp16 = (self.precision == "fp16") or (self.precision == "auto" and self.device.type == "cuda")
         self._core = None
         self._api_mode = None
         self._initialized = False
         self._lazy_init()
         log.info(f"Initialized MatAnyoneSession on {self.device} | precision={self.precision}, use_fp16={self.use_fp16}")
             log.info(f"CUDA device: {torch.cuda.get_device_name(idx)}")
             self._log_gpu_memory()
     def _log_gpu_memory(self) -> Tuple[float, float]:
+        """Log current GPU memory usage (MB)."""
         if torch.cuda.is_available():
             idx = self.device.index if isinstance(self.device, torch.device) and self.device.index is not None else 0
             try:
                 log.warning(f"Failed to read GPU memory: {e}")
         return 0.0, 0.0
+    # -------------------------------------------------------------------------
+    # 2.2 — Lazy init of MatAnyone core & API selection + API probe
+    # -------------------------------------------------------------------------
     def _lazy_init(self) -> None:
+        """Import and initialize the MatAnyone InferenceCore, choose API mode, and probe capabilities."""
         try:
             from matanyone.inference.inference_core import InferenceCore  # type: ignore
         except ImportError as e:
         except TypeError:
             self._core = InferenceCore("PeiqingYang/MatAnyone")
+        # ---- Force reliable frame-by-frame mode (avoid process_video by default)
         if hasattr(self._core, "process_frame"):
             self._api_mode = "process_frame"
         elif hasattr(self._core, "step"):
             self._api_mode = "step"
         else:
             raise MatAnyError(
+                "MatAnyone build has no frame API (process_frame/step). Cannot proceed safely."
             )
         log.info(f"[MATANY] API mode forced to: {self._api_mode} (video-mode disabled)")
+        # Probe & log exactly what APIs exist (and process_video signature if available)
+        self._probe_api_support()
         self._initialized = True
+    def _probe_api_support(self) -> None:
+        """Log which APIs the installed MatAnyone exposes + best-effort signature for process_video."""
+        core = self._core
+        have = {
+            "process_video": hasattr(core, "process_video") and callable(getattr(core, "process_video", None)),
+            "process_frame": hasattr(core, "process_frame") and callable(getattr(core, "process_frame", None)),
+            "step": hasattr(core, "step") and callable(getattr(core, "step", None)),
+        }
+        log.info(f"[MATANY] API availability: {have}")
+        if have["process_video"]:
+            try:
+                import inspect
+                sig = inspect.signature(core.process_video)  # type: ignore[attr-defined]
+                log.info(f"[MATANY] process_video signature: {sig}")
+            except Exception as e:
+                log.info(f"[MATANY] process_video signature probe failed: {e}")
+    # -------------------------------------------------------------------------
+    # 2.3 — Autocast policy
+    # -------------------------------------------------------------------------
     def _maybe_amp(self):
         enabled = (self.device.type == "cuda")
         if self.precision == "fp32":
         # auto
         return torch.amp.autocast(device_type="cuda", enabled=enabled and self.use_fp16)
+    # -------------------------------------------------------------------------
+    # 2.4 — Frame validation & core call
+    # -------------------------------------------------------------------------
     def _validate_input_frame(self, frame: np.ndarray) -> None:
         if not isinstance(frame, np.ndarray):
             raise MatAnyError(f"Frame must be numpy.ndarray, got {type(frame)}")
         if frame.ndim != 3 or frame.shape[2] != 3:
             raise MatAnyError(f"Frame must be HWC with 3 channels, got {frame.shape}")
+    def _run_frame(self, frame_bgr: np.ndarray, seed_1hw: Optional[np.ndarray], is_first: bool) -> np.ndarray:
         """
+        Run a single frame through MatAnyone.
+        Returns: alpha matte as 2D np.float32 in [0,1].
         """
         self._validate_input_frame(frame_bgr)
+        # Image -> CHW float32 [0,1], then torch on device
+        img_chw = _to_chw01(frame_bgr)      # (3,H,W) float32
+        img_t   = torch.from_numpy(img_chw).to(self.device)
+        # Optional seed mask on first frame: expect HW float32 [0,1]
+        mask_t = None
+        if is_first and seed_1hw is not None:
+            if seed_1hw.ndim == 3 and seed_1hw.shape[0] == 1:
+                seed_hw = seed_1hw[0]
+            elif seed_1hw.ndim == 2:
+                seed_hw = seed_1hw
+            else:
+                raise MatAnyError(f"seed mask must be 1HW or HW; got {seed_1hw.shape}")
+            mask_t = torch.from_numpy(seed_hw).to(self.device)
+        # Dispatch into the selected frame API
+        try:
             with torch.no_grad(), self._maybe_amp():
+                if self._api_mode == "step":
+                    out = self._core.step(img_t, mask_t) if mask_t is not None else self._core.step(img_t)
+                elif self._api_mode == "process_frame":
+                    out = self._core.process_frame(img_t, mask_t)
+                else:
+                    raise MatAnyError("Internal error: _run_frame used in non-frame mode")
+        except torch.cuda.OutOfMemoryError as e:
+            snap = _cuda_snapshot(self.device)
+            self._log_gpu_memory()
+            raise MatAnyError(f"CUDA OOM while processing frame | {snap}") from e
+        except RuntimeError as e:
+            # If it’s a CUDA-side runtime issue, annotate with snapshot
+            if "CUDA" in str(e):
+                snap = _cuda_snapshot(self.device)
+                self._log_gpu_memory()
+                raise MatAnyError(f"CUDA runtime error: {e} | {snap}") from e
+            raise MatAnyError(f"Runtime error: {e}") from e
+        except Exception as e:
+            raise MatAnyError(f"Processing failed: {e}") from e
+        # Normalize to pure 2D numpy [0,1]
+        if isinstance(out, torch.Tensor):
+            alpha_np = out.detach().float().clamp(0, 1).squeeze().cpu().numpy()
+        else:
+            alpha_np = np.asarray(out, dtype=np.float32)
             if alpha_np.max() > 1.0:
                 alpha_np = alpha_np / 255.0
+        alpha_np = np.squeeze(alpha_np)
+        if alpha_np.ndim != 2:
+            raise MatAnyError(f"Expected 2D alpha matte; got shape {alpha_np.shape}")
+        return alpha_np.astype(np.float32)
+    # -------------------------------------------------------------------------
+    # 2.5 — process_video harvesting (kept for completeness; not used in forced frame mode)
+    # -------------------------------------------------------------------------
+    def _harvest_process_video_output(self, res, out_dir: Path, base: str) -> Tuple[Path, Path]:
+        """
+        Accepts varied return types from MatAnyone.process_video and produces
+        (alpha.mp4, fg.mp4) inside out_dir. Strategy: prefer path returns; fallback glob.
+        If backend returns arrays only, we raise (cannot reconstruct FG here).
+        """
+        alpha_mp4 = out_dir / "alpha.mp4"
+        fg_mp4    = out_dir / "fg.mp4"
+        # Dict style: look for common keys
+        if isinstance(res, dict):
+            cand_alpha = res.get("alpha") or res.get("alpha_path") or res.get("matte") or res.get("matte_path")
+            cand_fg    = res.get("fg")    or res.get("fg_path")    or res.get("foreground") or res.get("foreground_path")
+            moved = 0
+            if cand_alpha and Path(cand_alpha).exists():
+                shutil.copy2(cand_alpha, alpha_mp4); moved += 1
+            if cand_fg and Path(cand_fg).exists():
+                shutil.copy2(cand_fg, fg_mp4); moved += 1
+            if moved == 2:
+                return alpha_mp4, fg_mp4
+        # Tuple/list of paths
+        if isinstance(res, (list, tuple)) and len(res) >= 1:
+            paths = [Path(x) for x in res if isinstance(x, (str, Path))]
+            if paths:
+                alpha_candidates = [p for p in paths if p.exists() and ("alpha" in p.name or "matte" in p.name)]
+                fg_candidates    = [p for p in paths if p.exists() and ("fg" in p.name or "fore" in p.name)]
+                if alpha_candidates and fg_candidates:
+                    shutil.copy2(alpha_candidates[0], alpha_mp4)
+                    shutil.copy2(fg_candidates[0], fg_mp4)
+                    return alpha_mp4, fg_mp4
+        # Fallback: glob common dirs
+        search_dirs = [Path.cwd(), out_dir, Path("results"), Path("result"), Path("output"), Path("outputs")]
+        hits: List[Path] = []
+        for d in search_dirs:
+            if d.exists():
+                hits.extend(list(d.rglob(f"*{base}*.*")))
+        alpha_candidates = [p for p in hits if p.suffix.lower() in (".mp4",".mov",".mkv",".avi") and ("alpha" in p.name or "matte" in p.name)]
+        fg_candidates    = [p for p in hits if p.suffix.lower() in (".mp4",".mov",".mkv",".avi") and ("fg" in p.name or "fore" in p.name)]
+        if alpha_candidates and fg_candidates:
+            shutil.copy2(alpha_candidates[0], alpha_mp4)
+            shutil.copy2(fg_candidates[0], fg_mp4)
+            return alpha_mp4, fg_mp4
+        raise MatAnyError("MatAnyone.process_video did not yield discoverable output paths.")
+    # -------------------------------------------------------------------------
+    # 2.6 — Public API: process_stream
+    # -------------------------------------------------------------------------
     def process_stream(
         self,
         video_path: Path,
         progress_cb: Optional[Callable] = None,
     ) -> Tuple[Path, Path]:
         """
+        Process a video with MatAnyone (frame-by-frame path enforced by default).
         Returns:
             (alpha_path, fg_path)
         log.info(f"[MATANY] {video_path.name}: {N} frames {W}x{H} @ {fps:.2f} fps")
         _emit_progress(progress_cb, 0.05, f"Video: {N} frames {W}x{H} @ {fps:.2f} fps")
         alpha_path = out_dir / "alpha.mp4"
         fg_path    = out_dir / "fg.mp4"
         try:
+            # -----------------------------
+            # Frame-by-frame streaming path
+            # -----------------------------
+            _emit_progress(progress_cb, 0.10, f"Using {self._api_mode} (frame-by-frame)")
+            cap = cv2.VideoCapture(str(video_path))
+            if not cap.isOpened():
+                raise MatAnyError(f"Failed to open video for reading: {video_path}")
+            # Writers (alpha as BGR grayscale for broad mp4v compatibility)
+            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+            alpha_writer = cv2.VideoWriter(str(alpha_path), fourcc, fps, (W, H), True)  # isColor=True
+            fg_writer    = cv2.VideoWriter(str(fg_path),    fourcc, fps, (W, H), True)
+            if not alpha_writer.isOpened() or not fg_writer.isOpened():
+                raise MatAnyError("Failed to initialize VideoWriter(s)")
+            # Optional seed mask (resized to video HxW, normalized to [0,1])
+            seed_1hw = None
+            if seed_mask_path is not None:
+                seed_1hw = _read_mask_hw(Path(seed_mask_path), (H, W))
+            idx = 0
+            last_tick = time.time()
+            start = time.time()
+            try:
+                while True:
+                    ret, frame = cap.read()
+                    if not ret:
+                        break
+                    current_mask = seed_1hw if idx == 0 else None
+                    alpha_hw = self._run_frame(frame, current_mask, is_first=(idx == 0))
+                    # Compose outputs
+                    alpha_u8  = (alpha_hw * 255.0 + 0.5).astype(np.uint8)
+                    alpha_bgr = cv2.cvtColor(alpha_u8, cv2.COLOR_GRAY2BGR)
+                    # alpha_hw already [0,1]
+                    fg_bgr = (frame.astype(np.float32) * alpha_hw[..., None]).clip(0, 255).astype(np.uint8)
+                    alpha_writer.write(alpha_bgr)
+                    fg_writer.write(fg_bgr)
+                    idx += 1
+                    # progress & ETA
+                    if N > 0 and (idx % max(5, N // 100) == 0 or (time.time() - last_tick) > 2.0):
+                        elapsed = time.time() - start
+                        prog = idx / max(1, N)
+                        eta_s = (elapsed / prog) * (1.0 - prog) if prog > 0 else 0.0
+                        if eta_s > 3600:
+                            eta = f"{eta_s/3600:.1f} h"
+                        elif eta_s > 60:
+                            eta = f"{eta_s/60:.1f} m"
+                        else:
+                            eta = f"{eta_s:.0f} s"
+                        fps_run = idx / elapsed if elapsed > 0 else 0.0
+                        gpu_tail = ""
+                        if torch.cuda.is_available():
+                            idx_dev = self.device.index if self.device.index is not None else 0
+                            mem_a = torch.cuda.memory_allocated(idx_dev) / 1024**2
+                            mem_r = torch.cuda.memory_reserved(idx_dev) / 1024**2
+                            gpu_tail = f" | GPU {mem_a:.0f}/{mem_r:.0f}MB"
+                        _emit_progress(progress_cb, min(0.99, prog), f"Frame {idx}/{N} • {fps_run:.1f} FPS • ETA {eta}{gpu_tail}")
+                        last_tick = time.time()
+                # finalize
+                _validate_nonempty(alpha_path)
+                _validate_nonempty(fg_path)
+                total = time.time() - start
+                fps_run = idx / total if total > 0 else 0.0
+                _emit_progress(progress_cb, 1.0, f"Complete! {idx} frames at {fps_run:.1f} FPS")
+                return alpha_path, fg_path
+            finally:
+                try:
+                    if cap and hasattr(cap, "isOpened") and cap.isOpened():
+                        cap.release()
+                except Exception:
+                    pass
+                try:
+                    if alpha_writer:
+                        alpha_writer.release()
+                except Exception:
+                    pass
+                try:
+                    if fg_writer:
+                        fg_writer.release()
+                except Exception:
+                    pass
+                _safe_empty_cache()
         except Exception as e:
             msg = f"Error during video processing: {e}"
                 msg += f" | {_cuda_snapshot(self.device)}"
             _emit_progress(progress_cb, -1, msg)
             raise MatAnyError(msg) from e
+# =============================================================================
+# END OF FILE
+# =============================================================================