Spaces:

MogensR
/

VideoBackgroundReplacer2

Paused

App Files Files Community

MogensR commited on Sep 16, 2025

Commit

386575c

1 Parent(s): 87688ee

vin

Browse files

Files changed (1) hide show

models/matanyone_loader.py +105 -135

models/matanyone_loader.py CHANGED Viewed

@@ -3,17 +3,10 @@
 """
 MatAnyone adapter — SAM2-seeded, streaming, build-agnostic.
-#1 Overview
-- SAM2 provides a seed mask on frame 0.
 - MatAnyone does frame-by-frame alpha matting.
-- Supports wheels that expect either 4D [B,C,H,W] or 5D [B,T,C,H,W].
-- Accepts HWC or CHW frames; converts to HWC RGB.
-- Writes alpha.mp4 (grayscale-as-BGR) and fg.mp4 (RGB on black).
-Public API used by pipeline:
-  MatAnyError (exception)
-  class MatAnyoneSession:
-     process_stream(video_path, seed_mask_path=None, out_dir=None, progress_cb=None) -> (alpha_path, fg_path)
 """
 from __future__ import annotations
@@ -28,9 +21,9 @@ class MatAnyoneSession:
 log = logging.getLogger(__name__)
-# ---------- Progress helper (safe & rate-limited) ----------
 def _env_flag(name: str, default: str = "0") -> bool:
-    return os.getenv(name, default).strip().lower() in {"1", "true", "yes", "on"}
 _PROGRESS_CB_ENABLED = _env_flag("MATANY_PROGRESS", "1")
 _PROGRESS_MIN_INTERVAL = float(os.getenv("MATANY_PROGRESS_MIN_SEC", "0.25"))
@@ -39,7 +32,6 @@ def _env_flag(name: str, default: str = "0") -> bool:
 _progress_disabled = False
 def _emit_progress(cb, pct: float, msg: str):
-    """#2 UI progress callback wrapper (tolerant of legacy 1-arg signatures)"""
     global _progress_last, _progress_last_msg, _progress_disabled
     if not cb or not _PROGRESS_CB_ENABLED or _progress_disabled:
         return
@@ -50,7 +42,7 @@ def _emit_progress(cb, pct: float, msg: str):
         try:
             cb(pct, msg)  # preferred (pct, msg)
         except TypeError:
-            cb(msg)       # legacy (msg-only)
         _progress_last = now
         _progress_last_msg = msg
     except Exception as e:
@@ -59,12 +51,10 @@ def _emit_progress(cb, pct: float, msg: str):
 # ---------- Errors ----------
 class MatAnyError(RuntimeError):
-    """#3 Adapter-level error (keeps upstream logs readable)"""
     pass
-# ---------- CUDA snapshots ----------
 def _cuda_snapshot(device: Optional[torch.device]) -> str:
-    """#4 Best-effort CUDA memory + device info (for error context)"""
     try:
         if not torch.cuda.is_available():
             return "CUDA: N/A"
@@ -79,7 +69,6 @@ def _cuda_snapshot(device: Optional[torch.device]) -> str:
         return f"CUDA snapshot error: {e!r}"
 def _safe_empty_cache():
-    """#5 Non-blocking VRAM cleanup (avoid synchronize() in Spaces)"""
     if not torch.cuda.is_available():
         return
     try:
@@ -90,9 +79,8 @@ def _safe_empty_cache():
 # ---------- SAM2 → seed mask prep ----------
 def _prepare_seed_mask(sam2_mask: np.ndarray, H: int, W: int) -> np.ndarray:
     """
-    #6 Normalize SAM2 mask to float32 [H,W] in {0,1}, white = foreground.
-    - Accepts 2D or 3-channel images; resizes with NEAREST to keep edges crisp.
-    - Auto-inverts if >60% of the image is ON (likely polarity swap).
     """
     if not isinstance(sam2_mask, np.ndarray):
         raise MatAnyError(f"SAM2 mask must be numpy array, got {type(sam2_mask)}")
@@ -109,47 +97,41 @@ def _prepare_seed_mask(sam2_mask: np.ndarray, H: int, W: int) -> np.ndarray:
         m /= 255.0
     m = np.clip(m, 0.0, 1.0)
-    cov = float((m > 0.5).mean())
-    if cov > 0.60:
         m = 1.0 - m
-    # hard binarize for a clean seed
-    m = (m > 0.5).astype(np.float32)
-    return m
 # ---------- Frame conversion ----------
 def _frame_bgr_to_hwc_rgb_numpy(frame) -> np.ndarray:
-    """
-    #7 Accepts OpenCV BGR uint8 HWC, or uint8 CHW; returns HWC RGB uint8.
-    """
     if not isinstance(frame, np.ndarray) or frame.ndim != 3:
         raise MatAnyError(f"Frame must be HWC/CHW numpy array, got {type(frame)}, shape={getattr(frame, 'shape', None)}")
     arr = frame
-    # Accept CHW and convert to HWC
-    if arr.shape[0] == 3 and arr.shape[2] != 3:
-        arr = np.transpose(arr, (1, 2, 0))  # CHW -> HWC
     if arr.dtype != np.uint8:
         raise MatAnyError(f"Frame must be uint8, got {arr.dtype}")
-    rgb = cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)
-    return rgb
 # ============================================================================
 class MatAnyoneSession:
     """
-    #8 Streaming wrapper that seeds MatAnyone with a SAM2 mask on frame 0.
-    - Tries 4D first; if the wheel truly wants 5D, promotes both image AND mask.
-    - Has an override env: MATANY_FORCE_FORMAT=4D|5D (for debugging).
     """
     def __init__(self, device: Optional[str] = None, precision: str = "auto"):
         self.device = torch.device(device) if device else (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
         self.precision = precision.lower()
-        # Optional override: MATANY_FORCE_FORMAT=4D|5D
-        fmt = os.getenv("MATANY_FORCE_FORMAT", "").strip().lower()
-        self._force_4d = (fmt == "4d")
-        self._force_5d = (fmt == "5d")
-        self._use_5d = self._force_5d  # start in 5D only if forced
         try:
             from matanyone.inference.inference_core import InferenceCore
@@ -158,37 +140,36 @@ def __init__(self, device: Optional[str] = None, precision: str = "auto"):
         try:
             self.core = InferenceCore()
         except TypeError:
-            # HF wheel constructor that needs a repo string
             self.core = InferenceCore("PeiqingYang/MatAnyone")
-        self.api = "step" if hasattr(self.core, "step") else ("process_frame" if hasattr(self.core, "process_frame") else None)
-        if not self.api:
-            raise MatAnyError("MatAnyone core exposes neither 'step' nor 'process_frame'")
-        log.info(f"[MATANY] API: {self.api} | device={self.device} | force4d={self._force_4d} | force5d={self._force_5d}")
-    # ----- AMP policy -----
     def _amp(self):
-        """#9 Simple AMP gate (auto/fp16/fp32)"""
         if self.device.type != "cuda":
             return torch.amp.autocast(device_type="cuda", enabled=False)
         if self.precision == "fp32":
             return torch.amp.autocast(device_type="cuda", enabled=False)
         if self.precision == "fp16":
             return torch.amp.autocast(device_type="cuda", enabled=True, dtype=torch.float16)
-        # auto
         return torch.amp.autocast(device_type="cuda", enabled=True)
-    # ----- Tensor builders -----
     def _to_tensors(self, img_hwc_rgb: np.ndarray, mask_hw: Optional[np.ndarray]):
-        """
-        #10 Build both 4D and 5D tensors.
-        Returns: (img_4d, img_5d, mask_4d, mask_5d)
-        - img_4d:  [1, 3, H, W]
-        - img_5d:  [1, 1, 3, H, W]
-        - mask_4d: [1, 1, H, W]  or None
-        - mask_5d: [1, 1, 1, H, W] or None
-        """
         img = torch.from_numpy(img_hwc_rgb).to(self.device)
         if img.dtype != torch.float32:
             img = img.float()
@@ -204,107 +185,104 @@ def _to_tensors(self, img_hwc_rgb: np.ndarray, mask_hw: Optional[np.ndarray]):
             m = torch.from_numpy(mask_hw).to(self.device)
             if m.dtype != torch.float32:
                 m = m.float()
-            # robust binarize
             m = (m >= 0.5).float() if float(m.max().item()) <= 1.0 else (m >= 128).float()
-            mask_4d = m.unsqueeze(0).unsqueeze(0).contiguous()          # [1,1,H,W]
-            mask_5d = mask_4d.unsqueeze(1).contiguous()                 # [1,1,1,H,W]
         return img_4d, img_5d, mask_4d, mask_5d
-    # ----- Core call (4D first, 5D only if demanded) -----
-    def _core_call(self, img_4d, img_5d, mask_4d, mask_5d, is_first: bool):
-        """
-        #11 Dispatch into the wheel, trying 4D, then 5D if the error suggests it.
-        Also backs off from 5D → 4D when conv2d complains about 3D/4D.
-        """
-        def run(use_5d: bool):
-            img  = img_5d if use_5d else img_4d
-            msk  = mask_5d if use_5d else mask_4d  # <<< IMPORTANT: match ranks
-            if self.api == "step":
-                if is_first and msk is not None:
                     try:
-                        return self.core.step(img, msk, is_first=True)
-                    except TypeError:
-                        return self.core.step(img, msk)  # older signature
-                else:
-                    return self.core.step(img)
             else:
-                return self.core.process_frame(img, msk if is_first else None)
         with torch.no_grad(), self._amp():
-            # Forced modes for debugging
             if self._force_4d:
                 return run(False)
             if self._force_5d:
                 return run(True)
-            # If a previous frame decided on 5D, try 5D first but back off if needed
             if self._use_5d:
                 try:
                     return run(True)
                 except RuntimeError as e5:
-                    msg5 = str(e5)
-                    # If the wheel says conv2d needs 3D/4D, revert to 4D permanently
-                    if "Expected 3D" in msg5 and "4D" in msg5 and "conv2d" in msg5:
                         log.info("[MATANY] 5D rejected by wheel (conv2d wants 3D/4D). Falling back to 4D.")
                         self._use_5d = False
                         return run(False)
-                    raise MatAnyError(f"Runtime error (5D path): {msg5}") from e5
-            # Default: try 4D first
             try:
-                return run(False)
             except RuntimeError as e4:
-                msg4 = str(e4)
-                # Hints that the wheel actually expects 5D
-                wants_5d = any(kw in msg4 for kw in [
-                    "expected 5D",
-                    "expects 5D",
-                    "input.dim() == 5",
-                    "but got 4D",
-                    "got input of size: [1, 3,"  # some wheels report this pattern
-                ])
-                if wants_5d:
                     log.info("[MATANY] Wheel appears to expect 5D — retrying with [1,1,3,H,W] and [1,1,1,H,W].")
                     self._use_5d = True
                     try:
                         return run(True)
                     except RuntimeError as e5b:
-                        msg5b = str(e5b)
-                        # If retry says conv2d wants 3D/4D, undo and raise original
-                        if "Expected 3D" in msg5b and "4D" in msg5b and "conv2d" in msg5b:
                             self._use_5d = False
-                            raise MatAnyError(f"Wheel ultimately expects 4D (conv2d). Original 4D error: {msg4}") from e4
-                        raise MatAnyError(f"5D attempt failed: {msg5b}") from e5b
-                # Add CUDA context for GPU errors
-                if "CUDA" in msg4 or "cublas" in msg4.lower() or "cudnn" in msg4.lower():
                     snap = _cuda_snapshot(self.device)
-                    raise MatAnyError(f"CUDA runtime error: {msg4} | {snap}") from e4
-                # Generic wrap
-                raise MatAnyError(f"Runtime error (4D path): {msg4}") from e4
     # ----- Per-frame runner -----
     def _run_frame(self, frame_bgr: np.ndarray, sam2_mask_hw: Optional[np.ndarray], is_first: bool) -> np.ndarray:
-        """#12 Convert inputs, seed frame 0, call core, and normalize to [H,W] alpha."""
         rgb_hwc = _frame_bgr_to_hwc_rgb_numpy(frame_bgr)
         H, W = rgb_hwc.shape[:2]
-        seed_for_this_frame = None
-        if is_first and sam2_mask_hw is not None:
-            seed_for_this_frame = _prepare_seed_mask(sam2_mask_hw, H, W)
-        img_4d, img_5d, mask_4d, mask_5d = self._to_tensors(rgb_hwc, seed_for_this_frame)
-        try:
-            out = self._core_call(img_4d, img_5d, mask_4d, mask_5d, is_first)
-        except torch.cuda.OutOfMemoryError as e:
-            snap = _cuda_snapshot(self.device)
-            raise MatAnyError(f"CUDA OOM while processing frame | {snap}") from e
-        except Exception as e:
-            raise MatAnyError(f"Runtime error: {e}") from e
-        # Normalize output to [H,W] float32 in [0,1]
         if isinstance(out, torch.Tensor):
             alpha = out.detach().float().squeeze().cpu().numpy()
         else:
@@ -325,9 +303,6 @@ def process_stream(
         out_dir: Optional[Path] = None,
         progress_cb: Optional[Callable] = None,
     ) -> Tuple[Path, Path]:
-        """
-        #13 Stream the video one frame at a time (T=1), write alpha.mp4 & fg.mp4.
-        """
         video_path = Path(video_path)
         if not video_path.exists():
             raise MatAnyError(f"Video file not found: {video_path}")
@@ -335,7 +310,6 @@ def process_stream(
         out_dir = Path(out_dir) if out_dir else video_path.parent
         out_dir.mkdir(parents=True, exist_ok=True)
-        # Probe video
         cap_probe = cv2.VideoCapture(str(video_path))
         if not cap_probe.isOpened():
             raise MatAnyError(f"Failed to open video: {video_path}")
@@ -349,9 +323,8 @@ def process_stream(
         log.info(f"MatAnyone: {video_path.name} | {N} frames {W}x{H} @ {fps:.2f} fps")
         _emit_progress(progress_cb, 0.05, f"Video: {N} frames {W}x{H} @ {fps:.2f} fps")
-        _emit_progress(progress_cb, 0.08, "Using step (frame-by-frame)")
-        # Prepare writers
         alpha_path = out_dir / "alpha.mp4"
         fg_path    = out_dir / "fg.mp4"
         fourcc = cv2.VideoWriter_fourcc(*"mp4v")
@@ -360,7 +333,6 @@ def process_stream(
         if not alpha_writer.isOpened() or not fg_writer.isOpened():
             raise MatAnyError("Failed to initialize VideoWriter(s)")
-        # Load seed mask if provided (file path on disk)
         seed_mask_np = None
         if seed_mask_path is not None:
             p = Path(seed_mask_path)
@@ -369,7 +341,7 @@ def process_stream(
             m = cv2.imread(str(p), cv2.IMREAD_GRAYSCALE)
             if m is None:
                 raise MatAnyError(f"Failed to read seed mask: {p}")
-            seed_mask_np = m  # we resize/polarize/binarize inside _run_frame
         cap = cv2.VideoCapture(str(video_path))
         if not cap.isOpened():
@@ -384,9 +356,8 @@ def process_stream(
                 if not ret:
                     break
                 is_first = (idx == 0)
-                alpha = self._run_frame(frame, seed_mask_np if is_first else None, is_first)  # [H,W] in [0,1]
-                # Compose outputs (no double divide)
                 alpha_u8  = (alpha * 255.0 + 0.5).astype(np.uint8)
                 alpha_bgr = cv2.cvtColor(alpha_u8, cv2.COLOR_GRAY2BGR)
                 fg_bgr    = (frame.astype(np.float32) * alpha[..., None]).clip(0, 255).astype(np.uint8)
@@ -411,7 +382,6 @@ def process_stream(
             except: pass
             _safe_empty_cache()
-        # Verify outputs are non-empty
         if not alpha_path.exists() or alpha_path.stat().st_size == 0:
             raise MatAnyError(f"Output file missing/empty: {alpha_path}")
         if not fg_path.exists() or fg_path.stat().st_size == 0:

 """
 MatAnyone adapter — SAM2-seeded, streaming, build-agnostic.
+- SAM2 defines the subject (seed mask) on frame 0.
 - MatAnyone does frame-by-frame alpha matting.
+- Prefers process_frame (HWC numpy) and falls back to step.
+- For step(): supports 4D [B,C,H,W] and 5D [B,T,C,H,W] with matching mask rank.
 """
 from __future__ import annotations
 log = logging.getLogger(__name__)
+# ---------- Progress helper ----------
 def _env_flag(name: str, default: str = "0") -> bool:
+    return os.getenv(name, default).strip().lower() in {"1","true","yes","on"}
 _PROGRESS_CB_ENABLED = _env_flag("MATANY_PROGRESS", "1")
 _PROGRESS_MIN_INTERVAL = float(os.getenv("MATANY_PROGRESS_MIN_SEC", "0.25"))
 _progress_disabled = False
 def _emit_progress(cb, pct: float, msg: str):
     global _progress_last, _progress_last_msg, _progress_disabled
     if not cb or not _PROGRESS_CB_ENABLED or _progress_disabled:
         return
         try:
             cb(pct, msg)  # preferred (pct, msg)
         except TypeError:
+            cb(msg)       # legacy (msg)
         _progress_last = now
         _progress_last_msg = msg
     except Exception as e:
 # ---------- Errors ----------
 class MatAnyError(RuntimeError):
     pass
+# ---------- CUDA helpers ----------
 def _cuda_snapshot(device: Optional[torch.device]) -> str:
     try:
         if not torch.cuda.is_available():
             return "CUDA: N/A"
         return f"CUDA snapshot error: {e!r}"
 def _safe_empty_cache():
     if not torch.cuda.is_available():
         return
     try:
 # ---------- SAM2 → seed mask prep ----------
 def _prepare_seed_mask(sam2_mask: np.ndarray, H: int, W: int) -> np.ndarray:
     """
+    Normalize to float32 [H,W] in {0,1}, white=FG.
+    Auto-invert if >60% ON (likely wrong polarity).
     """
     if not isinstance(sam2_mask, np.ndarray):
         raise MatAnyError(f"SAM2 mask must be numpy array, got {type(sam2_mask)}")
         m /= 255.0
     m = np.clip(m, 0.0, 1.0)
+    if (m > 0.5).mean() > 0.60:
         m = 1.0 - m
+    return (m > 0.5).astype(np.float32)
 # ---------- Frame conversion ----------
 def _frame_bgr_to_hwc_rgb_numpy(frame) -> np.ndarray:
+    """Accept HWC/CHW BGR uint8 → return HWC RGB uint8."""
     if not isinstance(frame, np.ndarray) or frame.ndim != 3:
         raise MatAnyError(f"Frame must be HWC/CHW numpy array, got {type(frame)}, shape={getattr(frame, 'shape', None)}")
     arr = frame
+    if arr.shape[0] == 3 and arr.shape[2] != 3:   # CHW → HWC
+        arr = np.transpose(arr, (1, 2, 0))
     if arr.dtype != np.uint8:
         raise MatAnyError(f"Frame must be uint8, got {arr.dtype}")
+    return cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)
 # ============================================================================
 class MatAnyoneSession:
     """
+    Streaming wrapper that seeds MatAnyone on frame 0.
+    Prefers core.process_frame (HWC numpy), falls back to core.step with 4D/5D.
     """
     def __init__(self, device: Optional[str] = None, precision: str = "auto"):
         self.device = torch.device(device) if device else (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
         self.precision = precision.lower()
+        # API/format overrides for debugging
+        api_force = os.getenv("MATANY_FORCE_API", "").strip().lower()  # "process" or "step"
+        fmt_force = os.getenv("MATANY_FORCE_FORMAT", "").strip().lower()  # "4d" or "5d"
+        self._force_api_process = (api_force == "process")
+        self._force_api_step    = (api_force == "step")
+        self._force_4d = (fmt_force == "4d")
+        self._force_5d = (fmt_force == "5d")
         try:
             from matanyone.inference.inference_core import InferenceCore
         try:
             self.core = InferenceCore()
         except TypeError:
             self.core = InferenceCore("PeiqingYang/MatAnyone")
+        self._has_process = hasattr(self.core, "process_frame")
+        self._has_step    = hasattr(self.core, "step")
+        if not (self._has_process or self._has_step):
+            raise MatAnyError("MatAnyone core exposes neither 'process_frame' nor 'step'")
+        # Prefer process_frame unless forced to step
+        if self._force_api_step and not self._has_step:
+            raise MatAnyError("MATANY_FORCE_API=step but core.step is missing")
+        if self._force_api_process and not self._has_process:
+            raise MatAnyError("MATANY_FORCE_API=process but core.process_frame is missing")
+        self._api = "process_frame" if (self._has_process and not self._force_api_step) or self._force_api_process else "step"
+        self._use_5d = bool(self._force_5d)  # only used in step mode
+        log.info(f"[MATANY] APIs: process_frame={self._has_process}, step={self._has_step} | active={self._api} | force4d={self._force_4d} force5d={self._force_5d}")
+    # AMP only affects step() path where we may use torch tensors
     def _amp(self):
         if self.device.type != "cuda":
             return torch.amp.autocast(device_type="cuda", enabled=False)
         if self.precision == "fp32":
             return torch.amp.autocast(device_type="cuda", enabled=False)
         if self.precision == "fp16":
             return torch.amp.autocast(device_type="cuda", enabled=True, dtype=torch.float16)
         return torch.amp.autocast(device_type="cuda", enabled=True)
+    # ----- Tensor builders for step() mode -----
     def _to_tensors(self, img_hwc_rgb: np.ndarray, mask_hw: Optional[np.ndarray]):
         img = torch.from_numpy(img_hwc_rgb).to(self.device)
         if img.dtype != torch.float32:
             img = img.float()
             m = torch.from_numpy(mask_hw).to(self.device)
             if m.dtype != torch.float32:
                 m = m.float()
             m = (m >= 0.5).float() if float(m.max().item()) <= 1.0 else (m >= 128).float()
+            mask_4d = m.unsqueeze(0).unsqueeze(0).contiguous()   # [1,1,H,W]
+            mask_5d = mask_4d.unsqueeze(1).contiguous()          # [1,1,1,H,W]
         return img_4d, img_5d, mask_4d, mask_5d
+    # ----- Core call: process_frame preferred, fallback to step -----
+    def _call_process_frame(self, rgb_hwc: np.ndarray, seed_mask_hw: Optional[np.ndarray], is_first: bool):
+        """Try numpy path first; fallback to torch path if the wheel requests tensors."""
+        seed = seed_mask_hw if is_first else None
+        # 1) Most wheels want numpy HWC + 2D mask (float 0..1 or uint8)
+        try:
+            return self.core.process_frame(rgb_hwc, seed)
+        except TypeError as e_np:
+            msg = str(e_np).lower()
+            # 2) Some wheels want torch [B,C,H,W] tensors even in process_frame
+            if "tensor" in msg or "expected" in msg or "conv2d" in msg:
+                img_4d, _, mask_4d, _ = self._to_tensors(rgb_hwc, seed)
+                with torch.no_grad(), self._amp():
                     try:
+                        return self.core.process_frame(img_4d, mask_4d)
+                    except Exception as e_t:
+                        raise MatAnyError(f"process_frame tensor path failed: {e_t}") from e_t
+            raise
+    def _call_step(self, rgb_hwc: np.ndarray, seed_mask_hw: Optional[np.ndarray], is_first: bool):
+        """4D first; if the wheel wants 5D, promote both image AND mask."""
+        img_4d, img_5d, mask_4d, mask_5d = self._to_tensors(rgb_hwc, seed_mask_hw if is_first else None)
+        def run(use_5d: bool):
+            img = img_5d if use_5d else img_4d
+            msk = mask_5d if use_5d else mask_4d
+            if is_first and msk is not None:
+                try:
+                    return self.core.step(img, msk, is_first=True)
+                except TypeError:
+                    return self.core.step(img, msk)
             else:
+                return self.core.step(img)
         with torch.no_grad(), self._amp():
             if self._force_4d:
                 return run(False)
             if self._force_5d:
                 return run(True)
             if self._use_5d:
                 try:
                     return run(True)
                 except RuntimeError as e5:
+                    m5 = str(e5)
+                    if "expected 3d" in m5.lower() and "4d" in m5 and "conv2d" in m5.lower():
                         log.info("[MATANY] 5D rejected by wheel (conv2d wants 3D/4D). Falling back to 4D.")
                         self._use_5d = False
                         return run(False)
+                    raise MatAnyError(f"Runtime error (step/5D): {m5}") from e5
             try:
+                return run(False)  # 4D
             except RuntimeError as e4:
+                m4 = str(e4)
+                needs_5d = any(kw in m4 for kw in ["expected 5D", "expects 5D", "input.dim() == 5", "but got 4D", "got input of size: [1, 3,"])
+                if needs_5d:
                     log.info("[MATANY] Wheel appears to expect 5D — retrying with [1,1,3,H,W] and [1,1,1,H,W].")
                     self._use_5d = True
                     try:
                         return run(True)
                     except RuntimeError as e5b:
+                        m5b = str(e5b)
+                        if "expected 3d" in m5b.lower() and "4d" in m5b and "conv2d" in m5b.lower():
                             self._use_5d = False
+                            raise MatAnyError(f"Wheel ultimately expects 4D (conv2d). Original 4D error: {m4}") from e4
+                        raise MatAnyError(f"step/5D attempt failed: {m5b}") from e5b
+                if "cuda" in m4.lower():
                     snap = _cuda_snapshot(self.device)
+                    raise MatAnyError(f"CUDA runtime error: {m4} | {snap}") from e4
+                raise MatAnyError(f"Runtime error (step/4D): {m4}") from e4
     # ----- Per-frame runner -----
     def _run_frame(self, frame_bgr: np.ndarray, sam2_mask_hw: Optional[np.ndarray], is_first: bool) -> np.ndarray:
         rgb_hwc = _frame_bgr_to_hwc_rgb_numpy(frame_bgr)
         H, W = rgb_hwc.shape[:2]
+        seed_for_this_frame = _prepare_seed_mask(sam2_mask_hw, H, W) if (is_first and sam2_mask_hw is not None) else None
+        # Primary: process_frame
+        if self._api == "process_frame":
+            try:
+                out = self._call_process_frame(rgb_hwc, seed_for_this_frame, is_first)
+            except Exception as e_proc:
+                log.warning(f"[MATANY] process_frame failed ({e_proc}); falling back to step().")
+                if not self._has_step:
+                    raise MatAnyError(f"process_frame failed and step() is unavailable: {e_proc}")
+                self._api = "step"
+                out = self._call_step(rgb_hwc, seed_for_this_frame, is_first)
+        else:
+            out = self._call_step(rgb_hwc, seed_for_this_frame, is_first)
+        # Normalize to 2D alpha [H,W] in [0,1]
         if isinstance(out, torch.Tensor):
             alpha = out.detach().float().squeeze().cpu().numpy()
         else:
         out_dir: Optional[Path] = None,
         progress_cb: Optional[Callable] = None,
     ) -> Tuple[Path, Path]:
         video_path = Path(video_path)
         if not video_path.exists():
             raise MatAnyError(f"Video file not found: {video_path}")
         out_dir = Path(out_dir) if out_dir else video_path.parent
         out_dir.mkdir(parents=True, exist_ok=True)
         cap_probe = cv2.VideoCapture(str(video_path))
         if not cap_probe.isOpened():
             raise MatAnyError(f"Failed to open video: {video_path}")
         log.info(f"MatAnyone: {video_path.name} | {N} frames {W}x{H} @ {fps:.2f} fps")
         _emit_progress(progress_cb, 0.05, f"Video: {N} frames {W}x{H} @ {fps:.2f} fps")
+        _emit_progress(progress_cb, 0.08, "Using per-frame processing")
         alpha_path = out_dir / "alpha.mp4"
         fg_path    = out_dir / "fg.mp4"
         fourcc = cv2.VideoWriter_fourcc(*"mp4v")
         if not alpha_writer.isOpened() or not fg_writer.isOpened():
             raise MatAnyError("Failed to initialize VideoWriter(s)")
         seed_mask_np = None
         if seed_mask_path is not None:
             p = Path(seed_mask_path)
             m = cv2.imread(str(p), cv2.IMREAD_GRAYSCALE)
             if m is None:
                 raise MatAnyError(f"Failed to read seed mask: {p}")
+            seed_mask_np = m
         cap = cv2.VideoCapture(str(video_path))
         if not cap.isOpened():
                 if not ret:
                     break
                 is_first = (idx == 0)
+                alpha = self._run_frame(frame, seed_mask_np if is_first else None, is_first)
                 alpha_u8  = (alpha * 255.0 + 0.5).astype(np.uint8)
                 alpha_bgr = cv2.cvtColor(alpha_u8, cv2.COLOR_GRAY2BGR)
                 fg_bgr    = (frame.astype(np.float32) * alpha[..., None]).clip(0, 255).astype(np.uint8)
             except: pass
             _safe_empty_cache()
         if not alpha_path.exists() or alpha_path.stat().st_size == 0:
             raise MatAnyError(f"Output file missing/empty: {alpha_path}")
         if not fg_path.exists() or fg_path.stat().st_size == 0: