Spaces:

MogensR
/

VideoBackgroundReplacer2

Paused

App Files Files Community

MogensR commited on Sep 16, 2025

Commit

87688ee

1 Parent(s): 2a63856

kkk

Browse files

Files changed (1) hide show

models/matanyone_loader.py +170 -137

models/matanyone_loader.py CHANGED Viewed

@@ -1,23 +1,19 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-MatAnyone adapter — SAM2-seeded, streaming, build-agnostic (HF Spaces ready).
-GOAL (pipeline contract):
-- Use SAM2 only to define the person on frame 0 (seed mask).
-- Feed MatAnyone frames one-by-one to generate the alpha matte.
-- Always pass tensors in the shapes MatAnyone’s conv2d expects:
-    * image  : [3, H, W]  (float32, 0..1, RGB, CHW)
-    * mask   : [H, W]     (float32, 0..1, binary)
-  (No 5D tensors. No time dimension.)
-Outputs:
-- alpha.mp4  — grayscale stored as BGR for broad mp4v compatibility
-- fg.mp4     — original RGB multiplied by alpha (for later compositing)
-Works on HF Spaces:
-- Reads from /tmp/gradio/...
-- Writes to the same folder (or a provided out_dir, e.g. /data/outputs).
 """
 from __future__ import annotations
@@ -32,46 +28,43 @@
 log = logging.getLogger(__name__)
-# =============================================================================
-# [0] Progress helper (safe & rate-limited)
-# =============================================================================
 def _env_flag(name: str, default: str = "0") -> bool:
     return os.getenv(name, default).strip().lower() in {"1", "true", "yes", "on"}
 _PROGRESS_CB_ENABLED = _env_flag("MATANY_PROGRESS", "1")
 _PROGRESS_MIN_INTERVAL = float(os.getenv("MATANY_PROGRESS_MIN_SEC", "0.25"))
-_progress_last_t = 0.0
-_progress_last_msg: Optional[str] = None
 _progress_disabled = False
 def _emit_progress(cb, pct: float, msg: str):
-    """[0.1] Emit progress without ever crashing the caller."""
-    global _progress_last_t, _progress_last_msg, _progress_disabled
     if not cb or not _PROGRESS_CB_ENABLED or _progress_disabled:
         return
     now = time.time()
-    if (now - _progress_last_t) < _PROGRESS_MIN_INTERVAL and msg == _progress_last_msg:
         return
     try:
         try:
-            cb(pct, msg)  # preferred signature (pct, msg)
         except TypeError:
-            cb(msg)       # legacy signature (msg)
-        _progress_last_t = now
         _progress_last_msg = msg
     except Exception as e:
         _progress_disabled = True
         log.warning("[progress-cb] disabled due to exception: %s", e)
-# =============================================================================
-# [1] Errors & CUDA helpers
-# =============================================================================
 class MatAnyError(RuntimeError):
-    """Single error type the pipeline can catch & decide to fallback."""
     pass
 def _cuda_snapshot(device: Optional[torch.device]) -> str:
-    """[1.1] Short, safe description of CUDA memory state."""
     try:
         if not torch.cuda.is_available():
             return "CUDA: N/A"
@@ -86,30 +79,23 @@ def _cuda_snapshot(device: Optional[torch.device]) -> str:
         return f"CUDA snapshot error: {e!r}"
 def _safe_empty_cache():
-    """[1.2] Try to free CUDA cache; never raise."""
     if not torch.cuda.is_available():
         return
-    try:
-        torch.cuda.synchronize()
-    except Exception:
-        pass
     try:
         torch.cuda.empty_cache()
     except Exception:
         pass
-# =============================================================================
-# [2] Mask & frame preparation
-# =============================================================================
 def _prepare_seed_mask(sam2_mask: np.ndarray, H: int, W: int) -> np.ndarray:
     """
-    [2.1] Convert SAM2 mask (0/255 or 0..1) into a clean binary [H,W] float32 in {0,1}.
-          Auto-invert if coverage is > 60% (typical “background is white” case).
     """
     if not isinstance(sam2_mask, np.ndarray):
         raise MatAnyError(f"SAM2 mask must be numpy array, got {type(sam2_mask)}")
-    # Accept accidental 3-channel masks
     if sam2_mask.ndim == 3 and sam2_mask.shape[2] == 3:
         sam2_mask = cv2.cvtColor(sam2_mask, cv2.COLOR_BGR2GRAY)
     if sam2_mask.ndim != 2:
@@ -120,25 +106,26 @@ def _prepare_seed_mask(sam2_mask: np.ndarray, H: int, W: int) -> np.ndarray:
     m = sam2_mask.astype(np.float32)
     if m.max() > 1.0:
-        m *= (1.0 / 255.0)
     m = np.clip(m, 0.0, 1.0)
     cov = float((m > 0.5).mean())
     if cov > 0.60:
-        m = 1.0 - m  # Auto-polarity for “mask covers most of the frame”
-        cov = 1.0 - cov
-    # Binarize (MatAnyone seed likes a crisp mask)
     m = (m > 0.5).astype(np.float32)
     return m
-def _frame_bgr_to_rgb_hwc(frame: np.ndarray) -> np.ndarray:
     """
-    [2.2] Accept OpenCV BGR uint8 HWC (or CHW uint8), return RGB uint8 HWC.
     """
     if not isinstance(frame, np.ndarray) or frame.ndim != 3:
         raise MatAnyError(f"Frame must be HWC/CHW numpy array, got {type(frame)}, shape={getattr(frame, 'shape', None)}")
     arr = frame
-    # Allow CHW input (rare, but we support it)
     if arr.shape[0] == 3 and arr.shape[2] != 3:
         arr = np.transpose(arr, (1, 2, 0))  # CHW -> HWC
     if arr.dtype != np.uint8:
@@ -146,24 +133,24 @@ def _frame_bgr_to_rgb_hwc(frame: np.ndarray) -> np.ndarray:
     rgb = cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)
     return rgb
-# =============================================================================
-# [3] Main session
-# =============================================================================
 class MatAnyoneSession:
     """
-    Streaming wrapper that seeds MatAnyone with a SAM2 mask on frame 0.
-    KEY DECISION: We always pass CHW (3,H,W) to core.step(), and HW (H,W) for the mask.
-                  Absolutely no [B,T,C,H,W] tensors.
     """
     def __init__(self, device: Optional[str] = None, precision: str = "auto"):
-        # [3.1] Device & AMP
-        self.device = torch.device(device) if device else (
-            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-        )
         self.precision = precision.lower()
-        # [3.2] Import & instantiate the MatAnyone core
         try:
             from matanyone.inference.inference_core import InferenceCore
         except ImportError as e:
@@ -171,122 +158,166 @@ def __init__(self, device: Optional[str] = None, precision: str = "auto"):
         try:
             self.core = InferenceCore()
         except TypeError:
-            # Some builds require the repo-id
             self.core = InferenceCore("PeiqingYang/MatAnyone")
-        # [3.3] Choose API (prefer step)
-        if hasattr(self.core, "step") and callable(getattr(self.core, "step")):
-            self.api = "step"
-        elif hasattr(self.core, "process_frame") and callable(getattr(self.core, "process_frame")):
-            self.api = "process_frame"
-        else:
             raise MatAnyError("MatAnyone core exposes neither 'step' nor 'process_frame'")
-        log.info(f"[MATANY] Using API: {self.api} | device={self.device}")
-    # [3.4] AMP context (enabled on CUDA unless precision=='fp32')
     def _amp(self):
         if self.device.type != "cuda":
             return torch.amp.autocast(device_type="cuda", enabled=False)
         if self.precision == "fp32":
             return torch.amp.autocast(device_type="cuda", enabled=False)
         if self.precision == "fp16":
             return torch.amp.autocast(device_type="cuda", enabled=True, dtype=torch.float16)
         return torch.amp.autocast(device_type="cuda", enabled=True)
-    # [3.5] Tensor builders — STRICT shapes
-    def _to_tensors_strict(self, rgb_hwc: np.ndarray, mask_hw: Optional[np.ndarray]):
         """
-        image_out: torch float32 [3,H,W] in 0..1 (RGB, CHW)
-        mask_out : torch float32 [H,W]   in {0,1}
         """
-        # image -> CHW
-        img = torch.from_numpy(rgb_hwc).to(self.device)
         if img.dtype != torch.float32:
             img = img.float()
         if float(img.max().item()) > 1.0:
             img = img / 255.0
         img_chw = img.permute(2, 0, 1).contiguous()  # [3,H,W]
-        # mask -> HW
-        mask_t = None
         if mask_hw is not None:
             m = torch.from_numpy(mask_hw).to(self.device)
             if m.dtype != torch.float32:
                 m = m.float()
-            # Robust binarization (accepts 0/1 or 0..1 or 0/255 upstream)
-            if float(m.max().item()) > 1.0:
-                m = (m >= 128).float()
-            else:
-                m = (m >= 0.5).float()
-            mask_t = m.contiguous()  # [H,W]
-        return img_chw, mask_t
-    # [3.6] Core call (NO 5D, ever)
-    def _core_call(self, img_chw: torch.Tensor, mask_hw: Optional[torch.Tensor], is_first: bool):
         """
-        Route strictly:
-          - step(image_chw, mask_hw) on frame 0 (if mask exists)
-          - step(image_chw)          on subsequent frames
-        Fallbacks only switch between step/process_frame, NOT shapes.
         """
-        with torch.no_grad(), self._amp():
             if self.api == "step":
-                try:
-                    if is_first and mask_hw is not None:
-                        return self.core.step(img_chw, mask_hw)  # <-- strict CHW/HW
-                    else:
-                        return self.core.step(img_chw)
-                except TypeError:
-                    # Some wheels might gate arguments differently; try process_frame
-                    if is_first and mask_hw is not None and hasattr(self.core, "process_frame"):
-                        return self.core.process_frame(img_chw, mask_hw)
-                    elif hasattr(self.core, "process_frame"):
-                        return self.core.process_frame(img_chw, None)
-                    raise
             else:
-                # process_frame fallback API
-                return self.core.process_frame(img_chw, mask_hw if (is_first and mask_hw is not None) else None)
-    # [3.7] Per-frame runner
     def _run_frame(self, frame_bgr: np.ndarray, sam2_mask_hw: Optional[np.ndarray], is_first: bool) -> np.ndarray:
-        rgb_hwc = _frame_bgr_to_rgb_hwc(frame_bgr)
         H, W = rgb_hwc.shape[:2]
-        seed = None
         if is_first and sam2_mask_hw is not None:
-            seed = _prepare_seed_mask(sam2_mask_hw, H, W)  # [H,W] float32 {0,1}
-        img_chw, mask_hw = self._to_tensors_strict(rgb_hwc, seed)
         try:
-            out = self._core_call(img_chw, mask_hw, is_first)
         except torch.cuda.OutOfMemoryError as e:
             snap = _cuda_snapshot(self.device)
             raise MatAnyError(f"CUDA OOM while processing frame | {snap}") from e
-        except RuntimeError as e:
-            # Add CUDA snapshot if relevant
-            if "CUDA" in str(e):
-                snap = _cuda_snapshot(self.device)
-                raise MatAnyError(f"CUDA runtime error: {e} | {snap}") from e
             raise MatAnyError(f"Runtime error: {e}") from e
-        # Normalize output -> [H,W] float32 0..1
         if isinstance(out, torch.Tensor):
             alpha = out.detach().float().squeeze().cpu().numpy()
         else:
             alpha = np.asarray(out)
         alpha = alpha.astype(np.float32)
         if float(alpha.max()) > 1.0:
-            alpha *= (1.0 / 255.0)
         alpha = np.squeeze(alpha)
         if alpha.ndim != 2:
             raise MatAnyError(f"Expected 2D alpha matte; got shape {alpha.shape}")
         return np.clip(alpha, 0.0, 1.0)
-    # =============================================================================
-    # [4] Public: stream the whole video
-    # =============================================================================
     def process_stream(
         self,
         video_path: Path,
@@ -294,7 +325,9 @@ def process_stream(
         out_dir: Optional[Path] = None,
         progress_cb: Optional[Callable] = None,
     ) -> Tuple[Path, Path]:
-        # [4.1] IO setup
         video_path = Path(video_path)
         if not video_path.exists():
             raise MatAnyError(f"Video file not found: {video_path}")
@@ -302,21 +335,23 @@ def process_stream(
         out_dir = Path(out_dir) if out_dir else video_path.parent
         out_dir.mkdir(parents=True, exist_ok=True)
-        # [4.2] Probe video
         cap_probe = cv2.VideoCapture(str(video_path))
         if not cap_probe.isOpened():
             raise MatAnyError(f"Failed to open video: {video_path}")
         N   = int(cap_probe.get(cv2.CAP_PROP_FRAME_COUNT))
-        fps = cap_probe.get(cv2.CAP_PROP_FPS) or 25.0
         W   = int(cap_probe.get(cv2.CAP_PROP_FRAME_WIDTH))
         H   = int(cap_probe.get(cv2.CAP_PROP_FRAME_HEIGHT))
         cap_probe.release()
         log.info(f"MatAnyone: {video_path.name} | {N} frames {W}x{H} @ {fps:.2f} fps")
         _emit_progress(progress_cb, 0.05, f"Video: {N} frames {W}x{H} @ {fps:.2f} fps")
         _emit_progress(progress_cb, 0.08, "Using step (frame-by-frame)")
-        # [4.3] Writers (alpha as BGR so mp4v is happy)
         alpha_path = out_dir / "alpha.mp4"
         fg_path    = out_dir / "fg.mp4"
         fourcc = cv2.VideoWriter_fourcc(*"mp4v")
@@ -325,7 +360,7 @@ def process_stream(
         if not alpha_writer.isOpened() or not fg_writer.isOpened():
             raise MatAnyError("Failed to initialize VideoWriter(s)")
-        # [4.4] Load seed mask file if provided
         seed_mask_np = None
         if seed_mask_path is not None:
             p = Path(seed_mask_path)
@@ -334,9 +369,8 @@ def process_stream(
             m = cv2.imread(str(p), cv2.IMREAD_GRAYSCALE)
             if m is None:
                 raise MatAnyError(f"Failed to read seed mask: {p}")
-            seed_mask_np = m  # Prepare per-frame to ensure correct (H,W)
-        # [4.5] Stream frames
         cap = cv2.VideoCapture(str(video_path))
         if not cap.isOpened():
             raise MatAnyError(f"Failed to open video for reading: {video_path}")
@@ -349,11 +383,10 @@ def process_stream(
                 ret, frame = cap.read()
                 if not ret:
                     break
                 is_first = (idx == 0)
-                alpha = self._run_frame(frame, seed_mask_np if is_first else None, is_first)  # [H,W] 0..1
-                # Compose outputs (note: alpha already 0..1 — no double scaling)
                 alpha_u8  = (alpha * 255.0 + 0.5).astype(np.uint8)
                 alpha_bgr = cv2.cvtColor(alpha_u8, cv2.COLOR_GRAY2BGR)
                 fg_bgr    = (frame.astype(np.float32) * alpha[..., None]).clip(0, 255).astype(np.uint8)
@@ -378,7 +411,7 @@ def process_stream(
             except: pass
             _safe_empty_cache()
-        # [4.6] Verify outputs
         if not alpha_path.exists() or alpha_path.stat().st_size == 0:
             raise MatAnyError(f"Output file missing/empty: {alpha_path}")
         if not fg_path.exists() or fg_path.stat().st_size == 0:

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
+MatAnyone adapter — SAM2-seeded, streaming, build-agnostic.
+#1 Overview
+- SAM2 provides a seed mask on frame 0.
+- MatAnyone does frame-by-frame alpha matting.
+- Supports wheels that expect either 4D [B,C,H,W] or 5D [B,T,C,H,W].
+- Accepts HWC or CHW frames; converts to HWC RGB.
+- Writes alpha.mp4 (grayscale-as-BGR) and fg.mp4 (RGB on black).
+Public API used by pipeline:
+  MatAnyError (exception)
+  class MatAnyoneSession:
+     process_stream(video_path, seed_mask_path=None, out_dir=None, progress_cb=None) -> (alpha_path, fg_path)
 """
 from __future__ import annotations
 log = logging.getLogger(__name__)
+# ---------- Progress helper (safe & rate-limited) ----------
 def _env_flag(name: str, default: str = "0") -> bool:
     return os.getenv(name, default).strip().lower() in {"1", "true", "yes", "on"}
 _PROGRESS_CB_ENABLED = _env_flag("MATANY_PROGRESS", "1")
 _PROGRESS_MIN_INTERVAL = float(os.getenv("MATANY_PROGRESS_MIN_SEC", "0.25"))
+_progress_last = 0.0
+_progress_last_msg = None
 _progress_disabled = False
 def _emit_progress(cb, pct: float, msg: str):
+    """#2 UI progress callback wrapper (tolerant of legacy 1-arg signatures)"""
+    global _progress_last, _progress_last_msg, _progress_disabled
     if not cb or not _PROGRESS_CB_ENABLED or _progress_disabled:
         return
     now = time.time()
+    if (now - _progress_last) < _PROGRESS_MIN_INTERVAL and msg == _progress_last_msg:
         return
     try:
         try:
+            cb(pct, msg)  # preferred (pct, msg)
         except TypeError:
+            cb(msg)       # legacy (msg-only)
+        _progress_last = now
         _progress_last_msg = msg
     except Exception as e:
         _progress_disabled = True
         log.warning("[progress-cb] disabled due to exception: %s", e)
+# ---------- Errors ----------
 class MatAnyError(RuntimeError):
+    """#3 Adapter-level error (keeps upstream logs readable)"""
     pass
+# ---------- CUDA snapshots ----------
 def _cuda_snapshot(device: Optional[torch.device]) -> str:
+    """#4 Best-effort CUDA memory + device info (for error context)"""
     try:
         if not torch.cuda.is_available():
             return "CUDA: N/A"
         return f"CUDA snapshot error: {e!r}"
 def _safe_empty_cache():
+    """#5 Non-blocking VRAM cleanup (avoid synchronize() in Spaces)"""
     if not torch.cuda.is_available():
         return
     try:
         torch.cuda.empty_cache()
     except Exception:
         pass
+# ---------- SAM2 → seed mask prep ----------
 def _prepare_seed_mask(sam2_mask: np.ndarray, H: int, W: int) -> np.ndarray:
     """
+    #6 Normalize SAM2 mask to float32 [H,W] in {0,1}, white = foreground.
+    - Accepts 2D or 3-channel images; resizes with NEAREST to keep edges crisp.
+    - Auto-inverts if >60% of the image is ON (likely polarity swap).
     """
     if not isinstance(sam2_mask, np.ndarray):
         raise MatAnyError(f"SAM2 mask must be numpy array, got {type(sam2_mask)}")
     if sam2_mask.ndim == 3 and sam2_mask.shape[2] == 3:
         sam2_mask = cv2.cvtColor(sam2_mask, cv2.COLOR_BGR2GRAY)
     if sam2_mask.ndim != 2:
     m = sam2_mask.astype(np.float32)
     if m.max() > 1.0:
+        m /= 255.0
     m = np.clip(m, 0.0, 1.0)
     cov = float((m > 0.5).mean())
     if cov > 0.60:
+        m = 1.0 - m
+    # hard binarize for a clean seed
     m = (m > 0.5).astype(np.float32)
     return m
+# ---------- Frame conversion ----------
+def _frame_bgr_to_hwc_rgb_numpy(frame) -> np.ndarray:
     """
+    #7 Accepts OpenCV BGR uint8 HWC, or uint8 CHW; returns HWC RGB uint8.
     """
     if not isinstance(frame, np.ndarray) or frame.ndim != 3:
         raise MatAnyError(f"Frame must be HWC/CHW numpy array, got {type(frame)}, shape={getattr(frame, 'shape', None)}")
     arr = frame
+    # Accept CHW and convert to HWC
     if arr.shape[0] == 3 and arr.shape[2] != 3:
         arr = np.transpose(arr, (1, 2, 0))  # CHW -> HWC
     if arr.dtype != np.uint8:
     rgb = cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)
     return rgb
+# ============================================================================
 class MatAnyoneSession:
     """
+    #8 Streaming wrapper that seeds MatAnyone with a SAM2 mask on frame 0.
+    - Tries 4D first; if the wheel truly wants 5D, promotes both image AND mask.
+    - Has an override env: MATANY_FORCE_FORMAT=4D|5D (for debugging).
     """
     def __init__(self, device: Optional[str] = None, precision: str = "auto"):
+        self.device = torch.device(device) if device else (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
         self.precision = precision.lower()
+        # Optional override: MATANY_FORCE_FORMAT=4D|5D
+        fmt = os.getenv("MATANY_FORCE_FORMAT", "").strip().lower()
+        self._force_4d = (fmt == "4d")
+        self._force_5d = (fmt == "5d")
+        self._use_5d = self._force_5d  # start in 5D only if forced
         try:
             from matanyone.inference.inference_core import InferenceCore
         except ImportError as e:
         try:
             self.core = InferenceCore()
         except TypeError:
+            # HF wheel constructor that needs a repo string
             self.core = InferenceCore("PeiqingYang/MatAnyone")
+        self.api = "step" if hasattr(self.core, "step") else ("process_frame" if hasattr(self.core, "process_frame") else None)
+        if not self.api:
             raise MatAnyError("MatAnyone core exposes neither 'step' nor 'process_frame'")
+        log.info(f"[MATANY] API: {self.api} | device={self.device} | force4d={self._force_4d} | force5d={self._force_5d}")
+    # ----- AMP policy -----
     def _amp(self):
+        """#9 Simple AMP gate (auto/fp16/fp32)"""
         if self.device.type != "cuda":
             return torch.amp.autocast(device_type="cuda", enabled=False)
         if self.precision == "fp32":
             return torch.amp.autocast(device_type="cuda", enabled=False)
         if self.precision == "fp16":
             return torch.amp.autocast(device_type="cuda", enabled=True, dtype=torch.float16)
+        # auto
         return torch.amp.autocast(device_type="cuda", enabled=True)
+    # ----- Tensor builders -----
+    def _to_tensors(self, img_hwc_rgb: np.ndarray, mask_hw: Optional[np.ndarray]):
         """
+        #10 Build both 4D and 5D tensors.
+        Returns: (img_4d, img_5d, mask_4d, mask_5d)
+        - img_4d:  [1, 3, H, W]
+        - img_5d:  [1, 1, 3, H, W]
+        - mask_4d: [1, 1, H, W]  or None
+        - mask_5d: [1, 1, 1, H, W] or None
         """
+        img = torch.from_numpy(img_hwc_rgb).to(self.device)
         if img.dtype != torch.float32:
             img = img.float()
         if float(img.max().item()) > 1.0:
             img = img / 255.0
         img_chw = img.permute(2, 0, 1).contiguous()  # [3,H,W]
+        img_4d  = img_chw.unsqueeze(0)               # [1,3,H,W]
+        img_5d  = img_chw.unsqueeze(0).unsqueeze(0)  # [1,1,3,H,W]
+        mask_4d = mask_5d = None
         if mask_hw is not None:
             m = torch.from_numpy(mask_hw).to(self.device)
             if m.dtype != torch.float32:
                 m = m.float()
+            # robust binarize
+            m = (m >= 0.5).float() if float(m.max().item()) <= 1.0 else (m >= 128).float()
+            mask_4d = m.unsqueeze(0).unsqueeze(0).contiguous()          # [1,1,H,W]
+            mask_5d = mask_4d.unsqueeze(1).contiguous()                 # [1,1,1,H,W]
+        return img_4d, img_5d, mask_4d, mask_5d
+    # ----- Core call (4D first, 5D only if demanded) -----
+    def _core_call(self, img_4d, img_5d, mask_4d, mask_5d, is_first: bool):
         """
+        #11 Dispatch into the wheel, trying 4D, then 5D if the error suggests it.
+        Also backs off from 5D → 4D when conv2d complains about 3D/4D.
         """
+        def run(use_5d: bool):
+            img  = img_5d if use_5d else img_4d
+            msk  = mask_5d if use_5d else mask_4d  # <<< IMPORTANT: match ranks
             if self.api == "step":
+                if is_first and msk is not None:
+                    try:
+                        return self.core.step(img, msk, is_first=True)
+                    except TypeError:
+                        return self.core.step(img, msk)  # older signature
+                else:
+                    return self.core.step(img)
             else:
+                return self.core.process_frame(img, msk if is_first else None)
+        with torch.no_grad(), self._amp():
+            # Forced modes for debugging
+            if self._force_4d:
+                return run(False)
+            if self._force_5d:
+                return run(True)
+            # If a previous frame decided on 5D, try 5D first but back off if needed
+            if self._use_5d:
+                try:
+                    return run(True)
+                except RuntimeError as e5:
+                    msg5 = str(e5)
+                    # If the wheel says conv2d needs 3D/4D, revert to 4D permanently
+                    if "Expected 3D" in msg5 and "4D" in msg5 and "conv2d" in msg5:
+                        log.info("[MATANY] 5D rejected by wheel (conv2d wants 3D/4D). Falling back to 4D.")
+                        self._use_5d = False
+                        return run(False)
+                    raise MatAnyError(f"Runtime error (5D path): {msg5}") from e5
+            # Default: try 4D first
+            try:
+                return run(False)
+            except RuntimeError as e4:
+                msg4 = str(e4)
+                # Hints that the wheel actually expects 5D
+                wants_5d = any(kw in msg4 for kw in [
+                    "expected 5D",
+                    "expects 5D",
+                    "input.dim() == 5",
+                    "but got 4D",
+                    "got input of size: [1, 3,"  # some wheels report this pattern
+                ])
+                if wants_5d:
+                    log.info("[MATANY] Wheel appears to expect 5D — retrying with [1,1,3,H,W] and [1,1,1,H,W].")
+                    self._use_5d = True
+                    try:
+                        return run(True)
+                    except RuntimeError as e5b:
+                        msg5b = str(e5b)
+                        # If retry says conv2d wants 3D/4D, undo and raise original
+                        if "Expected 3D" in msg5b and "4D" in msg5b and "conv2d" in msg5b:
+                            self._use_5d = False
+                            raise MatAnyError(f"Wheel ultimately expects 4D (conv2d). Original 4D error: {msg4}") from e4
+                        raise MatAnyError(f"5D attempt failed: {msg5b}") from e5b
+                # Add CUDA context for GPU errors
+                if "CUDA" in msg4 or "cublas" in msg4.lower() or "cudnn" in msg4.lower():
+                    snap = _cuda_snapshot(self.device)
+                    raise MatAnyError(f"CUDA runtime error: {msg4} | {snap}") from e4
+                # Generic wrap
+                raise MatAnyError(f"Runtime error (4D path): {msg4}") from e4
+    # ----- Per-frame runner -----
     def _run_frame(self, frame_bgr: np.ndarray, sam2_mask_hw: Optional[np.ndarray], is_first: bool) -> np.ndarray:
+        """#12 Convert inputs, seed frame 0, call core, and normalize to [H,W] alpha."""
+        rgb_hwc = _frame_bgr_to_hwc_rgb_numpy(frame_bgr)
         H, W = rgb_hwc.shape[:2]
+        seed_for_this_frame = None
         if is_first and sam2_mask_hw is not None:
+            seed_for_this_frame = _prepare_seed_mask(sam2_mask_hw, H, W)
+        img_4d, img_5d, mask_4d, mask_5d = self._to_tensors(rgb_hwc, seed_for_this_frame)
         try:
+            out = self._core_call(img_4d, img_5d, mask_4d, mask_5d, is_first)
         except torch.cuda.OutOfMemoryError as e:
             snap = _cuda_snapshot(self.device)
             raise MatAnyError(f"CUDA OOM while processing frame | {snap}") from e
+        except Exception as e:
             raise MatAnyError(f"Runtime error: {e}") from e
+        # Normalize output to [H,W] float32 in [0,1]
         if isinstance(out, torch.Tensor):
             alpha = out.detach().float().squeeze().cpu().numpy()
         else:
             alpha = np.asarray(out)
         alpha = alpha.astype(np.float32)
         if float(alpha.max()) > 1.0:
+            alpha /= 255.0
         alpha = np.squeeze(alpha)
         if alpha.ndim != 2:
             raise MatAnyError(f"Expected 2D alpha matte; got shape {alpha.shape}")
         return np.clip(alpha, 0.0, 1.0)
+    # ----- Public: streaming processor -----
     def process_stream(
         self,
         video_path: Path,
         out_dir: Optional[Path] = None,
         progress_cb: Optional[Callable] = None,
     ) -> Tuple[Path, Path]:
+        """
+        #13 Stream the video one frame at a time (T=1), write alpha.mp4 & fg.mp4.
+        """
         video_path = Path(video_path)
         if not video_path.exists():
             raise MatAnyError(f"Video file not found: {video_path}")
         out_dir = Path(out_dir) if out_dir else video_path.parent
         out_dir.mkdir(parents=True, exist_ok=True)
+        # Probe video
         cap_probe = cv2.VideoCapture(str(video_path))
         if not cap_probe.isOpened():
             raise MatAnyError(f"Failed to open video: {video_path}")
         N   = int(cap_probe.get(cv2.CAP_PROP_FRAME_COUNT))
+        fps = cap_probe.get(cv2.CAP_PROP_FPS)
         W   = int(cap_probe.get(cv2.CAP_PROP_FRAME_WIDTH))
         H   = int(cap_probe.get(cv2.CAP_PROP_FRAME_HEIGHT))
         cap_probe.release()
+        if not fps or fps <= 0 or np.isnan(fps):
+            fps = 25.0
         log.info(f"MatAnyone: {video_path.name} | {N} frames {W}x{H} @ {fps:.2f} fps")
         _emit_progress(progress_cb, 0.05, f"Video: {N} frames {W}x{H} @ {fps:.2f} fps")
         _emit_progress(progress_cb, 0.08, "Using step (frame-by-frame)")
+        # Prepare writers
         alpha_path = out_dir / "alpha.mp4"
         fg_path    = out_dir / "fg.mp4"
         fourcc = cv2.VideoWriter_fourcc(*"mp4v")
         if not alpha_writer.isOpened() or not fg_writer.isOpened():
             raise MatAnyError("Failed to initialize VideoWriter(s)")
+        # Load seed mask if provided (file path on disk)
         seed_mask_np = None
         if seed_mask_path is not None:
             p = Path(seed_mask_path)
             m = cv2.imread(str(p), cv2.IMREAD_GRAYSCALE)
             if m is None:
                 raise MatAnyError(f"Failed to read seed mask: {p}")
+            seed_mask_np = m  # we resize/polarize/binarize inside _run_frame
         cap = cv2.VideoCapture(str(video_path))
         if not cap.isOpened():
             raise MatAnyError(f"Failed to open video for reading: {video_path}")
                 ret, frame = cap.read()
                 if not ret:
                     break
                 is_first = (idx == 0)
+                alpha = self._run_frame(frame, seed_mask_np if is_first else None, is_first)  # [H,W] in [0,1]
+                # Compose outputs (no double divide)
                 alpha_u8  = (alpha * 255.0 + 0.5).astype(np.uint8)
                 alpha_bgr = cv2.cvtColor(alpha_u8, cv2.COLOR_GRAY2BGR)
                 fg_bgr    = (frame.astype(np.float32) * alpha[..., None]).clip(0, 255).astype(np.uint8)
             except: pass
             _safe_empty_cache()
+        # Verify outputs are non-empty
         if not alpha_path.exists() or alpha_path.stat().st_size == 0:
             raise MatAnyError(f"Output file missing/empty: {alpha_path}")
         if not fg_path.exists() or fg_path.stat().st_size == 0: