Spaces:

MogensR
/

VideoBackgroundReplacer2

Paused

App Files Files Community

MogensR commited on Sep 16, 2025

Commit

3cd16f5

1 Parent(s): 975ab1f

agent 3.0

Browse files

Files changed (1) hide show

models/matanyone_loader.py +210 -79

models/matanyone_loader.py CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # =============================================================================
-# MatAnyone Adapter (streaming, API-agnostic) — with chapter markers
 # =============================================================================
 """
 - Supports multiple MatAnyone variants:
@@ -40,90 +40,152 @@
 # =============================================================================
 # CHAPTER 1 — Small utilities
 # =============================================================================
-def _emit_progress(cb, pct: float, msg: str):
-    """Route progress to callback (supports new 2-arg and legacy 1-arg styles)."""
-    if not cb:
         return
     try:
-        cb(pct, msg)  # preferred 2-arg
-    except TypeError:
         try:
-            cb(msg)   # legacy 1-arg
         except TypeError:
             pass
 class MatAnyError(RuntimeError):
     """Custom exception for MatAnyone processing errors."""
     pass
 def _cuda_snapshot(device: Optional[torch.device] = None) -> str:
-    """Human-friendly GPU memory snapshot."""
     if not torch.cuda.is_available():
-        return "CUDA: N/A"
-    idx = 0
-    if device is not None and isinstance(device, torch.device) and device.index is not None:
-        idx = device.index
-    name = torch.cuda.get_device_name(idx)
-    alloc = torch.cuda.memory_allocated(idx) / 1e9
-    resv  = torch.cuda.memory_reserved(idx) / 1e9
-    return f"device={idx}, name={name}, alloc={alloc:.2f}GB, reserved={resv:.2f}GB"
-def _safe_empty_cache():
-    """Synchronize and empty CUDA cache if present (best-effort)."""
-    if torch.cuda.is_available():
-        try:
-            torch.cuda.synchronize()
-        except Exception:
-            pass
         torch.cuda.empty_cache()
 def _read_mask_hw(mask_path: Path, target_hw: Tuple[int, int]) -> np.ndarray:
-    """Read mask, convert to float32 [0,1], resize to target (H,W)."""
-    if not Path(mask_path).exists():
         raise MatAnyError(f"Seed mask not found: {mask_path}")
     mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
-    if mask is None:
-        raise MatAnyError(f"Failed to read seed mask: {mask_path}")
     H, W = target_hw
     if mask.shape[:2] != (H, W):
         mask = cv2.resize(mask, (W, H), interpolation=cv2.INTER_LINEAR)
     maskf = (mask.astype(np.float32) / 255.0).clip(0.0, 1.0)
     return maskf
 def _to_chw01(img_bgr: np.ndarray) -> np.ndarray:
-    """BGR [H,W,3] uint8 -> CHW float32 [0,1] RGB."""
     rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
     rgbf = rgb.astype(np.float32) / 255.0
-    chw = np.transpose(rgbf, (2, 0, 1))  # C,H,W
     return chw
-def _validate_nonempty(file_path: Path) -> None:
-    """Ensure output file exists and is non-empty."""
-    if not file_path.exists() or file_path.stat().st_size == 0:
-        raise MatAnyError(f"Output file missing/empty: {file_path}")
-def _select_matany_mode(core) -> str:
-    """
-    Inspect available APIs.
-    Priority: process_video > process_frame > step
-    (Note: we still force frame mode in _lazy_init; this helper is used by chunk helper.)
-    """
-    if hasattr(core, "process_video") and callable(getattr(core, "process_video")):
-        return "process_video"
-    if hasattr(core, "process_frame") and callable(getattr(core, "process_frame")):
-        return "process_frame"
-    if hasattr(core, "step") and callable(getattr(core, "step")):
-        return "step"
-    raise MatAnyError("No supported MatAnyone API on core (process_video/process_frame/step).")
 # =============================================================================
 # CHAPTER 2 — Main session
 # =============================================================================
@@ -153,6 +215,7 @@ def __init__(self, device: Optional[str] = None, precision: str = "auto"):
         self._core = None
         self._api_mode = None
         self._initialized = False
         self._lazy_init()
         log.info(f"Initialized MatAnyoneSession on {self.device} | precision={self.precision}, use_fp16={self.use_fp16}")
@@ -239,7 +302,7 @@ def _maybe_amp(self):
         return torch.amp.autocast(device_type="cuda", enabled=enabled and self.use_fp16)
     # -------------------------------------------------------------------------
-    # 2.4 — Frame validation & core call
     # -------------------------------------------------------------------------
     def _validate_input_frame(self, frame: np.ndarray) -> None:
         if not isinstance(frame, np.ndarray):
@@ -249,43 +312,105 @@ def _validate_input_frame(self, frame: np.ndarray) -> None:
         if frame.ndim != 3 or frame.shape[2] != 3:
             raise MatAnyError(f"Frame must be HWC with 3 channels, got {frame.shape}")
-    def _run_frame(self, frame_bgr: np.ndarray, seed_1hw: Optional[np.ndarray], is_first: bool) -> np.ndarray:
         """
-        Run a single frame through MatAnyone.
-        Returns: alpha matte as 2D np.float32 in [0,1].
         """
-        self._validate_input_frame(frame_bgr)
-        # Image -> CHW float32 [0,1], then torch on device
-        img_chw = _to_chw01(frame_bgr)      # (3,H,W) float32
-        img_t   = torch.from_numpy(img_chw).to(self.device)
-        # Optional seed mask on first frame: expect HW float32 [0,1]
-        mask_t = None
         if is_first and seed_1hw is not None:
             if seed_1hw.ndim == 3 and seed_1hw.shape[0] == 1:
                 seed_hw = seed_1hw[0]
             elif seed_1hw.ndim == 2:
                 seed_hw = seed_1hw
             else:
                 raise MatAnyError(f"seed mask must be 1HW or HW; got {seed_1hw.shape}")
-            mask_t = torch.from_numpy(seed_hw).to(self.device)
-        # Dispatch into the selected frame API
         try:
             with torch.no_grad(), self._maybe_amp():
-                if self._api_mode == "step":
-                    out = self._core.step(img_t, mask_t) if mask_t is not None else self._core.step(img_t)
-                elif self._api_mode == "process_frame":
-                    out = self._core.process_frame(img_t, mask_t)
-                else:
-                    raise MatAnyError("Internal error: _run_frame used in non-frame mode")
         except torch.cuda.OutOfMemoryError as e:
             snap = _cuda_snapshot(self.device)
             self._log_gpu_memory()
             raise MatAnyError(f"CUDA OOM while processing frame | {snap}") from e
         except RuntimeError as e:
-            # If it’s a CUDA-side runtime issue, annotate with snapshot
             if "CUDA" in str(e):
                 snap = _cuda_snapshot(self.device)
                 self._log_gpu_memory()
@@ -296,20 +421,26 @@ def _run_frame(self, frame_bgr: np.ndarray, seed_1hw: Optional[np.ndarray], is_f
         # Normalize to pure 2D numpy [0,1]
         if isinstance(out, torch.Tensor):
-            alpha_np = out.detach().float().clamp(0, 1).squeeze().cpu().numpy()
         else:
-            alpha_np = np.asarray(out, dtype=np.float32)
-            if alpha_np.max() > 1.0:
-                alpha_np = alpha_np / 255.0
         alpha_np = np.squeeze(alpha_np)
         if alpha_np.ndim != 2:
             raise MatAnyError(f"Expected 2D alpha matte; got shape {alpha_np.shape}")
-        return alpha_np.astype(np.float32)
     # -------------------------------------------------------------------------
-    # 2.5 — process_video harvesting (kept for completeness; not used in forced frame mode)
     # -------------------------------------------------------------------------
     def _harvest_process_video_output(self, res, out_dir: Path, base: str) -> Tuple[Path, Path]:
         """
@@ -320,7 +451,7 @@ def _harvest_process_video_output(self, res, out_dir: Path, base: str) -> Tuple[
         alpha_mp4 = out_dir / "alpha.mp4"
         fg_mp4    = out_dir / "fg.mp4"
-        # Dict style: look for common keys
         if isinstance(res, dict):
             cand_alpha = res.get("alpha") or res.get("alpha_path") or res.get("matte") or res.get("matte_path")
             cand_fg    = res.get("fg")    or res.get("fg_path")    or res.get("foreground") or res.get("foreground_path")
@@ -359,7 +490,7 @@ def _harvest_process_video_output(self, res, out_dir: Path, base: str) -> Tuple[
         raise MatAnyError("MatAnyone.process_video did not yield discoverable output paths.")
     # -------------------------------------------------------------------------
-    # 2.6 — Public API: process_stream
     # -------------------------------------------------------------------------
     def process_stream(
         self,

 #!/usr/bin/env python3
 # =============================================================================
+# MatAnyone Adapter (streaming, API-agnostic) — with chapter markers + layout probe
 # =============================================================================
 """
 - Supports multiple MatAnyone variants:
 # =============================================================================
 # CHAPTER 1 — Small utilities
 # =============================================================================
+# --- Progress callback controls ---
+def _env_flag(name: str, default: str = "0") -> bool:
+    return os.getenv(name, default).strip() in {"1", "true", "TRUE", "yes", "YES", "on", "ON"}
+_PROGRESS_CB_ENABLED = _env_flag("MATANY_PROGRESS", "1")
+_PROGRESS_MIN_INTERVAL = float(os.getenv("MATANY_PROGRESS_MIN_SEC", "0.25"))
+_progress_state = {"t": 0.0, "last": None, "disabled": False}
+def _emit_progress(cb, pct: float, msg: str, *, force: bool = False) -> None:
+    """
+    Safe progress emitter:
+      - Respects MATANY_PROGRESS and rate-limits updates.
+      - Never raises upstream; disables itself if the callback misbehaves.
+      - Accepts either 2-arg (pct, msg) or legacy 1-arg (msg) callbacks.
+    """
+    if not cb or not _PROGRESS_CB_ENABLED or _progress_state["disabled"]:
         return
+    now = time.time()
+    if not force and (now - _progress_state["t"] < _PROGRESS_MIN_INTERVAL) and msg == _progress_state["last"]:
+        return
     try:
         try:
+            cb(pct, msg)  # preferred signature
         except TypeError:
+            cb(msg)       # legacy signature
+        _progress_state["t"] = now
+        _progress_state["last"] = msg
+    except Exception as e:
+        # Permanently disable to avoid log spam and user-facing crashes
+        _progress_state["disabled"] = True
+        try:
+            log.warning(f"[progress-cb] disabled due to exception: {e}")
+        except Exception:
             pass
+# --- Errors ---
 class MatAnyError(RuntimeError):
     """Custom exception for MatAnyone processing errors."""
     pass
+# --- CUDA helpers ---
 def _cuda_snapshot(device: Optional[torch.device] = None) -> str:
+    """
+    Return a short, exception-safe string describing CUDA memory on a device.
+    """
+    try:
+        if not torch.cuda.is_available():
+            return "CUDA: N/A"
+        idx = 0
+        if isinstance(device, torch.device) and device.type == "cuda" and device.index is not None:
+            idx = device.index
+        name = torch.cuda.get_device_name(idx)
+        alloc = torch.cuda.memory_allocated(idx) / (1024 ** 3)
+        resv  = torch.cuda.memory_reserved(idx)  / (1024 ** 3)
+        return f"device={idx}, name={name}, alloc={alloc:.2f}GB, reserved={resv:.2f}GB"
+    except Exception as e:
+        return f"CUDA: snapshot-error: {e!r}"
+def _safe_empty_cache() -> None:
+    """Try hard to release CUDA cache; never raises."""
     if not torch.cuda.is_available():
+        return
+    try:
+        torch.cuda.synchronize()
+    except Exception:
+        pass
+    try:
         torch.cuda.empty_cache()
+    except Exception:
+        pass
+def _supports_fp16(device: Optional[torch.device]) -> bool:
+    """
+    Best-effort check whether the device can benefit from fp16.
+    Returns False for CPU; True for most modern NVIDIA GPUs.
+    """
+    if not isinstance(device, torch.device) or device.type != "cuda" or not torch.cuda.is_available():
+        return False
+    try:
+        major, minor = torch.cuda.get_device_capability(device.index or 0)
+        # Volta (7.0)+ generally supports fast fp16 paths; T4 is 7.5.
+        return (major, minor) >= (7, 0)
+    except Exception:
+        return True  # be optimistic if capability query fails
+def _ensure_device_usable(device: torch.device) -> None:
+    """
+    Validate that the chosen device is actually usable.
+    Raise MatAnyError early if CUDA is requested but unavailable.
+    """
+    if device.type == "cuda" and not torch.cuda.is_available():
+        raise MatAnyError("CUDA device requested but torch.cuda.is_available() == False")
+    if device.type not in {"cuda", "cpu"}:
+        raise MatAnyError(f"Unsupported device type: {device.type!r}")
+# --- File & image helpers ---
+def _validate_nonempty(file_path: Path) -> None:
+    if (not isinstance(file_path, Path)) or (not file_path.exists()) or file_path.stat().st_size <= 0:
+        raise MatAnyError(f"Output file missing/empty: {file_path}")
 def _read_mask_hw(mask_path: Path, target_hw: Tuple[int, int]) -> np.ndarray:
+    """
+    Read a mask image, return float32 in [0,1] with shape (H, W).
+    Validates content and resizes to target (H, W).
+    """
+    if not isinstance(mask_path, (str, Path)):
+        raise MatAnyError(f"Seed mask path must be str/Path, got {type(mask_path)}")
+    mask_path = Path(mask_path)
+    if not mask_path.exists():
         raise MatAnyError(f"Seed mask not found: {mask_path}")
     mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
+    if mask is None or mask.size == 0:
+        raise MatAnyError(f"Failed to read seed mask or empty file: {mask_path}")
     H, W = target_hw
+    if mask.ndim != 2:
+        raise MatAnyError(f"Seed mask must be single-channel; got shape {mask.shape}")
     if mask.shape[:2] != (H, W):
         mask = cv2.resize(mask, (W, H), interpolation=cv2.INTER_LINEAR)
     maskf = (mask.astype(np.float32) / 255.0).clip(0.0, 1.0)
     return maskf
 def _to_chw01(img_bgr: np.ndarray) -> np.ndarray:
+    """
+    BGR uint8 (H, W, 3) -> RGB float32 (C, H, W) in [0,1].
+    """
+    if not isinstance(img_bgr, np.ndarray) or img_bgr.dtype != np.uint8 or img_bgr.ndim != 3 or img_bgr.shape[2] != 3:
+        raise MatAnyError(f"Frame must be uint8 HWC BGR; got {type(img_bgr)}, shape={getattr(img_bgr, 'shape', None)}")
     rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
     rgbf = rgb.astype(np.float32) / 255.0
+    chw = np.transpose(rgbf, (2, 0, 1))  # C, H, W
     return chw
 # =============================================================================
 # CHAPTER 2 — Main session
 # =============================================================================
         self._core = None
         self._api_mode = None
         self._initialized = False
+        self._layout_locked: Optional[str] = None  # 'BCHW+B1HW', 'CHW+HW', etc.
         self._lazy_init()
         log.info(f"Initialized MatAnyoneSession on {self.device} | precision={self.precision}, use_fp16={self.use_fp16}")
         return torch.amp.autocast(device_type="cuda", enabled=enabled and self.use_fp16)
     # -------------------------------------------------------------------------
+    # 2.4 — Frame validation
     # -------------------------------------------------------------------------
     def _validate_input_frame(self, frame: np.ndarray) -> None:
         if not isinstance(frame, np.ndarray):
         if frame.ndim != 3 or frame.shape[2] != 3:
             raise MatAnyError(f"Frame must be HWC with 3 channels, got {frame.shape}")
+    # -------------------------------------------------------------------------
+    # 2.5 — Core call helper with first-frame layout probe (locks after success)
+    # -------------------------------------------------------------------------
+    def _call_core_frame(self, img_chw: np.ndarray, seed_1hw: Optional[np.ndarray], is_first: bool):
         """
+        Calls MatAnyone frame API trying a small set of plausible layouts.
+        Locks layout after the first successful call to avoid repeated probing.
+        Returns the raw output (torch.Tensor or numpy).
         """
+        core = self._core
+        # Build base tensors
+        img_t_chw = torch.from_numpy(img_chw).to(self.device)  # (3,H,W)
+        H, W = img_chw.shape[1], img_chw.shape[2]
+        mask_t_hw = None
         if is_first and seed_1hw is not None:
+            # Ensure pure HW float32 in [0,1]
             if seed_1hw.ndim == 3 and seed_1hw.shape[0] == 1:
                 seed_hw = seed_1hw[0]
             elif seed_1hw.ndim == 2:
                 seed_hw = seed_1hw
             else:
                 raise MatAnyError(f"seed mask must be 1HW or HW; got {seed_1hw.shape}")
+            mask_t_hw = torch.from_numpy(seed_hw.astype(np.float32)).to(self.device)
+        def _do_call(layout: str):
+            """Dispatch according to a named layout."""
+            if layout == "BCHW+B1HW":  # Preferred for many PyTorch models
+                img_in  = img_t_chw.unsqueeze(0).contiguous()                    # (1,3,H,W)
+                mask_in = mask_t_hw.unsqueeze(0).unsqueeze(0).contiguous() if mask_t_hw is not None else None  # (1,1,H,W)
+            elif layout == "CHW+HW":   # Some APIs accept unbatched tensors
+                img_in  = img_t_chw                                         # (3,H,W)
+                mask_in = mask_t_hw if mask_t_hw is not None else None      # (H,W)
+            elif layout == "BCHW+HW":
+                img_in  = img_t_chw.unsqueeze(0).contiguous()               # (1,3,H,W)
+                mask_in = mask_t_hw if mask_t_hw is not None else None      # (H,W)
+            elif layout == "CHW+1HW":
+                img_in  = img_t_chw                                         # (3,H,W)
+                mask_in = mask_t_hw.unsqueeze(0).contiguous() if mask_t_hw is not None else None  # (1,H,W)
+            else:
+                raise MatAnyError(f"Unknown layout spec: {layout}")
+            if self._api_mode == "step":
+                return core.step(img_in, mask_in) if mask_in is not None else core.step(img_in)
+            elif self._api_mode == "process_frame":
+                return core.process_frame(img_in, mask_in)
+            else:
+                raise MatAnyError("Internal error: frame dispatch used in non-frame mode")
+        # If layout was already found, use it directly
+        if self._layout_locked is not None:
+            try:
+                return _do_call(self._layout_locked)
+            except Exception as e:
+                # If a previously-working layout starts failing, surface clear error
+                raise MatAnyError(f"MatAnyone call failed with locked layout {self._layout_locked}: {e}")
+        # First-frame probe: try a few reasonable layouts in this order
+        probe_order = ["BCHW+B1HW", "CHW+HW", "BCHW+HW", "CHW+1HW"]
+        last_err: Optional[str] = None
+        for layout in probe_order:
+            try:
+                out = _do_call(layout)
+                # Success — lock layout for subsequent frames
+                self._layout_locked = layout
+                log.info(f"[MATANY] First-frame layout locked: {layout} (H={H}, W={W})")
+                return out
+            except Exception as e:
+                last_err = str(e)
+                log.warning(f"[MATANY] Layout attempt failed ({layout}): {last_err}")
+        # If we reach here, all attempts failed
+        snap = _cuda_snapshot(self.device)
+        raise MatAnyError(f"MatAnyone first-frame probe failed for all layouts. Last error: {last_err} | {snap}")
+    # -------------------------------------------------------------------------
+    # 2.6 — Frame runner (normalizes output to 2D [0,1])
+    # -------------------------------------------------------------------------
+    def _run_frame(self, frame_bgr: np.ndarray, seed_1hw: Optional[np.ndarray], is_first: bool) -> np.ndarray:
+        """
+        Run a single frame through MatAnyone.
+        Returns: alpha matte as 2D np.float32 in [0,1].
+        """
+        self._validate_input_frame(frame_bgr)
+        # Image -> CHW float32 [0,1]
+        img_chw = _to_chw01(frame_bgr)  # (3,H,W)
+        # Dispatch (with autocast + no_grad)
         try:
             with torch.no_grad(), self._maybe_amp():
+                out = self._call_core_frame(img_chw, seed_1hw, is_first=is_first)
         except torch.cuda.OutOfMemoryError as e:
             snap = _cuda_snapshot(self.device)
             self._log_gpu_memory()
             raise MatAnyError(f"CUDA OOM while processing frame | {snap}") from e
         except RuntimeError as e:
             if "CUDA" in str(e):
                 snap = _cuda_snapshot(self.device)
                 self._log_gpu_memory()
         # Normalize to pure 2D numpy [0,1]
         if isinstance(out, torch.Tensor):
+            alpha_np = out.detach().float().squeeze().cpu().numpy()
         else:
+            alpha_np = np.asarray(out)
+        # Scale if it looks like 0..255
+        alpha_np = alpha_np.astype(np.float32)
+        if alpha_np.max() > 1.0:
+            alpha_np = alpha_np / 255.0
+        # In case model returns shape like (1,H,W) or (1,1,H,W), squeeze to (H,W)
         alpha_np = np.squeeze(alpha_np)
         if alpha_np.ndim != 2:
             raise MatAnyError(f"Expected 2D alpha matte; got shape {alpha_np.shape}")
+        # Clamp to [0,1]
+        alpha_np = np.clip(alpha_np, 0.0, 1.0).astype(np.float32)
+        return alpha_np
     # -------------------------------------------------------------------------
+    # 2.7 — process_video harvesting (kept for completeness; not used in forced frame mode)
     # -------------------------------------------------------------------------
     def _harvest_process_video_output(self, res, out_dir: Path, base: str) -> Tuple[Path, Path]:
         """
         alpha_mp4 = out_dir / "alpha.mp4"
         fg_mp4    = out_dir / "fg.mp4"
+        # Dict style
         if isinstance(res, dict):
             cand_alpha = res.get("alpha") or res.get("alpha_path") or res.get("matte") or res.get("matte_path")
             cand_fg    = res.get("fg")    or res.get("fg_path")    or res.get("foreground") or res.get("foreground_path")
         raise MatAnyError("MatAnyone.process_video did not yield discoverable output paths.")
     # -------------------------------------------------------------------------
+    # 2.8 — Public API: process_stream
     # -------------------------------------------------------------------------
     def process_stream(
         self,