Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 29, 2025

Commit

c29dcc4

1 Parent(s): c30a2cc

Update models/loaders/matanyone_loader.py

Browse files

Files changed (1) hide show

models/loaders/matanyone_loader.py +175 -281

models/loaders/matanyone_loader.py CHANGED Viewed

@@ -1,29 +1,49 @@
 #!/usr/bin/env python3
 """
 MatAnyone Loader + Stateful Adapter (OOM-resilient, spatially robust)
-- Canonical HF load (MatAnyone.from_pretrained -> InferenceCore(model, cfg))
 - Mixed precision (bf16/fp16) with safe fallback to fp32
-- Autocast + inference_mode around every call
-- Auto downscale with progressive retry on OOM, then upsample alpha back
-- Always aligns mask/image dimensions before inference to avoid all size errors
-- Returns 2-D float32 [H,W] alpha for OpenCV
 """
 import os
 import time
 import logging
 import traceback
-from typing import Optional, Dict, Any, Tuple
 import numpy as np
 import torch
 import torch.nn.functional as F
 import inspect
-logger = logging.getLogger(__name__)
-# ------------------------- Shape & dtype utilities ------------------------- #
 def _select_device(pref: str) -> str:
     pref = (pref or "").lower()
     if pref.startswith("cuda"):
@@ -32,130 +52,23 @@ def _select_device(pref: str) -> str:
         return "cpu"
     return "cuda" if torch.cuda.is_available() else "cpu"
-def _as_tensor_on_device(x, device: str) -> torch.Tensor:
-    if isinstance(x, torch.Tensor):
-        return x.to(device, non_blocking=True)
-    return torch.from_numpy(np.asarray(x)).to(device, non_blocking=True)
-def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
-    """
-    Normalize input to BCHW (image) or B1HW (mask).
-    Accepts: HWC, CHW, BCHW, BHWC, BTCHW/BTHWC, TCHW/THWC, HW.
-    """
-    x = _as_tensor_on_device(x, device)
-    if x.dtype == torch.uint8:
-        x = x.float().div_(255.0)
-    elif x.dtype in (torch.int16, torch.int32, torch.int64):
-        x = x.float()
-    if x.ndim == 5:
-        x = x[:, 0]  # -> 4D
-    if x.ndim == 4:
-        if x.shape[-1] in (1, 3, 4) and x.shape[1] not in (1, 3, 4):
-            x = x.permute(0, 3, 1, 2).contiguous()
-    elif x.ndim == 3:
-        if x.shape[-1] in (1, 3, 4):
-            x = x.permute(2, 0, 1).contiguous()
-        x = x.unsqueeze(0)
-    elif x.ndim == 2:
-        x = x.unsqueeze(0).unsqueeze(0)
-        if not is_mask:
-            x = x.repeat(1, 3, 1, 1)
-    else:
-        raise ValueError(f"Unsupported ndim={x.ndim}")
-    if is_mask:
-        if x.shape[1] > 1:
-            x = x[:, :1]
-        x = x.clamp_(0.0, 1.0).to(torch.float32)
-    else:
-        if x.shape[1] == 1:
-            x = x.repeat(1, 3, 1, 1)
-        x = x.clamp_(0.0, 1.0)
-    return x
-def _to_chw_image(img_bchw: torch.Tensor) -> torch.Tensor:
-    if img_bchw.ndim == 4 and img_bchw.shape[0] == 1:
-        return img_bchw[0]
-    return img_bchw
-def _to_1hw_mask(msk_b1hw: torch.Tensor) -> Optional[torch.Tensor]:
-    if msk_b1hw is None:
-        return None
-    if msk_b1hw.ndim == 4 and msk_b1hw.shape[1] == 1:
-        return msk_b1hw[0]  # -> [1,H,W]
-    if msk_b1hw.ndim == 3 and msk_b1hw.shape[0] == 1:
-        return msk_b1hw
-    raise ValueError(f"Expected B1HW or 1HW, got {tuple(msk_b1hw.shape)}")
-def _resize_bchw(x: Optional[torch.Tensor], size_hw: Tuple[int, int], is_mask=False) -> Optional[torch.Tensor]:
-    if x is None:
-        return None
-    if x.shape[-2:] == size_hw:
-        return x
-    mode = "nearest" if is_mask else "bilinear"
-    return F.interpolate(x, size=size_hw, mode=mode, align_corners=False if mode == "bilinear" else None)
-def _to_b1hw_alpha(alpha, device: str) -> torch.Tensor:
-    t = torch.as_tensor(alpha, device=device).float()
-    if t.ndim == 2:
-        t = t.unsqueeze(0).unsqueeze(0)            # -> [1,1,H,W]
-    elif t.ndim == 3:
-        if t.shape[0] in (1, 3, 4):
-            if t.shape[0] != 1:
-                t = t[:1]
-            t = t.unsqueeze(0)
-        elif t.shape[-1] in (1, 3, 4):
-            t = t[..., :1].permute(2, 0, 1).unsqueeze(0)
-        else:
-            t = t[:1].unsqueeze(0)
-    elif t.ndim == 4:
-        if t.shape[1] != 1:
-            t = t[:, :1]
-        if t.shape[0] != 1:
-            t = t[:1]
-    else:
-        while t.ndim > 4:
-            t = t.squeeze(0)
-        while t.ndim < 4:
-            t = t.unsqueeze(0)
-        if t.shape[1] != 1:
-            t = t[:, :1]
-    return t.clamp_(0.0, 1.0).contiguous()
-def _to_2d_alpha_numpy(x) -> np.ndarray:
-    t = torch.as_tensor(x).float()
-    while t.ndim > 2:
-        if t.ndim == 4 and t.shape[0] == 1 and t.shape[1] == 1:
-            t = t[0, 0]
-        elif t.ndim == 3 and t.shape[0] == 1:
-            t = t[0]
-        else:
-            t = t.squeeze(0)
-    t = t.clamp_(0.0, 1.0)
-    out = t.detach().cpu().numpy().astype(np.float32)
-    return np.ascontiguousarray(out)
-def debug_shapes(tag: str, image, mask) -> None:
-    def _info(name, v):
-        try:
-            tv = torch.as_tensor(v)
-            mn = float(tv.min()) if tv.numel() else float("nan")
-            mx = float(tv.max()) if tv.numel() else float("nan")
-            logger.info(f"[{tag}:{name}] shape={tuple(tv.shape)} dtype={tv.dtype} min={mn:.4f} max={mx:.4f}")
-        except Exception as e:
-            logger.info(f"[{tag}:{name}] type={type(v)} err={e}")
-    _info("image", image)
-    _info("mask", mask)
-# ------------------------------ Stateful Adapter --------------------------- #
 class _MatAnyoneSession:
     """
     Stateful controller around InferenceCore with OOM-resilient inference.
-    Usage:
-        # frame 0 (has mask):
-        alpha0 = session(frame0_rgb01, mask01)
-        # frames 1..N (no mask):
-        alpha  = session(frame_rgb01)
     """
     def __init__(
         self,
@@ -165,7 +78,7 @@ def __init__(
         use_autocast: bool,
         autocast_dtype: Optional[torch.dtype],
         max_edge: int = 768,
-        target_pixels: int = 600_000,   # ~775x775 cap by area
     ):
         self.core = core
         self.device = device
@@ -175,8 +88,9 @@ def __init__(
         self.max_edge = int(max_edge)
         self.target_pixels = int(target_pixels)
         self.started = False
-        # feature detection
         try:
             sig = inspect.signature(self.core.step)
             self._has_first_frame_pred = "first_frame_pred" in sig.parameters
@@ -185,22 +99,26 @@ def __init__(
         self._has_prob_to_mask = hasattr(self.core, "output_prob_to_mask")
     def reset(self):
-        try:
-            if hasattr(self.core, "clear_memory"):
-                self.core.clear_memory()
-        except Exception:
-            pass
-        self.started = False
-    def _compute_scaled_size(self, h: int, w: int) -> Tuple[int, int, float]:
-        if h <= 0 or w <= 0:
-            return h, w, 1.0
-        s1 = min(1.0, self.max_edge / max(h, w))
-        s2 = min(1.0, (self.target_pixels / (h * w)) ** 0.5) if self.target_pixels > 0 else 1.0
-        s = min(s1, s2)
-        nh = max(1, int(round(h * s)))
-        nw = max(1, int(round(w * s)))
-        return nh, nw, s
     def _to_alpha(self, out_prob):
         if self._has_prob_to_mask:
@@ -210,120 +128,99 @@ def _to_alpha(self, out_prob):
                 pass
         t = torch.as_tensor(out_prob).float()
         if t.ndim == 4:
-            c = 0 if t.shape[1] > 0 else None
-            b = 0 if t.shape[0] > 0 else None
-            if b is not None and c is not None:
-                return t[b, c]
         if t.ndim == 3:
             return t[0] if t.shape[0] >= 1 else t.mean(0)
         return t
     def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
         """
-        Returns a 2-D float32 alpha [H,W]. On first call, provide a coarse mask.
-        Subsequent calls propagate without a mask.
         """
-        img_bchw = _to_bchw(image, self.device, is_mask=False)   # [1,C,H,W]
-        msk_b1hw = _to_bchw(mask,  self.device, is_mask=True) if mask is not None else None
-        H, W = img_bchw.shape[-2], img_bchw.shape[-1]
-        # --- Guarantee same shape for mask/image at input resolution ---
-        if msk_b1hw is not None and img_bchw.shape[-2:] != msk_b1hw.shape[-2:]:
-            logger.warning(f"Fixing mask shape: {msk_b1hw.shape[-2:]} → {img_bchw.shape[-2:]}")
-            msk_b1hw = _resize_bchw(msk_b1hw, img_bchw.shape[-2:], is_mask=True)
-        img_bchw = img_bchw.to(self.model_dtype, non_blocking=True)
-        nh, nw, s = self._compute_scaled_size(H, W)
-        scales = [(nh, nw)]
-        if s < 1.0:
-            f = 0.85
-            cur_h, cur_w = nh, nw
-            for _ in range(6):
-                cur_h = max(128, int(cur_h * f))
-                cur_w = max(128, int(cur_w * f))
-                if (cur_h, cur_w) != scales[-1]:
-                    scales.append((cur_h, cur_w))
-                if max(cur_h, cur_w) <= 192 or (cur_h * cur_w) <= 150_000:
-                    break
-        last_exc = None
-        for (th, tw) in scales:
-            try:
-                img_in = _resize_bchw(img_bchw, (th, tw), is_mask=False)
-                msk_in = _resize_bchw(msk_b1hw, (th, tw), is_mask=True) if msk_b1hw is not None else None
-                # --- Guarantee same shape for mask/image at each retry scale ---
-                if msk_in is not None and img_in.shape[-2:] != msk_in.shape[-2:]:
-                    logger.warning(f"Progressive retry: resizing mask from {msk_in.shape[-2:]} to {img_in.shape[-2:]}")
-                    msk_in = _resize_bchw(msk_in, img_in.shape[-2:], is_mask=True)
-                img_chw = _to_chw_image(img_in).contiguous()
-                m_1hw  = _to_1hw_mask(msk_in) if msk_in is not None else None
-                mask_2d = m_1hw[0].contiguous() if m_1hw is not None else None
-                with torch.inference_mode():
-                    if self.use_autocast:
-                        amp_ctx = torch.cuda.amp.autocast(dtype=self.autocast_dtype)
-                    else:
-                        class _NoOp:
-                            def __enter__(self): return None
-                            def __exit__(self, *args): return False
-                        amp_ctx = _NoOp()
-                    with amp_ctx:
-                        if not self.started:
-                            if mask_2d is None:
-                                logger.warning("First frame arrived without a mask; returning neutral alpha.")
-                                return np.full((H, W), 0.5, dtype=np.float32)
-                            _ = self.core.step(image=img_chw, mask=mask_2d)
-                            if self._has_first_frame_pred:
-                                out_prob = self.core.step(image=img_chw, first_frame_pred=True)
                             else:
                                 out_prob = self.core.step(image=img_chw)
-                            alpha = self._to_alpha(out_prob)
-                            self.started = True
-                        else:
-                            out_prob = self.core.step(image=img_chw)
-                            alpha = self._to_alpha(out_prob)
-                if (th, tw) != (H, W):
-                    a_b1hw = _to_b1hw_alpha(alpha, device=img_chw.device)
-                    a_b1hw = torch.nn.functional.interpolate(a_b1hw, size=(H, W), mode="bilinear", align_corners=False)
-                    alpha  = a_b1hw[0, 0]
-                return _to_2d_alpha_numpy(alpha)
-            except torch.cuda.OutOfMemoryError as e:
-                last_exc = e
-                logger.warning(f"MatAnyone OOM at {th}x{tw}; retrying smaller. {e}")
-                torch.cuda.empty_cache()
-                continue
-            except Exception as e:
-                last_exc = e
-                logger.debug(traceback.format_exc())
-                logger.warning(f"MatAnyone call failed at {th}x{tw}; retrying smaller. {e}")
-                torch.cuda.empty_cache()
-                continue
-        logger.warning(f"MatAnyone calls failed; returning input mask as fallback. {last_exc}")
-        if msk_b1hw is not None:
-            return _to_2d_alpha_numpy(msk_b1hw)
-        return np.full((H, W), 0.5, dtype=np.float32)
-# -------------------------------- Loader ---------------------------------- #
-def _choose_precision(device: str) -> Tuple[torch.dtype, bool, Optional[torch.dtype]]:
-    if device != "cuda":
-        return torch.float32, False, None
-    bf16_ok = hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported()
-    cc = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
-    fp16_ok = cc[0] >= 7  # Volta+
-    if bf16_ok:
-        return torch.bfloat16, True, torch.bfloat16
-    if fp16_ok:
-        return torch.float16, True, torch.float16
-    return torch.float32, False, None
 class MatAnyoneLoader:
     """
-    Official MatAnyone loader with stateful, OOM-resilient adapter.
     """
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyone_cache"):
         self.device = _select_device(device)
@@ -335,6 +232,7 @@ def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyo
         self.model_id = "PeiqingYang/MatAnyone"
         self.load_time = 0.0
     def _import_model_and_core(self):
         model_cls = core_cls = None
         err_msgs = []
@@ -359,36 +257,40 @@ def _import_model_and_core(self):
             except Exception as e:
                 err_msgs.append(f"core  {mod}.{cls}: {e}")
         if model_cls is None or core_cls is None:
-            msg = " | ".join(err_msgs)
-            raise ImportError(f"Could not import MatAnyone/InferenceCore: {msg}")
         return model_cls, core_cls
     def load(self) -> Optional[Any]:
         logger.info(f"Loading MatAnyone from HF: {self.model_id} (device={self.device})")
-        start = time.time()
         try:
             model_cls, core_cls = self._import_model_and_core()
             model_dtype, use_autocast, autocast_dtype = _choose_precision(self.device)
             logger.info(f"MatAnyone precision: weights={model_dtype}, autocast={use_autocast and autocast_dtype}")
             self.model = model_cls.from_pretrained(self.model_id)
             try:
                 self.model = self.model.to(self.device).to(model_dtype)
             except Exception:
                 self.model = self.model.to(self.device)
             self.model.eval()
             try:
                 cfg = getattr(self.model, "cfg", None)
-                if cfg is not None:
-                    self.core = core_cls(self.model, cfg=cfg)
-                else:
-                    self.core = core_cls(self.model)
             except TypeError:
                 self.core = core_cls(self.model)
             try:
                 if hasattr(self.core, "to"):
                     self.core.to(self.device)
             except Exception:
                 pass
             max_edge = int(os.environ.get("MATANYONE_MAX_EDGE", "768"))
             target_pixels = int(os.environ.get("MATANYONE_TARGET_PIXELS", "600000"))
             self.adapter = _MatAnyoneSession(
@@ -400,9 +302,10 @@ def load(self) -> Optional[Any]:
                 max_edge=max_edge,
                 target_pixels=target_pixels,
             )
-            self.load_time = time.time() - start
             logger.info(f"MatAnyone loaded in {self.load_time:.2f}s")
             return self.adapter
         except Exception as e:
             logger.error(f"Failed to load MatAnyone: {e}")
             logger.debug(traceback.format_exc())
@@ -430,24 +333,14 @@ def get_info(self) -> Dict[str, Any]:
         }
     def debug_shapes(self, image, mask, tag: str = ""):
-        debug_shapes(tag, image, mask)
-# -------------------------- Optional: Module-level symbols --------------------------
-__all__ = [
-    "MatAnyoneLoader",
-    "_MatAnyoneSession",
-    "_to_bchw",
-    "_resize_bchw",
-    "_to_chw_image",
-    "_to_1hw_mask",
-    "_to_b1hw_alpha",
-    "_to_2d_alpha_numpy",
-    "debug_shapes"
-]
-# -------------------------- (Optional) Simple CLI for quick testing --------------------------
 if __name__ == "__main__":
     import sys
     import cv2
@@ -457,15 +350,16 @@ def debug_shapes(self, image, mask, tag: str = ""):
     if len(sys.argv) < 2:
         print(f"Usage: {sys.argv[0]} image.jpg [mask.png]")
-        sys.exit(1)
     image_path = sys.argv[1]
-    mask_path = sys.argv[2] if len(sys.argv) > 2 else None
-    img = cv2.imread(image_path, cv2.IMREAD_COLOR)
-    if img is None:
         print(f"Could not load image {image_path}")
-        sys.exit(2)
-    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
     mask = None
     if mask_path:
@@ -477,7 +371,7 @@ def debug_shapes(self, image, mask, tag: str = ""):
     session = loader.load()
     if not session:
         print("Failed to load MatAnyone")
-        sys.exit(3)
     alpha = session(img_rgb, mask)
     cv2.imwrite("alpha_out.png", (np.clip(alpha, 0, 1) * 255).astype(np.uint8))

+from matanyone_loader import MatAnyoneLoader
+import cv2, numpy as np, torch
+# Load session (stateful per video)
+loader = MatAnyoneLoader(device="cuda")
+session = loader.load()
+assert session, "MatAnyone failed to load"
+# Frame 0 (must supply a coarse mask, even a fallback like 0.5 or ones)
+bgr0 = cv2.imread("frame0001.jpg")
+rgb0 = cv2.cvtColor(bgr0, cv2.COLOR_BGR2RGB)
+coarse0 = np.ones((rgb0.shape[0], rgb0.shape[1]), dtype=np.float32)  # example fallback
+alpha0 = session(rgb0, coarse0)   # -> 2-D float32 [H,W]
+# Frames 1..N (mask=None, stateful propagation)
+for i in range(2, 6):
+    bgr = cv2.imread(f"frame000{i}.jpg")
+    rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
+    alpha = session(rgb, mask=None)  # -> 2-D float32 [H,W]
 #!/usr/bin/env python3
+# -*- coding: utf-8 -*-
 """
 MatAnyone Loader + Stateful Adapter (OOM-resilient, spatially robust)
+- Canonical HF load (MatAnyone.from_pretrained → InferenceCore(model, cfg))
 - Mixed precision (bf16/fp16) with safe fallback to fp32
+- torch.autocast(device_type="cuda", dtype=...) + torch.inference_mode()
+- Progressive downscale ladder with graceful fallback
+- Strict image↔mask alignment on every path/scale
+- Returns 2-D float32 [H,W] alpha (OpenCV-friendly)
 """
+from __future__ import annotations
 import os
 import time
 import logging
 import traceback
+from typing import Optional, Dict, Any, Tuple, List
 import numpy as np
 import torch
 import torch.nn.functional as F
 import inspect
+import threading
 def _select_device(pref: str) -> str:
     pref = (pref or "").lower()
     if pref.startswith("cuda"):
         return "cpu"
     return "cuda" if torch.cuda.is_available() else "cpu"
+def _choose_precision(device: str) -> Tuple[torch.dtype, bool, Optional[torch.dtype]]:
+    """Pick model weight dtype + autocast dtype (bf16>fp16>fp32)."""
+    if device != "cuda":
+        return torch.float32, False, None
+    bf16_ok = hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported()
+    cc = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
+    fp16_ok = cc[0] >= 7  # Volta+
+    if bf16_ok:
+        return torch.bfloat16, True, torch.bfloat16
+    if fp16_ok:
+        return torch.float16, True, torch.float16
+    return torch.float32, False, None
 class _MatAnyoneSession:
     """
     Stateful controller around InferenceCore with OOM-resilient inference.
+    First call MUST supply a coarse mask (we enforce 1HW internally).
     """
     def __init__(
         self,
         use_autocast: bool,
         autocast_dtype: Optional[torch.dtype],
         max_edge: int = 768,
+        target_pixels: int = 600_000,   # ~775x775 by area
     ):
         self.core = core
         self.device = device
         self.max_edge = int(max_edge)
         self.target_pixels = int(target_pixels)
         self.started = False
+        self._lock = threading.Lock()
+        # Introspect optional args
         try:
             sig = inspect.signature(self.core.step)
             self._has_first_frame_pred = "first_frame_pred" in sig.parameters
         self._has_prob_to_mask = hasattr(self.core, "output_prob_to_mask")
     def reset(self):
+        with self._lock:
+            try:
+                if hasattr(self.core, "clear_memory"):
+                    self.core.clear_memory()
+            except Exception:
+                pass
+            self.started = False
+    def _scaled_ladder(self, H: int, W: int) -> List[Tuple[int, int]]:
+        nh, nw, s = _compute_scaled_size(H, W, self.max_edge, self.target_pixels)
+        sizes = [(nh, nw)]
+        if s < 1.0:
+            f_chain = (0.85, 0.70, 0.55, 0.40)
+            cur_h, cur_w = nh, nw
+            for f in f_chain:
+                cur_h = max(128, int(cur_h * f))
+                cur_w = max(128, int(cur_w * f))
+                if sizes[-1] != (cur_h, cur_w):
+                    sizes.append((cur_h, cur_w))
+        return sizes
     def _to_alpha(self, out_prob):
         if self._has_prob_to_mask:
                 pass
         t = torch.as_tensor(out_prob).float()
         if t.ndim == 4:
+            return t[0, 0] if t.shape[1] >= 1 else t[0].mean(0)
         if t.ndim == 3:
             return t[0] if t.shape[0] >= 1 else t.mean(0)
         return t
     def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
         """
+        Returns a 2-D float32 alpha [H,W].
+        - frame 0: provide coarse mask → session initialized
+        - frames 1..N: pass mask=None (propagation)
         """
+        with self._lock:
+            img_bchw = _to_bchw(image, self.device, is_mask=False)   # [1,C,H,W]
+            H, W = img_bchw.shape[-2], img_bchw.shape[-1]
+            img_bchw = img_bchw.to(self.model_dtype, non_blocking=True)
+            # Normalize + align provided mask (if any) to **B1HW** at full res
+            msk_b1hw = _to_bchw(mask, self.device, is_mask=True) if mask is not None else None
+            if msk_b1hw is not None and msk_b1hw.shape[-2:] != (H, W):
+                msk_b1hw = _resize_bchw(msk_b1hw, (H, W), is_mask=True)
+            mask_1hw = _to_1hw_mask(msk_b1hw) if msk_b1hw is not None else None  # ← 1HW!
+            sizes = self._scaled_ladder(H, W)
+            last_exc = None
+            for (th, tw) in sizes:
+                try:
+                    img_in = img_bchw if (th, tw) == (H, W) else F.interpolate(
+                        img_bchw, size=(th, tw), mode="bilinear", align_corners=False
+                    )
+                    msk_in = None
+                    if mask_1hw is not None:
+                        if (th, tw) == (H, W):
+                            msk_in = mask_1hw
+                        else:
+                            msk_in = F.interpolate(mask_1hw.unsqueeze(0), size=(th, tw), mode="nearest")[0]
+                    img_chw = _to_chw_image(img_in).contiguous()  # [C,H,W]
+                    with torch.inference_mode():
+                        if self.use_autocast:
+                            amp_ctx = torch.autocast(device_type="cuda", dtype=self.autocast_dtype)
+                        else:
+                            class _NoOp:
+                                def __enter__(self): return None
+                                def __exit__(self, *a): return False
+                            amp_ctx = _NoOp()
+                        with amp_ctx:
+                            if not self.started:
+                                if msk_in is None:
+                                    # Should not happen when used correctly — still be defensive
+                                    logger.warning("First frame arrived without a mask; returning neutral alpha.")
+                                    return np.full((H, W), 0.5, dtype=np.float32)
+                                # CRITICAL: pass **1HW** to .step(mask=...)
+                                _ = self.core.step(image=img_chw, mask=msk_in)
+                                if self._has_first_frame_pred:
+                                    out_prob = self.core.step(image=img_chw, first_frame_pred=True)
+                                else:
+                                    out_prob = self.core.step(image=img_chw)
+                                self.started = True
                             else:
                                 out_prob = self.core.step(image=img_chw)
+                    alpha = self._to_alpha(out_prob)
+                    # Upsample alpha back if we ran at a smaller scale
+                    if (th, tw) != (H, W):
+                        a_b1hw = _to_b1hw_alpha(alpha, device=img_bchw.device)
+                        a_b1hw = F.interpolate(a_b1hw, size=(H, W), mode="bilinear", align_corners=False)
+                        alpha = a_b1hw[0, 0]
+                    return _to_2d_alpha_numpy(alpha)
+                except torch.cuda.OutOfMemoryError as e:
+                    last_exc = e
+                    torch.cuda.empty_cache()
+                    logger.warning(f"MatAnyone OOM at {th}x{tw}; retrying smaller. {e}")
+                    continue
+                except Exception as e:
+                    last_exc = e
+                    torch.cuda.empty_cache()
+                    logger.debug(traceback.format_exc())
+                    logger.warning(f"MatAnyone call failed at {th}x{tw}; retrying smaller. {e}")
+                    continue
+            logger.warning(f"MatAnyone calls failed; returning input mask or neutral alpha. {last_exc}")
+            if mask_1hw is not None:
+                return _to_2d_alpha_numpy(mask_1hw)
+            return np.full((H, W), 0.5, dtype=np.float32)
 class MatAnyoneLoader:
     """
+    Official MatAnyone loader with stateful, OOM-resilient session adapter.
     """
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyone_cache"):
         self.device = _select_device(device)
         self.model_id = "PeiqingYang/MatAnyone"
         self.load_time = 0.0
+    # --- Robust imports (works with different packaging layouts) ---
     def _import_model_and_core(self):
         model_cls = core_cls = None
         err_msgs = []
             except Exception as e:
                 err_msgs.append(f"core  {mod}.{cls}: {e}")
         if model_cls is None or core_cls is None:
+            raise ImportError("Could not import MatAnyone / InferenceCore: " + " | ".join(err_msgs))
         return model_cls, core_cls
     def load(self) -> Optional[Any]:
         logger.info(f"Loading MatAnyone from HF: {self.model_id} (device={self.device})")
+        t0 = time.time()
         try:
             model_cls, core_cls = self._import_model_and_core()
             model_dtype, use_autocast, autocast_dtype = _choose_precision(self.device)
             logger.info(f"MatAnyone precision: weights={model_dtype}, autocast={use_autocast and autocast_dtype}")
+            # HF weights (safetensors); keep trust defaults of library itself
             self.model = model_cls.from_pretrained(self.model_id)
             try:
                 self.model = self.model.to(self.device).to(model_dtype)
             except Exception:
                 self.model = self.model.to(self.device)
             self.model.eval()
+            # Inference core (cfg may or may not exist on the model)
             try:
                 cfg = getattr(self.model, "cfg", None)
+                self.core = core_cls(self.model, cfg=cfg) if cfg is not None else core_cls(self.model)
             except TypeError:
                 self.core = core_cls(self.model)
+            # Some versions expose .to(), some don’t — best effort
             try:
                 if hasattr(self.core, "to"):
                     self.core.to(self.device)
             except Exception:
                 pass
+            # Build stateful adapter
             max_edge = int(os.environ.get("MATANYONE_MAX_EDGE", "768"))
             target_pixels = int(os.environ.get("MATANYONE_TARGET_PIXELS", "600000"))
             self.adapter = _MatAnyoneSession(
                 max_edge=max_edge,
                 target_pixels=target_pixels,
             )
+            self.load_time = time.time() - t0
             logger.info(f"MatAnyone loaded in {self.load_time:.2f}s")
             return self.adapter
         except Exception as e:
             logger.error(f"Failed to load MatAnyone: {e}")
             logger.debug(traceback.format_exc())
         }
     def debug_shapes(self, image, mask, tag: str = ""):
+        try:
+            tv_img = torch.as_tensor(image)
+            tv_msk = torch.as_tensor(mask) if mask is not None else None
+            logger.info(f"[{tag}:image] shape={tuple(tv_img.shape)} dtype={tv_img.dtype}")
+            if tv_msk is not None:
+                logger.info(f"[{tag}:mask ] shape={tuple(tv_msk.shape)} dtype={tv_msk.dtype}")
+        except Exception as e:
+            logger.info(f"[{tag}] debug error: {e}")
 if __name__ == "__main__":
     import sys
     import cv2
     if len(sys.argv) < 2:
         print(f"Usage: {sys.argv[0]} image.jpg [mask.png]")
+        raise SystemExit(1)
     image_path = sys.argv[1]
+    mask_path  = sys.argv[2] if len(sys.argv) > 2 else None
+    img_bgr = cv2.imread(image_path, cv2.IMREAD_COLOR)
+    if img_bgr is None:
         print(f"Could not load image {image_path}")
+        raise SystemExit(2)
+    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
     mask = None
     if mask_path:
     session = loader.load()
     if not session:
         print("Failed to load MatAnyone")
+        raise SystemExit(3)
     alpha = session(img_rgb, mask)
     cv2.imwrite("alpha_out.png", (np.clip(alpha, 0, 1) * 255).astype(np.uint8))