Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 28, 2025

Commit

874e937

1 Parent(s): 28e0f6c

Update models/loaders/matanyone_loader.py

Browse files

Files changed (1) hide show

models/loaders/matanyone_loader.py +75 -103

models/loaders/matanyone_loader.py CHANGED Viewed

@@ -1,10 +1,11 @@
 #!/usr/bin/env python3
 """
-MatAnyone Loader + Stateful Adapter (OOM-resilient)
 - Canonical HF load (MatAnyone.from_pretrained -> InferenceCore(model, cfg))
 - Mixed precision (bf16/fp16) with safe fallback to fp32
 - Autocast + inference_mode around every call
 - Auto downscale with progressive retry on OOM, then upsample alpha back
 - Returns 2-D float32 [H,W] alpha for OpenCV
 """
@@ -42,38 +43,25 @@ def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
     Accepts: HWC, CHW, BCHW, BHWC, BTCHW/BTHWC, TCHW/THWC, HW.
     """
     x = _as_tensor_on_device(x, device)
-    # dtype / range
     if x.dtype == torch.uint8:
         x = x.float().div_(255.0)
     elif x.dtype in (torch.int16, torch.int32, torch.int64):
         x = x.float()
-    # 5D -> take first time slice
     if x.ndim == 5:
         x = x[:, 0]  # -> 4D
-    # 4D: BHWC -> BCHW
     if x.ndim == 4:
         if x.shape[-1] in (1, 3, 4) and x.shape[1] not in (1, 3, 4):
             x = x.permute(0, 3, 1, 2).contiguous()
-    # 3D: HWC -> CHW; add batch
     elif x.ndim == 3:
         if x.shape[-1] in (1, 3, 4):
             x = x.permute(2, 0, 1).contiguous()
         x = x.unsqueeze(0)
-    # 2D: add channel & batch
     elif x.ndim == 2:
         x = x.unsqueeze(0).unsqueeze(0)
         if not is_mask:
             x = x.repeat(1, 3, 1, 1)
     else:
         raise ValueError(f"Unsupported ndim={x.ndim}")
-    # finalize channels
     if is_mask:
         if x.shape[1] > 1:
             x = x[:, :1]
@@ -82,7 +70,6 @@ def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
         if x.shape[1] == 1:
             x = x.repeat(1, 3, 1, 1)
         x = x.clamp_(0.0, 1.0)
     return x
 def _to_chw_image(img_bchw: torch.Tensor) -> torch.Tensor:
@@ -108,32 +95,24 @@ def _resize_bchw(x: Optional[torch.Tensor], size_hw: Tuple[int, int], is_mask=Fa
     return F.interpolate(x, size=size_hw, mode=mode, align_corners=False if mode == "bilinear" else None)
 def _to_b1hw_alpha(alpha, device: str) -> torch.Tensor:
-    """
-    Convert any plausible alpha/prob output into [1,1,H,W] float in [0,1].
-    Prevents 5D/6D mishaps when upsampling.
-    """
     t = torch.as_tensor(alpha, device=device).float()
     if t.ndim == 2:
         t = t.unsqueeze(0).unsqueeze(0)            # -> [1,1,H,W]
     elif t.ndim == 3:
-        # CHW or 1HW
         if t.shape[0] in (1, 3, 4):
             if t.shape[0] != 1:
-                t = t[:1]                           # keep first channel
-            t = t.unsqueeze(0)                      # -> [1,1,H,W]
-        elif t.shape[-1] in (1, 3, 4):              # HWC (unexpected, but handle)
             t = t[..., :1].permute(2, 0, 1).unsqueeze(0)
         else:
-            # assume [H,W,C?] incompatible → fallback to first dim semantics
             t = t[:1].unsqueeze(0)
     elif t.ndim == 4:
-        # [B,C,H,W] → ensure C=1 and B=1
         if t.shape[1] != 1:
             t = t[:, :1]
         if t.shape[0] != 1:
             t = t[:1]
     else:
-        # squeeze weird shapes down to [1,1,H,W] best-effort
         while t.ndim > 4:
             t = t.squeeze(0)
         while t.ndim < 4:
@@ -213,7 +192,6 @@ def reset(self):
             pass
         self.started = False
-    # ---- helpers ----
     def _compute_scaled_size(self, h: int, w: int) -> Tuple[int, int, float]:
         if h <= 0 or w <= 0:
             return h, w, 1.0
@@ -225,48 +203,41 @@ def _compute_scaled_size(self, h: int, w: int) -> Tuple[int, int, float]:
         return nh, nw, s
     def _to_alpha(self, out_prob):
-        # Prefer library conversion if available
         if self._has_prob_to_mask:
             try:
                 return self.core.output_prob_to_mask(out_prob, matting=True)
             except Exception:
                 pass
         t = torch.as_tensor(out_prob).float()
-        # Normalize common cases to 2-D alpha
-        if t.ndim == 4:                # [B,C,H,W]
             c = 0 if t.shape[1] > 0 else None
             b = 0 if t.shape[0] > 0 else None
             if b is not None and c is not None:
                 return t[b, c]
-        if t.ndim == 3:                # [C,H,W]
             return t[0] if t.shape[0] >= 1 else t.mean(0)
-        return t                        # already 2-D or degenerate -> let caller sanitize
-    # ---- main call ----
     def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
         """
         Returns a 2-D float32 alpha [H,W]. On first call, provide a coarse mask.
         Subsequent calls propagate without a mask.
         """
-        # Boundary normalization
         img_bchw = _to_bchw(image, self.device, is_mask=False)   # [1,C,H,W]
         msk_b1hw = _to_bchw(mask,  self.device, is_mask=True) if mask is not None else None
         H, W = img_bchw.shape[-2], img_bchw.shape[-1]
-        if msk_b1hw is not None:
-            msk_b1hw = _resize_bchw(msk_b1hw, (H, W), is_mask=True)
-        # dtype alignment for activations
         img_bchw = img_bchw.to(self.model_dtype, non_blocking=True)
-        # build a deeper downscale ladder to survive tight VRAM
         nh, nw, s = self._compute_scaled_size(H, W)
         scales = [(nh, nw)]
-        # add progressive reductions until fairly small, but not tiny
         if s < 1.0:
             f = 0.85
             cur_h, cur_w = nh, nw
-            for _ in range(6):  # up to 8 attempts total
                 cur_h = max(128, int(cur_h * f))
                 cur_w = max(128, int(cur_w * f))
                 if (cur_h, cur_w) != scales[-1]:
@@ -278,15 +249,17 @@ def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
         for (th, tw) in scales:
             try:
-                # downscale for inference if needed
                 img_in = _resize_bchw(img_bchw, (th, tw), is_mask=False)
                 msk_in = _resize_bchw(msk_b1hw, (th, tw), is_mask=True) if msk_b1hw is not None else None
-                img_chw = _to_chw_image(img_in).contiguous()                  # [C,H,W]
-                m_1hw  = _to_1hw_mask(msk_in) if msk_in is not None else None # [1,H,W] or None
-                mask_2d = m_1hw[0].contiguous() if m_1hw is not None else None# [H,W] or None
-                # inference with autocast + inference_mode
                 with torch.inference_mode():
                     if self.use_autocast:
                         amp_ctx = torch.cuda.amp.autocast(dtype=self.autocast_dtype)
@@ -295,17 +268,12 @@ class _NoOp:
                             def __enter__(self): return None
                             def __exit__(self, *args): return False
                         amp_ctx = _NoOp()
                     with amp_ctx:
                         if not self.started:
                             if mask_2d is None:
                                 logger.warning("First frame arrived without a mask; returning neutral alpha.")
                                 return np.full((H, W), 0.5, dtype=np.float32)
-                            # encode/memorize — pass 2-D mask (H,W)
                             _ = self.core.step(image=img_chw, mask=mask_2d)
-                            # warm-up predict
                             if self._has_first_frame_pred:
                                 out_prob = self.core.step(image=img_chw, first_frame_pred=True)
                             else:
@@ -316,13 +284,10 @@ def __exit__(self, *args): return False
                             out_prob = self.core.step(image=img_chw)
                             alpha = self._to_alpha(out_prob)
-                # ---- SAFE UPSAMPLE PATH (always 4D -> 2D) ----
                 if (th, tw) != (H, W):
-                    a_b1hw = _to_b1hw_alpha(alpha, device=img_chw.device)             # [1,1,th,tw]
-                    a_b1hw = F.interpolate(a_b1hw, size=(H, W), mode="bilinear", align_corners=False)  # [1,1,H,W]
-                    alpha  = a_b1hw[0, 0]                                             # -> [H,W]
-                # ------------------------------------------------
                 return _to_2d_alpha_numpy(alpha)
             except torch.cuda.OutOfMemoryError as e:
@@ -337,7 +302,6 @@ def __exit__(self, *args): return False
                 torch.cuda.empty_cache()
                 continue
-        # All attempts failed → return fallback
         logger.warning(f"MatAnyone calls failed; returning input mask as fallback. {last_exc}")
         if msk_b1hw is not None:
             return _to_2d_alpha_numpy(msk_b1hw)
@@ -346,51 +310,34 @@ def __exit__(self, *args): return False
 # -------------------------------- Loader ---------------------------------- #
 def _choose_precision(device: str) -> Tuple[torch.dtype, bool, Optional[torch.dtype]]:
-    """
-    Decide model+autocast dtypes.
-    Strategy:
-      - Prefer bf16 autocast if supported (Ampere+), keep weights bf16 if possible.
-      - Else use fp16 autocast, keep weights fp16 if safe.
-      - Else fp32 without autocast.
-    """
     if device != "cuda":
         return torch.float32, False, None
     bf16_ok = hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported()
     cc = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
     fp16_ok = cc[0] >= 7  # Volta+
     if bf16_ok:
         return torch.bfloat16, True, torch.bfloat16
     if fp16_ok:
         return torch.float16, True, torch.float16
     return torch.float32, False, None
 class MatAnyoneLoader:
     """
     Official MatAnyone loader with stateful, OOM-resilient adapter.
     """
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyone_cache"):
         self.device = _select_device(device)
         self.cache_dir = cache_dir
         os.makedirs(self.cache_dir, exist_ok=True)
-        self.model = None        # torch.nn.Module (MatAnyone)
-        self.core = None         # InferenceCore
-        self.adapter = None      # _MatAnyoneSession
         self.model_id = "PeiqingYang/MatAnyone"
         self.load_time = 0.0
     def _import_model_and_core(self):
-        """
-        Import MatAnyone + InferenceCore with resilient fallbacks (different dist layouts).
-        """
         model_cls = core_cls = None
         err_msgs = []
-        # Candidates for model class
         for mod, cls in [
             ("matanyone.model.matanyone", "MatAnyone"),
             ("matanyone", "MatAnyone"),
@@ -401,8 +348,6 @@ def _import_model_and_core(self):
                 break
             except Exception as e:
                 err_msgs.append(f"model {mod}.{cls}: {e}")
-        # Candidates for InferenceCore
         for mod, cls in [
             ("matanyone.inference.inference_core", "InferenceCore"),
             ("matanyone", "InferenceCore"),
@@ -413,39 +358,24 @@ def _import_model_and_core(self):
                 break
             except Exception as e:
                 err_msgs.append(f"core  {mod}.{cls}: {e}")
         if model_cls is None or core_cls is None:
             msg = " | ".join(err_msgs)
             raise ImportError(f"Could not import MatAnyone/InferenceCore: {msg}")
         return model_cls, core_cls
     def load(self) -> Optional[Any]:
-        """
-        Load MatAnyone and return the stateful callable adapter.
-        """
         logger.info(f"Loading MatAnyone from HF: {self.model_id} (device={self.device})")
         start = time.time()
         try:
             model_cls, core_cls = self._import_model_and_core()
-            # pick precision strategy
             model_dtype, use_autocast, autocast_dtype = _choose_precision(self.device)
             logger.info(f"MatAnyone precision: weights={model_dtype}, autocast={use_autocast and autocast_dtype}")
-            # Official pattern: model -> eval -> core(model, cfg=model.cfg)
             self.model = model_cls.from_pretrained(self.model_id)
-            # Try to move weights to selected dtype (safe try)
             try:
                 self.model = self.model.to(self.device).to(model_dtype)
             except Exception:
                 self.model = self.model.to(self.device)
-                # keep weights fp32; still benefit from autocast
             self.model.eval()
-            # Some builds require cfg; fall back if not present
             try:
                 cfg = getattr(self.model, "cfg", None)
                 if cfg is not None:
@@ -454,17 +384,13 @@ def load(self) -> Optional[Any]:
                     self.core = core_cls(self.model)
             except TypeError:
                 self.core = core_cls(self.model)
             try:
                 if hasattr(self.core, "to"):
                     self.core.to(self.device)
             except Exception:
                 pass
-            # tune scaling from env (optional)
             max_edge = int(os.environ.get("MATANYONE_MAX_EDGE", "768"))
             target_pixels = int(os.environ.get("MATANYONE_TARGET_PIXELS", "600000"))
             self.adapter = _MatAnyoneSession(
                 self.core,
                 device=self.device,
@@ -474,11 +400,9 @@ def load(self) -> Optional[Any]:
                 max_edge=max_edge,
                 target_pixels=target_pixels,
             )
             self.load_time = time.time() - start
             logger.info(f"MatAnyone loaded in {self.load_time:.2f}s")
             return self.adapter
         except Exception as e:
             logger.error(f"Failed to load MatAnyone: {e}")
             logger.debug(traceback.format_exc())
@@ -505,6 +429,54 @@ def get_info(self) -> Dict[str, Any]:
             "model_type": type(self.model).__name__ if self.model else None,
         }
-    # Optional: instance-level shape debugging
     def debug_shapes(self, image, mask, tag: str = ""):
         debug_shapes(tag, image, mask)

 #!/usr/bin/env python3
 """
+MatAnyone Loader + Stateful Adapter (OOM-resilient, spatially robust)
 - Canonical HF load (MatAnyone.from_pretrained -> InferenceCore(model, cfg))
 - Mixed precision (bf16/fp16) with safe fallback to fp32
 - Autocast + inference_mode around every call
 - Auto downscale with progressive retry on OOM, then upsample alpha back
+- Always aligns mask/image dimensions before inference to avoid all size errors
 - Returns 2-D float32 [H,W] alpha for OpenCV
 """
     Accepts: HWC, CHW, BCHW, BHWC, BTCHW/BTHWC, TCHW/THWC, HW.
     """
     x = _as_tensor_on_device(x, device)
     if x.dtype == torch.uint8:
         x = x.float().div_(255.0)
     elif x.dtype in (torch.int16, torch.int32, torch.int64):
         x = x.float()
     if x.ndim == 5:
         x = x[:, 0]  # -> 4D
     if x.ndim == 4:
         if x.shape[-1] in (1, 3, 4) and x.shape[1] not in (1, 3, 4):
             x = x.permute(0, 3, 1, 2).contiguous()
     elif x.ndim == 3:
         if x.shape[-1] in (1, 3, 4):
             x = x.permute(2, 0, 1).contiguous()
         x = x.unsqueeze(0)
     elif x.ndim == 2:
         x = x.unsqueeze(0).unsqueeze(0)
         if not is_mask:
             x = x.repeat(1, 3, 1, 1)
     else:
         raise ValueError(f"Unsupported ndim={x.ndim}")
     if is_mask:
         if x.shape[1] > 1:
             x = x[:, :1]
         if x.shape[1] == 1:
             x = x.repeat(1, 3, 1, 1)
         x = x.clamp_(0.0, 1.0)
     return x
 def _to_chw_image(img_bchw: torch.Tensor) -> torch.Tensor:
     return F.interpolate(x, size=size_hw, mode=mode, align_corners=False if mode == "bilinear" else None)
 def _to_b1hw_alpha(alpha, device: str) -> torch.Tensor:
     t = torch.as_tensor(alpha, device=device).float()
     if t.ndim == 2:
         t = t.unsqueeze(0).unsqueeze(0)            # -> [1,1,H,W]
     elif t.ndim == 3:
         if t.shape[0] in (1, 3, 4):
             if t.shape[0] != 1:
+                t = t[:1]
+            t = t.unsqueeze(0)
+        elif t.shape[-1] in (1, 3, 4):
             t = t[..., :1].permute(2, 0, 1).unsqueeze(0)
         else:
             t = t[:1].unsqueeze(0)
     elif t.ndim == 4:
         if t.shape[1] != 1:
             t = t[:, :1]
         if t.shape[0] != 1:
             t = t[:1]
     else:
         while t.ndim > 4:
             t = t.squeeze(0)
         while t.ndim < 4:
             pass
         self.started = False
     def _compute_scaled_size(self, h: int, w: int) -> Tuple[int, int, float]:
         if h <= 0 or w <= 0:
             return h, w, 1.0
         return nh, nw, s
     def _to_alpha(self, out_prob):
         if self._has_prob_to_mask:
             try:
                 return self.core.output_prob_to_mask(out_prob, matting=True)
             except Exception:
                 pass
         t = torch.as_tensor(out_prob).float()
+        if t.ndim == 4:
             c = 0 if t.shape[1] > 0 else None
             b = 0 if t.shape[0] > 0 else None
             if b is not None and c is not None:
                 return t[b, c]
+        if t.ndim == 3:
             return t[0] if t.shape[0] >= 1 else t.mean(0)
+        return t
     def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
         """
         Returns a 2-D float32 alpha [H,W]. On first call, provide a coarse mask.
         Subsequent calls propagate without a mask.
         """
         img_bchw = _to_bchw(image, self.device, is_mask=False)   # [1,C,H,W]
         msk_b1hw = _to_bchw(mask,  self.device, is_mask=True) if mask is not None else None
         H, W = img_bchw.shape[-2], img_bchw.shape[-1]
+        # --- Guarantee same shape for mask/image at input resolution ---
+        if msk_b1hw is not None and img_bchw.shape[-2:] != msk_b1hw.shape[-2:]:
+            logger.warning(f"Fixing mask shape: {msk_b1hw.shape[-2:]} → {img_bchw.shape[-2:]}")
+            msk_b1hw = _resize_bchw(msk_b1hw, img_bchw.shape[-2:], is_mask=True)
         img_bchw = img_bchw.to(self.model_dtype, non_blocking=True)
         nh, nw, s = self._compute_scaled_size(H, W)
         scales = [(nh, nw)]
         if s < 1.0:
             f = 0.85
             cur_h, cur_w = nh, nw
+            for _ in range(6):
                 cur_h = max(128, int(cur_h * f))
                 cur_w = max(128, int(cur_w * f))
                 if (cur_h, cur_w) != scales[-1]:
         for (th, tw) in scales:
             try:
                 img_in = _resize_bchw(img_bchw, (th, tw), is_mask=False)
                 msk_in = _resize_bchw(msk_b1hw, (th, tw), is_mask=True) if msk_b1hw is not None else None
+                # --- Guarantee same shape for mask/image at each retry scale ---
+                if msk_in is not None and img_in.shape[-2:] != msk_in.shape[-2:]:
+                    logger.warning(f"Progressive retry: resizing mask from {msk_in.shape[-2:]} to {img_in.shape[-2:]}")
+                    msk_in = _resize_bchw(msk_in, img_in.shape[-2:], is_mask=True)
+                img_chw = _to_chw_image(img_in).contiguous()
+                m_1hw  = _to_1hw_mask(msk_in) if msk_in is not None else None
+                mask_2d = m_1hw[0].contiguous() if m_1hw is not None else None
                 with torch.inference_mode():
                     if self.use_autocast:
                         amp_ctx = torch.cuda.amp.autocast(dtype=self.autocast_dtype)
                             def __enter__(self): return None
                             def __exit__(self, *args): return False
                         amp_ctx = _NoOp()
                     with amp_ctx:
                         if not self.started:
                             if mask_2d is None:
                                 logger.warning("First frame arrived without a mask; returning neutral alpha.")
                                 return np.full((H, W), 0.5, dtype=np.float32)
                             _ = self.core.step(image=img_chw, mask=mask_2d)
                             if self._has_first_frame_pred:
                                 out_prob = self.core.step(image=img_chw, first_frame_pred=True)
                             else:
                             out_prob = self.core.step(image=img_chw)
                             alpha = self._to_alpha(out_prob)
                 if (th, tw) != (H, W):
+                    a_b1hw = _to_b1hw_alpha(alpha, device=img_chw.device)
+                    a_b1hw = torch.nn.functional.interpolate(a_b1hw, size=(H, W), mode="bilinear", align_corners=False)
+                    alpha  = a_b1hw[0, 0]
                 return _to_2d_alpha_numpy(alpha)
             except torch.cuda.OutOfMemoryError as e:
                 torch.cuda.empty_cache()
                 continue
         logger.warning(f"MatAnyone calls failed; returning input mask as fallback. {last_exc}")
         if msk_b1hw is not None:
             return _to_2d_alpha_numpy(msk_b1hw)
 # -------------------------------- Loader ---------------------------------- #
 def _choose_precision(device: str) -> Tuple[torch.dtype, bool, Optional[torch.dtype]]:
     if device != "cuda":
         return torch.float32, False, None
     bf16_ok = hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported()
     cc = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
     fp16_ok = cc[0] >= 7  # Volta+
     if bf16_ok:
         return torch.bfloat16, True, torch.bfloat16
     if fp16_ok:
         return torch.float16, True, torch.float16
     return torch.float32, False, None
 class MatAnyoneLoader:
     """
     Official MatAnyone loader with stateful, OOM-resilient adapter.
     """
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyone_cache"):
         self.device = _select_device(device)
         self.cache_dir = cache_dir
         os.makedirs(self.cache_dir, exist_ok=True)
+        self.model = None
+        self.core = None
+        self.adapter = None
         self.model_id = "PeiqingYang/MatAnyone"
         self.load_time = 0.0
     def _import_model_and_core(self):
         model_cls = core_cls = None
         err_msgs = []
         for mod, cls in [
             ("matanyone.model.matanyone", "MatAnyone"),
             ("matanyone", "MatAnyone"),
                 break
             except Exception as e:
                 err_msgs.append(f"model {mod}.{cls}: {e}")
         for mod, cls in [
             ("matanyone.inference.inference_core", "InferenceCore"),
             ("matanyone", "InferenceCore"),
                 break
             except Exception as e:
                 err_msgs.append(f"core  {mod}.{cls}: {e}")
         if model_cls is None or core_cls is None:
             msg = " | ".join(err_msgs)
             raise ImportError(f"Could not import MatAnyone/InferenceCore: {msg}")
         return model_cls, core_cls
     def load(self) -> Optional[Any]:
         logger.info(f"Loading MatAnyone from HF: {self.model_id} (device={self.device})")
         start = time.time()
         try:
             model_cls, core_cls = self._import_model_and_core()
             model_dtype, use_autocast, autocast_dtype = _choose_precision(self.device)
             logger.info(f"MatAnyone precision: weights={model_dtype}, autocast={use_autocast and autocast_dtype}")
             self.model = model_cls.from_pretrained(self.model_id)
             try:
                 self.model = self.model.to(self.device).to(model_dtype)
             except Exception:
                 self.model = self.model.to(self.device)
             self.model.eval()
             try:
                 cfg = getattr(self.model, "cfg", None)
                 if cfg is not None:
                     self.core = core_cls(self.model)
             except TypeError:
                 self.core = core_cls(self.model)
             try:
                 if hasattr(self.core, "to"):
                     self.core.to(self.device)
             except Exception:
                 pass
             max_edge = int(os.environ.get("MATANYONE_MAX_EDGE", "768"))
             target_pixels = int(os.environ.get("MATANYONE_TARGET_PIXELS", "600000"))
             self.adapter = _MatAnyoneSession(
                 self.core,
                 device=self.device,
                 max_edge=max_edge,
                 target_pixels=target_pixels,
             )
             self.load_time = time.time() - start
             logger.info(f"MatAnyone loaded in {self.load_time:.2f}s")
             return self.adapter
         except Exception as e:
             logger.error(f"Failed to load MatAnyone: {e}")
             logger.debug(traceback.format_exc())
             "model_type": type(self.model).__name__ if self.model else None,
         }
     def debug_shapes(self, image, mask, tag: str = ""):
         debug_shapes(tag, image, mask)
+# -------------------------- Optional: Module-level symbols --------------------------
+__all__ = [
+    "MatAnyoneLoader",
+    "_MatAnyoneSession",
+    "_to_bchw",
+    "_resize_bchw",
+    "_to_chw_image",
+    "_to_1hw_mask",
+    "_to_b1hw_alpha",
+    "_to_2d_alpha_numpy",
+    "debug_shapes"
+]
+# -------------------------- (Optional) Simple CLI for quick testing --------------------------
+if __name__ == "__main__":
+    import sys
+    logging.basicConfig(level=logging.INFO)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if len(sys.argv) < 2:
+        print(f"Usage: {sys.argv[0]} image.jpg [mask.png]")
+        sys.exit(1)
+    image_path = sys.argv[1]
+    mask_path = sys.argv[2] if len(sys.argv) > 2 else None
+    img = cv2.imread(image_path, cv2.IMREAD_COLOR)
+    if img is None:
+        print(f"Could not load image {image_path}")
+        sys.exit(2)
+    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    mask = None
+    if mask_path:
+        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
+        if mask is not None and mask.max() > 1:
+            mask = (mask.astype(np.float32) / 255.0)
+    loader = MatAnyoneLoader(device=device)
+    session = loader.load()
+    if not session:
+        print("Failed to load MatAnyone")
+        sys.exit(3)
+    alpha = session(img_rgb, mask)
+    cv2.imwrite("alpha_out.png", (np.clip(alpha, 0, 1) * 255).astype(np.uint8))
+    print("Alpha matte written to alpha_out.png")