Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 28, 2025

Commit

7280fe8

1 Parent(s): b20702e

Update models/loaders/matanyone_loader.py

Browse files

Files changed (1) hide show

models/loaders/matanyone_loader.py +284 -412

models/loaders/matanyone_loader.py CHANGED Viewed

@@ -1,48 +1,258 @@
 #!/usr/bin/env python3
 """
-MatAnyone Model Loader
-Handles MatAnyone loading with proper device initialization
 """
 import os
 import time
 import logging
 import traceback
-from pathlib import Path
-from typing import Optional, Dict, Any
-import torch
 import numpy as np
 logger = logging.getLogger(__name__)
 class MatAnyoneLoader:
-    """Dedicated loader for MatAnyone models"""
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyone_cache"):
-        self.device = device
         self.cache_dir = cache_dir
         os.makedirs(self.cache_dir, exist_ok=True)
-        self.model = None
         self.model_id = "PeiqingYang/MatAnyone"
         self.load_time = 0.0
     def load(self) -> Optional[Any]:
         """
-        Load MatAnyone model
         Returns:
-            Loaded model or None
         """
-        logger.info(f"Loading MatAnyone model: {self.model_id}")
-        # Try loading strategies in order
         strategies = [
             ("official", self._load_official),
-            ("fallback", self._load_fallback)
         ]
         for strategy_name, strategy_func in strategies:
             try:
                 logger.info(f"Trying MatAnyone loading strategy: {strategy_name}")
@@ -51,423 +261,85 @@ def load(self) -> Optional[Any]:
                 if model:
                     self.load_time = time.time() - start_time
                     self.model = model
-                    logger.info(f"MatAnyone loaded successfully via {strategy_name} in {self.load_time:.2f}s")
                     return model
             except Exception as e:
                 logger.error(f"MatAnyone {strategy_name} strategy failed: {e}")
                 logger.debug(traceback.format_exc())
                 continue
         logger.error("All MatAnyone loading strategies failed")
         return None
     def _load_official(self) -> Optional[Any]:
-        """Load using official MatAnyone API with comprehensive shape guard"""
-        from matanyone import InferenceCore
-        # Create processor - pass model ID as positional argument
-        processor = InferenceCore(self.model_id)
-        # Install the critical shape guard patch from original loader
-        self._install_shape_guard(processor)
-        return processor
-    def _install_shape_guard(self, processor):
         """
-        Install the comprehensive shape guard from the original loader.
-        This is CRITICAL for preventing 5D tensor issues and ensuring compatibility.
         """
-        import torch
-        import numpy as np
-        device = self.device
-        # Helper functions for tensor manipulation
-        def ensure_image_nchw(img: torch.Tensor, want_batched: bool = True) -> torch.Tensor:
-            """Ensure image is in NCHW format"""
-            if isinstance(img, np.ndarray):
-                img = torch.from_numpy(img)
-            img = img.to(device)
-            # Handle 5D tensors (B,T,C,H,W) by squeezing time dimension
-            while img.ndim == 5:
-                if img.shape[0] == 1:
-                    img = img.squeeze(0)
-                elif img.shape[1] == 1:
-                    img = img.squeeze(1)
-                else:
-                    # Can't auto-squeeze, take first time frame
-                    img = img[:, 0]
-            # Handle various input formats
-            if img.ndim == 3:
-                # CHW or HWC
-                if img.shape[0] in (1, 3, 4):  # Likely CHW
-                    chw = img
-                elif img.shape[-1] in (1, 3, 4):  # Likely HWC
-                    chw = img.permute(2, 0, 1)
-                else:
-                    # Assume CHW
-                    chw = img
-                # Ensure float and normalized
-                if chw.dtype != torch.float32:
-                    chw = chw.float()
-                if chw.max() > 1.0:
-                    chw = chw / 255.0
-                return chw.unsqueeze(0) if want_batched else chw
-            elif img.ndim == 4:
-                # NCHW or NHWC
-                N, A, B, C = img.shape
-                if A in (1, 3, 4):  # NCHW
-                    nchw = img
-                elif C in (1, 3, 4):  # NHWC
-                    nchw = img.permute(0, 3, 1, 2)
-                else:
-                    # Assume NCHW
-                    nchw = img
-                # Ensure float and normalized
-                if nchw.dtype != torch.float32:
-                    nchw = nchw.float()
-                if nchw.max() > 1.0:
-                    nchw = nchw / 255.0
-                return nchw if want_batched else nchw.squeeze(0) if not want_batched and nchw.shape[0] == 1 else nchw[0]
-            else:
-                logger.error(f"Unexpected image dimensions: {img.shape}")
-                # Return something safe
-                return torch.zeros((3, 512, 512), device=device, dtype=torch.float32).unsqueeze(0) if want_batched else torch.zeros((3, 512, 512), device=device, dtype=torch.float32)
-        def ensure_mask_for_matanyone(mask: torch.Tensor, idx_mask: bool = False,
-                                      threshold: float = 0.5, keep_soft: bool = False) -> torch.Tensor:
-            """Ensure mask is in correct format for MatAnyone"""
-            if isinstance(mask, np.ndarray):
-                mask = torch.from_numpy(mask)
-            mask = mask.to(device)
-            # Handle 5D tensors
-            if mask.ndim == 5:
-                if mask.shape[1] == 1:
-                    mask = mask.squeeze(1)
-                if mask.shape[0] == 1 and mask.ndim == 5:
-                    mask = mask.squeeze(0)
-            # Handle index masks
-            if idx_mask:
-                if mask.ndim == 3:
-                    if mask.shape[0] == 1:
-                        idx = (mask[0] >= threshold).to(torch.long)
-                    else:
-                        idx = torch.argmax(mask, dim=0).to(torch.long)
-                        idx = (idx > 0).to(torch.long)
-                elif mask.ndim == 2:
-                    idx = (mask >= threshold).to(torch.long)
-                else:
-                    logger.warning(f"Unexpected idx mask shape: {mask.shape}")
-                    idx = torch.zeros((512, 512), device=device, dtype=torch.long)
-                return idx
-            # Handle channel masks
-            if mask.ndim == 2:
-                out = mask.unsqueeze(0)  # Add channel dimension
-            elif mask.ndim == 3:
-                if mask.shape[0] == 1:
-                    out = mask
                 else:
-                    # Choose channel with largest area
-                    areas = mask.sum(dim=(-2, -1))
-                    best_idx = areas.argmax()
-                    out = mask[best_idx:best_idx+1]
-            else:
-                logger.warning(f"Unexpected mask shape: {mask.shape}")
-                out = torch.ones((1, 512, 512), device=device, dtype=torch.float32)
-            # Convert to float and normalize
-            out = out.to(torch.float32)
-            if not keep_soft:
-                out = (out >= threshold).to(torch.float32)
-            return out.clamp_(0.0, 1.0).contiguous()
-        # Create the guarded wrapper
-        def create_guarded_method(original_method):
-            """Create a guarded version of a MatAnyone method"""
-            def guarded_method(*args, **kwargs):
-                # Extract image and mask
-                image = kwargs.get("image", None)
-                mask = kwargs.get("mask", None)
-                idx_mask = kwargs.get("idx_mask", kwargs.get("index_mask", False))
-                # Handle positional arguments
-                if image is None and len(args) >= 1:
-                    image = args[0]
-                if mask is None and len(args) >= 2:
-                    mask = args[1]
-                if image is None or mask is None:
-                    logger.error(f"MatAnyone called without image/mask: args={len(args)}, kwargs={list(kwargs.keys())}")
-                    # Return something safe
-                    return torch.ones((1, 512, 512), dtype=torch.float32) * 0.5
                 try:
-                    # Coerce shapes - ensure we REALLY squeeze out extra dimensions
-                    img_nchw = ensure_image_nchw(image, want_batched=True)
-                    # CRITICAL FIX: Force squeeze all unnecessary dimensions
-                    while img_nchw.ndim > 4:
-                        if img_nchw.shape[0] == 1:
-                            img_nchw = img_nchw.squeeze(0)
-                        elif img_nchw.shape[1] == 1:
-                            img_nchw = img_nchw.squeeze(1)
-                        else:
-                            break
-                    if idx_mask:
-                        m_fixed = ensure_mask_for_matanyone(mask, idx_mask=True)
-                    else:
-                        m_fixed = ensure_mask_for_matanyone(mask, idx_mask=False, threshold=0.5)
-                    # Log actual shapes being passed
-                    logger.debug(f"MatAnyone input - image: {img_nchw.shape}, mask: {m_fixed.shape}, idx: {idx_mask}")
-                    # For MatAnyone, we need CHW not NCHW for unbatched
-                    if img_nchw.ndim == 4 and img_nchw.shape[0] == 1:
-                        img_chw = img_nchw[0]  # Remove batch dimension
-                    else:
-                        img_chw = img_nchw
-                    # Try unbatched first (most common)
-                    try:
-                        new_kwargs = dict(kwargs)
-                        new_kwargs["image"] = img_chw  # CHW
-                        new_kwargs["mask"] = m_fixed.squeeze(0) if m_fixed.ndim > 2 and m_fixed.shape[0] == 1 else m_fixed
-                        new_kwargs["idx_mask"] = bool(idx_mask)
-                        result = original_method(**new_kwargs)
-                        return result
-                    except Exception as e1:
-                        logger.debug(f"Unbatched call failed, trying batched: {e1}")
-                        # Try with batch dimension
-                        new_kwargs = dict(kwargs)
-                        new_kwargs["image"] = img_nchw  # NCHW
-                        new_kwargs["mask"] = m_fixed
-                        new_kwargs["idx_mask"] = bool(idx_mask)
-                        result = original_method(**new_kwargs)
-                        return result
-                except Exception as e:
-                    logger.error(f"MatAnyone guarded call failed: {e}")
-                    import traceback
-                    logger.debug(traceback.format_exc())
-                    # Return input mask as fallback
-                    if isinstance(mask, torch.Tensor):
-                        return mask.cpu().numpy()
-                    elif isinstance(mask, np.ndarray):
-                        return mask
-                    else:
-                        return np.ones((512, 512), dtype=np.float32) * 0.5
-            return guarded_method
-        # Apply guard to both step and process methods
-        if hasattr(processor, 'step'):
-            original_step = processor.step
-            processor.step = create_guarded_method(original_step)
-            logger.info("Installed shape guard on MatAnyone.step")
-        if hasattr(processor, 'process'):
-            original_process = processor.process
-            processor.process = create_guarded_method(original_process)
-            logger.info("Installed shape guard on MatAnyone.process")
-    def _patch_processor(self, processor):
-        """
-        Patch the MatAnyone processor to handle device placement and tensor formats correctly
-        """
-        original_step = getattr(processor, 'step', None)
-        original_process = getattr(processor, 'process', None)
-        device = self.device
-        def safe_wrapper(*args, **kwargs):
-            """Universal wrapper that handles both step and process calls"""
-            try:
-                # Handle different calling patterns
-                # Pattern 1: step(image, mask, idx_mask=False)
-                # Pattern 2: process(image, mask)
-                # Pattern 3: Called with just args
-                # Pattern 4: Called with kwargs
-                image = None
-                mask = None
-                idx_mask = kwargs.get('idx_mask', False)
-                # Extract image and mask
-                if 'image' in kwargs and 'mask' in kwargs:
-                    image = kwargs['image']
-                    mask = kwargs['mask']
-                elif len(args) >= 2:
-                    image = args[0]
-                    mask = args[1]
-                    if len(args) > 2:
-                        idx_mask = args[2]
-                elif len(args) == 1:
-                    # Might be called with just mask for refinement
-                    mask = args[0]
-                    # Create dummy image if needed
-                    if isinstance(mask, np.ndarray):
-                        h, w = mask.shape[:2] if mask.ndim >= 2 else (512, 512)
-                        image = np.zeros((h, w, 3), dtype=np.uint8)
-                    elif isinstance(mask, torch.Tensor):
-                        h, w = mask.shape[-2:] if mask.dim() >= 2 else (512, 512)
-                        image = torch.zeros((h, w, 3), dtype=torch.uint8)
-                if image is None or mask is None:
-                    logger.error(f"MatAnyone called with invalid args: {len(args)} args, kwargs: {kwargs.keys()}")
-                    # Return something safe
-                    if mask is not None:
-                        return mask
-                    return np.ones((512, 512), dtype=np.float32) * 0.5
-                # Convert to tensors on correct device
-                if isinstance(image, np.ndarray):
-                    image = torch.from_numpy(image).to(device)
-                elif isinstance(image, torch.Tensor):
-                    image = image.to(device)
-                if isinstance(mask, np.ndarray):
-                    mask = torch.from_numpy(mask).to(device)
-                elif isinstance(mask, torch.Tensor):
-                    mask = mask.to(device)
-                # Fix image format (ensure CHW or NCHW)
-                if image.dim() == 2:  # Grayscale HW
-                    image = image.unsqueeze(0)  # CHW
-                elif image.dim() == 3:
-                    # Check if HWC or CHW
-                    if image.shape[-1] in [1, 3, 4]:  # HWC
-                        image = image.permute(2, 0, 1)  # CHW
-                    # Add batch if needed
-                    if image.shape[0] in [1, 3, 4]:  # CHW
-                        image = image.unsqueeze(0)  # NCHW
-                elif image.dim() == 4:
-                    # Already NCHW, ensure correct channel position
-                    if image.shape[-1] in [1, 3, 4]:  # NHWC
-                        image = image.permute(0, 3, 1, 2)  # NCHW
-                # Fix mask format
-                if mask.dim() == 2:
-                    mask = mask.unsqueeze(0)  # Add channel: CHW
-                elif mask.dim() == 3:
-                    if mask.shape[0] > 4:  # Likely HWC
-                        mask = mask.permute(2, 0, 1)  # CHW
-                # Ensure float and normalized
-                if image.dtype != torch.float32:
-                    image = image.float()
-                if not idx_mask and mask.dtype != torch.float32:
-                    mask = mask.float()
-                if image.max() > 1.0:
-                    image = image / 255.0
-                if not idx_mask and mask.max() > 1.0:
-                    mask = mask / 255.0
-                # Call original method if it exists
-                if original_step:
-                    try:
-                        result = original_step(image, mask, idx_mask=idx_mask)
-                        # Convert result back to numpy if needed
-                        if isinstance(result, torch.Tensor):
-                            result = result.cpu().numpy()
-                        return result
-                    except Exception as e:
-                        logger.error(f"MatAnyone original step failed: {e}")
-                # Fallback: return slightly processed mask
-                if isinstance(mask, torch.Tensor):
-                    # Apply slight smoothing
-                    import torch.nn.functional as F
-                    mask = F.avg_pool2d(mask.unsqueeze(0), 3, stride=1, padding=1)
-                    mask = mask.squeeze(0).cpu().numpy()
-                return mask
-            except Exception as e:
-                logger.error(f"MatAnyone safe_wrapper failed: {e}")
-                import traceback
-                logger.debug(traceback.format_exc())
-                # Return safe fallback
-                if 'mask' in locals() and mask is not None:
-                    if isinstance(mask, torch.Tensor):
-                        return mask.cpu().numpy()
-                    return mask
-                return np.ones((512, 512), dtype=np.float32) * 0.5
-        # Apply patches to both methods
-        if hasattr(processor, 'step'):
-            processor.step = safe_wrapper
-            logger.info("Patched MatAnyone step method")
-        if hasattr(processor, 'process'):
-            processor.process = safe_wrapper
-            logger.info("Patched MatAnyone process method")
-        # Also add a direct call method
-        processor.__call__ = safe_wrapper
-    def _load_fallback(self) -> Optional[Any]:
-        """Create fallback processor for testing"""
-        class FallbackMatAnyone:
-            def __init__(self, device):
-                self.device = device
-            def step(self, image, mask, idx_mask=False, **kwargs):
-                """Pass through mask with minor smoothing"""
-                if isinstance(mask, np.ndarray):
-                    # Apply slight Gaussian blur for edge smoothing
                     import cv2
-                    if mask.ndim == 2:
-                        smoothed = cv2.GaussianBlur(mask, (5, 5), 1.0)
-                        return smoothed
-                    elif mask.ndim == 3:
-                        smoothed = np.zeros_like(mask)
-                        for i in range(mask.shape[0]):
-                            smoothed[i] = cv2.GaussianBlur(mask[i], (5, 5), 1.0)
-                        return smoothed
-                return mask
             def process(self, image, mask, **kwargs):
-                """Alias for step"""
                 return self.step(image, mask, **kwargs)
-        logger.warning("Using fallback MatAnyone (limited refinement)")
-        return FallbackMatAnyone(self.device)
     def cleanup(self):
-        """Clean up resources"""
         if self.model:
-            del self.model
             self.model = None
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
     def get_info(self) -> Dict[str, Any]:
-        """Get loader information"""
         return {
             "loaded": self.model is not None,
             "model_id": self.model_id,
             "device": self.device,
             "load_time": self.load_time,
-            "model_type": type(self.model).__name__ if self.model else None
-        }

 #!/usr/bin/env python3
 """
+MatAnyone Model Loader (Hardened)
+- Prevents 5D (B,T,C,H,W) tensors from ever reaching conv2d.
+- Normalizes images to BCHW [B,C,H,W] and masks to B1HW [B,1,H,W].
+- If idx_mask=True, converts masks to integer labels (long) safely.
+- Tries unbatched then batched calls for maximum compatibility.
+- Resizes masks with 'nearest' to preserve label integrity.
+- Includes a debug_shapes() helper for quick diagnostics.
 """
 import os
 import time
 import logging
 import traceback
+from typing import Optional, Dict, Any, Tuple
 import numpy as np
+import torch
 logger = logging.getLogger(__name__)
+# ------------------------------- Utilities -------------------------------- #
+def _select_device(pref: str) -> str:
+    """
+    Resolve a safe device string. If CUDA not available, fall back to CPU.
+    """
+    pref = (pref or "").lower()
+    if pref.startswith("cuda"):
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    if pref == "cpu":
+        return "cpu"
+    return "cuda" if torch.cuda.is_available() else "cpu"
+def _as_tensor_on_device(x, device: str) -> torch.Tensor:
+    """Convert ndarray or Tensor to torch.Tensor on device."""
+    if isinstance(x, torch.Tensor):
+        return x.to(device)
+    return torch.from_numpy(np.asarray(x)).to(device)
+def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
+    """
+    Normalize input to BCHW (image) or B1HW (mask).
+    Accepts: HWC, CHW, BCHW, BHWC, BTCHW, BTHWC, TCHW, THWC, HW.
+    - Collapses any time/clip dimension T if present (takes t=0 if T>1).
+    - Images returned as float32 in [0,1], shape [B,C,H,W] (C=3 or 4; C=1 expanded to 3).
+    - Masks  returned as float32 in [0,1], shape [B,1,H,W].
+    """
+    x = _as_tensor_on_device(x, device)
+    # Promote to float and normalize if needed
+    if x.dtype == torch.uint8:
+        x = x.float().div_(255.0)
+    elif x.dtype in (torch.int16, torch.int32, torch.int64):
+        x = x.float()
+    # 5D: [B,T,C,H,W] or [B,T,H,W,C]  -> take first frame
+    if x.ndim == 5:
+        B, T = x.shape[0], x.shape[1]
+        x = x[:, 0] if T > 0 else x.squeeze(1)  # -> [B,C,H,W] or [B,H,W,C]
+    # 4D
+    if x.ndim == 4:
+        # If BHWC, permute to BCHW
+        if x.shape[-1] in (1, 3, 4) and x.shape[1] not in (1, 3, 4):
+            x = x.permute(0, 3, 1, 2).contiguous()
+    # 3D
+    elif x.ndim == 3:
+        # HWC -> CHW
+        if x.shape[-1] in (1, 3, 4):
+            x = x.permute(2, 0, 1).contiguous()
+        x = x.unsqueeze(0)  # -> BCHW
+    # 2D
+    elif x.ndim == 2:
+        if is_mask:
+            x = x.unsqueeze(0).unsqueeze(0)  # -> B1HW
+        else:
+            x = x.unsqueeze(0).unsqueeze(0)  # 1,1,H,W
+            x = x.repeat(1, 3, 1, 1)        # 1,3,H,W
+    else:
+        raise ValueError(f"Unsupported tensor ndim={x.ndim} for normalization")
+    # Now x should be BCHW
+    if is_mask:
+        # Ensure single-channel
+        if x.shape[1] > 1:
+            x = x[:, :1]
+        x = x.clamp_(0.0, 1.0).to(torch.float32)
+    else:
+        # Ensure reasonable channels
+        C = x.shape[1]
+        if C == 1:
+            x = x.repeat(1, 3, 1, 1)
+        if x.min() < 0.0 or x.max() > 1.0:
+            x = x.clamp_(0.0, 1.0)
+        x = x.to(torch.float32)
+    return x
+def _resize_mask_to(img_bchw: torch.Tensor, mask_b1hw: torch.Tensor) -> torch.Tensor:
+    """
+    Ensure mask spatial dims match image. Use NEAREST to keep labels crisp.
+    """
+    if img_bchw.shape[-2:] == mask_b1hw.shape[-2:]:
+        return mask_b1hw
+    import torch.nn.functional as F
+    return F.interpolate(mask_b1hw, size=img_bchw.shape[-2:], mode="nearest")
+def debug_shapes(tag: str, image, mask) -> None:
+    """
+    Quick diagnostics: logs shape/dtype/min/max for image/mask.
+    """
+    def _info(name, t):
+        try:
+            tt = torch.as_tensor(t)
+            mn = float(tt.min()) if tt.numel() else float("nan")
+            mx = float(tt.max()) if tt.numel() else float("nan")
+            logger.info(f"[{tag}:{name}] shape={tuple(tt.shape)} dtype={tt.dtype} "
+                        f"min={mn:.4f} max={mx:.4f}")
+        except Exception as e:
+            logger.info(f"[{tag}:{name}] type={type(t)} err={e}")
+    _info("image", image)
+    _info("mask",  mask)
+# --------------------------- Boundary Wrapper ------------------------------ #
+class _MatAnyoneWrapper:
+    """
+    Thin, defensive wrapper around the MatAnyone InferenceCore.
+    Normalizes inputs at the boundary so the core never sees >4D tensors.
+    """
+    def __init__(self, core: Any, device: str):
+        self.core = core
+        self.device = device
+        # Try to move the core to device, if supported.
+        try:
+            if hasattr(self.core, "to"):
+                self.core.to(self.device)
+        except Exception as e:
+            logger.debug(f"MatAnyone core .to({self.device}) not applied: {e}")
+    @staticmethod
+    def _to_numpy(x):
+        if isinstance(x, torch.Tensor):
+            return x.detach().cpu().numpy()
+        return np.asarray(x)
+    def _normalize_pair(
+        self, image, mask, idx_mask: bool
+    ) -> Tuple[torch.Tensor, torch.Tensor, bool]:
+        img_bchw = _to_bchw(image, self.device, is_mask=False)  # [B,C,H,W]
+        msk_b1hw = _to_bchw(mask,  self.device, is_mask=True)   # [B,1,H,W]
+        msk_b1hw = _resize_mask_to(img_bchw, msk_b1hw)
+        return img_bchw, msk_b1hw, bool(idx_mask)
+    def __call__(self, image, mask, idx_mask: bool = False, **kwargs):
+        """
+        Preferred entry: handles normalization and robust call patterns.
+        """
+        img_bchw, msk_b1hw, idx_mask = self._normalize_pair(image, mask, idx_mask)
+        # Special handling for idx_mask: convert to integer label map.
+        if idx_mask:
+            # Threshold -> {0,1} long; squeeze channel
+            m_bhw = (msk_b1hw > 0.5).long()[:, 0]  # [B,H,W]
+            # Try unbatched first if B==1
+            if img_bchw.shape[0] == 1:
+                img_chw = img_bchw[0]  # [C,H,W]
+                m_hw   = m_bhw[0]      # [H,W]
+                # Prefer step(image, mask, idx_mask=True)
+                try:
+                    if hasattr(self.core, "step"):
+                        out = self.core.step(image=img_chw, mask=m_hw, idx_mask=True, **kwargs)
+                        return self._to_numpy(out)
+                except Exception as e_unbatched_idx:
+                    logger.debug(f"MatAnyone unbatched idx_mask step() failed: {e_unbatched_idx}")
+            # Batched fallback
+            for method_name in ("step", "process"):
+                try:
+                    if hasattr(self.core, method_name):
+                        method = getattr(self.core, method_name)
+                        out = method(image=img_bchw, mask=m_bhw, idx_mask=True, **kwargs)
+                        return self._to_numpy(out)
+                except Exception as e_batched_idx:
+                    logger.debug(f"MatAnyone {method_name} idx_mask batched call failed: {e_batched_idx}")
+            logger.warning("MatAnyone idx_mask calls failed; returning integer mask as fallback.")
+            return self._to_numpy(m_bhw if m_bhw.shape[0] > 1 else m_bhw[0])
+        # Non-index soft/binary mask path
+        try:
+            # Try unbatched first (common CHW / 1HW)
+            if hasattr(self.core, "step") and img_bchw.shape[0] == 1:
+                img_chw = img_bchw[0]        # [C,H,W]
+                m_1hw   = msk_b1hw[0]        # [1,H,W]
+                out = self.core.step(image=img_chw, mask=m_1hw, idx_mask=False, **kwargs)
+                return self._to_numpy(out)
+        except Exception as e_unbatched:
+            logger.debug(f"MatAnyone unbatched step() failed: {e_unbatched}")
+        # Batched fallback
+        for method_name in ("step", "process"):
+            try:
+                if hasattr(self.core, method_name):
+                    method = getattr(self.core, method_name)
+                    out = method(image=img_bchw, mask=msk_b1hw, idx_mask=False, **kwargs)
+                    return self._to_numpy(out)
+            except Exception as e_batched:
+                logger.debug(f"MatAnyone {method_name} batched call failed: {e_batched}")
+        logger.warning("MatAnyone calls failed; returning input mask as fallback.")
+        return self._to_numpy(msk_b1hw.squeeze(1))  # [B,H,W] or [H,W] if squeezed
+# ------------------------------- Loader ----------------------------------- #
 class MatAnyoneLoader:
+    """Dedicated loader for MatAnyone models (with boundary normalization)."""
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyone_cache"):
+        self.device = _select_device(device)
         self.cache_dir = cache_dir
         os.makedirs(self.cache_dir, exist_ok=True)
+        self.model: Optional[Any] = None
         self.model_id = "PeiqingYang/MatAnyone"
         self.load_time = 0.0
     def load(self) -> Optional[Any]:
         """
+        Load MatAnyone model and return a callable wrapper.
         Returns:
+            _MatAnyoneWrapper or None
         """
+        logger.info(f"Loading MatAnyone model: {self.model_id} (device={self.device})")
         strategies = [
             ("official", self._load_official),
+            ("fallback", self._load_fallback),
         ]
         for strategy_name, strategy_func in strategies:
             try:
                 logger.info(f"Trying MatAnyone loading strategy: {strategy_name}")
                 if model:
                     self.load_time = time.time() - start_time
                     self.model = model
+                    logger.info(f"MatAnyone loaded via {strategy_name} in {self.load_time:.2f}s")
                     return model
             except Exception as e:
                 logger.error(f"MatAnyone {strategy_name} strategy failed: {e}")
                 logger.debug(traceback.format_exc())
                 continue
         logger.error("All MatAnyone loading strategies failed")
         return None
     def _load_official(self) -> Optional[Any]:
         """
+        Load using the official MatAnyone API and wrap with boundary normalizer.
         """
+        try:
+            from matanyone import InferenceCore  # type: ignore
+        except Exception as e:
+            logger.error(f"Failed to import official MatAnyone: {e}")
+            return None
+        core = InferenceCore(self.model_id)
+        wrapped = _MatAnyoneWrapper(core, device=self.device)
+        return wrapped
+    def _load_fallback(self) -> Optional[Any]:
+        """Create a minimal fallback that smooths/returns the mask."""
+        class _FallbackCore:
+            def step(self, image, mask, idx_mask: bool = False, **kwargs):
+                # Convert mask to numpy
+                if isinstance(mask, torch.Tensor):
+                    mask_np = mask.detach().cpu().numpy()
                 else:
+                    mask_np = np.asarray(mask)
                 try:
                     import cv2
+                    if mask_np.ndim == 2:
+                        return cv2.GaussianBlur(mask_np, (5, 5), 1.0)
+                    if mask_np.ndim == 3:
+                        # Handle CHW-style smoothing (per-channel)
+                        if mask_np.shape[0] in (1, 3, 4):
+                            sm = np.empty_like(mask_np)
+                            for i in range(mask_np.shape[0]):
+                                sm[i] = cv2.GaussianBlur(mask_np[i], (5, 5), 1.0)
+                            return sm
+                    return mask_np
+                except Exception:
+                    return mask_np
             def process(self, image, mask, **kwargs):
                 return self.step(image, mask, **kwargs)
+        logger.warning("Using fallback MatAnyone (limited refinement).")
+        core = _FallbackCore()
+        return _MatAnyoneWrapper(core, device=self.device)
+    # --------------------------- Housekeeping --------------------------- #
     def cleanup(self):
+        """Clean up resources."""
         if self.model:
+            try:
+                del self.model
+            except Exception:
+                pass
             self.model = None
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
     def get_info(self) -> Dict[str, Any]:
+        """Get loader information."""
         return {
             "loaded": self.model is not None,
             "model_id": self.model_id,
             "device": self.device,
             "load_time": self.load_time,
+            "model_type": type(self.model).__name__ if self.model else None,
+        }
+    # Optional: instance-level shape debugging hook
+    def debug_shapes(self, image, mask, tag: str = ""):
+        debug_shapes(tag, image, mask)