Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 28, 2025

Commit

58a43ef

1 Parent(s): bb9d73e

Update models/loaders/matanyone_loader.py

Browse files

Files changed (1) hide show

models/loaders/matanyone_loader.py +239 -218

models/loaders/matanyone_loader.py CHANGED Viewed

@@ -1,12 +1,10 @@
 #!/usr/bin/env python3
 """
-MatAnyone Model Loader (Hardened v2)
-- Prevents 5D (B,T,C,H,W) tensors from reaching conv2d.
-- Normalizes images to BCHW [B,C,H,W] and masks to B1HW [B,1,H,W].
-- idx_mask=True -> integer label map, but final output still a 2-D [H,W] mask for OpenCV.
-- ALWAYS returns a 2-D, contiguous, float32 mask [H,W] to downstream code.
-- Tries unbatched then batched calls; resizes masks with NEAREST to preserve labels.
-- Includes debug_shapes() for quick diagnostics.
 """
 import os
@@ -17,14 +15,16 @@
 import numpy as np
 import torch
 logger = logging.getLogger(__name__)
-# ------------------------------- Utilities -------------------------------- #
 def _select_device(pref: str) -> str:
-    pref = (pref or "").lower()
     if pref.startswith("cuda"):
         return "cuda" if torch.cuda.is_available() else "cpu"
     if pref == "cpu":
@@ -41,296 +41,317 @@ def _as_tensor_on_device(x, device: str) -> torch.Tensor:
 def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
     """
     Normalize input to BCHW (image) or B1HW (mask).
-    Accepts: HWC, CHW, BCHW, BHWC, BTCHW, BTHWC, TCHW, THWC, HW.
     """
     x = _as_tensor_on_device(x, device)
-    # Promote to float and normalize if needed
     if x.dtype == torch.uint8:
         x = x.float().div_(255.0)
     elif x.dtype in (torch.int16, torch.int32, torch.int64):
         x = x.float()
-    # 5D: [B,T,C,H,W] or [B,T,H,W,C] -> take first frame
     if x.ndim == 5:
-        B, T = x.shape[0], x.shape[1]
-        x = x[:, 0] if T > 0 else x.squeeze(1)
-    # 4D
     if x.ndim == 4:
         if x.shape[-1] in (1, 3, 4) and x.shape[1] not in (1, 3, 4):
-            x = x.permute(0, 3, 1, 2).contiguous()  # BHWC -> BCHW
-    # 3D
     elif x.ndim == 3:
-        if x.shape[-1] in (1, 3, 4):               # HWC -> CHW
             x = x.permute(2, 0, 1).contiguous()
-        x = x.unsqueeze(0)                          # -> BCHW
-    # 2D
     elif x.ndim == 2:
-        if is_mask:
-            x = x.unsqueeze(0).unsqueeze(0)         # -> B1HW
-        else:
-            x = x.unsqueeze(0).unsqueeze(0)         # 1,1,H,W
-            x = x.repeat(1, 3, 1, 1)                # 1,3,H,W
     else:
-        raise ValueError(f"Unsupported tensor ndim={x.ndim} for normalization")
-    # Finalize channels / clamp
     if is_mask:
         if x.shape[1] > 1:
             x = x[:, :1]
         x = x.clamp_(0.0, 1.0).to(torch.float32)
     else:
-        C = x.shape[1]
-        if C == 1:
             x = x.repeat(1, 3, 1, 1)
-        if x.min() < 0.0 or x.max() > 1.0:
-            x = x.clamp_(0.0, 1.0)
-        x = x.to(torch.float32)
     return x
 def _resize_mask_to(img_bchw: torch.Tensor, mask_b1hw: torch.Tensor) -> torch.Tensor:
-    """Ensure mask spatial dims match image. Use NEAREST to keep labels crisp."""
     if img_bchw.shape[-2:] == mask_b1hw.shape[-2:]:
         return mask_b1hw
-    import torch.nn.functional as F
     return F.interpolate(mask_b1hw, size=img_bchw.shape[-2:], mode="nearest")
-def debug_shapes(tag: str, image, mask) -> None:
-    """Quick diagnostics: logs shape/dtype/min/max for image/mask."""
-    def _info(name, t):
-        try:
-            tt = torch.as_tensor(t)
-            mn = float(tt.min()) if tt.numel() else float("nan")
-            mx = float(tt.max()) if tt.numel() else float("nan")
-            logger.info(f"[{tag}:{name}] shape={tuple(tt.shape)} dtype={tt.dtype} "
-                        f"min={mn:.4f} max={mx:.4f}")
-        except Exception as e:
-            logger.info(f"[{tag}:{name}] type={type(t)} err={e}")
-    _info("image", image)
-    _info("mask",  mask)
-def _to_2d_numpy_mask(x) -> np.ndarray:
     """
-    Convert any tensor/ndarray mask to a 2-D contiguous float32 array [H,W] in [0,1].
-    Handles inputs like: B1HW, BCHW, 1HW, CHW, HWC, HW, etc.
     """
-    if isinstance(x, torch.Tensor):
-        t = x.detach()
-    else:
-        t = torch.as_tensor(x)
-    # Bring to float in [0,1] if likely 0..255
-    if t.dtype == torch.uint8:
-        t = t.float().div_(255.0)
-    elif t.dtype in (torch.int16, torch.int32, torch.int64):
-        t = t.float()
-    else:
-        t = t.float()
-    # Reduce dimensions to [H,W]
-    if t.ndim == 4:           # e.g., [B, C, H, W]
-        if t.shape[0] > 1:
-            t = t[0]
-        # now [C,H,W]
-        if t.shape[0] > 1:    # multiple channels -> take first (or could mean)
-            t = t[0]
-        else:
-            t = t[0]          # squeeze channel -> [H,W]
-    elif t.ndim == 3:
-        # Could be [1,H,W], [C,H,W], or [H,W,1]
-        if t.shape[0] in (1, 3, 4):     # CHW/1HW
-            t = t[0]                    # -> [H,W] (first channel)
-        elif t.shape[-1] == 1:          # HWC with single channel
-            t = t[..., 0]               # -> [H,W]
         else:
-            # Unknown 3D -> take first slice
-            t = t[0]
-    elif t.ndim == 2:
-        pass  # already [H,W]
-    else:
-        # Any other: try to squeeze to 2-D
-        t = t.squeeze()
-        if t.ndim != 2:
-            # fallback to a tiny neutral mask
-            h = int(t.shape[-2]) if t.ndim >= 2 else 512
-            w = int(t.shape[-1]) if t.ndim >= 2 else 512
-            t = torch.full((h, w), 0.5, dtype=torch.float32)
-    # Clamp and convert to contiguous numpy
     t = t.clamp_(0.0, 1.0)
-    m = t.cpu().numpy().astype(np.float32)
-    return np.ascontiguousarray(m)
-# --------------------------- Boundary Wrapper ------------------------------ #
-class _MatAnyoneWrapper:
-    """
-    Thin, defensive wrapper around the MatAnyone InferenceCore.
-    Normalizes inputs at the boundary and always outputs a 2-D mask for OpenCV.
     """
-    def __init__(self, core: Any, device: str):
         self.core = core
         self.device = device
-        # Try to move the core to device, if supported.
         try:
-            if hasattr(self.core, "to"):
-                self.core.to(self.device)
-        except Exception as e:
-            logger.debug(f"MatAnyone core .to({self.device}) not applied: {e}")
-    def _normalize_pair(
-        self, image, mask, idx_mask: bool
-    ) -> Tuple[torch.Tensor, torch.Tensor, bool]:
-        img_bchw = _to_bchw(image, self.device, is_mask=False)  # [B,C,H,W]
-        msk_b1hw = _to_bchw(mask,  self.device, is_mask=True)   # [B,1,H,W]
-        msk_b1hw = _resize_mask_to(img_bchw, msk_b1hw)
-        return img_bchw, msk_b1hw, bool(idx_mask)
-    def __call__(self, image, mask, idx_mask: bool = False, **kwargs):
         """
-        Entry point: returns a 2-D float32 mask [H,W] for downstream OpenCV.
         """
-        img_bchw, msk_b1hw, idx_mask = self._normalize_pair(image, mask, idx_mask)
-        # idx_mask path -> integer labels; still output 2-D for downstream
-        if idx_mask:
-            m_bhw = (msk_b1hw > 0.5).long()[:, 0]  # [B,H,W]
-            # Try unbatched if B==1
-            if img_bchw.shape[0] == 1:
-                img_chw = img_bchw[0]  # [C,H,W]
-                m_hw   = m_bhw[0]      # [H,W]
-                try:
-                    if hasattr(self.core, "step"):
-                        out = self.core.step(image=img_chw, mask=m_hw, idx_mask=True, **kwargs)
-                        return _to_2d_numpy_mask(out)
-                except Exception as e_unbatched_idx:
-                    logger.debug(f"MatAnyone unbatched idx_mask step() failed: {e_unbatched_idx}")
-            # Batched fallback
-            for method_name in ("step", "process"):
-                try:
-                    if hasattr(self.core, method_name):
-                        method = getattr(self.core, method_name)
-                        out = method(image=img_bchw, mask=m_bhw, idx_mask=True, **kwargs)
-                        return _to_2d_numpy_mask(out)
-                except Exception as e_batched_idx:
-                    logger.debug(f"MatAnyone {method_name} idx_mask batched call failed: {e_batched_idx}")
-            logger.warning("MatAnyone idx_mask calls failed; returning integer mask as fallback.")
-            return _to_2d_numpy_mask(m_bhw)
-        # Non-index mask path (soft/binary)
         try:
-            if hasattr(self.core, "step") and img_bchw.shape[0] == 1:
-                img_chw = img_bchw[0]        # [C,H,W]
-                m_1hw   = msk_b1hw[0]        # [1,H,W]
-                out = self.core.step(image=img_chw, mask=m_1hw, idx_mask=False, **kwargs)
-                return _to_2d_numpy_mask(out)
-        except Exception as e_unbatched:
-            logger.debug(f"MatAnyone unbatched step() failed: {e_unbatched}")
-        # Batched fallback
-        for method_name in ("step", "process"):
             try:
-                if hasattr(self.core, method_name):
-                    method = getattr(self.core, method_name)
-                    out = method(image=img_bchw, mask=msk_b1hw, idx_mask=False, **kwargs)
-                    return _to_2d_numpy_mask(out)
-            except Exception as e_batched:
-                logger.debug(f"MatAnyone {method_name} batched call failed: {e_batched}")
-        logger.warning("MatAnyone calls failed; returning input mask as fallback.")
-        # Return a valid 2-D mask even on total failure
-        return _to_2d_numpy_mask(msk_b1hw)
-# ------------------------------- Loader ----------------------------------- #
 class MatAnyoneLoader:
-    """Dedicated loader for MatAnyone models (with boundary normalization)."""
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyone_cache"):
         self.device = _select_device(device)
         self.cache_dir = cache_dir
         os.makedirs(self.cache_dir, exist_ok=True)
-        self.model: Optional[Any] = None
         self.model_id = "PeiqingYang/MatAnyone"
         self.load_time = 0.0
-    def load(self) -> Optional[Any]:
         """
-        Load MatAnyone model and return a callable wrapper.
-        Returns: _MatAnyoneWrapper or None
         """
-        logger.info(f"Loading MatAnyone model: {self.model_id} (device={self.device})")
-        strategies = [
-            ("official", self._load_official),
-            ("fallback", self._load_fallback),
         ]
-        for strategy_name, strategy_func in strategies:
             try:
-                logger.info(f"Trying MatAnyone loading strategy: {strategy_name}")
-                start_time = time.time()
-                model = strategy_func()
-                if model:
-                    self.load_time = time.time() - start_time
-                    self.model = model
-                    logger.info(f"MatAnyone loaded via {strategy_name} in {self.load_time:.2f}s")
-                    return model
             except Exception as e:
-                logger.error(f"MatAnyone {strategy_name} strategy failed: {e}")
-                logger.debug(traceback.format_exc())
-                continue
-        logger.error("All MatAnyone loading strategies failed")
-        return None
-    def _load_official(self) -> Optional[Any]:
-        """Load using the official MatAnyone API and wrap with boundary normalizer."""
-        try:
-            from matanyone import InferenceCore  # type: ignore
-        except Exception as e:
-            logger.error(f"Failed to import official MatAnyone: {e}")
-            return None
-        core = InferenceCore(self.model_id)
-        wrapped = _MatAnyoneWrapper(core, device=self.device)
-        return wrapped
-    def _load_fallback(self) -> Optional[Any]:
-        """Create a minimal fallback that smooths/returns the mask."""
-        class _FallbackCore:
-            def step(self, image, mask, idx_mask: bool = False, **kwargs):
-                # Convert to 2-D numpy mask as final step
-                m2d = _to_2d_numpy_mask(mask)
-                try:
-                    import cv2
-                    return cv2.GaussianBlur(m2d, (5, 5), 1.0)
-                except Exception:
-                    return m2d
-            def process(self, image, mask, **kwargs):
-                return self.step(image, mask, **kwargs)
-        logger.warning("Using fallback MatAnyone (limited refinement).")
-        core = _FallbackCore()
-        return _MatAnyoneWrapper(core, device=self.device)
-    # --------------------------- Housekeeping --------------------------- #
     def cleanup(self):
         if self.model:
             try:
                 del self.model
@@ -342,7 +363,7 @@ def cleanup(self):
     def get_info(self) -> Dict[str, Any]:
         return {
-            "loaded": self.model is not None,
             "model_id": self.model_id,
             "device": self.device,
             "load_time": self.load_time,

 #!/usr/bin/env python3
 """
+MatAnyone Loader + Stateful Adapter
+- Loads the official model from Hugging Face.
+- Drives InferenceCore as intended: first-frame encode + warm-up, then propagation.
+- Normalizes inputs so conv2d never sees 5-D tensors.
+- Always outputs a 2-D, contiguous float32 mask [H,W] for OpenCV.
 """
 import os
 import numpy as np
 import torch
+import torch.nn.functional as F
+import inspect
 logger = logging.getLogger(__name__)
+# ------------------------- Shape & dtype utilities ------------------------- #
 def _select_device(pref: str) -> str:
+    pref = (pref or "").lower() if pref else ""
     if pref.startswith("cuda"):
         return "cuda" if torch.cuda.is_available() else "cpu"
     if pref == "cpu":
 def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
     """
     Normalize input to BCHW (image) or B1HW (mask).
+    Accepts: HWC, CHW, BCHW, BHWC, BTCHW/BTHWC, TCHW/THWC, HW.
     """
     x = _as_tensor_on_device(x, device)
+    # dtype / range
     if x.dtype == torch.uint8:
         x = x.float().div_(255.0)
     elif x.dtype in (torch.int16, torch.int32, torch.int64):
         x = x.float()
+    # 5D [B,T,*,H,W] or [B,T,H,W,*] -> take first frame
     if x.ndim == 5:
+        x = x[:, 0]  # -> 4D
+    # 4D: BHWC -> BCHW
     if x.ndim == 4:
         if x.shape[-1] in (1, 3, 4) and x.shape[1] not in (1, 3, 4):
+            x = x.permute(0, 3, 1, 2).contiguous()
+    # 3D: HWC -> CHW; add batch
     elif x.ndim == 3:
+        if x.shape[-1] in (1, 3, 4):
             x = x.permute(2, 0, 1).contiguous()
+        x = x.unsqueeze(0)
+    # 2D: add channel & batch
     elif x.ndim == 2:
+        x = x.unsqueeze(0).unsqueeze(0)
+        if not is_mask:
+            x = x.repeat(1, 3, 1, 1)
     else:
+        raise ValueError(f"Unsupported ndim={x.ndim}")
+    # finalize channels
     if is_mask:
         if x.shape[1] > 1:
             x = x[:, :1]
         x = x.clamp_(0.0, 1.0).to(torch.float32)
     else:
+        if x.shape[1] == 1:
             x = x.repeat(1, 3, 1, 1)
+        x = x.clamp_(0.0, 1.0).to(torch.float32)
     return x
+def _to_chw_image(img_bchw: torch.Tensor) -> torch.Tensor:
+    """Prefer CHW for InferenceCore.step."""
+    if img_bchw.ndim == 4 and img_bchw.shape[0] == 1:
+        return img_bchw[0]
+    return img_bchw  # some builds may accept batched; we try CHW first
+def _to_1hw_mask(msk_b1hw: torch.Tensor) -> torch.Tensor:
+    """Non-idx path expects [1,H,W] for single target."""
+    if msk_b1hw is None:
+        return None
+    if msk_b1hw.ndim == 4 and msk_b1hw.shape[1] == 1:
+        return msk_b1hw[0]  # -> [1,H,W]
+    if msk_b1hw.ndim == 3 and msk_b1hw.shape[0] == 1:
+        return msk_b1hw
+    raise ValueError(f"Expected B1HW or 1HW, got {tuple(msk_b1hw.shape)}")
 def _resize_mask_to(img_bchw: torch.Tensor, mask_b1hw: torch.Tensor) -> torch.Tensor:
+    if mask_b1hw is None:
+        return None
     if img_bchw.shape[-2:] == mask_b1hw.shape[-2:]:
         return mask_b1hw
     return F.interpolate(mask_b1hw, size=img_bchw.shape[-2:], mode="nearest")
+def _to_2d_alpha_numpy(x) -> np.ndarray:
     """
+    Convert probabilities/mattes to 2-D float32 [H,W] contiguous.
     """
+    t = torch.as_tensor(x).float()
+    while t.ndim > 2:
+        if t.ndim == 3:
+            t = t[0] if t.shape[0] >= 1 else t.squeeze(0)
         else:
+            t = t.squeeze()
     t = t.clamp_(0.0, 1.0)
+    out = t.detach().cpu().numpy().astype(np.float32)
+    return np.ascontiguousarray(out)
+def debug_shapes(tag: str, image, mask) -> None:
+    def _info(name, v):
+        try:
+            tv = torch.as_tensor(v)
+            mn = float(tv.min()) if tv.numel() else float("nan")
+            mx = float(tv.max()) if tv.numel() else float("nan")
+            logger.info(f"[{tag}:{name}] shape={tuple(tv.shape)} dtype={tv.dtype} min={mn:.4f} max={mx:.4f}")
+        except Exception as e:
+            logger.info(f"[{tag}:{name}] type={type(v)} err={e}")
+    _info("image", image)
+    _info("mask", mask)
+# ------------------------------ Stateful Adapter --------------------------- #
+class _MatAnyoneSession:
     """
+    Minimal stateful controller around InferenceCore.
+    Usage:
+        # frame 0 (has initial coarse mask):
+        alpha0 = session(frame0_rgb, mask0)      # encode + warm-up predict
+        # frames 1..N (no mask):
+        alpha  = session(frame_rgb)              # propagate/refine
+    """
+    def __init__(self, core, device: str):
         self.core = core
         self.device = device
+        self.started = False
+        # discover supported step() kwargs
         try:
+            self._step_sig = inspect.signature(self.core.step)
+            self._has_first_frame_pred = "first_frame_pred" in self._step_sig.parameters
+            self._has_idx_mask = "idx_mask" in self._step_sig.parameters
+        except Exception:
+            self._step_sig = None
+            self._has_first_frame_pred = True
+            self._has_idx_mask = True
+        # discover output conversion helper
+        self._has_prob_to_mask = hasattr(self.core, "output_prob_to_mask")
+    def reset(self):
+        try:
+            if hasattr(self.core, "clear_memory"):
+                self.core.clear_memory()
+        except Exception:
+            pass
+        self.started = False
+    def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
         """
+        Returns a 2-D float32 alpha [H,W] suitable for OpenCV.
+        Expects RGB image in HWC or similar; mask as [H,W] or broadcastable.
         """
+        # Normalize inputs
+        img_bchw = _to_bchw(image, self.device, is_mask=False)   # [B,C,H,W]
+        msk_b1hw = _to_bchw(mask,  self.device, is_mask=True) if mask is not None else None
+        if msk_b1hw is not None:
+            msk_b1hw = _resize_mask_to(img_bchw, msk_b1hw)
+        img_chw = _to_chw_image(img_bchw)
+        m_1hw = _to_1hw_mask(msk_b1hw) if msk_b1hw is not None else None
         try:
+            if not self.started:
+                if m_1hw is None:
+                    logger.warning("First frame arrived without a mask; returning neutral alpha.")
+                    return np.full(img_chw.shape[-2:], 0.5, dtype=np.float32)
+                # 1) Encode target on first frame
+                kwargs1 = {}
+                if self._has_idx_mask:
+                    kwargs1["idx_mask"] = False
+                _ = self.core.step(image=img_chw, mask=m_1hw, **kwargs1)
+                # 2) First-frame warm-up prediction + memorize
+                kwargs2 = {}
+                if self._has_first_frame_pred:
+                    kwargs2["first_frame_pred"] = True
+                out_prob = self.core.step(image=img_chw, **kwargs2)
+                alpha = self._to_alpha(out_prob)
+                self.started = True
+                return _to_2d_alpha_numpy(alpha)
+            # Subsequent frames: propagate without mask
+            out_prob = self.core.step(image=img_chw)
+            alpha = self._to_alpha(out_prob)
+            return _to_2d_alpha_numpy(alpha)
+        except Exception as e:
+            logger.debug(traceback.format_exc())
+            logger.warning(f"MatAnyone call failed; returning input mask as fallback: {e}")
+            if m_1hw is not None:
+                return _to_2d_alpha_numpy(m_1hw)
+            return np.full(img_chw.shape[-2:], 0.5, dtype=np.float32)
+    def _to_alpha(self, out_prob):
+        """
+        Convert core output to alpha. Prefer core.output_prob_to_mask(matting=True) if available.
+        """
+        if self._has_prob_to_mask:
             try:
+                return self.core.output_prob_to_mask(out_prob, matting=True)
+            except Exception:
+                pass
+        # Fallback heuristics
+        t = torch.as_tensor(out_prob).float()
+        if t.ndim == 3 and t.shape[0] >= 1:
+            return t[0]
+        if t.ndim >= 2:
+            return t
+        return torch.full((1, 1), 0.5, dtype=torch.float32, device=t.device if t.is_cuda else "cpu")
+# -------------------------------- Loader ---------------------------------- #
 class MatAnyoneLoader:
+    """
+    Official MatAnyone loader with stateful adapter.
+    """
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyone_cache"):
         self.device = _select_device(device)
         self.cache_dir = cache_dir
         os.makedirs(self.cache_dir, exist_ok=True)
+        self.model = None        # torch.nn.Module (MatAnyone)
+        self.core = None         # InferenceCore
+        self.adapter = None      # _MatAnyoneSession
         self.model_id = "PeiqingYang/MatAnyone"
         self.load_time = 0.0
+    def _import_model_and_core(self):
         """
+        Import MatAnyone + InferenceCore with resilient fallbacks (different dist layouts).
         """
+        # Try several possible import paths to be robust
+        model_cls = core_cls = None
+        err_msgs = []
+        # Candidates for model class
+        model_paths = [
+            ("matanyone.model.matanyone", "MatAnyone"),
+            ("matanyone", "MatAnyone"),
         ]
+        for mod, cls in model_paths:
+            try:
+                m = __import__(mod, fromlist=[cls])
+                model_cls = getattr(m, cls)
+                break
+            except Exception as e:
+                err_msgs.append(f"model {mod}.{cls}: {e}")
+        # Candidates for InferenceCore
+        core_paths = [
+            ("matanyone.inference.inference_core", "InferenceCore"),
+            ("matanyone", "InferenceCore"),
+        ]
+        for mod, cls in core_paths:
             try:
+                m = __import__(mod, fromlist=[cls])
+                core_cls = getattr(m, cls)
+                break
             except Exception as e:
+                err_msgs.append(f"core  {mod}.{cls}: {e}")
+        if model_cls is None or core_cls is None:
+            msg = " | ".join(err_msgs)
+            raise ImportError(f"Could not import MatAnyone/InferenceCore: {msg}")
+        return model_cls, core_cls
+    def load(self) -> Optional[Any]:
+        """
+        Load MatAnyone and return the stateful callable adapter.
+        """
+        logger.info(f"Loading MatAnyone from HF: {self.model_id} (device={self.device})")
+        start = time.time()
+        try:
+            model_cls, core_cls = self._import_model_and_core()
+            # Official pattern: model -> eval -> core(model, cfg=model.cfg)
+            self.model = model_cls.from_pretrained(self.model_id)
+            self.model = self.model.to(self.device).eval()
+            # Some builds require cfg; fall back if not present
+            try:
+                cfg = getattr(self.model, "cfg", None)
+                if cfg is not None:
+                    self.core = core_cls(self.model, cfg=cfg)
+                else:
+                    self.core = core_cls(self.model)
+            except TypeError:
+                # signature without cfg
+                self.core = core_cls(self.model)
+            # Move core to device if it supports .to
+            try:
+                if hasattr(self.core, "to"):
+                    self.core.to(self.device)
+            except Exception:
+                pass
+            self.adapter = _MatAnyoneSession(self.core, self.device)
+            self.load_time = time.time() - start
+            logger.info(f"MatAnyone loaded in {self.load_time:.2f}s")
+            return self.adapter
+        except Exception as e:
+            logger.error(f"Failed to load MatAnyone: {e}")
+            logger.debug(traceback.format_exc())
+            return None
     def cleanup(self):
+        if self.adapter:
+            try:
+                self.adapter.reset()
+            except Exception:
+                pass
+        self.adapter = None
+        self.core = None
         if self.model:
             try:
                 del self.model
     def get_info(self) -> Dict[str, Any]:
         return {
+            "loaded": self.adapter is not None,
             "model_id": self.model_id,
             "device": self.device,
             "load_time": self.load_time,