Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 29, 2025

Commit

183c1c8

1 Parent(s): a41fc30

Update models/loaders/matanyone_loader.py

Browse files

Files changed (1) hide show

models/loaders/matanyone_loader.py +57 -79

models/loaders/matanyone_loader.py CHANGED Viewed

@@ -3,33 +3,30 @@
 """
 MatAnyone Loader + Stateful Adapter (OOM-resilient, spatially robust)
 - Canonical HF load (MatAnyone.from_pretrained → InferenceCore(model, cfg))
-- Mixed precision (bf16/fp16) with safe fallback to fp32
 - torch.autocast(device_type="cuda", dtype=...) + torch.inference_mode()
 - Progressive downscale ladder with graceful fallback
 - Strict image↔mask alignment on every path/scale
 - Returns 2-D float32 [H,W] alpha (OpenCV-friendly)
 """
 from __future__ import annotations
 import os
 import time
 import logging
 import traceback
 from typing import Optional, Dict, Any, Tuple, List
 import numpy as np
 import torch
 import torch.nn.functional as F
 import inspect
 import threading
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Utilities (shapes, dtype, scaling)
 # ---------------------------------------------------------------------------
 def _select_device(pref: str) -> str:
     pref = (pref or "").lower()
     if pref.startswith("cuda"):
@@ -37,12 +34,10 @@ def _select_device(pref: str) -> str:
     if pref == "cpu":
         return "cpu"
     return "cuda" if torch.cuda.is_available() else "cpu"
 def _as_tensor_on_device(x, device: str) -> torch.Tensor:
     if isinstance(x, torch.Tensor):
         return x.to(device, non_blocking=True)
     return torch.from_numpy(np.asarray(x)).to(device, non_blocking=True)
 def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
     """
     Normalize input to BCHW (image) or B1HW (mask).
@@ -54,7 +49,7 @@ def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
     elif x.dtype in (torch.int16, torch.int32, torch.int64):
         x = x.float()
     if x.ndim == 5:
-        x = x[:, 0]  # -> 4D
     if x.ndim == 4:
         if x.shape[-1] in (1, 3, 4) and x.shape[1] not in (1, 3, 4):
             x = x.permute(0, 3, 1, 2).contiguous()
@@ -77,21 +72,18 @@ def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
             x = x.repeat(1, 3, 1, 1)
         x = x.clamp_(0.0, 1.0)
     return x
 def _to_chw_image(img_bchw: torch.Tensor) -> torch.Tensor:
     if img_bchw.ndim == 4 and img_bchw.shape[0] == 1:
         return img_bchw[0]
     return img_bchw
 def _to_1hw_mask(msk_b1hw: torch.Tensor) -> Optional[torch.Tensor]:
     if msk_b1hw is None:
         return None
     if msk_b1hw.ndim == 4 and msk_b1hw.shape[1] == 1:
-        return msk_b1hw[0]  # -> [1,H,W]
     if msk_b1hw.ndim == 3 and msk_b1hw.shape[0] == 1:
         return msk_b1hw
     raise ValueError(f"Expected B1HW or 1HW, got {tuple(msk_b1hw.shape)}")
 def _resize_bchw(x: Optional[torch.Tensor], size_hw: Tuple[int, int], is_mask: bool = False) -> Optional[torch.Tensor]:
     if x is None:
         return None
@@ -99,11 +91,10 @@ def _resize_bchw(x: Optional[torch.Tensor], size_hw: Tuple[int, int], is_mask: b
         return x
     mode = "nearest" if is_mask else "bilinear"
     return F.interpolate(x, size_hw, mode=mode, align_corners=False if mode == "bilinear" else None)
 def _to_b1hw_alpha(alpha, device: str) -> torch.Tensor:
     t = torch.as_tensor(alpha, device=device).float()
     if t.ndim == 2:
-        t = t.unsqueeze(0).unsqueeze(0)            # -> [1,1,H,W]
     elif t.ndim == 3:
         if t.shape[0] in (1, 3, 4):
             if t.shape[0] != 1:
@@ -126,7 +117,6 @@ def _to_b1hw_alpha(alpha, device: str) -> torch.Tensor:
         if t.shape[1] != 1:
             t = t[:, :1]
     return t.clamp_(0.0, 1.0).contiguous()
 def _to_2d_alpha_numpy(x) -> np.ndarray:
     t = torch.as_tensor(x).float()
     while t.ndim > 2:
@@ -139,17 +129,32 @@ def _to_2d_alpha_numpy(x) -> np.ndarray:
     t = t.clamp_(0.0, 1.0)
     out = t.detach().cpu().numpy().astype(np.float32)
     return np.ascontiguousarray(out)
 def _compute_scaled_size(h: int, w: int, max_edge: int, target_pixels: int) -> Tuple[int, int, float]:
     if h <= 0 or w <= 0:
         return h, w, 1.0
     s1 = min(1.0, float(max_edge) / float(max(h, w))) if max_edge > 0 else 1.0
     s2 = min(1.0, (float(target_pixels) / float(h * w)) ** 0.5) if target_pixels > 0 else 1.0
     s = min(s1, s2)
-    nh = max(1, int(round(h * s)))
-    nw = max(1, int(round(w * s)))
     return nh, nw, s
 def debug_shapes(tag: str, image, mask) -> None:
     def _info(name, v):
         try:
@@ -161,28 +166,24 @@ def _info(name, v):
             logger.info(f"[{tag}:{name}] type={type(v)} err={e}")
     _info("image", image)
     _info("mask", mask)
 # ---------------------------------------------------------------------------
 # Precision selection
 # ---------------------------------------------------------------------------
 def _choose_precision(device: str) -> Tuple[torch.dtype, bool, Optional[torch.dtype]]:
-    """Pick model weight dtype + autocast dtype (bf16>fp16>fp32)."""
     if device != "cuda":
         return torch.float32, False, None
-    bf16_ok = hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported()
     cc = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
     fp16_ok = cc[0] >= 7  # Volta+
     if bf16_ok:
         return torch.bfloat16, True, torch.bfloat16
-    if fp16_ok:
-        return torch.float16, True, torch.float16
     return torch.float32, False, None
 # ---------------------------------------------------------------------------
 # Stateful Adapter around InferenceCore
 # ---------------------------------------------------------------------------
 class _MatAnyoneSession:
     """
     Stateful controller around InferenceCore with OOM-resilient inference.
@@ -196,7 +197,7 @@ def __init__(
         use_autocast: bool,
         autocast_dtype: Optional[torch.dtype],
         max_edge: int = 768,
-        target_pixels: int = 600_000,   # ~775x775 by area
     ):
         self.core = core
         self.device = device
@@ -207,7 +208,6 @@ def __init__(
         self.target_pixels = int(target_pixels)
         self.started = False
         self._lock = threading.Lock()
         # Introspect optional args
         try:
             sig = inspect.signature(self.core.step)
@@ -215,7 +215,6 @@ def __init__(
         except Exception:
             self._has_first_frame_pred = True
         self._has_prob_to_mask = hasattr(self.core, "output_prob_to_mask")
     def reset(self):
         with self._lock:
             try:
@@ -224,7 +223,6 @@ def reset(self):
             except Exception:
                 pass
             self.started = False
     def _scaled_ladder(self, H: int, W: int) -> List[Tuple[int, int]]:
         nh, nw, s = _compute_scaled_size(H, W, self.max_edge, self.target_pixels)
         sizes = [(nh, nw)]
@@ -237,7 +235,6 @@ def _scaled_ladder(self, H: int, W: int) -> List[Tuple[int, int]]:
                 if sizes[-1] != (cur_h, cur_w):
                     sizes.append((cur_h, cur_w))
         return sizes
     def _to_alpha(self, out_prob):
         if self._has_prob_to_mask:
             try:
@@ -250,7 +247,6 @@ def _to_alpha(self, out_prob):
         if t.ndim == 3:
             return t[0] if t.shape[0] >= 1 else t.mean(0)
         return t
     def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
         """
         Returns a 2-D float32 alpha [H,W].
@@ -258,19 +254,16 @@ def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
         - frames 1..N: pass mask=None (propagation)
         """
         with self._lock:
-            img_bchw = _to_bchw(image, self.device, is_mask=False)   # [1,C,H,W]
             H, W = img_bchw.shape[-2], img_bchw.shape[-1]
             img_bchw = img_bchw.to(self.model_dtype, non_blocking=True)
             # Normalize + align provided mask (if any) to **B1HW** at full res
             msk_b1hw = _to_bchw(mask, self.device, is_mask=True) if mask is not None else None
             if msk_b1hw is not None and msk_b1hw.shape[-2:] != (H, W):
                 msk_b1hw = _resize_bchw(msk_b1hw, (H, W), is_mask=True)
-            mask_1hw = _to_1hw_mask(msk_b1hw) if msk_b1hw is not None else None  # ← 1HW!
             sizes = self._scaled_ladder(H, W)
             last_exc = None
             for (th, tw) in sizes:
                 try:
                     img_in = img_bchw if (th, tw) == (H, W) else F.interpolate(
@@ -283,9 +276,12 @@ def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
                         else:
                             # nearest to keep binary-like edges
                             msk_in = F.interpolate(mask_1hw.unsqueeze(0), size=(th, tw), mode="nearest")[0]
-                    img_chw = _to_chw_image(img_in).contiguous()  # [C,H,W]
                     with torch.inference_mode():
                         if self.use_autocast:
                             amp_ctx = torch.autocast(device_type="cuda", dtype=self.autocast_dtype)
@@ -294,7 +290,6 @@ class _NoOp:
                                 def __enter__(self): return None
                                 def __exit__(self, *a): return False
                             amp_ctx = _NoOp()
                         with amp_ctx:
                             if not self.started:
                                 if msk_in is None:
@@ -310,17 +305,15 @@ def __exit__(self, *a): return False
                                 self.started = True
                             else:
                                 out_prob = self.core.step(image=img_chw)
                     alpha = self._to_alpha(out_prob)
                     # Upsample alpha back if we ran at a smaller scale
                     if (th, tw) != (H, W):
                         a_b1hw = _to_b1hw_alpha(alpha, device=img_bchw.device)
                         a_b1hw = F.interpolate(a_b1hw, size=(H, W), mode="bilinear", align_corners=False)
                         alpha = a_b1hw[0, 0]
                     return _to_2d_alpha_numpy(alpha)
                 except torch.cuda.OutOfMemoryError as e:
                     last_exc = e
                     torch.cuda.empty_cache()
@@ -332,16 +325,13 @@ def __exit__(self, *a): return False
                     logger.debug(traceback.format_exc())
                     logger.warning(f"MatAnyone call failed at {th}x{tw}; retrying smaller. {e}")
                     continue
             logger.warning(f"MatAnyone calls failed; returning input mask or neutral alpha. {last_exc}")
             if mask_1hw is not None:
                 return _to_2d_alpha_numpy(mask_1hw)
             return np.full((H, W), 0.5, dtype=np.float32)
 # ---------------------------------------------------------------------------
 # Loader
 # ---------------------------------------------------------------------------
 class MatAnyoneLoader:
     """
     Official MatAnyone loader with stateful, OOM-resilient session adapter.
@@ -355,7 +345,6 @@ def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyo
         self.adapter = None
         self.model_id = "PeiqingYang/MatAnyone"
         self.load_time = 0.0
     # --- Robust imports (works with different packaging layouts) ---
     def _import_model_and_core(self):
         model_cls = core_cls = None
@@ -379,11 +368,10 @@ def _import_model_and_core(self):
                 core_cls = getattr(m, cls)
                 break
             except Exception as e:
-                err_msgs.append(f"core  {mod}.{cls}: {e}")
         if model_cls is None or core_cls is None:
             raise ImportError("Could not import MatAnyone / InferenceCore: " + " | ".join(err_msgs))
         return model_cls, core_cls
     def load(self) -> Optional[Any]:
         logger.info(f"Loading MatAnyone from HF: {self.model_id} (device={self.device})")
         t0 = time.time()
@@ -391,7 +379,6 @@ def load(self) -> Optional[Any]:
             model_cls, core_cls = self._import_model_and_core()
             model_dtype, use_autocast, autocast_dtype = _choose_precision(self.device)
             logger.info(f"MatAnyone precision: weights={model_dtype}, autocast={use_autocast and autocast_dtype}")
             # HF weights (safetensors)
             self.model = model_cls.from_pretrained(self.model_id)
             try:
@@ -399,21 +386,27 @@ def load(self) -> Optional[Any]:
             except Exception:
                 self.model = self.model.to(self.device)
             self.model.eval()
-            # Inference core (cfg may or may not exist on the model)
             try:
-                cfg = getattr(self.model, "cfg", None)
-                self.core = core_cls(self.model, cfg=cfg) if cfg is not None else core_cls(self.model)
             except TypeError:
                 self.core = core_cls(self.model)
             # Some versions expose .to(), some don’t — best effort
             try:
                 if hasattr(self.core, "to"):
                     self.core.to(self.device)
             except Exception:
                 pass
             # Build stateful adapter
             max_edge = int(os.environ.get("MATANYONE_MAX_EDGE", "768"))
             target_pixels = int(os.environ.get("MATANYONE_TARGET_PIXELS", "600000"))
@@ -429,12 +422,10 @@ def load(self) -> Optional[Any]:
             self.load_time = time.time() - t0
             logger.info(f"MatAnyone loaded in {self.load_time:.2f}s")
             return self.adapter
         except Exception as e:
             logger.error(f"Failed to load MatAnyone: {e}")
             logger.debug(traceback.format_exc())
             return None
     def cleanup(self):
         self.adapter = None
         self.core = None
@@ -446,7 +437,6 @@ def cleanup(self):
             self.model = None
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
     def get_info(self) -> Dict[str, Any]:
         return {
             "loaded": self.adapter is not None,
@@ -455,7 +445,6 @@ def get_info(self) -> Dict[str, Any]:
             "load_time": self.load_time,
             "model_type": type(self.model).__name__ if self.model else None,
         }
     def debug_shapes(self, image, mask, tag: str = ""):
         try:
             tv_img = torch.as_tensor(image)
@@ -465,11 +454,9 @@ def debug_shapes(self, image, mask, tag: str = ""):
                 logger.info(f"[{tag}:mask ] shape={tuple(tv_msk.shape)} dtype={tv_msk.dtype}")
         except Exception as e:
             logger.info(f"[{tag}] debug error: {e}")
 # ---------------------------------------------------------------------------
 # Public symbols
 # ---------------------------------------------------------------------------
 __all__ = [
     "MatAnyoneLoader",
     "_MatAnyoneSession",
@@ -482,43 +469,34 @@ def debug_shapes(self, image, mask, tag: str = ""):
     "_compute_scaled_size",
     "debug_shapes",
 ]
 # ---------------------------------------------------------------------------
 # Optional CLI for quick testing (no circular imports)
 # ---------------------------------------------------------------------------
 if __name__ == "__main__":
     import sys
-    import cv2  # only needed for this demo CLI
     logging.basicConfig(level=logging.INFO)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     if len(sys.argv) < 2:
         print(f"Usage: {sys.argv[0]} image.jpg [mask.png]")
         raise SystemExit(1)
     image_path = sys.argv[1]
-    mask_path  = sys.argv[2] if len(sys.argv) > 2 else None
     img_bgr = cv2.imread(image_path, cv2.IMREAD_COLOR)
     if img_bgr is None:
         print(f"Could not load image {image_path}")
         raise SystemExit(2)
     img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
     mask = None
     if mask_path:
         mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
         if mask is not None and mask.max() > 1:
             mask = (mask.astype(np.float32) / 255.0)
     loader = MatAnyoneLoader(device=device)
     session = loader.load()
     if not session:
         print("Failed to load MatAnyone")
         raise SystemExit(3)
     alpha = session(img_rgb, mask if mask is not None else np.ones(img_rgb.shape[:2], np.float32))
     cv2.imwrite("alpha_out.png", (np.clip(alpha, 0, 1) * 255).astype(np.uint8))
-    print("Alpha matte written to alpha_out.png")

 """
 MatAnyone Loader + Stateful Adapter (OOM-resilient, spatially robust)
 - Canonical HF load (MatAnyone.from_pretrained → InferenceCore(model, cfg))
+- Mixed precision (fp16 preferred over bf16) with safe fallback to fp32
 - torch.autocast(device_type="cuda", dtype=...) + torch.inference_mode()
 - Progressive downscale ladder with graceful fallback
 - Strict image↔mask alignment on every path/scale
 - Returns 2-D float32 [H,W] alpha (OpenCV-friendly)
+- Added: Force chunk_size=1, flip_aug=False in cfg to avoid dim mismatches
+- Added: Pad to multiple of 16 to avoid transformer patch issues
+- Added: Prefer fp16 over bf16 for Tesla T4 compatibility
 """
 from __future__ import annotations
 import os
 import time
 import logging
 import traceback
 from typing import Optional, Dict, Any, Tuple, List
 import numpy as np
 import torch
 import torch.nn.functional as F
 import inspect
 import threading
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Utilities (shapes, dtype, scaling)
 # ---------------------------------------------------------------------------
 def _select_device(pref: str) -> str:
     pref = (pref or "").lower()
     if pref.startswith("cuda"):
     if pref == "cpu":
         return "cpu"
     return "cuda" if torch.cuda.is_available() else "cpu"
 def _as_tensor_on_device(x, device: str) -> torch.Tensor:
     if isinstance(x, torch.Tensor):
         return x.to(device, non_blocking=True)
     return torch.from_numpy(np.asarray(x)).to(device, non_blocking=True)
 def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
     """
     Normalize input to BCHW (image) or B1HW (mask).
     elif x.dtype in (torch.int16, torch.int32, torch.int64):
         x = x.float()
     if x.ndim == 5:
+        x = x[:, 0] # -> 4D
     if x.ndim == 4:
         if x.shape[-1] in (1, 3, 4) and x.shape[1] not in (1, 3, 4):
             x = x.permute(0, 3, 1, 2).contiguous()
             x = x.repeat(1, 3, 1, 1)
         x = x.clamp_(0.0, 1.0)
     return x
 def _to_chw_image(img_bchw: torch.Tensor) -> torch.Tensor:
     if img_bchw.ndim == 4 and img_bchw.shape[0] == 1:
         return img_bchw[0]
     return img_bchw
 def _to_1hw_mask(msk_b1hw: torch.Tensor) -> Optional[torch.Tensor]:
     if msk_b1hw is None:
         return None
     if msk_b1hw.ndim == 4 and msk_b1hw.shape[1] == 1:
+        return msk_b1hw[0] # -> [1,H,W]
     if msk_b1hw.ndim == 3 and msk_b1hw.shape[0] == 1:
         return msk_b1hw
     raise ValueError(f"Expected B1HW or 1HW, got {tuple(msk_b1hw.shape)}")
 def _resize_bchw(x: Optional[torch.Tensor], size_hw: Tuple[int, int], is_mask: bool = False) -> Optional[torch.Tensor]:
     if x is None:
         return None
         return x
     mode = "nearest" if is_mask else "bilinear"
     return F.interpolate(x, size_hw, mode=mode, align_corners=False if mode == "bilinear" else None)
 def _to_b1hw_alpha(alpha, device: str) -> torch.Tensor:
     t = torch.as_tensor(alpha, device=device).float()
     if t.ndim == 2:
+        t = t.unsqueeze(0).unsqueeze(0) # -> [1,1,H,W]
     elif t.ndim == 3:
         if t.shape[0] in (1, 3, 4):
             if t.shape[0] != 1:
         if t.shape[1] != 1:
             t = t[:, :1]
     return t.clamp_(0.0, 1.0).contiguous()
 def _to_2d_alpha_numpy(x) -> np.ndarray:
     t = torch.as_tensor(x).float()
     while t.ndim > 2:
     t = t.clamp_(0.0, 1.0)
     out = t.detach().cpu().numpy().astype(np.float32)
     return np.ascontiguousarray(out)
 def _compute_scaled_size(h: int, w: int, max_edge: int, target_pixels: int) -> Tuple[int, int, float]:
     if h <= 0 or w <= 0:
         return h, w, 1.0
     s1 = min(1.0, float(max_edge) / float(max(h, w))) if max_edge > 0 else 1.0
     s2 = min(1.0, (float(target_pixels) / float(h * w)) ** 0.5) if target_pixels > 0 else 1.0
     s = min(s1, s2)
+    nh = max(128, int(round(h * s)))  # Force min 128 to avoid small-res bugs
+    nw = max(128, int(round(w * s)))
     return nh, nw, s
+def _pad_to_multiple(t: Optional[torch.Tensor], multiple: int = 16) -> Optional[torch.Tensor]:
+    if t is None:
+        return None
+    if t.ndim == 3:
+        c, h, w = t.shape
+    elif t.ndim == 2:
+        h, w = t.shape
+        t = t.unsqueeze(0)  # Temp to 3D for padding
+    else:
+        raise ValueError(f"Unsupported ndim for padding: {t.ndim}")
+    pad_h = (multiple - h % multiple) % multiple
+    pad_w = (multiple - w % multiple) % multiple
+    if pad_h or pad_w:
+        t = F.pad(t, (0, pad_w, 0, pad_h))
+    if t.ndim == 2:  # Shouldn't happen
+        t = t.squeeze(0)
+    return t
 def debug_shapes(tag: str, image, mask) -> None:
     def _info(name, v):
         try:
             logger.info(f"[{tag}:{name}] type={type(v)} err={e}")
     _info("image", image)
     _info("mask", mask)
 # ---------------------------------------------------------------------------
 # Precision selection
 # ---------------------------------------------------------------------------
 def _choose_precision(device: str) -> Tuple[torch.dtype, bool, Optional[torch.dtype]]:
+    """Pick model weight dtype + autocast dtype (fp16>bf16>fp32) for T4 compatibility."""
     if device != "cuda":
         return torch.float32, False, None
     cc = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
     fp16_ok = cc[0] >= 7  # Volta+
+    bf16_ok = cc[0] >= 8 and hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported()  # Ampere+ strict
+    if fp16_ok:
+        return torch.float16, True, torch.float16  # Prefer fp16 for T4
     if bf16_ok:
         return torch.bfloat16, True, torch.bfloat16
     return torch.float32, False, None
 # ---------------------------------------------------------------------------
 # Stateful Adapter around InferenceCore
 # ---------------------------------------------------------------------------
 class _MatAnyoneSession:
     """
     Stateful controller around InferenceCore with OOM-resilient inference.
         use_autocast: bool,
         autocast_dtype: Optional[torch.dtype],
         max_edge: int = 768,
+        target_pixels: int = 600_000, # ~775x775 by area
     ):
         self.core = core
         self.device = device
         self.target_pixels = int(target_pixels)
         self.started = False
         self._lock = threading.Lock()
         # Introspect optional args
         try:
             sig = inspect.signature(self.core.step)
         except Exception:
             self._has_first_frame_pred = True
         self._has_prob_to_mask = hasattr(self.core, "output_prob_to_mask")
     def reset(self):
         with self._lock:
             try:
             except Exception:
                 pass
             self.started = False
     def _scaled_ladder(self, H: int, W: int) -> List[Tuple[int, int]]:
         nh, nw, s = _compute_scaled_size(H, W, self.max_edge, self.target_pixels)
         sizes = [(nh, nw)]
                 if sizes[-1] != (cur_h, cur_w):
                     sizes.append((cur_h, cur_w))
         return sizes
     def _to_alpha(self, out_prob):
         if self._has_prob_to_mask:
             try:
         if t.ndim == 3:
             return t[0] if t.shape[0] >= 1 else t.mean(0)
         return t
     def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
         """
         Returns a 2-D float32 alpha [H,W].
         - frames 1..N: pass mask=None (propagation)
         """
         with self._lock:
+            img_bchw = _to_bchw(image, self.device, is_mask=False) # [1,C,H,W]
             H, W = img_bchw.shape[-2], img_bchw.shape[-1]
             img_bchw = img_bchw.to(self.model_dtype, non_blocking=True)
             # Normalize + align provided mask (if any) to **B1HW** at full res
             msk_b1hw = _to_bchw(mask, self.device, is_mask=True) if mask is not None else None
             if msk_b1hw is not None and msk_b1hw.shape[-2:] != (H, W):
                 msk_b1hw = _resize_bchw(msk_b1hw, (H, W), is_mask=True)
+            mask_1hw = _to_1hw_mask(msk_b1hw) if msk_b1hw is not None else None # ← 1HW!
             sizes = self._scaled_ladder(H, W)
             last_exc = None
             for (th, tw) in sizes:
                 try:
                     img_in = img_bchw if (th, tw) == (H, W) else F.interpolate(
                         else:
                             # nearest to keep binary-like edges
                             msk_in = F.interpolate(mask_1hw.unsqueeze(0), size=(th, tw), mode="nearest")[0]
+                    img_chw = _to_chw_image(img_in).contiguous() # [C,H,W]
+                    # Pad to multiple of 16
+                    img_chw = _pad_to_multiple(img_chw)
+                    if msk_in is not None:
+                        msk_in = _pad_to_multiple(msk_in)
+                    ph, pw = img_chw.shape[-2:]
                     with torch.inference_mode():
                         if self.use_autocast:
                             amp_ctx = torch.autocast(device_type="cuda", dtype=self.autocast_dtype)
                                 def __enter__(self): return None
                                 def __exit__(self, *a): return False
                             amp_ctx = _NoOp()
                         with amp_ctx:
                             if not self.started:
                                 if msk_in is None:
                                 self.started = True
                             else:
                                 out_prob = self.core.step(image=img_chw)
                     alpha = self._to_alpha(out_prob)
+                    # Unpad to scaled size, then upsample if needed
+                    alpha = alpha[:th, :tw]
                     # Upsample alpha back if we ran at a smaller scale
                     if (th, tw) != (H, W):
                         a_b1hw = _to_b1hw_alpha(alpha, device=img_bchw.device)
                         a_b1hw = F.interpolate(a_b1hw, size=(H, W), mode="bilinear", align_corners=False)
                         alpha = a_b1hw[0, 0]
                     return _to_2d_alpha_numpy(alpha)
                 except torch.cuda.OutOfMemoryError as e:
                     last_exc = e
                     torch.cuda.empty_cache()
                     logger.debug(traceback.format_exc())
                     logger.warning(f"MatAnyone call failed at {th}x{tw}; retrying smaller. {e}")
                     continue
             logger.warning(f"MatAnyone calls failed; returning input mask or neutral alpha. {last_exc}")
             if mask_1hw is not None:
                 return _to_2d_alpha_numpy(mask_1hw)
             return np.full((H, W), 0.5, dtype=np.float32)
 # ---------------------------------------------------------------------------
 # Loader
 # ---------------------------------------------------------------------------
 class MatAnyoneLoader:
     """
     Official MatAnyone loader with stateful, OOM-resilient session adapter.
         self.adapter = None
         self.model_id = "PeiqingYang/MatAnyone"
         self.load_time = 0.0
     # --- Robust imports (works with different packaging layouts) ---
     def _import_model_and_core(self):
         model_cls = core_cls = None
                 core_cls = getattr(m, cls)
                 break
             except Exception as e:
+                err_msgs.append(f"core {mod}.{cls}: {e}")
         if model_cls is None or core_cls is None:
             raise ImportError("Could not import MatAnyone / InferenceCore: " + " | ".join(err_msgs))
         return model_cls, core_cls
     def load(self) -> Optional[Any]:
         logger.info(f"Loading MatAnyone from HF: {self.model_id} (device={self.device})")
         t0 = time.time()
             model_cls, core_cls = self._import_model_and_core()
             model_dtype, use_autocast, autocast_dtype = _choose_precision(self.device)
             logger.info(f"MatAnyone precision: weights={model_dtype}, autocast={use_autocast and autocast_dtype}")
             # HF weights (safetensors)
             self.model = model_cls.from_pretrained(self.model_id)
             try:
             except Exception:
                 self.model = self.model.to(self.device)
             self.model.eval()
+            # Override cfg to disable features causing dim mismatches
+            default_cfg = {
+                'chunk_size': 1,
+                'flip_aug': False,
+            }
+            cfg = getattr(self.model, "cfg", default_cfg) or default_cfg
+            if isinstance(cfg, dict):
+                cfg.update(default_cfg)  # Override
+            else:
+                cfg = default_cfg
+            # Inference core
             try:
+                self.core = core_cls(self.model, cfg=cfg)
             except TypeError:
                 self.core = core_cls(self.model)
             # Some versions expose .to(), some don’t — best effort
             try:
                 if hasattr(self.core, "to"):
                     self.core.to(self.device)
             except Exception:
                 pass
             # Build stateful adapter
             max_edge = int(os.environ.get("MATANYONE_MAX_EDGE", "768"))
             target_pixels = int(os.environ.get("MATANYONE_TARGET_PIXELS", "600000"))
             self.load_time = time.time() - t0
             logger.info(f"MatAnyone loaded in {self.load_time:.2f}s")
             return self.adapter
         except Exception as e:
             logger.error(f"Failed to load MatAnyone: {e}")
             logger.debug(traceback.format_exc())
             return None
     def cleanup(self):
         self.adapter = None
         self.core = None
             self.model = None
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
     def get_info(self) -> Dict[str, Any]:
         return {
             "loaded": self.adapter is not None,
             "load_time": self.load_time,
             "model_type": type(self.model).__name__ if self.model else None,
         }
     def debug_shapes(self, image, mask, tag: str = ""):
         try:
             tv_img = torch.as_tensor(image)
                 logger.info(f"[{tag}:mask ] shape={tuple(tv_msk.shape)} dtype={tv_msk.dtype}")
         except Exception as e:
             logger.info(f"[{tag}] debug error: {e}")
 # ---------------------------------------------------------------------------
 # Public symbols
 # ---------------------------------------------------------------------------
 __all__ = [
     "MatAnyoneLoader",
     "_MatAnyoneSession",
     "_compute_scaled_size",
     "debug_shapes",
 ]
 # ---------------------------------------------------------------------------
 # Optional CLI for quick testing (no circular imports)
 # ---------------------------------------------------------------------------
 if __name__ == "__main__":
     import sys
+    import cv2 # only needed for this demo CLI
     logging.basicConfig(level=logging.INFO)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     if len(sys.argv) < 2:
         print(f"Usage: {sys.argv[0]} image.jpg [mask.png]")
         raise SystemExit(1)
     image_path = sys.argv[1]
+    mask_path = sys.argv[2] if len(sys.argv) > 2 else None
     img_bgr = cv2.imread(image_path, cv2.IMREAD_COLOR)
     if img_bgr is None:
         print(f"Could not load image {image_path}")
         raise SystemExit(2)
     img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
     mask = None
     if mask_path:
         mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
         if mask is not None and mask.max() > 1:
             mask = (mask.astype(np.float32) / 255.0)
     loader = MatAnyoneLoader(device=device)
     session = loader.load()
     if not session:
         print("Failed to load MatAnyone")
         raise SystemExit(3)
     alpha = session(img_rgb, mask if mask is not None else np.ones(img_rgb.shape[:2], np.float32))
     cv2.imwrite("alpha_out.png", (np.clip(alpha, 0, 1) * 255).astype(np.uint8))
+    print("Alpha matte written to alpha_out.png")