Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 30, 2025

Commit

8b8d050

1 Parent(s): 5711ea9

Update models/loaders/matanyone_loader.py

Browse files

Files changed (1) hide show

models/loaders/matanyone_loader.py +243 -197

models/loaders/matanyone_loader.py CHANGED Viewed

@@ -1,35 +1,54 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-MatAnyone Loader + Stateful Adapter (OOM-resilient, spatially robust)
-- Canonical HF load (MatAnyone.from_pretrained → InferenceCore(model, cfg))
-- Mixed precision (fp16 preferred over bf16) with safe fallback to fp32
-- torch.autocast(device_type="cuda", dtype=...) + torch.inference_mode()
-- Progressive downscale ladder with graceful fallback
-- Strict image↔mask alignment on every path/scale
-- Returns 2-D float32 [H,W] alpha (OpenCV-friendly)
-- Added: Force chunk_size=1, flip_aug=False in cfg to avoid dim mismatches
-- Added: Pad to multiple of 16 to avoid transformer patch issues
-- Added: Prefer fp16 over bf16 for Tesla T4 compatibility
-- New: EasyDict polyfill and conversion for cfg to fix 'dict no attribute' errors
-- New: Full default cfg from official config.json to fix 'mem_every' issues
-- FIXED: Re-enabled memory features and added temporal dimension support
 """
 from __future__ import annotations
 import os
 import time
 import logging
 import traceback
 from typing import Optional, Dict, Any, Tuple, List
 import numpy as np
 import torch
 import torch.nn.functional as F
 import inspect
 import threading
 logger = logging.getLogger(__name__)
-# EasyDict polyfill (recursive dict with dot access)
 class EasyDict(dict):
     def __init__(self, d=None, **kwargs):
         if d is None:
             d = {}
@@ -43,21 +62,22 @@ def __init__(self, d=None, **kwargs):
             else:
                 self[k] = v
-    def __getattr__(self, name):
         try:
             return self[name]
         except KeyError:
             raise AttributeError(name)
-    def __setattr__(self, name, value):
         self[name] = value
-    def __delattr__(self, name):
         del self[name]
-# ---------------------------------------------------------------------------
-# Utilities (shapes, dtype, scaling)
-# ---------------------------------------------------------------------------
 def _select_device(pref: str) -> str:
     pref = (pref or "").lower()
     if pref.startswith("cuda"):
@@ -66,61 +86,89 @@ def _select_device(pref: str) -> str:
         return "cpu"
     return "cuda" if torch.cuda.is_available() else "cpu"
 def _as_tensor_on_device(x, device: str) -> torch.Tensor:
     if isinstance(x, torch.Tensor):
         return x.to(device, non_blocking=True)
     return torch.from_numpy(np.asarray(x)).to(device, non_blocking=True)
 def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
     """
     Normalize input to BCHW (image) or B1HW (mask).
-    Accepts: HWC, CHW, BCHW, BHWC, BTCHW/BTHWC, TCHW/THWC, HW.
     """
     x = _as_tensor_on_device(x, device)
     if x.dtype == torch.uint8:
         x = x.float().div_(255.0)
     elif x.dtype in (torch.int16, torch.int32, torch.int64):
         x = x.float()
     if x.ndim == 5:
-        x = x[:, 0] # -> 4D
     if x.ndim == 4:
         if x.shape[-1] in (1, 3, 4) and x.shape[1] not in (1, 3, 4):
             x = x.permute(0, 3, 1, 2).contiguous()
     elif x.ndim == 3:
         if x.shape[-1] in (1, 3, 4):
             x = x.permute(2, 0, 1).contiguous()
         x = x.unsqueeze(0)
     elif x.ndim == 2:
         x = x.unsqueeze(0).unsqueeze(0)
         if not is_mask:
             x = x.repeat(1, 3, 1, 1)
     else:
-        raise ValueError(f"Unsupported ndim={x.ndim}")
     if is_mask:
         if x.shape[1] > 1:
             x = x[:, :1]
         x = x.clamp_(0.0, 1.0).to(torch.float32)
     else:
-        if x.shape[1] == 1:
             x = x.repeat(1, 3, 1, 1)
         x = x.clamp_(0.0, 1.0)
-    return x
 def _to_chw_image(img_bchw: torch.Tensor) -> torch.Tensor:
     if img_bchw.ndim == 4 and img_bchw.shape[0] == 1:
         return img_bchw[0]
-    return img_bchw
-def _to_1hw_mask(msk_b1hw: torch.Tensor) -> Optional[torch.Tensor]:
     if msk_b1hw is None:
-        return None
     if msk_b1hw.ndim == 4 and msk_b1hw.shape[1] == 1:
-        return msk_b1hw[0] # -> [1,H,W]
     if msk_b1hw.ndim == 3 and msk_b1hw.shape[0] == 1:
         return msk_b1hw
-    raise ValueError(f"Expected B1HW or 1HW, got {tuple(msk_b1hw.shape)}")
 def _resize_bchw(x: Optional[torch.Tensor], size_hw: Tuple[int, int], is_mask: bool = False) -> Optional[torch.Tensor]:
     if x is None:
         return None
     if x.shape[-2:] == size_hw:
@@ -128,35 +176,40 @@ def _resize_bchw(x: Optional[torch.Tensor], size_hw: Tuple[int, int], is_mask: b
     mode = "nearest" if is_mask else "bilinear"
     return F.interpolate(x, size_hw, mode=mode, align_corners=False if mode == "bilinear" else None)
 def _to_b1hw_alpha(alpha, device: str) -> torch.Tensor:
     t = torch.as_tensor(alpha, device=device).float()
-    if t.ndim == 2:
-        t = t.unsqueeze(0).unsqueeze(0) # -> [1,1,H,W]
-    elif t.ndim == 3:
-        if t.shape[0] in (1, 3, 4):
-            if t.shape[0] != 1:
-                t = t[:1]
-            t = t.unsqueeze(0)
-        elif t.shape[-1] in (1, 3, 4):
-            t = t[..., :1].permute(2, 0, 1).unsqueeze(0)
-        else:
-            t = t[:1].unsqueeze(0)
-    elif t.ndim == 4:
-        if t.shape[1] != 1:
-            t = t[:, :1]
         if t.shape[0] != 1:
             t = t[:1]
-    else:
-        while t.ndim > 4:
-            t = t.squeeze(0)
-        while t.ndim < 4:
-            t = t.unsqueeze(0)
         if t.shape[1] != 1:
             t = t[:, :1]
-    return t.clamp_(0.0, 1.0).contiguous()
 def _to_2d_alpha_numpy(x) -> np.ndarray:
     t = torch.as_tensor(x).float()
     while t.ndim > 2:
         if t.ndim == 4 and t.shape[0] == 1 and t.shape[1] == 1:
             t = t[0, 0]
@@ -168,37 +221,36 @@ def _to_2d_alpha_numpy(x) -> np.ndarray:
     out = t.detach().cpu().numpy().astype(np.float32)
     return np.ascontiguousarray(out)
 def _compute_scaled_size(h: int, w: int, max_edge: int, target_pixels: int) -> Tuple[int, int, float]:
     if h <= 0 or w <= 0:
         return h, w, 1.0
     s1 = min(1.0, float(max_edge) / float(max(h, w))) if max_edge > 0 else 1.0
     s2 = min(1.0, (float(target_pixels) / float(h * w)) ** 0.5) if target_pixels > 0 else 1.0
     s = min(s1, s2)
-    nh = max(128, int(round(h * s)))  # Force min 128 to avoid small-res bugs
     nw = max(128, int(round(w * s)))
     return nh, nw, s
-def _pad_to_multiple(t: Optional[torch.Tensor], multiple: int = 16) -> Optional[torch.Tensor]:
-    if t is None:
-        return None
-    if t.ndim == 3:
-        c, h, w = t.shape
-    elif t.ndim == 2:
-        h, w = t.shape
-        t = t.unsqueeze(0)  # Temp to 3D for padding
-    elif t.ndim == 4:  # Handle [T, C, H, W] or similar
-        return t  # Skip padding for temporal tensors
-    else:
-        raise ValueError(f"Unsupported ndim for padding: {t.ndim}")
     pad_h = (multiple - h % multiple) % multiple
     pad_w = (multiple - w % multiple) % multiple
     if pad_h or pad_w:
-        t = F.pad(t, (0, pad_w, 0, pad_h))
-    if t.ndim == 2:  # Shouldn't happen
-        t = t.squeeze(0)
     return t
 def debug_shapes(tag: str, image, mask) -> None:
     def _info(name, v):
         try:
             tv = torch.as_tensor(v)
@@ -210,29 +262,35 @@ def _info(name, v):
     _info("image", image)
     _info("mask", mask)
-# ---------------------------------------------------------------------------
-# Precision selection
-# ---------------------------------------------------------------------------
 def _choose_precision(device: str) -> Tuple[torch.dtype, bool, Optional[torch.dtype]]:
-    """Pick model weight dtype + autocast dtype (fp16>bf16>fp32) for T4 compatibility."""
     if device != "cuda":
         return torch.float32, False, None
     cc = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
     fp16_ok = cc[0] >= 7  # Volta+
-    bf16_ok = cc[0] >= 8 and hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported()  # Ampere+ strict
     if fp16_ok:
-        return torch.float16, True, torch.float16  # Prefer fp16 for T4
     if bf16_ok:
         return torch.bfloat16, True, torch.bfloat16
     return torch.float32, False, None
-# ---------------------------------------------------------------------------
-# Stateful Adapter around InferenceCore
-# ---------------------------------------------------------------------------
 class _MatAnyoneSession:
     """
     Stateful controller around InferenceCore with OOM-resilient inference.
     First call MUST supply a coarse mask (we enforce 1HW internally).
     """
     def __init__(
         self,
@@ -242,7 +300,7 @@ def __init__(
         use_autocast: bool,
         autocast_dtype: Optional[torch.dtype],
         max_edge: int = 768,
-        target_pixels: int = 600_000, # ~775x775 by area
     ):
         self.core = core
         self.device = device
@@ -253,7 +311,8 @@ def __init__(
         self.target_pixels = int(target_pixels)
         self.started = False
         self._lock = threading.Lock()
-        # Introspect optional args
         try:
             sig = inspect.signature(self.core.step)
             self._has_first_frame_pred = "first_frame_pred" in sig.parameters
@@ -271,6 +330,9 @@ def reset(self):
             self.started = False
     def _scaled_ladder(self, H: int, W: int) -> List[Tuple[int, int]]:
         nh, nw, s = _compute_scaled_size(H, W, self.max_edge, self.target_pixels)
         sizes = [(nh, nw)]
         if s < 1.0:
@@ -284,15 +346,16 @@ def _scaled_ladder(self, H: int, W: int) -> List[Tuple[int, int]]:
         return sizes
     def _to_alpha(self, out_prob):
         if self._has_prob_to_mask:
             try:
                 return self.core.output_prob_to_mask(out_prob, matting=True)
             except Exception:
                 pass
         t = torch.as_tensor(out_prob).float()
-        if t.ndim == 4:
             return t[0, 0] if t.shape[1] >= 1 else t[0].mean(0)
-        if t.ndim == 3:
             return t[0] if t.shape[0] >= 1 else t.mean(0)
         return t
@@ -303,76 +366,76 @@ def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
         - frames 1..N: pass mask=None (propagation)
         """
         with self._lock:
-            img_bchw = _to_bchw(image, self.device, is_mask=False) # [1,C,H,W]
             H, W = img_bchw.shape[-2], img_bchw.shape[-1]
             img_bchw = img_bchw.to(self.model_dtype, non_blocking=True)
-            # Normalize + align provided mask (if any) to **B1HW** at full res
             msk_b1hw = _to_bchw(mask, self.device, is_mask=True) if mask is not None else None
             if msk_b1hw is not None and msk_b1hw.shape[-2:] != (H, W):
                 msk_b1hw = _resize_bchw(msk_b1hw, (H, W), is_mask=True)
-            mask_1hw = _to_1hw_mask(msk_b1hw) if msk_b1hw is not None else None # ← 1HW!
             sizes = self._scaled_ladder(H, W)
             last_exc = None
             for (th, tw) in sizes:
                 try:
-                    img_in = img_bchw if (th, tw) == (H, W) else F.interpolate(
-                        img_bchw, size=(th, tw), mode="bilinear", align_corners=False
-                    )
-                    msk_in = None
-                    if mask_1hw is not None:
-                        if (th, tw) == (H, W):
-                            msk_in = mask_1hw
-                        else:
-                            # nearest to keep binary-like edges
-                            msk_in = F.interpolate(mask_1hw.unsqueeze(0), size=(th, tw), mode="nearest")[0]
-                    img_chw = _to_chw_image(img_in).contiguous() # [C,H,W]
-                    # ADD TEMPORAL DIMENSION for video processing mode
-                    img_tchw = img_chw.unsqueeze(0)  # [C,H,W] -> [T=1,C,H,W]
-                    if msk_in is not None:
-                        msk_t1hw = msk_in.unsqueeze(0)  # [1,H,W] -> [T=1,1,H,W]
                     else:
-                        msk_t1hw = None
-                    # Pad to multiple of 16 (skip for temporal tensors)
-                    img_tchw = _pad_to_multiple(img_tchw)
-                    if msk_t1hw is not None:
-                        msk_t1hw = _pad_to_multiple(msk_t1hw)
-                    ph, pw = img_tchw.shape[-2:]
                     with torch.inference_mode():
-                        if self.use_autocast:
-                            amp_ctx = torch.autocast(device_type="cuda", dtype=self.autocast_dtype)
-                        else:
-                            class _NoOp:
-                                def __enter__(self): return None
-                                def __exit__(self, *a): return False
-                            amp_ctx = _NoOp()
                         with amp_ctx:
                             if not self.started:
-                                if msk_t1hw is None:
-                                    # Should not happen when used correctly — still be defensive
                                     logger.warning("First frame arrived without a mask; returning neutral alpha.")
                                     return np.full((H, W), 0.5, dtype=np.float32)
-                                # Pass temporal tensors to core
-                                _ = self.core.step(image=img_tchw, mask=msk_t1hw)
                                 if self._has_first_frame_pred:
-                                    out_prob = self.core.step(image=img_tchw, first_frame_pred=True)
                                 else:
-                                    out_prob = self.core.step(image=img_tchw)
                                 self.started = True
                             else:
-                                out_prob = self.core.step(image=img_tchw)
                     alpha = self._to_alpha(out_prob)
-                    # Unpad to scaled size, then upsample if needed
                     if alpha.ndim >= 2:
-                        alpha = alpha[..., :th, :tw]
-                    # Upsample alpha back if we ran at a smaller scale
                     if (th, tw) != (H, W):
                         a_b1hw = _to_b1hw_alpha(alpha, device=img_bchw.device)
                         a_b1hw = F.interpolate(a_b1hw, size=(H, W), mode="bilinear", align_corners=False)
                         alpha = a_b1hw[0, 0]
                     return _to_2d_alpha_numpy(alpha)
                 except torch.cuda.OutOfMemoryError as e:
                     last_exc = e
                     torch.cuda.empty_cache()
@@ -384,14 +447,17 @@ def __exit__(self, *a): return False
                     logger.debug(traceback.format_exc())
                     logger.warning(f"MatAnyone call failed at {th}x{tw}; retrying smaller. {e}")
                     continue
             logger.warning(f"MatAnyone calls failed; returning input mask or neutral alpha. {last_exc}")
             if mask_1hw is not None:
                 return _to_2d_alpha_numpy(mask_1hw)
             return np.full((H, W), 0.5, dtype=np.float32)
-# ---------------------------------------------------------------------------
-# Loader
-# ---------------------------------------------------------------------------
 class MatAnyoneLoader:
     """
     Official MatAnyone loader with stateful, OOM-resilient session adapter.
@@ -441,17 +507,21 @@ def load(self) -> Optional[Any]:
             model_cls, core_cls = self._import_model_and_core()
             model_dtype, use_autocast, autocast_dtype = _choose_precision(self.device)
             logger.info(f"MatAnyone precision: weights={model_dtype}, autocast={use_autocast and autocast_dtype}")
             # HF weights (safetensors)
             self.model = model_cls.from_pretrained(self.model_id)
             try:
                 self.model = self.model.to(self.device).to(model_dtype)
             except Exception:
                 self.model = self.model.to(self.device)
             self.model.eval()
-            # Full default cfg from official config.json
             default_cfg = {
                 "amp": False,
-                "chunk_size": 1,  # Keep at 1 for single frame processing
                 "flip_aug": False,
                 "long_term": {
                     "buffer_tokens": 2000,
@@ -465,63 +535,25 @@ def load(self) -> Optional[Any]:
                 "max_mem_frames": 5,
                 "mem_every": 5,
                 "model": {
-                    "aux_loss": {
-                        "query": {
-                            "enabled": True,
-                            "weight": 0.01
-                        },
-                        "sensory": {
-                            "enabled": True,
-                            "weight": 0.01
-                        }
-                    },
                     "embed_dim": 256,
                     "key_dim": 64,
-                    "mask_decoder": {
-                        "up_dims": [256, 128, 128, 64, 16]
-                    },
-                    "mask_encoder": {
-                        "final_dim": 256,
-                        "type": "resnet18"
-                    },
-                    "object_summarizer": {
-                        "add_pe": True,
-                        "embed_dim": 256,
-                        "num_summaries": 16
-                    },
                     "object_transformer": {
-                        "embed_dim": 256,
-                        "ff_dim": 2048,
-                        "num_blocks": 3,
-                        "num_heads": 8,
                         "num_queries": 16,
-                        "pixel_self_attention": {
-                            "add_pe_to_qkv": [True, True, False]
-                        },
-                        "query_self_attention": {
-                            "add_pe_to_qkv": [True, True, False]
-                        },
-                        "read_from_memory": {
-                            "add_pe_to_qkv": [True, True, False]
-                        },
-                        "read_from_past": {
-                            "add_pe_to_qkv": [True, True, False]
-                        },
-                        "read_from_pixel": {
-                            "add_pe_to_qkv": [True, True, False],
-                            "input_add_pe": False,
-                            "input_norm": False
-                        },
-                        "read_from_query": {
-                            "add_pe_to_qkv": [True, True, False],
-                            "output_norm": False
-                        }
                     },
                     "pixel_dim": 256,
-                    "pixel_encoder": {
-                        "ms_dims": [1024, 512, 256, 64, 3],
-                        "type": "resnet50"
-                    },
                     "pixel_mean": [0.485, 0.456, 0.406],
                     "pixel_pe_scale": 32,
                     "pixel_pe_temperature": 128,
@@ -537,34 +569,35 @@ def load(self) -> Optional[Any]:
                 "stagger_updates": 5,
                 "top_k": 30,
                 "use_all_masks": False,
-                "use_long_term": True,  # Enable long-term memory
                 "visualize": False,
                 "weights": "pretrained_models/matanyone.pth"
             }
-            # Get cfg from model if available, else default
             cfg = getattr(self.model, "cfg", default_cfg) or default_cfg
             if isinstance(cfg, dict):
-                cfg = dict(cfg)  # Copy to avoid modifying model.cfg
-            # Only override minimal settings for compatibility
             overrides = {
-                'chunk_size': 1,  # Process one frame at a time
-                'flip_aug': False,  # Disable augmentation
-                # Keep memory features enabled!
             }
             cfg.update(overrides)
-            # Convert to EasyDict for dot access
             cfg = EasyDict(cfg)
-            # Inference core
             try:
                 self.core = core_cls(self.model, cfg=cfg)
             except TypeError:
                 self.core = core_cls(self.model)
-            # Some versions expose .to(), some don't — best effort
             try:
                 if hasattr(self.core, "to"):
                     self.core.to(self.device)
             except Exception:
                 pass
             # Build stateful adapter
             max_edge = int(os.environ.get("MATANYONE_MAX_EDGE", "768"))
             target_pixels = int(os.environ.get("MATANYONE_TARGET_PIXELS", "600000"))
@@ -580,12 +613,14 @@ def load(self) -> Optional[Any]:
             self.load_time = time.time() - t0
             logger.info(f"MatAnyone loaded in {self.load_time:.2f}s")
             return self.adapter
         except Exception as e:
             logger.error(f"Failed to load MatAnyone: {e}")
             logger.debug(traceback.format_exc())
             return None
     def cleanup(self):
         self.adapter = None
         self.core = None
         if self.model:
@@ -598,6 +633,7 @@ def cleanup(self):
             torch.cuda.empty_cache()
     def get_info(self) -> Dict[str, Any]:
         return {
             "loaded": self.adapter is not None,
             "model_id": self.model_id,
@@ -607,6 +643,7 @@ def get_info(self) -> Dict[str, Any]:
         }
     def debug_shapes(self, image, mask, tag: str = ""):
         try:
             tv_img = torch.as_tensor(image)
             tv_msk = torch.as_tensor(mask) if mask is not None else None
@@ -616,9 +653,10 @@ def debug_shapes(self, image, mask, tag: str = ""):
         except Exception as e:
             logger.info(f"[{tag}] debug error: {e}")
-# ---------------------------------------------------------------------------
-# Public symbols
-# ---------------------------------------------------------------------------
 __all__ = [
     "MatAnyoneLoader",
     "_MatAnyoneSession",
@@ -632,34 +670,42 @@ def debug_shapes(self, image, mask, tag: str = ""):
     "debug_shapes",
 ]
-# ---------------------------------------------------------------------------
-# Optional CLI for quick testing (no circular imports)
-# ---------------------------------------------------------------------------
 if __name__ == "__main__":
     import sys
-    import cv2 # only needed for this demo CLI
     logging.basicConfig(level=logging.INFO)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     if len(sys.argv) < 2:
         print(f"Usage: {sys.argv[0]} image.jpg [mask.png]")
         raise SystemExit(1)
     image_path = sys.argv[1]
     mask_path = sys.argv[2] if len(sys.argv) > 2 else None
     img_bgr = cv2.imread(image_path, cv2.IMREAD_COLOR)
     if img_bgr is None:
         print(f"Could not load image {image_path}")
         raise SystemExit(2)
     img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
     mask = None
     if mask_path:
         mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
         if mask is not None and mask.max() > 1:
             mask = (mask.astype(np.float32) / 255.0)
     loader = MatAnyoneLoader(device=device)
     session = loader.load()
     if not session:
         print("Failed to load MatAnyone")
         raise SystemExit(3)
     alpha = session(img_rgb, mask if mask is not None else np.ones(img_rgb.shape[:2], np.float32))
     cv2.imwrite("alpha_out.png", (np.clip(alpha, 0, 1) * 255).astype(np.uint8))
-    print("Alpha matte written to alpha_out.png")

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
+MatAnyone Loader + Stateful Adapter (Fixed Tensor Shapes, OOM-resilient)
+=======================================================================
+CHAPTERS
+1) Overview & Rationale
+2) Imports & Logger
+3) EasyDict Polyfill
+4) Tensor Utilities (device, shape, resize, padding)
+5) Precision Selection (fp16/bf16/fp32)
+6) Stateful Session (_MatAnyoneSession)  ← FIX: CHW / 1HW only (no temporal axis)
+7) Loader (MatAnyoneLoader)
+8) Public Symbols
+9) CLI Demo (optional quick test)
+Key Fix vs. previous version
+----------------------------
+- Removed the extra “temporal” axis that produced 5D tensors like [1,1,3,H,W].
+- MatAnyone now receives:
+  • Image: CHW (float, in [0,1]) — or internally BCHW collapsed to CHW.
+  • Mask : 1HW (float, in [0,1]) on the first frame only; later frames mask=None.
+- Kept: downscale ladder, padding to multiple of 16, mixed precision, long-term memory config.
 """
+# ============================================================================
+# 2) IMPORTS & LOGGER
+# ============================================================================
 from __future__ import annotations
 import os
 import time
 import logging
 import traceback
 from typing import Optional, Dict, Any, Tuple, List
 import numpy as np
 import torch
 import torch.nn.functional as F
 import inspect
 import threading
+import contextlib
 logger = logging.getLogger(__name__)
+# ============================================================================
+# 3) EASYDICT POLYFILL
+# ============================================================================
 class EasyDict(dict):
+    """Recursive dict with dot access."""
     def __init__(self, d=None, **kwargs):
         if d is None:
             d = {}
             else:
                 self[k] = v
+    def __getattr__(self, name):  # dot-get
         try:
             return self[name]
         except KeyError:
             raise AttributeError(name)
+    def __setattr__(self, name, value):  # dot-set
         self[name] = value
+    def __delattr__(self, name):  # dot-del
         del self[name]
+# ============================================================================
+# 4) TENSOR UTILITIES (DEVICE, SHAPE, RESIZE, PADDING)
+# ============================================================================
 def _select_device(pref: str) -> str:
     pref = (pref or "").lower()
     if pref.startswith("cuda"):
         return "cpu"
     return "cuda" if torch.cuda.is_available() else "cpu"
 def _as_tensor_on_device(x, device: str) -> torch.Tensor:
     if isinstance(x, torch.Tensor):
         return x.to(device, non_blocking=True)
     return torch.from_numpy(np.asarray(x)).to(device, non_blocking=True)
 def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
     """
     Normalize input to BCHW (image) or B1HW (mask).
+    Accepts: HWC, CHW, BCHW, BHWC, (accidental) 5D, and HW.
+    Defensive against dtype/range; output is clamped to [0,1].
     """
     x = _as_tensor_on_device(x, device)
     if x.dtype == torch.uint8:
         x = x.float().div_(255.0)
     elif x.dtype in (torch.int16, torch.int32, torch.int64):
         x = x.float()
+    # If upstream passed a 5D tensor (e.g., (B,1,C,H,W) or (B,T,C,H,W)), squeeze a singleton middle axis.
     if x.ndim == 5:
+        # Prefer to squeeze the 2nd dim if it's 1; otherwise take the first slice.
+        if x.shape[1] == 1:
+            x = x.squeeze(1)  # -> BCHW
+        else:
+            x = x[:, 0, ...]  # -> BCHW
     if x.ndim == 4:
+        # Handle BHWC → BCHW
         if x.shape[-1] in (1, 3, 4) and x.shape[1] not in (1, 3, 4):
             x = x.permute(0, 3, 1, 2).contiguous()
     elif x.ndim == 3:
+        # HWC → CHW
         if x.shape[-1] in (1, 3, 4):
             x = x.permute(2, 0, 1).contiguous()
+        # CHW → BCHW
         x = x.unsqueeze(0)
     elif x.ndim == 2:
+        # HW → B1HW (mask) or B3HW (image)
         x = x.unsqueeze(0).unsqueeze(0)
         if not is_mask:
             x = x.repeat(1, 3, 1, 1)
     else:
+        raise ValueError(f"_to_bchw: unsupported ndim={x.ndim}")
     if is_mask:
+        # Ensure single-channel B1HW, clamped and float32
         if x.shape[1] > 1:
             x = x[:, :1]
         x = x.clamp_(0.0, 1.0).to(torch.float32)
     else:
+        # Ensure RGB
+        if x.shape[1] == 4:
+            x = x[:, :3, ...]
+        elif x.shape[1] == 1:
             x = x.repeat(1, 3, 1, 1)
         x = x.clamp_(0.0, 1.0)
+    return x.contiguous()
 def _to_chw_image(img_bchw: torch.Tensor) -> torch.Tensor:
+    """BCHW → CHW (take batch 0 if present)."""
     if img_bchw.ndim == 4 and img_bchw.shape[0] == 1:
         return img_bchw[0]
+    if img_bchw.ndim == 3:
+        return img_bchw
+    raise ValueError(f"_to_chw_image: expected BCHW or CHW, got {tuple(img_bchw.shape)}")
+def _to_1hw_mask(msk_b1hw: torch.Tensor) -> torch.Tensor:
+    """B1HW → 1HW (drop batch)."""
     if msk_b1hw is None:
+        raise ValueError("_to_1hw_mask: mask is None")
     if msk_b1hw.ndim == 4 and msk_b1hw.shape[1] == 1:
+        return msk_b1hw[0]  # 1HW
     if msk_b1hw.ndim == 3 and msk_b1hw.shape[0] == 1:
         return msk_b1hw
+    raise ValueError(f"_to_1hw_mask: expected B1HW or 1HW, got {tuple(msk_b1hw.shape)}")
 def _resize_bchw(x: Optional[torch.Tensor], size_hw: Tuple[int, int], is_mask: bool = False) -> Optional[torch.Tensor]:
+    """Resize BCHW or B1HW to (H, W) using bilinear (image) or nearest (mask)."""
     if x is None:
         return None
     if x.shape[-2:] == size_hw:
     mode = "nearest" if is_mask else "bilinear"
     return F.interpolate(x, size_hw, mode=mode, align_corners=False if mode == "bilinear" else None)
 def _to_b1hw_alpha(alpha, device: str) -> torch.Tensor:
+    """Convert arbitrary mask-like input to B1HW float32 [0,1]."""
     t = torch.as_tensor(alpha, device=device).float()
+    # Squeeze extra dims down to HW/1HW first
+    while t.ndim > 4:
+        t = t.squeeze(0)
+    if t.ndim == 4:
+        # Expecting BxCxHxW; force B=1, C=1
         if t.shape[0] != 1:
             t = t[:1]
         if t.shape[1] != 1:
             t = t[:, :1]
+    elif t.ndim == 3:
+        # Could be CxHxW or HxWx1
+        if t.shape[0] == 1:
+            t = t.unsqueeze(0)  # 1x1xHxW
+        elif t.shape[-1] == 1:
+            t = t.permute(2, 0, 1).unsqueeze(0)  # 1x1xHxW
+        else:
+            # If C>1, take first channel
+            t = t[:1, ...].unsqueeze(0)
+    elif t.ndim == 2:
+        t = t.unsqueeze(0).unsqueeze(0)
+    else:
+        raise ValueError(f"_to_b1hw_alpha: unsupported ndim={t.ndim}")
+    t = t.clamp_(0.0, 1.0).contiguous()
+    return t
 def _to_2d_alpha_numpy(x) -> np.ndarray:
+    """Convert any mask-like tensor to 2D float32 numpy [H,W] in [0,1]."""
     t = torch.as_tensor(x).float()
+    # Squeeze down to 2D
     while t.ndim > 2:
         if t.ndim == 4 and t.shape[0] == 1 and t.shape[1] == 1:
             t = t[0, 0]
     out = t.detach().cpu().numpy().astype(np.float32)
     return np.ascontiguousarray(out)
 def _compute_scaled_size(h: int, w: int, max_edge: int, target_pixels: int) -> Tuple[int, int, float]:
+    """Compute a safe scaled size that respects a max edge and total pixels."""
     if h <= 0 or w <= 0:
         return h, w, 1.0
     s1 = min(1.0, float(max_edge) / float(max(h, w))) if max_edge > 0 else 1.0
     s2 = min(1.0, (float(target_pixels) / float(h * w)) ** 0.5) if target_pixels > 0 else 1.0
     s = min(s1, s2)
+    nh = max(128, int(round(h * s)))  # minimum of 128 to avoid very small feature maps
     nw = max(128, int(round(w * s)))
     return nh, nw, s
+def _pad_to_multiple_3d(t: torch.Tensor, multiple: int = 16) -> torch.Tensor:
+    """
+    Pad a 3D tensor (C,H,W) to multiples of `multiple`. Works for CHW and 1HW.
+    Returns a tensor with same ndim.
+    """
+    if t.ndim != 3:
+        raise ValueError(f"_pad_to_multiple_3d: expected 3D, got {t.ndim}D")
+    c, h, w = t.shape
     pad_h = (multiple - h % multiple) % multiple
     pad_w = (multiple - w % multiple) % multiple
     if pad_h or pad_w:
+        t = F.pad(t, (0, pad_w, 0, pad_h))  # (left,right,top,bottom)
     return t
 def debug_shapes(tag: str, image, mask) -> None:
+    """Log shapes/dtypes/min/max for quick inspection."""
     def _info(name, v):
         try:
             tv = torch.as_tensor(v)
     _info("image", image)
     _info("mask", mask)
+# ============================================================================
+# 5) PRECISION SELECTION (fp16/bf16/fp32)
+# ============================================================================
 def _choose_precision(device: str) -> Tuple[torch.dtype, bool, Optional[torch.dtype]]:
+    """
+    Pick model weights dtype and autocast dtype (fp16>bf16>fp32), preferring fp16 for T4.
+    Returns: (model_dtype, use_autocast, autocast_dtype)
+    """
     if device != "cuda":
         return torch.float32, False, None
     cc = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
     fp16_ok = cc[0] >= 7  # Volta+
+    bf16_ok = (cc[0] >= 8) and hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported()
     if fp16_ok:
+        return torch.float16, True, torch.float16  # T4 prefers fp16
     if bf16_ok:
         return torch.bfloat16, True, torch.bfloat16
     return torch.float32, False, None
+# ============================================================================
+# 6) STATEFUL SESSION (NO TEMPORAL AXIS; STRICT CHW/1HW)
+# ============================================================================
 class _MatAnyoneSession:
     """
     Stateful controller around InferenceCore with OOM-resilient inference.
     First call MUST supply a coarse mask (we enforce 1HW internally).
+    Subsequent calls should pass mask=None (temporal propagation handled by core).
     """
     def __init__(
         self,
         use_autocast: bool,
         autocast_dtype: Optional[torch.dtype],
         max_edge: int = 768,
+        target_pixels: int = 600_000,  # ~775x775 by area
     ):
         self.core = core
         self.device = device
         self.target_pixels = int(target_pixels)
         self.started = False
         self._lock = threading.Lock()
+        # Introspect optional API surfaces
         try:
             sig = inspect.signature(self.core.step)
             self._has_first_frame_pred = "first_frame_pred" in sig.parameters
             self.started = False
     def _scaled_ladder(self, H: int, W: int) -> List[Tuple[int, int]]:
+        """
+        Build a list of decreasing (H,W) resolutions to attempt to avoid OOM.
+        """
         nh, nw, s = _compute_scaled_size(H, W, self.max_edge, self.target_pixels)
         sizes = [(nh, nw)]
         if s < 1.0:
         return sizes
     def _to_alpha(self, out_prob):
+        """Convert model output probabilities to a matte."""
         if self._has_prob_to_mask:
             try:
                 return self.core.output_prob_to_mask(out_prob, matting=True)
             except Exception:
                 pass
         t = torch.as_tensor(out_prob).float()
+        if t.ndim == 4:   # BxCxHxW
             return t[0, 0] if t.shape[1] >= 1 else t[0].mean(0)
+        if t.ndim == 3:   # CxHxW
             return t[0] if t.shape[0] >= 1 else t.mean(0)
         return t
         - frames 1..N: pass mask=None (propagation)
         """
         with self._lock:
+            # ---- 1) Normalize inputs to BCHW (image) and B1HW (mask), then collapse to CHW / 1HW
+            img_bchw = _to_bchw(image, self.device, is_mask=False)  # BCHW
             H, W = img_bchw.shape[-2], img_bchw.shape[-1]
             img_bchw = img_bchw.to(self.model_dtype, non_blocking=True)
             msk_b1hw = _to_bchw(mask, self.device, is_mask=True) if mask is not None else None
             if msk_b1hw is not None and msk_b1hw.shape[-2:] != (H, W):
                 msk_b1hw = _resize_bchw(msk_b1hw, (H, W), is_mask=True)
+            img_chw = _to_chw_image(img_bchw)                      # CHW
+            mask_1hw = _to_1hw_mask(msk_b1hw) if msk_b1hw is not None else None  # 1HW or None
+            # ---- 2) Downscale ladder to avoid OOM
             sizes = self._scaled_ladder(H, W)
             last_exc = None
             for (th, tw) in sizes:
                 try:
+                    # 2a) Resize image (bilinear) and mask (nearest) to ladder size
+                    if (th, tw) == (H, W):
+                        img_in = img_chw
+                        msk_in = mask_1hw
                     else:
+                        img_in = F.interpolate(img_chw.unsqueeze(0), size=(th, tw),
+                                               mode="bilinear", align_corners=False)[0]  # CHW
+                        msk_in = None
+                        if mask_1hw is not None:
+                            msk_in = F.interpolate(mask_1hw.unsqueeze(0), size=(th, tw),
+                                                   mode="nearest")[0]  # 1HW
+                    # 2b) Pad to multiple of 16 (per-model stability)
+                    img_in = _pad_to_multiple_3d(img_in)  # CHW
+                    if msk_in is not None:
+                        msk_in = _pad_to_multiple_3d(msk_in)  # 1HW
+                    # ---- 3) Forward pass (STRICT CHW / 1HW; NO TEMPORAL AXIS)
                     with torch.inference_mode():
+                        amp_ctx = (
+                            torch.autocast(device_type="cuda", dtype=self.autocast_dtype)
+                            if self.use_autocast else
+                            contextlib.nullcontext()
+                        )
                         with amp_ctx:
                             if not self.started:
+                                if msk_in is None:
                                     logger.warning("First frame arrived without a mask; returning neutral alpha.")
                                     return np.full((H, W), 0.5, dtype=np.float32)
+                                # Initialize with first frame (explicit mask)
+                                _ = self.core.step(image=img_in, mask=msk_in)   # ← CHW + 1HW
                                 if self._has_first_frame_pred:
+                                    out_prob = self.core.step(image=img_in, first_frame_pred=True)
                                 else:
+                                    out_prob = self.core.step(image=img_in)
                                 self.started = True
                             else:
+                                # Subsequent frames; core uses memory internally
+                                out_prob = self.core.step(image=img_in)         # ← CHW
+                    # ---- 4) Convert to alpha + unpad/upsample back to full res if needed
                     alpha = self._to_alpha(out_prob)
                     if alpha.ndim >= 2:
+                        alpha = alpha[..., :th, :tw]  # remove pad
                     if (th, tw) != (H, W):
                         a_b1hw = _to_b1hw_alpha(alpha, device=img_bchw.device)
                         a_b1hw = F.interpolate(a_b1hw, size=(H, W), mode="bilinear", align_corners=False)
                         alpha = a_b1hw[0, 0]
                     return _to_2d_alpha_numpy(alpha)
                 except torch.cuda.OutOfMemoryError as e:
                     last_exc = e
                     torch.cuda.empty_cache()
                     logger.debug(traceback.format_exc())
                     logger.warning(f"MatAnyone call failed at {th}x{tw}; retrying smaller. {e}")
                     continue
+            # ---- 5) All attempts failed – return input mask or neutral alpha
             logger.warning(f"MatAnyone calls failed; returning input mask or neutral alpha. {last_exc}")
             if mask_1hw is not None:
                 return _to_2d_alpha_numpy(mask_1hw)
             return np.full((H, W), 0.5, dtype=np.float32)
+# ============================================================================
+# 7) LOADER (MatAnyoneLoader)
+# ============================================================================
 class MatAnyoneLoader:
     """
     Official MatAnyone loader with stateful, OOM-resilient session adapter.
             model_cls, core_cls = self._import_model_and_core()
             model_dtype, use_autocast, autocast_dtype = _choose_precision(self.device)
             logger.info(f"MatAnyone precision: weights={model_dtype}, autocast={use_autocast and autocast_dtype}")
             # HF weights (safetensors)
             self.model = model_cls.from_pretrained(self.model_id)
+            # Move to device + dtype when possible
             try:
                 self.model = self.model.to(self.device).to(model_dtype)
             except Exception:
                 self.model = self.model.to(self.device)
             self.model.eval()
+            # Full default cfg from official config.json (kept; enables memory features)
             default_cfg = {
                 "amp": False,
+                "chunk_size": 1,  # single-frame stepping
                 "flip_aug": False,
                 "long_term": {
                     "buffer_tokens": 2000,
                 "max_mem_frames": 5,
                 "mem_every": 5,
                 "model": {
+                    "aux_loss": {"query": {"enabled": True, "weight": 0.01},
+                                 "sensory": {"enabled": True, "weight": 0.01}},
                     "embed_dim": 256,
                     "key_dim": 64,
+                    "mask_decoder": {"up_dims": [256, 128, 128, 64, 16]},
+                    "mask_encoder": {"final_dim": 256, "type": "resnet18"},
+                    "object_summarizer": {"add_pe": True, "embed_dim": 256, "num_summaries": 16},
                     "object_transformer": {
+                        "embed_dim": 256, "ff_dim": 2048, "num_blocks": 3, "num_heads": 8,
                         "num_queries": 16,
+                        "pixel_self_attention": {"add_pe_to_qkv": [True, True, False]},
+                        "query_self_attention": {"add_pe_to_qkv": [True, True, False]},
+                        "read_from_memory": {"add_pe_to_qkv": [True, True, False]},
+                        "read_from_past": {"add_pe_to_qkv": [True, True, False]},
+                        "read_from_pixel": {"add_pe_to_qkv": [True, True, False], "input_add_pe": False, "input_norm": False},
+                        "read_from_query": {"add_pe_to_qkv": [True, True, False], "output_norm": False}
                     },
                     "pixel_dim": 256,
+                    "pixel_encoder": {"ms_dims": [1024, 512, 256, 64, 3], "type": "resnet50"},
                     "pixel_mean": [0.485, 0.456, 0.406],
                     "pixel_pe_scale": 32,
                     "pixel_pe_temperature": 128,
                 "stagger_updates": 5,
                 "top_k": 30,
                 "use_all_masks": False,
+                "use_long_term": True,
                 "visualize": False,
                 "weights": "pretrained_models/matanyone.pth"
             }
+            # Merge with model.cfg if present; apply minimal overrides
             cfg = getattr(self.model, "cfg", default_cfg) or default_cfg
             if isinstance(cfg, dict):
+                cfg = dict(cfg)
             overrides = {
+                "chunk_size": 1,
+                "flip_aug": False,
             }
             cfg.update(overrides)
             cfg = EasyDict(cfg)
+            # Build inference core
             try:
                 self.core = core_cls(self.model, cfg=cfg)
             except TypeError:
                 self.core = core_cls(self.model)
+            # Some versions expose .to()
             try:
                 if hasattr(self.core, "to"):
                     self.core.to(self.device)
             except Exception:
                 pass
             # Build stateful adapter
             max_edge = int(os.environ.get("MATANYONE_MAX_EDGE", "768"))
             target_pixels = int(os.environ.get("MATANYONE_TARGET_PIXELS", "600000"))
             self.load_time = time.time() - t0
             logger.info(f"MatAnyone loaded in {self.load_time:.2f}s")
             return self.adapter
         except Exception as e:
             logger.error(f"Failed to load MatAnyone: {e}")
             logger.debug(traceback.format_exc())
             return None
     def cleanup(self):
+        """Release model/core and clear CUDA cache."""
         self.adapter = None
         self.core = None
         if self.model:
             torch.cuda.empty_cache()
     def get_info(self) -> Dict[str, Any]:
+        """Lightweight status for UI/self-check."""
         return {
             "loaded": self.adapter is not None,
             "model_id": self.model_id,
         }
     def debug_shapes(self, image, mask, tag: str = ""):
+        """Quick shape/dtype logger."""
         try:
             tv_img = torch.as_tensor(image)
             tv_msk = torch.as_tensor(mask) if mask is not None else None
         except Exception as e:
             logger.info(f"[{tag}] debug error: {e}")
+# ============================================================================
+# 8) PUBLIC SYMBOLS
+# ============================================================================
 __all__ = [
     "MatAnyoneLoader",
     "_MatAnyoneSession",
     "debug_shapes",
 ]
+# ============================================================================
+# 9) CLI DEMO (OPTIONAL QUICK TEST)
+# ============================================================================
 if __name__ == "__main__":
     import sys
+    import cv2  # only for demo
     logging.basicConfig(level=logging.INFO)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     if len(sys.argv) < 2:
         print(f"Usage: {sys.argv[0]} image.jpg [mask.png]")
         raise SystemExit(1)
     image_path = sys.argv[1]
     mask_path = sys.argv[2] if len(sys.argv) > 2 else None
     img_bgr = cv2.imread(image_path, cv2.IMREAD_COLOR)
     if img_bgr is None:
         print(f"Could not load image {image_path}")
         raise SystemExit(2)
+    # OpenCV → RGB
     img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
     mask = None
     if mask_path:
         mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
         if mask is not None and mask.max() > 1:
             mask = (mask.astype(np.float32) / 255.0)
     loader = MatAnyoneLoader(device=device)
     session = loader.load()
     if not session:
         print("Failed to load MatAnyone")
         raise SystemExit(3)
     alpha = session(img_rgb, mask if mask is not None else np.ones(img_rgb.shape[:2], np.float32))
     cv2.imwrite("alpha_out.png", (np.clip(alpha, 0, 1) * 255).astype(np.uint8))
+    print("Alpha matte written to alpha_out.png")