Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 29, 2025

Commit

2fe1a8c

1 Parent(s): 183c1c8

Update models/loaders/matanyone_loader.py

Browse files

Files changed (1) hide show

models/loaders/matanyone_loader.py +155 -6

models/loaders/matanyone_loader.py CHANGED Viewed

@@ -11,6 +11,8 @@
 - Added: Force chunk_size=1, flip_aug=False in cfg to avoid dim mismatches
 - Added: Pad to multiple of 16 to avoid transformer patch issues
 - Added: Prefer fp16 over bf16 for Tesla T4 compatibility
 """
 from __future__ import annotations
 import os
@@ -24,6 +26,34 @@
 import inspect
 import threading
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Utilities (shapes, dtype, scaling)
 # ---------------------------------------------------------------------------
@@ -34,10 +64,12 @@ def _select_device(pref: str) -> str:
     if pref == "cpu":
         return "cpu"
     return "cuda" if torch.cuda.is_available() else "cpu"
 def _as_tensor_on_device(x, device: str) -> torch.Tensor:
     if isinstance(x, torch.Tensor):
         return x.to(device, non_blocking=True)
     return torch.from_numpy(np.asarray(x)).to(device, non_blocking=True)
 def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
     """
     Normalize input to BCHW (image) or B1HW (mask).
@@ -72,10 +104,12 @@ def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
             x = x.repeat(1, 3, 1, 1)
         x = x.clamp_(0.0, 1.0)
     return x
 def _to_chw_image(img_bchw: torch.Tensor) -> torch.Tensor:
     if img_bchw.ndim == 4 and img_bchw.shape[0] == 1:
         return img_bchw[0]
     return img_bchw
 def _to_1hw_mask(msk_b1hw: torch.Tensor) -> Optional[torch.Tensor]:
     if msk_b1hw is None:
         return None
@@ -84,6 +118,7 @@ def _to_1hw_mask(msk_b1hw: torch.Tensor) -> Optional[torch.Tensor]:
     if msk_b1hw.ndim == 3 and msk_b1hw.shape[0] == 1:
         return msk_b1hw
     raise ValueError(f"Expected B1HW or 1HW, got {tuple(msk_b1hw.shape)}")
 def _resize_bchw(x: Optional[torch.Tensor], size_hw: Tuple[int, int], is_mask: bool = False) -> Optional[torch.Tensor]:
     if x is None:
         return None
@@ -91,6 +126,7 @@ def _resize_bchw(x: Optional[torch.Tensor], size_hw: Tuple[int, int], is_mask: b
         return x
     mode = "nearest" if is_mask else "bilinear"
     return F.interpolate(x, size_hw, mode=mode, align_corners=False if mode == "bilinear" else None)
 def _to_b1hw_alpha(alpha, device: str) -> torch.Tensor:
     t = torch.as_tensor(alpha, device=device).float()
     if t.ndim == 2:
@@ -117,6 +153,7 @@ def _to_b1hw_alpha(alpha, device: str) -> torch.Tensor:
         if t.shape[1] != 1:
             t = t[:, :1]
     return t.clamp_(0.0, 1.0).contiguous()
 def _to_2d_alpha_numpy(x) -> np.ndarray:
     t = torch.as_tensor(x).float()
     while t.ndim > 2:
@@ -129,6 +166,7 @@ def _to_2d_alpha_numpy(x) -> np.ndarray:
     t = t.clamp_(0.0, 1.0)
     out = t.detach().cpu().numpy().astype(np.float32)
     return np.ascontiguousarray(out)
 def _compute_scaled_size(h: int, w: int, max_edge: int, target_pixels: int) -> Tuple[int, int, float]:
     if h <= 0 or w <= 0:
         return h, w, 1.0
@@ -138,6 +176,7 @@ def _compute_scaled_size(h: int, w: int, max_edge: int, target_pixels: int) -> T
     nh = max(128, int(round(h * s)))  # Force min 128 to avoid small-res bugs
     nw = max(128, int(round(w * s)))
     return nh, nw, s
 def _pad_to_multiple(t: Optional[torch.Tensor], multiple: int = 16) -> Optional[torch.Tensor]:
     if t is None:
         return None
@@ -155,6 +194,7 @@ def _pad_to_multiple(t: Optional[torch.Tensor], multiple: int = 16) -> Optional[
     if t.ndim == 2:  # Shouldn't happen
         t = t.squeeze(0)
     return t
 def debug_shapes(tag: str, image, mask) -> None:
     def _info(name, v):
         try:
@@ -166,6 +206,7 @@ def _info(name, v):
             logger.info(f"[{tag}:{name}] type={type(v)} err={e}")
     _info("image", image)
     _info("mask", mask)
 # ---------------------------------------------------------------------------
 # Precision selection
 # ---------------------------------------------------------------------------
@@ -181,6 +222,7 @@ def _choose_precision(device: str) -> Tuple[torch.dtype, bool, Optional[torch.dt
     if bf16_ok:
         return torch.bfloat16, True, torch.bfloat16
     return torch.float32, False, None
 # ---------------------------------------------------------------------------
 # Stateful Adapter around InferenceCore
 # ---------------------------------------------------------------------------
@@ -215,6 +257,7 @@ def __init__(
         except Exception:
             self._has_first_frame_pred = True
         self._has_prob_to_mask = hasattr(self.core, "output_prob_to_mask")
     def reset(self):
         with self._lock:
             try:
@@ -223,6 +266,7 @@ def reset(self):
             except Exception:
                 pass
             self.started = False
     def _scaled_ladder(self, H: int, W: int) -> List[Tuple[int, int]]:
         nh, nw, s = _compute_scaled_size(H, W, self.max_edge, self.target_pixels)
         sizes = [(nh, nw)]
@@ -235,6 +279,7 @@ def _scaled_ladder(self, H: int, W: int) -> List[Tuple[int, int]]:
                 if sizes[-1] != (cur_h, cur_w):
                     sizes.append((cur_h, cur_w))
         return sizes
     def _to_alpha(self, out_prob):
         if self._has_prob_to_mask:
             try:
@@ -247,6 +292,7 @@ def _to_alpha(self, out_prob):
         if t.ndim == 3:
             return t[0] if t.shape[0] >= 1 else t.mean(0)
         return t
     def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
         """
         Returns a 2-D float32 alpha [H,W].
@@ -329,6 +375,7 @@ def __exit__(self, *a): return False
             if mask_1hw is not None:
                 return _to_2d_alpha_numpy(mask_1hw)
             return np.full((H, W), 0.5, dtype=np.float32)
 # ---------------------------------------------------------------------------
 # Loader
 # ---------------------------------------------------------------------------
@@ -345,6 +392,7 @@ def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyo
         self.adapter = None
         self.model_id = "PeiqingYang/MatAnyone"
         self.load_time = 0.0
     # --- Robust imports (works with different packaging layouts) ---
     def _import_model_and_core(self):
         model_cls = core_cls = None
@@ -372,6 +420,7 @@ def _import_model_and_core(self):
         if model_cls is None or core_cls is None:
             raise ImportError("Could not import MatAnyone / InferenceCore: " + " | ".join(err_msgs))
         return model_cls, core_cls
     def load(self) -> Optional[Any]:
         logger.info(f"Loading MatAnyone from HF: {self.model_id} (device={self.device})")
         t0 = time.time()
@@ -386,16 +435,111 @@ def load(self) -> Optional[Any]:
             except Exception:
                 self.model = self.model.to(self.device)
             self.model.eval()
-            # Override cfg to disable features causing dim mismatches
             default_cfg = {
-                'chunk_size': 1,
-                'flip_aug': False,
             }
             cfg = getattr(self.model, "cfg", default_cfg) or default_cfg
             if isinstance(cfg, dict):
-                cfg.update(default_cfg)  # Override
-            else:
-                cfg = default_cfg
             # Inference core
             try:
                 self.core = core_cls(self.model, cfg=cfg)
@@ -426,6 +570,7 @@ def load(self) -> Optional[Any]:
             logger.error(f"Failed to load MatAnyone: {e}")
             logger.debug(traceback.format_exc())
             return None
     def cleanup(self):
         self.adapter = None
         self.core = None
@@ -437,6 +582,7 @@ def cleanup(self):
             self.model = None
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
     def get_info(self) -> Dict[str, Any]:
         return {
             "loaded": self.adapter is not None,
@@ -445,6 +591,7 @@ def get_info(self) -> Dict[str, Any]:
             "load_time": self.load_time,
             "model_type": type(self.model).__name__ if self.model else None,
         }
     def debug_shapes(self, image, mask, tag: str = ""):
         try:
             tv_img = torch.as_tensor(image)
@@ -454,6 +601,7 @@ def debug_shapes(self, image, mask, tag: str = ""):
                 logger.info(f"[{tag}:mask ] shape={tuple(tv_msk.shape)} dtype={tv_msk.dtype}")
         except Exception as e:
             logger.info(f"[{tag}] debug error: {e}")
 # ---------------------------------------------------------------------------
 # Public symbols
 # ---------------------------------------------------------------------------
@@ -469,6 +617,7 @@ def debug_shapes(self, image, mask, tag: str = ""):
     "_compute_scaled_size",
     "debug_shapes",
 ]
 # ---------------------------------------------------------------------------
 # Optional CLI for quick testing (no circular imports)
 # ---------------------------------------------------------------------------

 - Added: Force chunk_size=1, flip_aug=False in cfg to avoid dim mismatches
 - Added: Pad to multiple of 16 to avoid transformer patch issues
 - Added: Prefer fp16 over bf16 for Tesla T4 compatibility
+- New: EasyDict polyfill and conversion for cfg to fix 'dict no attribute' errors
+- New: Full default cfg from official config.json to ensure keys like mem_every are present
 """
 from __future__ import annotations
 import os
 import inspect
 import threading
 logger = logging.getLogger(__name__)
+# EasyDict polyfill (recursive dict with dot access)
+class EasyDict(dict):
+    def __init__(self, d=None, **kwargs):
+        if d is None:
+            d = {}
+        if kwargs:
+            d.update(**kwargs)
+        for k, v in d.items():
+            if isinstance(v, dict):
+                self[k] = EasyDict(v)
+            elif isinstance(v, list):
+                self[k] = [EasyDict(i) if isinstance(i, dict) else i for i in v]
+            else:
+                self[k] = v
+    def __getattr__(self, name):
+        try:
+            return self[name]
+        except KeyError:
+            raise AttributeError(name)
+    def __setattr__(self, name, value):
+        self[name] = value
+    def __delattr__(self, name):
+        del self[name]
 # ---------------------------------------------------------------------------
 # Utilities (shapes, dtype, scaling)
 # ---------------------------------------------------------------------------
     if pref == "cpu":
         return "cpu"
     return "cuda" if torch.cuda.is_available() else "cpu"
 def _as_tensor_on_device(x, device: str) -> torch.Tensor:
     if isinstance(x, torch.Tensor):
         return x.to(device, non_blocking=True)
     return torch.from_numpy(np.asarray(x)).to(device, non_blocking=True)
 def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
     """
     Normalize input to BCHW (image) or B1HW (mask).
             x = x.repeat(1, 3, 1, 1)
         x = x.clamp_(0.0, 1.0)
     return x
 def _to_chw_image(img_bchw: torch.Tensor) -> torch.Tensor:
     if img_bchw.ndim == 4 and img_bchw.shape[0] == 1:
         return img_bchw[0]
     return img_bchw
 def _to_1hw_mask(msk_b1hw: torch.Tensor) -> Optional[torch.Tensor]:
     if msk_b1hw is None:
         return None
     if msk_b1hw.ndim == 3 and msk_b1hw.shape[0] == 1:
         return msk_b1hw
     raise ValueError(f"Expected B1HW or 1HW, got {tuple(msk_b1hw.shape)}")
 def _resize_bchw(x: Optional[torch.Tensor], size_hw: Tuple[int, int], is_mask: bool = False) -> Optional[torch.Tensor]:
     if x is None:
         return None
         return x
     mode = "nearest" if is_mask else "bilinear"
     return F.interpolate(x, size_hw, mode=mode, align_corners=False if mode == "bilinear" else None)
 def _to_b1hw_alpha(alpha, device: str) -> torch.Tensor:
     t = torch.as_tensor(alpha, device=device).float()
     if t.ndim == 2:
         if t.shape[1] != 1:
             t = t[:, :1]
     return t.clamp_(0.0, 1.0).contiguous()
 def _to_2d_alpha_numpy(x) -> np.ndarray:
     t = torch.as_tensor(x).float()
     while t.ndim > 2:
     t = t.clamp_(0.0, 1.0)
     out = t.detach().cpu().numpy().astype(np.float32)
     return np.ascontiguousarray(out)
 def _compute_scaled_size(h: int, w: int, max_edge: int, target_pixels: int) -> Tuple[int, int, float]:
     if h <= 0 or w <= 0:
         return h, w, 1.0
     nh = max(128, int(round(h * s)))  # Force min 128 to avoid small-res bugs
     nw = max(128, int(round(w * s)))
     return nh, nw, s
 def _pad_to_multiple(t: Optional[torch.Tensor], multiple: int = 16) -> Optional[torch.Tensor]:
     if t is None:
         return None
     if t.ndim == 2:  # Shouldn't happen
         t = t.squeeze(0)
     return t
 def debug_shapes(tag: str, image, mask) -> None:
     def _info(name, v):
         try:
             logger.info(f"[{tag}:{name}] type={type(v)} err={e}")
     _info("image", image)
     _info("mask", mask)
 # ---------------------------------------------------------------------------
 # Precision selection
 # ---------------------------------------------------------------------------
     if bf16_ok:
         return torch.bfloat16, True, torch.bfloat16
     return torch.float32, False, None
 # ---------------------------------------------------------------------------
 # Stateful Adapter around InferenceCore
 # ---------------------------------------------------------------------------
         except Exception:
             self._has_first_frame_pred = True
         self._has_prob_to_mask = hasattr(self.core, "output_prob_to_mask")
     def reset(self):
         with self._lock:
             try:
             except Exception:
                 pass
             self.started = False
     def _scaled_ladder(self, H: int, W: int) -> List[Tuple[int, int]]:
         nh, nw, s = _compute_scaled_size(H, W, self.max_edge, self.target_pixels)
         sizes = [(nh, nw)]
                 if sizes[-1] != (cur_h, cur_w):
                     sizes.append((cur_h, cur_w))
         return sizes
     def _to_alpha(self, out_prob):
         if self._has_prob_to_mask:
             try:
         if t.ndim == 3:
             return t[0] if t.shape[0] >= 1 else t.mean(0)
         return t
     def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
         """
         Returns a 2-D float32 alpha [H,W].
             if mask_1hw is not None:
                 return _to_2d_alpha_numpy(mask_1hw)
             return np.full((H, W), 0.5, dtype=np.float32)
 # ---------------------------------------------------------------------------
 # Loader
 # ---------------------------------------------------------------------------
         self.adapter = None
         self.model_id = "PeiqingYang/MatAnyone"
         self.load_time = 0.0
     # --- Robust imports (works with different packaging layouts) ---
     def _import_model_and_core(self):
         model_cls = core_cls = None
         if model_cls is None or core_cls is None:
             raise ImportError("Could not import MatAnyone / InferenceCore: " + " | ".join(err_msgs))
         return model_cls, core_cls
     def load(self) -> Optional[Any]:
         logger.info(f"Loading MatAnyone from HF: {self.model_id} (device={self.device})")
         t0 = time.time()
             except Exception:
                 self.model = self.model.to(self.device)
             self.model.eval()
+            # Full default cfg from official config.json
             default_cfg = {
+                "amp": False,
+                "chunk_size": -1,
+                "flip_aug": False,
+                "long_term": {
+                    "buffer_tokens": 2000,
+                    "count_usage": True,
+                    "max_mem_frames": 10,
+                    "max_num_tokens": 10000,
+                    "min_mem_frames": 5,
+                    "num_prototypes": 128
+                },
+                "max_internal_size": -1,
+                "max_mem_frames": 5,
+                "mem_every": 5,
+                "model": {
+                    "aux_loss": {
+                        "query": {
+                            "enabled": True,
+                            "weight": 0.01
+                        },
+                        "sensory": {
+                            "enabled": True,
+                            "weight": 0.01
+                        }
+                    },
+                    "embed_dim": 256,
+                    "key_dim": 64,
+                    "mask_decoder": {
+                        "up_dims": [256, 128, 128, 64, 16]
+                    },
+                    "mask_encoder": {
+                        "final_dim": 256,
+                        "type": "resnet18"
+                    },
+                    "object_summarizer": {
+                        "add_pe": True,
+                        "embed_dim": 256,
+                        "num_summaries": 16
+                    },
+                    "object_transformer": {
+                        "embed_dim": 256,
+                        "ff_dim": 2048,
+                        "num_blocks": 3,
+                        "num_heads": 8,
+                        "num_queries": 16,
+                        "pixel_self_attention": {
+                            "add_pe_to_qkv": [True, True, False]
+                        },
+                        "query_self_attention": {
+                            "add_pe_to_qkv": [True, True, False]
+                        },
+                        "read_from_memory": {
+                            "add_pe_to_qkv": [True, True, False]
+                        },
+                        "read_from_past": {
+                            "add_pe_to_qkv": [True, True, False]
+                        },
+                        "read_from_pixel": {
+                            "add_pe_to_qkv": [True, True, False],
+                            "input_add_pe": False,
+                            "input_norm": False
+                        },
+                        "read_from_query": {
+                            "add_pe_to_qkv": [True, True, False],
+                            "output_norm": False
+                        }
+                    },
+                    "pixel_dim": 256,
+                    "pixel_encoder": {
+                        "ms_dims": [1024, 512, 256, 64, 3],
+                        "type": "resnet50"
+                    },
+                    "pixel_mean": [0.485, 0.456, 0.406],
+                    "pixel_pe_scale": 32,
+                    "pixel_pe_temperature": 128,
+                    "pixel_std": [0.229, 0.224, 0.225],
+                    "pretrained_resnet": False,
+                    "sensory_dim": 256,
+                    "value_dim": 256
+                },
+                "output_dir": None,
+                "save_all": True,
+                "save_aux": False,
+                "save_scores": False,
+                "stagger_updates": 5,
+                "top_k": 30,
+                "use_all_masks": False,
+                "use_long_term": False,
+                "visualize": False,
+                "weights": "pretrained_models/matanyone.pth"
             }
+            # Get cfg from model if available, else default
             cfg = getattr(self.model, "cfg", default_cfg) or default_cfg
             if isinstance(cfg, dict):
+                cfg = dict(cfg)  # Copy to avoid modifying model.cfg
+            # Override specific values
+            overrides = {
+                'chunk_size': 1,
+                'flip_aug': False,
+            }
+            cfg.update(overrides)
+            # Convert to EasyDict for dot access
+            cfg = EasyDict(cfg)
             # Inference core
             try:
                 self.core = core_cls(self.model, cfg=cfg)
             logger.error(f"Failed to load MatAnyone: {e}")
             logger.debug(traceback.format_exc())
             return None
     def cleanup(self):
         self.adapter = None
         self.core = None
             self.model = None
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
     def get_info(self) -> Dict[str, Any]:
         return {
             "loaded": self.adapter is not None,
             "load_time": self.load_time,
             "model_type": type(self.model).__name__ if self.model else None,
         }
     def debug_shapes(self, image, mask, tag: str = ""):
         try:
             tv_img = torch.as_tensor(image)
                 logger.info(f"[{tag}:mask ] shape={tuple(tv_msk.shape)} dtype={tv_msk.dtype}")
         except Exception as e:
             logger.info(f"[{tag}] debug error: {e}")
 # ---------------------------------------------------------------------------
 # Public symbols
 # ---------------------------------------------------------------------------
     "_compute_scaled_size",
     "debug_shapes",
 ]
 # ---------------------------------------------------------------------------
 # Optional CLI for quick testing (no circular imports)
 # ---------------------------------------------------------------------------