Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 30, 2025

Commit

5711ea9

1 Parent(s): 19f3f1c

Update models/loaders/matanyone_loader.py

Browse files

Files changed (1) hide show

models/loaders/matanyone_loader.py +31 -36

models/loaders/matanyone_loader.py CHANGED Viewed

@@ -13,9 +13,7 @@
 - Added: Prefer fp16 over bf16 for Tesla T4 compatibility
 - New: EasyDict polyfill and conversion for cfg to fix 'dict no attribute' errors
 - New: Full default cfg from official config.json to fix 'mem_every' issues
-- Update: Disable memory propagation by setting mem_every=-1, max_mem_frames=0 to fix dim mismatch in fusion
-- Fix: Merge long_term overrides to preserve keys like count_usage
-- Fix: Syntax error in _to_bchw (== instead of =)
 """
 from __future__ import annotations
 import os
@@ -188,6 +186,8 @@ def _pad_to_multiple(t: Optional[torch.Tensor], multiple: int = 16) -> Optional[
     elif t.ndim == 2:
         h, w = t.shape
         t = t.unsqueeze(0)  # Temp to 3D for padding
     else:
         raise ValueError(f"Unsupported ndim for padding: {t.ndim}")
     pad_h = (multiple - h % multiple) % multiple
@@ -326,11 +326,20 @@ def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
                             # nearest to keep binary-like edges
                             msk_in = F.interpolate(mask_1hw.unsqueeze(0), size=(th, tw), mode="nearest")[0]
                     img_chw = _to_chw_image(img_in).contiguous() # [C,H,W]
-                    # Pad to multiple of 16
-                    img_chw = _pad_to_multiple(img_chw)
                     if msk_in is not None:
-                        msk_in = _pad_to_multiple(msk_in)
-                    ph, pw = img_chw.shape[-2:]
                     with torch.inference_mode():
                         if self.use_autocast:
                             amp_ctx = torch.autocast(device_type="cuda", dtype=self.autocast_dtype)
@@ -341,22 +350,23 @@ def __exit__(self, *a): return False
                             amp_ctx = _NoOp()
                         with amp_ctx:
                             if not self.started:
-                                if msk_in is None:
                                     # Should not happen when used correctly — still be defensive
                                     logger.warning("First frame arrived without a mask; returning neutral alpha.")
                                     return np.full((H, W), 0.5, dtype=np.float32)
-                                # CRITICAL: pass **1HW** to .step(mask=...)
-                                _ = self.core.step(image=img_chw, mask=msk_in)
                                 if self._has_first_frame_pred:
-                                    out_prob = self.core.step(image=img_chw, first_frame_pred=True)
                                 else:
-                                    out_prob = self.core.step(image=img_chw)
                                 self.started = True
                             else:
-                                out_prob = self.core.step(image=img_chw)
                     alpha = self._to_alpha(out_prob)
                     # Unpad to scaled size, then upsample if needed
-                    alpha = alpha[:th, :tw]
                     # Upsample alpha back if we ran at a smaller scale
                     if (th, tw) != (H, W):
                         a_b1hw = _to_b1hw_alpha(alpha, device=img_bchw.device)
@@ -441,7 +451,7 @@ def load(self) -> Optional[Any]:
             # Full default cfg from official config.json
             default_cfg = {
                 "amp": False,
-                "chunk_size": -1,
                 "flip_aug": False,
                 "long_term": {
                     "buffer_tokens": 2000,
@@ -527,7 +537,7 @@ def load(self) -> Optional[Any]:
                 "stagger_updates": 5,
                 "top_k": 30,
                 "use_all_masks": False,
-                "use_long_term": False,
                 "visualize": False,
                 "weights": "pretrained_models/matanyone.pth"
             }
@@ -535,28 +545,13 @@ def load(self) -> Optional[Any]:
             cfg = getattr(self.model, "cfg", default_cfg) or default_cfg
             if isinstance(cfg, dict):
                 cfg = dict(cfg)  # Copy to avoid modifying model.cfg
-            # Override specific values to disable memory and potential dim issues
             overrides = {
-                'chunk_size': 1,
-                'flip_aug': False,
-                'mem_every': -1,
-                'max_mem_frames': 0,
-                'use_long_term': False,
             }
             cfg.update(overrides)
-            # Merge long_term overrides without removing keys
-            long_term_defaults = {
-                "buffer_tokens": 2000,
-                "count_usage": True,
-                "max_mem_frames": 0,
-                "max_num_tokens": 10000,
-                "min_mem_frames": 0,
-                "num_prototypes": 128
-            }
-            if 'long_term' in cfg:
-                cfg['long_term'].update({k: v for k, v in long_term_defaults.items() if k not in cfg['long_term'] or k in ['max_mem_frames', 'min_mem_frames']})
-            else:
-                cfg['long_term'] = long_term_defaults
             # Convert to EasyDict for dot access
             cfg = EasyDict(cfg)
             # Inference core
@@ -564,7 +559,7 @@ def load(self) -> Optional[Any]:
                 self.core = core_cls(self.model, cfg=cfg)
             except TypeError:
                 self.core = core_cls(self.model)
-            # Some versions expose .to(), some don’t — best effort
             try:
                 if hasattr(self.core, "to"):
                     self.core.to(self.device)

 - Added: Prefer fp16 over bf16 for Tesla T4 compatibility
 - New: EasyDict polyfill and conversion for cfg to fix 'dict no attribute' errors
 - New: Full default cfg from official config.json to fix 'mem_every' issues
+- FIXED: Re-enabled memory features and added temporal dimension support
 """
 from __future__ import annotations
 import os
     elif t.ndim == 2:
         h, w = t.shape
         t = t.unsqueeze(0)  # Temp to 3D for padding
+    elif t.ndim == 4:  # Handle [T, C, H, W] or similar
+        return t  # Skip padding for temporal tensors
     else:
         raise ValueError(f"Unsupported ndim for padding: {t.ndim}")
     pad_h = (multiple - h % multiple) % multiple
                             # nearest to keep binary-like edges
                             msk_in = F.interpolate(mask_1hw.unsqueeze(0), size=(th, tw), mode="nearest")[0]
                     img_chw = _to_chw_image(img_in).contiguous() # [C,H,W]
+                    # ADD TEMPORAL DIMENSION for video processing mode
+                    img_tchw = img_chw.unsqueeze(0)  # [C,H,W] -> [T=1,C,H,W]
                     if msk_in is not None:
+                        msk_t1hw = msk_in.unsqueeze(0)  # [1,H,W] -> [T=1,1,H,W]
+                    else:
+                        msk_t1hw = None
+                    # Pad to multiple of 16 (skip for temporal tensors)
+                    img_tchw = _pad_to_multiple(img_tchw)
+                    if msk_t1hw is not None:
+                        msk_t1hw = _pad_to_multiple(msk_t1hw)
+                    ph, pw = img_tchw.shape[-2:]
                     with torch.inference_mode():
                         if self.use_autocast:
                             amp_ctx = torch.autocast(device_type="cuda", dtype=self.autocast_dtype)
                             amp_ctx = _NoOp()
                         with amp_ctx:
                             if not self.started:
+                                if msk_t1hw is None:
                                     # Should not happen when used correctly — still be defensive
                                     logger.warning("First frame arrived without a mask; returning neutral alpha.")
                                     return np.full((H, W), 0.5, dtype=np.float32)
+                                # Pass temporal tensors to core
+                                _ = self.core.step(image=img_tchw, mask=msk_t1hw)
                                 if self._has_first_frame_pred:
+                                    out_prob = self.core.step(image=img_tchw, first_frame_pred=True)
                                 else:
+                                    out_prob = self.core.step(image=img_tchw)
                                 self.started = True
                             else:
+                                out_prob = self.core.step(image=img_tchw)
                     alpha = self._to_alpha(out_prob)
                     # Unpad to scaled size, then upsample if needed
+                    if alpha.ndim >= 2:
+                        alpha = alpha[..., :th, :tw]
                     # Upsample alpha back if we ran at a smaller scale
                     if (th, tw) != (H, W):
                         a_b1hw = _to_b1hw_alpha(alpha, device=img_bchw.device)
             # Full default cfg from official config.json
             default_cfg = {
                 "amp": False,
+                "chunk_size": 1,  # Keep at 1 for single frame processing
                 "flip_aug": False,
                 "long_term": {
                     "buffer_tokens": 2000,
                 "stagger_updates": 5,
                 "top_k": 30,
                 "use_all_masks": False,
+                "use_long_term": True,  # Enable long-term memory
                 "visualize": False,
                 "weights": "pretrained_models/matanyone.pth"
             }
             cfg = getattr(self.model, "cfg", default_cfg) or default_cfg
             if isinstance(cfg, dict):
                 cfg = dict(cfg)  # Copy to avoid modifying model.cfg
+            # Only override minimal settings for compatibility
             overrides = {
+                'chunk_size': 1,  # Process one frame at a time
+                'flip_aug': False,  # Disable augmentation
+                # Keep memory features enabled!
             }
             cfg.update(overrides)
             # Convert to EasyDict for dot access
             cfg = EasyDict(cfg)
             # Inference core
                 self.core = core_cls(self.model, cfg=cfg)
             except TypeError:
                 self.core = core_cls(self.model)
+            # Some versions expose .to(), some don't — best effort
             try:
                 if hasattr(self.core, "to"):
                     self.core.to(self.device)