Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 28, 2025

Commit

e2ca8f7

1 Parent(s): b8dd531

Update models/loaders/matanyone_loader.py

Browse files

Files changed (1) hide show

models/loaders/matanyone_loader.py +67 -18

models/loaders/matanyone_loader.py CHANGED Viewed

@@ -107,13 +107,50 @@ def _resize_bchw(x: Optional[torch.Tensor], size_hw: Tuple[int, int], is_mask=Fa
     mode = "nearest" if is_mask else "bilinear"
     return F.interpolate(x, size=size_hw, mode=mode, align_corners=False if mode == "bilinear" else None)
 def _to_2d_alpha_numpy(x) -> np.ndarray:
     t = torch.as_tensor(x).float()
     while t.ndim > 2:
-        if t.ndim == 3:
-            t = t[0] if t.shape[0] >= 1 else t.squeeze(0)
         else:
-            t = t.squeeze()
     t = t.clamp_(0.0, 1.0)
     out = t.detach().cpu().numpy().astype(np.float32)
     return np.ascontiguousarray(out)
@@ -188,17 +225,22 @@ def _compute_scaled_size(self, h: int, w: int) -> Tuple[int, int, float]:
         return nh, nw, s
     def _to_alpha(self, out_prob):
         if self._has_prob_to_mask:
             try:
                 return self.core.output_prob_to_mask(out_prob, matting=True)
             except Exception:
                 pass
         t = torch.as_tensor(out_prob).float()
-        if t.ndim == 3 and t.shape[0] >= 1:
-            return t[0]
-        if t.ndim >= 2:
-            return t
-        return torch.full((1, 1), 0.5, dtype=torch.float32, device=t.device if t.is_cuda else "cpu")
     # ---- main call ----
     def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
@@ -217,12 +259,20 @@ def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
         # dtype alignment for activations
         img_bchw = img_bchw.to(self.model_dtype, non_blocking=True)
-        # initial scale + fallbacks
         nh, nw, s = self._compute_scaled_size(H, W)
         scales = [(nh, nw)]
         if s < 1.0:
-            scales.append((max(1, int(nh * 0.85)), max(1, int(nw * 0.85))))
-            scales.append((max(1, int(nh * 0.70)), max(1, int(nw * 0.70))))
         last_exc = None
@@ -232,11 +282,9 @@ def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
                 img_in = _resize_bchw(img_bchw, (th, tw), is_mask=False)
                 msk_in = _resize_bchw(msk_b1hw, (th, tw), is_mask=True) if msk_b1hw is not None else None
-                # ---- IMPORTANT SHAPE CHANGES (only edit) ----
                 img_chw = _to_chw_image(img_in).contiguous()                  # [C,H,W]
                 m_1hw  = _to_1hw_mask(msk_in) if msk_in is not None else None # [1,H,W] or None
-                mask_2d = m_1hw[0].contiguous() if m_1hw is not None else None # [H,W] or None
-                # ------------------------------------------------
                 # inference with autocast + inference_mode
                 with torch.inference_mode():
@@ -268,11 +316,12 @@ def __exit__(self, *args): return False
                             out_prob = self.core.step(image=img_chw)
                             alpha = self._to_alpha(out_prob)
-                # upsample back to original resolution if scaled
                 if (th, tw) != (H, W):
-                    alpha = torch.as_tensor(alpha).unsqueeze(0).unsqueeze(0).float()
-                    alpha = F.interpolate(alpha, size=(H, W), mode="bilinear", align_corners=False)
-                    alpha = alpha.squeeze(0).squeeze(0)
                 return _to_2d_alpha_numpy(alpha)

     mode = "nearest" if is_mask else "bilinear"
     return F.interpolate(x, size=size_hw, mode=mode, align_corners=False if mode == "bilinear" else None)
+def _to_b1hw_alpha(alpha, device: str) -> torch.Tensor:
+    """
+    Convert any plausible alpha/prob output into [1,1,H,W] float in [0,1].
+    Prevents 5D/6D mishaps when upsampling.
+    """
+    t = torch.as_tensor(alpha, device=device).float()
+    if t.ndim == 2:
+        t = t.unsqueeze(0).unsqueeze(0)            # -> [1,1,H,W]
+    elif t.ndim == 3:
+        # CHW or 1HW
+        if t.shape[0] in (1, 3, 4):
+            if t.shape[0] != 1:
+                t = t[:1]                           # keep first channel
+            t = t.unsqueeze(0)                      # -> [1,1,H,W]
+        elif t.shape[-1] in (1, 3, 4):              # HWC (unexpected, but handle)
+            t = t[..., :1].permute(2, 0, 1).unsqueeze(0)
+        else:
+            # assume [H,W,C?] incompatible → fallback to first dim semantics
+            t = t[:1].unsqueeze(0)
+    elif t.ndim == 4:
+        # [B,C,H,W] → ensure C=1 and B=1
+        if t.shape[1] != 1:
+            t = t[:, :1]
+        if t.shape[0] != 1:
+            t = t[:1]
+    else:
+        # squeeze weird shapes down to [1,1,H,W] best-effort
+        while t.ndim > 4:
+            t = t.squeeze(0)
+        while t.ndim < 4:
+            t = t.unsqueeze(0)
+        if t.shape[1] != 1:
+            t = t[:, :1]
+    return t.clamp_(0.0, 1.0).contiguous()
 def _to_2d_alpha_numpy(x) -> np.ndarray:
     t = torch.as_tensor(x).float()
     while t.ndim > 2:
+        if t.ndim == 4 and t.shape[0] == 1 and t.shape[1] == 1:
+            t = t[0, 0]
+        elif t.ndim == 3 and t.shape[0] == 1:
+            t = t[0]
         else:
+            t = t.squeeze(0)
     t = t.clamp_(0.0, 1.0)
     out = t.detach().cpu().numpy().astype(np.float32)
     return np.ascontiguousarray(out)
         return nh, nw, s
     def _to_alpha(self, out_prob):
+        # Prefer library conversion if available
         if self._has_prob_to_mask:
             try:
                 return self.core.output_prob_to_mask(out_prob, matting=True)
             except Exception:
                 pass
         t = torch.as_tensor(out_prob).float()
+        # Normalize common cases to 2-D alpha
+        if t.ndim == 4:                # [B,C,H,W]
+            c = 0 if t.shape[1] > 0 else None
+            b = 0 if t.shape[0] > 0 else None
+            if b is not None and c is not None:
+                return t[b, c]
+        if t.ndim == 3:                # [C,H,W]
+            return t[0] if t.shape[0] >= 1 else t.mean(0)
+        return t                        # already 2-D or degenerate -> let caller sanitize
     # ---- main call ----
     def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
         # dtype alignment for activations
         img_bchw = img_bchw.to(self.model_dtype, non_blocking=True)
+        # build a deeper downscale ladder to survive tight VRAM
         nh, nw, s = self._compute_scaled_size(H, W)
         scales = [(nh, nw)]
+        # add progressive reductions until fairly small, but not tiny
         if s < 1.0:
+            f = 0.85
+            cur_h, cur_w = nh, nw
+            for _ in range(6):  # up to 8 attempts total
+                cur_h = max(128, int(cur_h * f))
+                cur_w = max(128, int(cur_w * f))
+                if (cur_h, cur_w) != scales[-1]:
+                    scales.append((cur_h, cur_w))
+                if max(cur_h, cur_w) <= 192 or (cur_h * cur_w) <= 150_000:
+                    break
         last_exc = None
                 img_in = _resize_bchw(img_bchw, (th, tw), is_mask=False)
                 msk_in = _resize_bchw(msk_b1hw, (th, tw), is_mask=True) if msk_b1hw is not None else None
                 img_chw = _to_chw_image(img_in).contiguous()                  # [C,H,W]
                 m_1hw  = _to_1hw_mask(msk_in) if msk_in is not None else None # [1,H,W] or None
+                mask_2d = m_1hw[0].contiguous() if m_1hw is not None else None# [H,W] or None
                 # inference with autocast + inference_mode
                 with torch.inference_mode():
                             out_prob = self.core.step(image=img_chw)
                             alpha = self._to_alpha(out_prob)
+                # ---- SAFE UPSAMPLE PATH (always 4D -> 2D) ----
                 if (th, tw) != (H, W):
+                    a_b1hw = _to_b1hw_alpha(alpha, device=img_chw.device)             # [1,1,th,tw]
+                    a_b1hw = F.interpolate(a_b1hw, size=(H, W), mode="bilinear", align_corners=False)  # [1,1,H,W]
+                    alpha  = a_b1hw[0, 0]                                             # -> [H,W]
+                # ------------------------------------------------
                 return _to_2d_alpha_numpy(alpha)