Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 29, 2025

Commit

6aec771

1 Parent(s): 6d44f52

Update models/loaders/matanyone_loader.py

Browse files

Files changed (1) hide show

models/loaders/matanyone_loader.py +170 -24

models/loaders/matanyone_loader.py CHANGED Viewed

@@ -1,26 +1,5 @@
-from matanyone_loader import MatAnyoneLoader
-import cv2, numpy as np, torch
-# Load session (stateful per video)
-loader = MatAnyoneLoader(device="cuda")
-session = loader.load()
-assert session, "MatAnyone failed to load"
-# Frame 0 (must supply a coarse mask, even a fallback like 0.5 or ones)
-bgr0 = cv2.imread("frame0001.jpg")
-rgb0 = cv2.cvtColor(bgr0, cv2.COLOR_BGR2RGB)
-coarse0 = np.ones((rgb0.shape[0], rgb0.shape[1]), dtype=np.float32)  # example fallback
-alpha0 = session(rgb0, coarse0)   # -> 2-D float32 [H,W]
-# Frames 1..N (mask=None, stateful propagation)
-for i in range(2, 6):
-    bgr = cv2.imread(f"frame000{i}.jpg")
-    rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
-    alpha = session(rgb, mask=None)  # -> 2-D float32 [H,W]
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 MatAnyone Loader + Stateful Adapter (OOM-resilient, spatially robust)
 - Canonical HF load (MatAnyone.from_pretrained → InferenceCore(model, cfg))
@@ -44,6 +23,13 @@
 import torch.nn.functional as F
 import inspect
 import threading
 def _select_device(pref: str) -> str:
     pref = (pref or "").lower()
     if pref.startswith("cuda"):
@@ -52,6 +38,133 @@ def _select_device(pref: str) -> str:
         return "cpu"
     return "cuda" if torch.cuda.is_available() else "cpu"
 def _choose_precision(device: str) -> Tuple[torch.dtype, bool, Optional[torch.dtype]]:
     """Pick model weight dtype + autocast dtype (bf16>fp16>fp32)."""
@@ -65,6 +178,11 @@ def _choose_precision(device: str) -> Tuple[torch.dtype, bool, Optional[torch.dt
     if fp16_ok:
         return torch.float16, True, torch.float16
     return torch.float32, False, None
 class _MatAnyoneSession:
     """
     Stateful controller around InferenceCore with OOM-resilient inference.
@@ -163,6 +281,7 @@ def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
                         if (th, tw) == (H, W):
                             msk_in = mask_1hw
                         else:
                             msk_in = F.interpolate(mask_1hw.unsqueeze(0), size=(th, tw), mode="nearest")[0]
                     img_chw = _to_chw_image(img_in).contiguous()  # [C,H,W]
@@ -218,6 +337,11 @@ def __exit__(self, *a): return False
             if mask_1hw is not None:
                 return _to_2d_alpha_numpy(mask_1hw)
             return np.full((H, W), 0.5, dtype=np.float32)
 class MatAnyoneLoader:
     """
     Official MatAnyone loader with stateful, OOM-resilient session adapter.
@@ -268,7 +392,7 @@ def load(self) -> Optional[Any]:
             model_dtype, use_autocast, autocast_dtype = _choose_precision(self.device)
             logger.info(f"MatAnyone precision: weights={model_dtype}, autocast={use_autocast and autocast_dtype}")
-            # HF weights (safetensors); keep trust defaults of library itself
             self.model = model_cls.from_pretrained(self.model_id)
             try:
                 self.model = self.model.to(self.device).to(model_dtype)
@@ -341,9 +465,31 @@ def debug_shapes(self, image, mask, tag: str = ""):
                 logger.info(f"[{tag}:mask ] shape={tuple(tv_msk.shape)} dtype={tv_msk.dtype}")
         except Exception as e:
             logger.info(f"[{tag}] debug error: {e}")
 if __name__ == "__main__":
     import sys
-    import cv2
     logging.basicConfig(level=logging.INFO)
     device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -373,6 +519,6 @@ def debug_shapes(self, image, mask, tag: str = ""):
         print("Failed to load MatAnyone")
         raise SystemExit(3)
-    alpha = session(img_rgb, mask)
     cv2.imwrite("alpha_out.png", (np.clip(alpha, 0, 1) * 255).astype(np.uint8))
     print("Alpha matte written to alpha_out.png")

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 MatAnyone Loader + Stateful Adapter (OOM-resilient, spatially robust)
 - Canonical HF load (MatAnyone.from_pretrained → InferenceCore(model, cfg))
 import torch.nn.functional as F
 import inspect
 import threading
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Utilities (shapes, dtype, scaling)
+# ---------------------------------------------------------------------------
 def _select_device(pref: str) -> str:
     pref = (pref or "").lower()
     if pref.startswith("cuda"):
         return "cpu"
     return "cuda" if torch.cuda.is_available() else "cpu"
+def _as_tensor_on_device(x, device: str) -> torch.Tensor:
+    if isinstance(x, torch.Tensor):
+        return x.to(device, non_blocking=True)
+    return torch.from_numpy(np.asarray(x)).to(device, non_blocking=True)
+def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
+    """
+    Normalize input to BCHW (image) or B1HW (mask).
+    Accepts: HWC, CHW, BCHW, BHWC, BTCHW/BTHWC, TCHW/THWC, HW.
+    """
+    x = _as_tensor_on_device(x, device)
+    if x.dtype == torch.uint8:
+        x = x.float().div_(255.0)
+    elif x.dtype in (torch.int16, torch.int32, torch.int64):
+        x = x.float()
+    if x.ndim == 5:
+        x = x[:, 0]  # -> 4D
+    if x.ndim == 4:
+        if x.shape[-1] in (1, 3, 4) and x.shape[1] not in (1, 3, 4):
+            x = x.permute(0, 3, 1, 2).contiguous()
+    elif x.ndim == 3:
+        if x.shape[-1] in (1, 3, 4):
+            x = x.permute(2, 0, 1).contiguous()
+        x = x.unsqueeze(0)
+    elif x.ndim == 2:
+        x = x.unsqueeze(0).unsqueeze(0)
+        if not is_mask:
+            x = x.repeat(1, 3, 1, 1)
+    else:
+        raise ValueError(f"Unsupported ndim={x.ndim}")
+    if is_mask:
+        if x.shape[1] > 1:
+            x = x[:, :1]
+        x = x.clamp_(0.0, 1.0).to(torch.float32)
+    else:
+        if x.shape[1] == 1:
+            x = x.repeat(1, 3, 1, 1)
+        x = x.clamp_(0.0, 1.0)
+    return x
+def _to_chw_image(img_bchw: torch.Tensor) -> torch.Tensor:
+    if img_bchw.ndim == 4 and img_bchw.shape[0] == 1:
+        return img_bchw[0]
+    return img_bchw
+def _to_1hw_mask(msk_b1hw: torch.Tensor) -> Optional[torch.Tensor]:
+    if msk_b1hw is None:
+        return None
+    if msk_b1hw.ndim == 4 and msk_b1hw.shape[1] == 1:
+        return msk_b1hw[0]  # -> [1,H,W]
+    if msk_b1hw.ndim == 3 and msk_b1hw.shape[0] == 1:
+        return msk_b1hw
+    raise ValueError(f"Expected B1HW or 1HW, got {tuple(msk_b1hw.shape)}")
+def _resize_bchw(x: Optional[torch.Tensor], size_hw: Tuple[int, int], is_mask: bool = False) -> Optional[torch.Tensor]:
+    if x is None:
+        return None
+    if x.shape[-2:] == size_hw:
+        return x
+    mode = "nearest" if is_mask else "bilinear"
+    return F.interpolate(x, size_hw, mode=mode, align_corners=False if mode == "bilinear" else None)
+def _to_b1hw_alpha(alpha, device: str) -> torch.Tensor:
+    t = torch.as_tensor(alpha, device=device).float()
+    if t.ndim == 2:
+        t = t.unsqueeze(0).unsqueeze(0)            # -> [1,1,H,W]
+    elif t.ndim == 3:
+        if t.shape[0] in (1, 3, 4):
+            if t.shape[0] != 1:
+                t = t[:1]
+            t = t.unsqueeze(0)
+        elif t.shape[-1] in (1, 3, 4):
+            t = t[..., :1].permute(2, 0, 1).unsqueeze(0)
+        else:
+            t = t[:1].unsqueeze(0)
+    elif t.ndim == 4:
+        if t.shape[1] != 1:
+            t = t[:, :1]
+        if t.shape[0] != 1:
+            t = t[:1]
+    else:
+        while t.ndim > 4:
+            t = t.squeeze(0)
+        while t.ndim < 4:
+            t = t.unsqueeze(0)
+        if t.shape[1] != 1:
+            t = t[:, :1]
+    return t.clamp_(0.0, 1.0).contiguous()
+def _to_2d_alpha_numpy(x) -> np.ndarray:
+    t = torch.as_tensor(x).float()
+    while t.ndim > 2:
+        if t.ndim == 4 and t.shape[0] == 1 and t.shape[1] == 1:
+            t = t[0, 0]
+        elif t.ndim == 3 and t.shape[0] == 1:
+            t = t[0]
+        else:
+            t = t.squeeze(0)
+    t = t.clamp_(0.0, 1.0)
+    out = t.detach().cpu().numpy().astype(np.float32)
+    return np.ascontiguousarray(out)
+def _compute_scaled_size(h: int, w: int, max_edge: int, target_pixels: int) -> Tuple[int, int, float]:
+    if h <= 0 or w <= 0:
+        return h, w, 1.0
+    s1 = min(1.0, float(max_edge) / float(max(h, w))) if max_edge > 0 else 1.0
+    s2 = min(1.0, (float(target_pixels) / float(h * w)) ** 0.5) if target_pixels > 0 else 1.0
+    s = min(s1, s2)
+    nh = max(1, int(round(h * s)))
+    nw = max(1, int(round(w * s)))
+    return nh, nw, s
+def debug_shapes(tag: str, image, mask) -> None:
+    def _info(name, v):
+        try:
+            tv = torch.as_tensor(v)
+            mn = float(tv.min()) if tv.numel() else float("nan")
+            mx = float(tv.max()) if tv.numel() else float("nan")
+            logger.info(f"[{tag}:{name}] shape={tuple(tv.shape)} dtype={tv.dtype} min={mn:.4f} max={mx:.4f}")
+        except Exception as e:
+            logger.info(f"[{tag}:{name}] type={type(v)} err={e}")
+    _info("image", image)
+    _info("mask", mask)
+# ---------------------------------------------------------------------------
+# Precision selection
+# ---------------------------------------------------------------------------
 def _choose_precision(device: str) -> Tuple[torch.dtype, bool, Optional[torch.dtype]]:
     """Pick model weight dtype + autocast dtype (bf16>fp16>fp32)."""
     if fp16_ok:
         return torch.float16, True, torch.float16
     return torch.float32, False, None
+# ---------------------------------------------------------------------------
+# Stateful Adapter around InferenceCore
+# ---------------------------------------------------------------------------
 class _MatAnyoneSession:
     """
     Stateful controller around InferenceCore with OOM-resilient inference.
                         if (th, tw) == (H, W):
                             msk_in = mask_1hw
                         else:
+                            # nearest to keep binary-like edges
                             msk_in = F.interpolate(mask_1hw.unsqueeze(0), size=(th, tw), mode="nearest")[0]
                     img_chw = _to_chw_image(img_in).contiguous()  # [C,H,W]
             if mask_1hw is not None:
                 return _to_2d_alpha_numpy(mask_1hw)
             return np.full((H, W), 0.5, dtype=np.float32)
+# ---------------------------------------------------------------------------
+# Loader
+# ---------------------------------------------------------------------------
 class MatAnyoneLoader:
     """
     Official MatAnyone loader with stateful, OOM-resilient session adapter.
             model_dtype, use_autocast, autocast_dtype = _choose_precision(self.device)
             logger.info(f"MatAnyone precision: weights={model_dtype}, autocast={use_autocast and autocast_dtype}")
+            # HF weights (safetensors)
             self.model = model_cls.from_pretrained(self.model_id)
             try:
                 self.model = self.model.to(self.device).to(model_dtype)
                 logger.info(f"[{tag}:mask ] shape={tuple(tv_msk.shape)} dtype={tv_msk.dtype}")
         except Exception as e:
             logger.info(f"[{tag}] debug error: {e}")
+# ---------------------------------------------------------------------------
+# Public symbols
+# ---------------------------------------------------------------------------
+__all__ = [
+    "MatAnyoneLoader",
+    "_MatAnyoneSession",
+    "_to_bchw",
+    "_resize_bchw",
+    "_to_chw_image",
+    "_to_1hw_mask",
+    "_to_b1hw_alpha",
+    "_to_2d_alpha_numpy",
+    "_compute_scaled_size",
+    "debug_shapes",
+]
+# ---------------------------------------------------------------------------
+# Optional CLI for quick testing (no circular imports)
+# ---------------------------------------------------------------------------
 if __name__ == "__main__":
     import sys
+    import cv2  # only needed for this demo CLI
     logging.basicConfig(level=logging.INFO)
     device = "cuda" if torch.cuda.is_available() else "cpu"
         print("Failed to load MatAnyone")
         raise SystemExit(3)
+    alpha = session(img_rgb, mask if mask is not None else np.ones(img_rgb.shape[:2], np.float32))
     cv2.imwrite("alpha_out.png", (np.clip(alpha, 0, 1) * 255).astype(np.uint8))
     print("Alpha matte written to alpha_out.png")