Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 30, 2025

Commit

2c9ad3e

1 Parent(s): 05c8f4b

Update models/loaders/matanyone_loader.py

Browse files

Files changed (1) hide show

models/loaders/matanyone_loader.py +246 -671

models/loaders/matanyone_loader.py CHANGED Viewed

@@ -1,711 +1,286 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-MatAnyone Loader + Stateful Adapter (Fixed Tensor Shapes, OOM-resilient)
-=======================================================================
-CHAPTERS
-1) Overview & Rationale
-2) Imports & Logger
-3) EasyDict Polyfill
-4) Tensor Utilities (device, shape, resize, padding)
-5) Precision Selection (fp16/bf16/fp32)
-6) Stateful Session (_MatAnyoneSession)  ← FIX: CHW / 1HW only (no temporal axis)
-7) Loader (MatAnyoneLoader)
-8) Public Symbols
-9) CLI Demo (optional quick test)
-Key Fix vs. previous version
-----------------------------
-- Removed the extra “temporal” axis that produced 5D tensors like [1,1,3,H,W].
-- MatAnyone now receives:
-  • Image: CHW (float, in [0,1]) — or internally BCHW collapsed to CHW.
-  • Mask : 1HW (float, in [0,1]) on the first frame only; later frames mask=None.
-- Kept: downscale ladder, padding to multiple of 16, mixed precision, long-term memory config.
 """
-# ============================================================================
-# 2) IMPORTS & LOGGER
-# ============================================================================
-from __future__ import annotations
 import os
 import time
 import logging
 import traceback
-from typing import Optional, Dict, Any, Tuple, List
 import numpy as np
 import torch
-import torch.nn.functional as F
-import inspect
-import threading
-import contextlib
 logger = logging.getLogger(__name__)
-# ============================================================================
-# 3) EASYDICT POLYFILL
-# ============================================================================
-class EasyDict(dict):
-    """Recursive dict with dot access."""
-    def __init__(self, d=None, **kwargs):
-        if d is None:
-            d = {}
-        if kwargs:
-            d.update(**kwargs)
-        for k, v in d.items():
-            if isinstance(v, dict):
-                self[k] = EasyDict(v)
-            elif isinstance(v, list):
-                self[k] = [EasyDict(i) if isinstance(i, dict) else i for i in v]
-            else:
-                self[k] = v
-    def __getattr__(self, name):  # dot-get
-        try:
-            return self[name]
-        except KeyError:
-            raise AttributeError(name)
-    def __setattr__(self, name, value):  # dot-set
-        self[name] = value
-    def __delattr__(self, name):  # dot-del
-        del self[name]
-# ============================================================================
-# 4) TENSOR UTILITIES (DEVICE, SHAPE, RESIZE, PADDING)
-# ============================================================================
-def _select_device(pref: str) -> str:
-    pref = (pref or "").lower()
-    if pref.startswith("cuda"):
-        return "cuda" if torch.cuda.is_available() else "cpu"
-    if pref == "cpu":
-        return "cpu"
-    return "cuda" if torch.cuda.is_available() else "cpu"
-def _as_tensor_on_device(x, device: str) -> torch.Tensor:
-    if isinstance(x, torch.Tensor):
-        return x.to(device, non_blocking=True)
-    return torch.from_numpy(np.asarray(x)).to(device, non_blocking=True)
-def _to_bchw(x, device: str, is_mask: bool = False) -> torch.Tensor:
-    """
-    Normalize input to BCHW (image) or B1HW (mask).
-    Accepts: HWC, CHW, BCHW, BHWC, (accidental) 5D, and HW.
-    Defensive against dtype/range; output is clamped to [0,1].
-    """
-    x = _as_tensor_on_device(x, device)
-    if x.dtype == torch.uint8:
-        x = x.float().div_(255.0)
-    elif x.dtype in (torch.int16, torch.int32, torch.int64):
-        x = x.float()
-    # If upstream passed a 5D tensor (e.g., (B,1,C,H,W) or (B,T,C,H,W)), squeeze a singleton middle axis.
-    if x.ndim == 5:
-        # Prefer to squeeze the 2nd dim if it's 1; otherwise take the first slice.
-        if x.shape[1] == 1:
-            x = x.squeeze(1)  # -> BCHW
-        else:
-            x = x[:, 0, ...]  # -> BCHW
-    if x.ndim == 4:
-        # Handle BHWC → BCHW
-        if x.shape[-1] in (1, 3, 4) and x.shape[1] not in (1, 3, 4):
-            x = x.permute(0, 3, 1, 2).contiguous()
-    elif x.ndim == 3:
-        # HWC → CHW
-        if x.shape[-1] in (1, 3, 4):
-            x = x.permute(2, 0, 1).contiguous()
-        # CHW → BCHW
-        x = x.unsqueeze(0)
-    elif x.ndim == 2:
-        # HW → B1HW (mask) or B3HW (image)
-        x = x.unsqueeze(0).unsqueeze(0)
-        if not is_mask:
-            x = x.repeat(1, 3, 1, 1)
-    else:
-        raise ValueError(f"_to_bchw: unsupported ndim={x.ndim}")
-    if is_mask:
-        # Ensure single-channel B1HW, clamped and float32
-        if x.shape[1] > 1:
-            x = x[:, :1]
-        x = x.clamp_(0.0, 1.0).to(torch.float32)
-    else:
-        # Ensure RGB
-        if x.shape[1] == 4:
-            x = x[:, :3, ...]
-        elif x.shape[1] == 1:
-            x = x.repeat(1, 3, 1, 1)
-        x = x.clamp_(0.0, 1.0)
-    return x.contiguous()
-def _to_chw_image(img_bchw: torch.Tensor) -> torch.Tensor:
-    """BCHW → CHW (take batch 0 if present)."""
-    if img_bchw.ndim == 4 and img_bchw.shape[0] == 1:
-        return img_bchw[0]
-    if img_bchw.ndim == 3:
-        return img_bchw
-    raise ValueError(f"_to_chw_image: expected BCHW or CHW, got {tuple(img_bchw.shape)}")
-def _to_1hw_mask(msk_b1hw: torch.Tensor) -> torch.Tensor:
-    """B1HW → 1HW (drop batch)."""
-    if msk_b1hw is None:
-        raise ValueError("_to_1hw_mask: mask is None")
-    if msk_b1hw.ndim == 4 and msk_b1hw.shape[1] == 1:
-        return msk_b1hw[0]  # 1HW
-    if msk_b1hw.ndim == 3 and msk_b1hw.shape[0] == 1:
-        return msk_b1hw
-    raise ValueError(f"_to_1hw_mask: expected B1HW or 1HW, got {tuple(msk_b1hw.shape)}")
-def _resize_bchw(x: Optional[torch.Tensor], size_hw: Tuple[int, int], is_mask: bool = False) -> Optional[torch.Tensor]:
-    """Resize BCHW or B1HW to (H, W) using bilinear (image) or nearest (mask)."""
-    if x is None:
-        return None
-    if x.shape[-2:] == size_hw:
-        return x
-    mode = "nearest" if is_mask else "bilinear"
-    return F.interpolate(x, size_hw, mode=mode, align_corners=False if mode == "bilinear" else None)
-def _to_b1hw_alpha(alpha, device: str) -> torch.Tensor:
-    """Convert arbitrary mask-like input to B1HW float32 [0,1]."""
-    t = torch.as_tensor(alpha, device=device).float()
-    # Squeeze extra dims down to HW/1HW first
-    while t.ndim > 4:
-        t = t.squeeze(0)
-    if t.ndim == 4:
-        # Expecting BxCxHxW; force B=1, C=1
-        if t.shape[0] != 1:
-            t = t[:1]
-        if t.shape[1] != 1:
-            t = t[:, :1]
-    elif t.ndim == 3:
-        # Could be CxHxW or HxWx1
-        if t.shape[0] == 1:
-            t = t.unsqueeze(0)  # 1x1xHxW
-        elif t.shape[-1] == 1:
-            t = t.permute(2, 0, 1).unsqueeze(0)  # 1x1xHxW
-        else:
-            # If C>1, take first channel
-            t = t[:1, ...].unsqueeze(0)
-    elif t.ndim == 2:
-        t = t.unsqueeze(0).unsqueeze(0)
-    else:
-        raise ValueError(f"_to_b1hw_alpha: unsupported ndim={t.ndim}")
-    t = t.clamp_(0.0, 1.0).contiguous()
-    return t
-def _to_2d_alpha_numpy(x) -> np.ndarray:
-    """Convert any mask-like tensor to 2D float32 numpy [H,W] in [0,1]."""
-    t = torch.as_tensor(x).float()
-    # Squeeze down to 2D
-    while t.ndim > 2:
-        if t.ndim == 4 and t.shape[0] == 1 and t.shape[1] == 1:
-            t = t[0, 0]
-        elif t.ndim == 3 and t.shape[0] == 1:
-            t = t[0]
-        else:
-            t = t.squeeze(0)
-    t = t.clamp_(0.0, 1.0)
-    out = t.detach().cpu().numpy().astype(np.float32)
-    return np.ascontiguousarray(out)
-def _compute_scaled_size(h: int, w: int, max_edge: int, target_pixels: int) -> Tuple[int, int, float]:
-    """Compute a safe scaled size that respects a max edge and total pixels."""
-    if h <= 0 or w <= 0:
-        return h, w, 1.0
-    s1 = min(1.0, float(max_edge) / float(max(h, w))) if max_edge > 0 else 1.0
-    s2 = min(1.0, (float(target_pixels) / float(h * w)) ** 0.5) if target_pixels > 0 else 1.0
-    s = min(s1, s2)
-    nh = max(128, int(round(h * s)))  # minimum of 128 to avoid very small feature maps
-    nw = max(128, int(round(w * s)))
-    return nh, nw, s
-def _pad_to_multiple_3d(t: torch.Tensor, multiple: int = 16) -> torch.Tensor:
-    """
-    Pad a 3D tensor (C,H,W) to multiples of `multiple`. Works for CHW and 1HW.
-    Returns a tensor with same ndim.
-    """
-    if t.ndim != 3:
-        raise ValueError(f"_pad_to_multiple_3d: expected 3D, got {t.ndim}D")
-    c, h, w = t.shape
-    pad_h = (multiple - h % multiple) % multiple
-    pad_w = (multiple - w % multiple) % multiple
-    if pad_h or pad_w:
-        t = F.pad(t, (0, pad_w, 0, pad_h))  # (left,right,top,bottom)
-    return t
-def debug_shapes(tag: str, image, mask) -> None:
-    """Log shapes/dtypes/min/max for quick inspection."""
-    def _info(name, v):
-        try:
-            tv = torch.as_tensor(v)
-            mn = float(tv.min()) if tv.numel() else float("nan")
-            mx = float(tv.max()) if tv.numel() else float("nan")
-            logger.info(f"[{tag}:{name}] shape={tuple(tv.shape)} dtype={tv.dtype} min={mn:.4f} max={mx:.4f}")
-        except Exception as e:
-            logger.info(f"[{tag}:{name}] type={type(v)} err={e}")
-    _info("image", image)
-    _info("mask", mask)
-# ============================================================================
-# 5) PRECISION SELECTION (fp16/bf16/fp32)
-# ============================================================================
-def _choose_precision(device: str) -> Tuple[torch.dtype, bool, Optional[torch.dtype]]:
-    """
-    Pick model weights dtype and autocast dtype (fp16>bf16>fp32), preferring fp16 for T4.
-    Returns: (model_dtype, use_autocast, autocast_dtype)
-    """
-    if device != "cuda":
-        return torch.float32, False, None
-    cc = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
-    fp16_ok = cc[0] >= 7  # Volta+
-    bf16_ok = (cc[0] >= 8) and hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported()
-    if fp16_ok:
-        return torch.float16, True, torch.float16  # T4 prefers fp16
-    if bf16_ok:
-        return torch.bfloat16, True, torch.bfloat16
-    return torch.float32, False, None
-# ============================================================================
-# 6) STATEFUL SESSION (NO TEMPORAL AXIS; STRICT CHW/1HW)
-# ============================================================================
-class _MatAnyoneSession:
-    """
-    Stateful controller around InferenceCore with OOM-resilient inference.
-    First call MUST supply a coarse mask (we enforce 1HW internally).
-    Subsequent calls should pass mask=None (temporal propagation handled by core).
-    """
-    def __init__(
-        self,
-        core,
-        device: str,
-        model_dtype: torch.dtype,
-        use_autocast: bool,
-        autocast_dtype: Optional[torch.dtype],
-        max_edge: int = 768,
-        target_pixels: int = 600_000,  # ~775x775 by area
-    ):
-        self.core = core
-        self.device = device
-        self.model_dtype = model_dtype
-        self.use_autocast = use_autocast and (device == "cuda")
-        self.autocast_dtype = autocast_dtype if self.use_autocast else None
-        self.max_edge = int(max_edge)
-        self.target_pixels = int(target_pixels)
-        self.started = False
-        self._lock = threading.Lock()
-        # Introspect optional API surfaces
-        try:
-            sig = inspect.signature(self.core.step)
-            self._has_first_frame_pred = "first_frame_pred" in sig.parameters
-        except Exception:
-            self._has_first_frame_pred = True
-        self._has_prob_to_mask = hasattr(self.core, "output_prob_to_mask")
-    def reset(self):
-        with self._lock:
-            try:
-                if hasattr(self.core, "clear_memory"):
-                    self.core.clear_memory()
-            except Exception:
-                pass
-            self.started = False
-    def _scaled_ladder(self, H: int, W: int) -> List[Tuple[int, int]]:
-        """
-        Build a list of decreasing (H,W) resolutions to attempt to avoid OOM.
-        """
-        nh, nw, s = _compute_scaled_size(H, W, self.max_edge, self.target_pixels)
-        sizes = [(nh, nw)]
-        if s < 1.0:
-            f_chain = (0.85, 0.70, 0.55, 0.40)
-            cur_h, cur_w = nh, nw
-            for f in f_chain:
-                cur_h = max(128, int(cur_h * f))
-                cur_w = max(128, int(cur_w * f))
-                if sizes[-1] != (cur_h, cur_w):
-                    sizes.append((cur_h, cur_w))
-        return sizes
-    def _to_alpha(self, out_prob):
-        """Convert model output probabilities to a matte."""
-        if self._has_prob_to_mask:
-            try:
-                return self.core.output_prob_to_mask(out_prob, matting=True)
-            except Exception:
-                pass
-        t = torch.as_tensor(out_prob).float()
-        if t.ndim == 4:   # BxCxHxW
-            return t[0, 0] if t.shape[1] >= 1 else t[0].mean(0)
-        if t.ndim == 3:   # CxHxW
-            return t[0] if t.shape[0] >= 1 else t.mean(0)
-        return t
-    def __call__(self, image, mask=None, **kwargs) -> np.ndarray:
-        """
-        Returns a 2-D float32 alpha [H,W].
-        - frame 0: provide coarse mask → session initialized
-        - frames 1..N: pass mask=None (propagation)
-        """
-        with self._lock:
-            # ---- 1) Normalize inputs to BCHW (image) and B1HW (mask), then collapse to CHW / 1HW
-            img_bchw = _to_bchw(image, self.device, is_mask=False)  # BCHW
-            H, W = img_bchw.shape[-2], img_bchw.shape[-1]
-            img_bchw = img_bchw.to(self.model_dtype, non_blocking=True)
-            msk_b1hw = _to_bchw(mask, self.device, is_mask=True) if mask is not None else None
-            if msk_b1hw is not None and msk_b1hw.shape[-2:] != (H, W):
-                msk_b1hw = _resize_bchw(msk_b1hw, (H, W), is_mask=True)
-            img_chw = _to_chw_image(img_bchw)                      # CHW
-            mask_1hw = _to_1hw_mask(msk_b1hw) if msk_b1hw is not None else None  # 1HW or None
-            # ---- 2) Downscale ladder to avoid OOM
-            sizes = self._scaled_ladder(H, W)
-            last_exc = None
-            for (th, tw) in sizes:
-                try:
-                    # 2a) Resize image (bilinear) and mask (nearest) to ladder size
-                    if (th, tw) == (H, W):
-                        img_in = img_chw
-                        msk_in = mask_1hw
-                    else:
-                        img_in = F.interpolate(img_chw.unsqueeze(0), size=(th, tw),
-                                               mode="bilinear", align_corners=False)[0]  # CHW
-                        msk_in = None
-                        if mask_1hw is not None:
-                            msk_in = F.interpolate(mask_1hw.unsqueeze(0), size=(th, tw),
-                                                   mode="nearest")[0]  # 1HW
-                    # 2b) Pad to multiple of 16 (per-model stability)
-                    img_in = _pad_to_multiple_3d(img_in)  # CHW
-                    if msk_in is not None:
-                        msk_in = _pad_to_multiple_3d(msk_in)  # 1HW
-                    # ---- 3) Forward pass (STRICT CHW / 1HW; NO TEMPORAL AXIS)
-                    with torch.inference_mode():
-                        amp_ctx = (
-                            torch.autocast(device_type="cuda", dtype=self.autocast_dtype)
-                            if self.use_autocast else
-                            contextlib.nullcontext()
-                        )
-                        with amp_ctx:
-                            if not self.started:
-                                if msk_in is None:
-                                    logger.warning("First frame arrived without a mask; returning neutral alpha.")
-                                    return np.full((H, W), 0.5, dtype=np.float32)
-                                # Initialize with first frame (explicit mask)
-                                _ = self.core.step(image=img_in, mask=msk_in)   # ← CHW + 1HW
-                                if self._has_first_frame_pred:
-                                    out_prob = self.core.step(image=img_in, first_frame_pred=True)
-                                else:
-                                    out_prob = self.core.step(image=img_in)
-                                self.started = True
-                            else:
-                                # Subsequent frames; core uses memory internally
-                                out_prob = self.core.step(image=img_in)         # ← CHW
-                    # ---- 4) Convert to alpha + unpad/upsample back to full res if needed
-                    alpha = self._to_alpha(out_prob)
-                    if alpha.ndim >= 2:
-                        alpha = alpha[..., :th, :tw]  # remove pad
-                    if (th, tw) != (H, W):
-                        a_b1hw = _to_b1hw_alpha(alpha, device=img_bchw.device)
-                        a_b1hw = F.interpolate(a_b1hw, size=(H, W), mode="bilinear", align_corners=False)
-                        alpha = a_b1hw[0, 0]
-                    return _to_2d_alpha_numpy(alpha)
-                except torch.cuda.OutOfMemoryError as e:
-                    last_exc = e
-                    torch.cuda.empty_cache()
-                    logger.warning(f"MatAnyone OOM at {th}x{tw}; retrying smaller. {e}")
-                    continue
-                except Exception as e:
-                    last_exc = e
-                    torch.cuda.empty_cache()
-                    logger.debug(traceback.format_exc())
-                    logger.warning(f"MatAnyone call failed at {th}x{tw}; retrying smaller. {e}")
-                    continue
-            # ---- 5) All attempts failed – return input mask or neutral alpha
-            logger.warning(f"MatAnyone calls failed; returning input mask or neutral alpha. {last_exc}")
-            if mask_1hw is not None:
-                return _to_2d_alpha_numpy(mask_1hw)
-            return np.full((H, W), 0.5, dtype=np.float32)
-# ============================================================================
-# 7) LOADER (MatAnyoneLoader)
-# ============================================================================
 class MatAnyoneLoader:
     """
-    Official MatAnyone loader with stateful, OOM-resilient session adapter.
     """
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyone_cache"):
-        self.device = _select_device(device)
         self.cache_dir = cache_dir
         os.makedirs(self.cache_dir, exist_ok=True)
-        self.model = None
-        self.core = None
-        self.adapter = None
         self.model_id = "PeiqingYang/MatAnyone"
         self.load_time = 0.0
-    # --- Robust imports (works with different packaging layouts) ---
-    def _import_model_and_core(self):
-        model_cls = core_cls = None
-        err_msgs = []
-        for mod, cls in [
-            ("matanyone.model.matanyone", "MatAnyone"),
-            ("matanyone", "MatAnyone"),
-        ]:
-            try:
-                m = __import__(mod, fromlist=[cls])
-                model_cls = getattr(m, cls)
-                break
-            except Exception as e:
-                err_msgs.append(f"model {mod}.{cls}: {e}")
-        for mod, cls in [
-            ("matanyone.inference.inference_core", "InferenceCore"),
-            ("matanyone", "InferenceCore"),
-        ]:
-            try:
-                m = __import__(mod, fromlist=[cls])
-                core_cls = getattr(m, cls)
-                break
-            except Exception as e:
-                err_msgs.append(f"core {mod}.{cls}: {e}")
-        if model_cls is None or core_cls is None:
-            raise ImportError("Could not import MatAnyone / InferenceCore: " + " | ".join(err_msgs))
-        return model_cls, core_cls
-    def load(self) -> Optional[Any]:
         logger.info(f"Loading MatAnyone from HF: {self.model_id} (device={self.device})")
         t0 = time.time()
         try:
-            model_cls, core_cls = self._import_model_and_core()
-            model_dtype, use_autocast, autocast_dtype = _choose_precision(self.device)
-            logger.info(f"MatAnyone precision: weights={model_dtype}, autocast={use_autocast and autocast_dtype}")
-            # HF weights (safetensors)
-            self.model = model_cls.from_pretrained(self.model_id)
-            # Move to device + dtype when possible
-            try:
-                self.model = self.model.to(self.device).to(model_dtype)
-            except Exception:
-                self.model = self.model.to(self.device)
-            self.model.eval()
-            # Full default cfg from official config.json (kept; enables memory features)
-            default_cfg = {
-                "amp": False,
-                "chunk_size": 1,  # single-frame stepping
-                "flip_aug": False,
-                "long_term": {
-                    "buffer_tokens": 2000,
-                    "count_usage": True,
-                    "max_mem_frames": 10,
-                    "max_num_tokens": 10000,
-                    "min_mem_frames": 5,
-                    "num_prototypes": 128
-                },
-                "max_internal_size": -1,
-                "max_mem_frames": 5,
-                "mem_every": 5,
-                "model": {
-                    "aux_loss": {"query": {"enabled": True, "weight": 0.01},
-                                 "sensory": {"enabled": True, "weight": 0.01}},
-                    "embed_dim": 256,
-                    "key_dim": 64,
-                    "mask_decoder": {"up_dims": [256, 128, 128, 64, 16]},
-                    "mask_encoder": {"final_dim": 256, "type": "resnet18"},
-                    "object_summarizer": {"add_pe": True, "embed_dim": 256, "num_summaries": 16},
-                    "object_transformer": {
-                        "embed_dim": 256, "ff_dim": 2048, "num_blocks": 3, "num_heads": 8,
-                        "num_queries": 16,
-                        "pixel_self_attention": {"add_pe_to_qkv": [True, True, False]},
-                        "query_self_attention": {"add_pe_to_qkv": [True, True, False]},
-                        "read_from_memory": {"add_pe_to_qkv": [True, True, False]},
-                        "read_from_past": {"add_pe_to_qkv": [True, True, False]},
-                        "read_from_pixel": {"add_pe_to_qkv": [True, True, False], "input_add_pe": False, "input_norm": False},
-                        "read_from_query": {"add_pe_to_qkv": [True, True, False], "output_norm": False}
-                    },
-                    "pixel_dim": 256,
-                    "pixel_encoder": {"ms_dims": [1024, 512, 256, 64, 3], "type": "resnet50"},
-                    "pixel_mean": [0.485, 0.456, 0.406],
-                    "pixel_pe_scale": 32,
-                    "pixel_pe_temperature": 128,
-                    "pixel_std": [0.229, 0.224, 0.225],
-                    "pretrained_resnet": False,
-                    "sensory_dim": 256,
-                    "value_dim": 256
-                },
-                "output_dir": None,
-                "save_all": True,
-                "save_aux": False,
-                "save_scores": False,
-                "stagger_updates": 5,
-                "top_k": 30,
-                "use_all_masks": False,
-                "use_long_term": True,
-                "visualize": False,
-                "weights": "pretrained_models/matanyone.pth"
-            }
-            # Merge with model.cfg if present; apply minimal overrides
-            cfg = getattr(self.model, "cfg", default_cfg) or default_cfg
-            if isinstance(cfg, dict):
-                cfg = dict(cfg)
-            overrides = {
-                "chunk_size": 1,
-                "flip_aug": False,
-            }
-            cfg.update(overrides)
-            cfg = EasyDict(cfg)
-            # Build inference core
-            try:
-                self.core = core_cls(self.model, cfg=cfg)
-            except TypeError:
-                self.core = core_cls(self.model)
-            # Some versions expose .to()
-            try:
-                if hasattr(self.core, "to"):
-                    self.core.to(self.device)
-            except Exception:
-                pass
-            # Build stateful adapter
-            max_edge = int(os.environ.get("MATANYONE_MAX_EDGE", "768"))
-            target_pixels = int(os.environ.get("MATANYONE_TARGET_PIXELS", "600000"))
-            self.adapter = _MatAnyoneSession(
-                self.core,
-                device=self.device,
-                model_dtype=model_dtype,
-                use_autocast=use_autocast,
-                autocast_dtype=autocast_dtype,
-                max_edge=max_edge,
-                target_pixels=target_pixels,
-            )
             self.load_time = time.time() - t0
-            logger.info(f"MatAnyone loaded in {self.load_time:.2f}s")
-            return self.adapter
         except Exception as e:
             logger.error(f"Failed to load MatAnyone: {e}")
             logger.debug(traceback.format_exc())
             return None
     def cleanup(self):
-        """Release model/core and clear CUDA cache."""
-        self.adapter = None
-        self.core = None
-        if self.model:
-            try:
-                del self.model
-            except Exception:
-                pass
-            self.model = None
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
     def get_info(self) -> Dict[str, Any]:
-        """Lightweight status for UI/self-check."""
         return {
-            "loaded": self.adapter is not None,
             "model_id": self.model_id,
-            "device": self.device,
             "load_time": self.load_time,
-            "model_type": type(self.model).__name__ if self.model else None,
         }
-    def debug_shapes(self, image, mask, tag: str = ""):
-        """Quick shape/dtype logger."""
-        try:
-            tv_img = torch.as_tensor(image)
-            tv_msk = torch.as_tensor(mask) if mask is not None else None
-            logger.info(f"[{tag}:image] shape={tuple(tv_img.shape)} dtype={tv_img.dtype}")
-            if tv_msk is not None:
-                logger.info(f"[{tag}:mask ] shape={tuple(tv_msk.shape)} dtype={tv_msk.dtype}")
-        except Exception as e:
-            logger.info(f"[{tag}] debug error: {e}")
-# ============================================================================
-# 8) PUBLIC SYMBOLS
-# ============================================================================
-__all__ = [
-    "MatAnyoneLoader",
-    "_MatAnyoneSession",
-    "_to_bchw",
-    "_resize_bchw",
-    "_to_chw_image",
-    "_to_1hw_mask",
-    "_to_b1hw_alpha",
-    "_to_2d_alpha_numpy",
-    "_compute_scaled_size",
-    "debug_shapes",
-]
-# ============================================================================
-# 9) CLI DEMO (OPTIONAL QUICK TEST)
-# ============================================================================
-if __name__ == "__main__":
-    import sys
-    import cv2  # only for demo
-    logging.basicConfig(level=logging.INFO)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    if len(sys.argv) < 2:
-        print(f"Usage: {sys.argv[0]} image.jpg [mask.png]")
-        raise SystemExit(1)
-    image_path = sys.argv[1]
-    mask_path = sys.argv[2] if len(sys.argv) > 2 else None
-    img_bgr = cv2.imread(image_path, cv2.IMREAD_COLOR)
-    if img_bgr is None:
-        print(f"Could not load image {image_path}")
-        raise SystemExit(2)
-    # OpenCV → RGB
-    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
-    mask = None
-    if mask_path:
-        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
-        if mask is not None and mask.max() > 1:
-            mask = (mask.astype(np.float32) / 255.0)
-    loader = MatAnyoneLoader(device=device)
-    session = loader.load()
-    if not session:
-        print("Failed to load MatAnyone")
-        raise SystemExit(3)
-    alpha = session(img_rgb, mask if mask is not None else np.ones(img_rgb.shape[:2], np.float32))
-    cv2.imwrite("alpha_out.png", (np.clip(alpha, 0, 1) * 255).astype(np.uint8))
-    print("Alpha matte written to alpha_out.png")

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
+MatAnyone Loader - Official InferenceCore API Implementation
+============================================================
+Fixed to use official MatAnyone API to resolve tensor dimension issues.
+No manual tensor manipulation - let InferenceCore handle everything internally.
 """
 import os
 import time
 import logging
+import tempfile
 import traceback
+from pathlib import Path
+from typing import Optional, Dict, Any, Tuple
 import numpy as np
 import torch
+import cv2
 logger = logging.getLogger(__name__)
 class MatAnyoneLoader:
     """
+    Official MatAnyone loader using InferenceCore API.
+    This fixes the tensor dimension mismatch by using the official API
+    which handles all tensor dimensions internally.
     """
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyone_cache"):
+        self.device = self._select_device(device)
         self.cache_dir = cache_dir
         os.makedirs(self.cache_dir, exist_ok=True)
+        self.processor = None
         self.model_id = "PeiqingYang/MatAnyone"
         self.load_time = 0.0
+        self.loaded = False
+        self.load_error = None
+        self.temp_dir = Path(tempfile.mkdtemp())
+    def _select_device(self, pref: str) -> str:
+        """Select best available device."""
+        pref = (pref or "").lower()
+        if pref.startswith("cuda"):
+            return "cuda" if torch.cuda.is_available() else "cpu"
+        if pref == "cpu":
+            return "cpu"
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    def load(self) -> bool:
+        """Load MatAnyone using official InferenceCore API."""
+        if self.loaded:
+            return True
         logger.info(f"Loading MatAnyone from HF: {self.model_id} (device={self.device})")
         t0 = time.time()
         try:
+            # Import the official API
+            from matanyone.inference.inference_core import InferenceCore
+            # Use official API - this handles ALL tensor dimensions internally
+            # No manual tensor reshaping needed!
+            self.processor = InferenceCore(self.model_id)
+            self.loaded = True
             self.load_time = time.time() - t0
+            logger.info(f"MatAnyone loaded successfully via InferenceCore API in {self.load_time:.2f}s")
+            return True
+        except ImportError as e:
+            self.load_error = f"MatAnyone not installed: {e}"
+            logger.error(f"Failed to import MatAnyone. Install with: pip install git+https://github.com/pq-yang/MatAnyone.git@main")
+            return False
         except Exception as e:
+            self.load_error = str(e)
             logger.error(f"Failed to load MatAnyone: {e}")
             logger.debug(traceback.format_exc())
+            return False
+    def process_video(self, video_path: str, mask_path: str, output_dir: Optional[str] = None,
+                     max_size: int = 720, save_frames: bool = False) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Process video using official MatAnyone API.
+        Args:
+            video_path: Path to input video
+            mask_path: Path to first frame mask
+            output_dir: Output directory (uses temp if None)
+            max_size: Maximum resolution (-1 for original)
+            save_frames: Whether to save individual frames
+        Returns:
+            (foreground_path, alpha_path) or (None, None) on error
+        """
+        if not self.loaded:
+            if not self.load():
+                logger.error(f"MatAnyone not loaded: {self.load_error}")
+                return None, None
+        if output_dir is None:
+            output_dir = str(self.temp_dir)
+        try:
+            # Use official API - no tensor manipulation needed!
+            # The API handles all dimension requirements internally
+            foreground_path, alpha_path = self.processor.process_video(
+                input_path=str(video_path),
+                mask_path=str(mask_path),
+                output_path=str(output_dir),
+                max_size=max_size,
+                save_frames=save_frames
+            )
+            logger.info(f"MatAnyone processing complete: fg={foreground_path}, alpha={alpha_path}")
+            return foreground_path, alpha_path
+        except Exception as e:
+            logger.error(f"MatAnyone processing failed: {e}")
+            logger.debug(traceback.format_exc())
+            return None, None
+    def process_frames_to_alpha(self, frames: np.ndarray, initial_mask: np.ndarray,
+                                output_dir: Optional[str] = None) -> Optional[np.ndarray]:
+        """
+        Process video frames and return alpha masks.
+        This is a compatibility wrapper for frame-based processing.
+        Args:
+            frames: Video frames as numpy array (T, H, W, C) or list
+            initial_mask: First frame mask (H, W) with values 0-255
+            output_dir: Optional output directory
+        Returns:
+            Alpha masks array (T, H, W) or None on error
+        """
+        if not self.loaded:
+            if not self.load():
+                return None
+        if output_dir is None:
+            output_dir = str(self.temp_dir)
+        # Save frames as temporary video
+        temp_video_path = Path(output_dir) / "temp_input.mp4"
+        temp_mask_path = Path(output_dir) / "temp_mask.png"
+        try:
+            # Convert frames to video
+            if isinstance(frames, list):
+                frames = np.stack(frames)
+            # Ensure correct format
+            if frames.ndim == 5:  # (B, C, T, H, W) or similar
+                # Take first batch, rearrange to (T, H, W, C)
+                frames = frames[0]
+                if frames.shape[0] == 3:  # Channels first
+                    frames = frames.transpose(1, 2, 3, 0)
+            elif frames.ndim == 4 and frames.shape[1] == 3:  # (T, C, H, W)
+                frames = frames.transpose(0, 2, 3, 1)
+            # Write video
+            fps = 30
+            height, width = frames.shape[1:3]
+            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+            out = cv2.VideoWriter(str(temp_video_path), fourcc, fps, (width, height))
+            for frame in frames:
+                if frame.dtype in (np.float32, np.float64):
+                    frame = (frame * 255).astype(np.uint8)
+                if frame.shape[-1] == 3:
+                    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+                out.write(frame)
+            out.release()
+            # Save mask
+            if initial_mask.dtype in (np.float32, np.float64):
+                initial_mask = (initial_mask * 255).astype(np.uint8)
+            cv2.imwrite(str(temp_mask_path), initial_mask)
+            # Process with official API
+            _, alpha_path = self.process_video(
+                str(temp_video_path),
+                str(temp_mask_path),
+                str(output_dir)
+            )
+            if alpha_path:
+                # Load alpha video and return as array
+                return self._load_alpha_video(alpha_path)
             return None
+        except Exception as e:
+            logger.error(f"Frame processing failed: {e}")
+            return None
+        finally:
+            # Cleanup temp files
+            if temp_video_path.exists():
+                temp_video_path.unlink()
+            if temp_mask_path.exists():
+                temp_mask_path.unlink()
+    def _load_alpha_video(self, alpha_video_path: str) -> Optional[np.ndarray]:
+        """Load alpha video and return as numpy array."""
+        try:
+            cap = cv2.VideoCapture(str(alpha_video_path))
+            frames = []
+            while True:
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                # Convert to grayscale if needed
+                if len(frame.shape) == 3:
+                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+                frames.append(frame / 255.0)  # Normalize to 0-1
+            cap.release()
+            return np.array(frames) if frames else None
+        except Exception as e:
+            logger.error(f"Failed to load alpha video: {e}")
+            return None
     def cleanup(self):
+        """Cleanup temporary files and release resources."""
+        self.processor = None
+        # Clean temp directory
+        if self.temp_dir.exists():
+            import shutil
+            shutil.rmtree(self.temp_dir, ignore_errors=True)
+        # Clear CUDA cache if available
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
     def get_info(self) -> Dict[str, Any]:
+        """Get model information."""
         return {
+            "loaded": self.loaded,
             "model_id": self.model_id,
+            "device": str(self.device),
             "load_time": self.load_time,
+            "error": self.load_error,
+            "api": "InferenceCore (official)"
         }
+    def reset(self):
+        """Reset the processor for a new video."""
+        # The official API handles session management internally
+        # Just log that reset was called
+        logger.info("MatAnyone session reset requested (handled by InferenceCore)")
+    # Compatibility method for existing code that might call this
+    def __call__(self, image, mask=None, **kwargs):
+        """
+        Direct call compatibility wrapper.
+        For single frame processing or backwards compatibility.
+        """
+        if isinstance(image, (list, np.ndarray)) and mask is not None:
+            # Process as frames
+            if not isinstance(image, np.ndarray):
+                image = np.array(image)
+            if image.ndim == 3:  # Single frame
+                image = image[np.newaxis, ...]
+            alphas = self.process_frames_to_alpha(image, mask)
+            if alphas is not None and len(alphas) > 0:
+                return alphas[0] if alphas.shape[0] == 1 else alphas
+        # Fallback
+        logger.warning("Direct call to MatAnyoneLoader not fully supported with official API")
+        return mask if mask is not None else np.zeros(image.shape[:2], dtype=np.float32)
+# For backwards compatibility - expose session class name even though we don't use it
+_MatAnyoneSession = MatAnyoneLoader  # Alias for compatibility
+__all__ = ["MatAnyoneLoader", "_MatAnyoneSession"]