Spaces:

MogensR
/

VideoBackgroundReplacer2

Paused

App Files Files Community

MogensR commited on Sep 15, 2025

Commit

ee1b711

1 Parent(s): e295279

agent 2.4

Browse files

Files changed (1) hide show

models/matanyone_loader.py +295 -832

models/matanyone_loader.py CHANGED Viewed

@@ -2,11 +2,11 @@
 """
 MatAnyone Adapter (streaming, API-agnostic)
 -------------------------------------------
-- Works with multiple MatAnyone variants:
-  - frame API:  core.step(image[, mask])  or  session.process_frame(image, mask)
-  - video API:  process_video(frames, mask)  (falls back to chunking)
 - Streams frames: no full-video-in-RAM.
-- Emits alpha.mp4 (grayscale) and fg.mp4 (RGB) as it goes.
 - Validates outputs and raises MatAnyError on failure (so pipeline can fallback).
 I/O conventions:
@@ -21,18 +21,21 @@
 import os
 import cv2
 import sys
-import json
-import math
 import time
 import torch
 import logging
-import tempfile
 import numpy as np
 from pathlib import Path
-from typing import Optional, Callable, Tuple, Union
 log = logging.getLogger(__name__)
 def _emit_progress(cb, pct: float, msg: str):
     if not cb:
         return
@@ -42,85 +45,24 @@ def _emit_progress(cb, pct: float, msg: str):
         try:
             cb(msg)         # legacy 1-arg
         except TypeError:
-            pass            # ignore if cb is incompatible
 class MatAnyError(RuntimeError):
     """Custom exception for MatAnyone processing errors."""
     pass
-def _to_device_batch(frames_bgr_np, device, dtype=torch.float16):
-    """
-    frames_bgr_np: list or np.ndarray of shape [N,H,W,3], dtype=uint8, BGR
-    Returns torch tensor [N,3,H,W] on device, normalized to 0..1
-    """
-    if isinstance(frames_bgr_np, list):
-        frames_bgr_np = np.stack(frames_bgr_np, axis=0)
-    frames_rgb = frames_bgr_np[..., ::-1].copy(order="C")               # BGR->RGB
-    pin = torch.from_numpy(frames_rgb).pin_memory()                     # [N,H,W,3]
-    t = pin.permute(0, 3, 1, 2).contiguous().to(device, non_blocking=True)
-    t = t.to(dtype=dtype) / 255.0
-    return t                                                            # [N,3,H,W]
-def _select_matany_mode(core):
-    """Pick best available API."""
-    if hasattr(core, "process_frame"):
-        return "process_frame"
-    if hasattr(core, "_process_tensor_video"):
-        return "_process_tensor_video"
-    if hasattr(core, "step"):
-        return "step"
-    raise MatAnyError("MatAnyone core has no supported API (process_frame/_process_tensor_video/step).")
-def _matany_run(core, mode, frames_04chw, seed_1hw=None, use_fp16=False):
-    """
-    Returns (alpha [N,1,H,W], fg [N,3,H,W]) on current device.
-    """
-    with torch.no_grad():
-        if mode == "process_frame":
-            alphas, fgs = [], []
-            for i in range(frames_04chw.shape[0]):
-                f = frames_04chw[i:i+1]  # [1,3,H,W]
-                if seed_1hw is not None and seed_1hw.ndim == 3:
-                    a, fg = core.process_frame(f, seed_1hw.unsqueeze(0))
-                else:
-                    a, fg = core.process_frame(f)
-                alphas.append(a)  # [1,1,H,W]
-                fgs.append(fg)    # [1,3,H,W]
-            alpha = torch.cat(alphas, dim=0)
-            fg = torch.cat(fgs, dim=0)
-            return alpha, fg
-        elif mode == "_process_tensor_video":
-            # Many repos expect float32 for this path
-            return core._process_tensor_video(frames_04chw.float(), seed_1hw)
-        elif mode == "step":
-            alphas, fgs = [], []
-            for i in range(frames_04chw.shape[0]):
-                f = frames_04chw[i:i+1]
-                if i == 0 and seed_1hw is not None:
-                    a, fg = core.step(f, seed_1hw)
-                else:
-                    a, fg = core.step(f)
-                alphas.append(a)
-                fgs.append(fg)
-            alpha = torch.cat(alphas, dim=0)
-            fg = torch.cat(fgs, dim=0)
-            return alpha, fg
-    raise MatAnyError(f"Unsupported MatAnyone mode: {mode}")
-def _cuda_snapshot():
     if not torch.cuda.is_available():
         return "CUDA: N/A"
-    i = torch.cuda.current_device()
-    return (f"device={i}, name={torch.cuda.get_device_name(i)}, "
-            f"alloc={torch.cuda.memory_allocated(i)/1e9:.2f}GB, "
-            f"reserved={torch.cuda.memory_reserved(i)/1e9:.2f}GB")
 def _safe_empty_cache():
@@ -132,99 +74,6 @@ def _safe_empty_cache():
         torch.cuda.empty_cache()
-def _to_uint8_cpu(alpha_n1hw, fg_n3hw):
-    alpha_cpu = (alpha_n1hw.clamp(0, 1) * 255.0).byte().squeeze(1).contiguous().cpu().numpy()      # [N,H,W]
-    fg_cpu    = (fg_n3hw.clamp(0, 1) * 255.0).byte().permute(0, 2, 3, 1).contiguous().cpu().numpy() # [N,H,W,3] RGB
-    return alpha_cpu, fg_cpu
-def _to_device_batch(frames_bgr_np, device, dtype=torch.float16):
-    """
-    Convert a list/array of BGR uint8 frames [N,H,W,3] to a normalized
-    CHW tensor on device using pinned memory + non_blocking copies.
-    """
-    if isinstance(frames_bgr_np, list):
-        frames_bgr_np = np.stack(frames_bgr_np, axis=0)  # [N,H,W,3]
-    # BGR -> RGB
-    frames_rgb = frames_bgr_np[..., ::-1].copy(order="C")
-    # to torch
-    pin = torch.from_numpy(frames_rgb).pin_memory()  # uint8 [N,H,W,3]
-    # NCHW and normalize
-    t = pin.permute(0, 3, 1, 2).contiguous().to(device, non_blocking=True)
-    t = t.to(dtype=dtype) / 255.0
-    return t  # [N,3,H,W]
-def _select_matany_mode(core):
-    """
-    Pick the best-available MatAnyone API at runtime.
-    Priority: process_frame > _process_tensor_video > step
-    """
-    if hasattr(core, "process_frame"):
-        return "process_frame"
-    if hasattr(core, "_process_tensor_video"):
-        return "_process_tensor_video"
-    if hasattr(core, "step"):
-        return "step"
-    raise MatAnyError("No supported MatAnyone API on core (process_frame/_process_tensor_video/step).")
-def _matany_run(core, mode, frames_04chw, seed_1hw=None):
-    """
-    Dispatch into the selected API. All tensors are on device.
-    Returns (alpha_1nhw, fg_n3hw) where alpha is [N,1,H,W], fg [N,3,H,W].
-    """
-    with torch.no_grad():
-        if mode == "process_frame":
-            alphas, fgs = [], []
-            # process_frame usually wants per-frame tensors in [1,3,H,W]
-            for i in range(frames_04chw.shape[0]):
-                f = frames_04chw[i:i+1]  # [1,3,H,W]
-                if seed_1hw is not None and seed_1hw.ndim == 3:
-                    a, fg = core.process_frame(f, seed_1hw.unsqueeze(0))
-                else:
-                    a, fg = core.process_frame(f)
-                alphas.append(a)  # [1,1,H,W]
-                fgs.append(fg)    # [1,3,H,W]
-            alpha = torch.cat(alphas, dim=0)
-            fg = torch.cat(fgs, dim=0)
-            return alpha, fg
-        elif mode == "_process_tensor_video":
-            return core._process_tensor_video(frames_04chw.float(), seed_1hw)
-        elif mode == "step":
-            alphas, fgs = [], []
-            for i in range(frames_04chw.shape[0]):
-                f = frames_04chw[i:i+1]
-                if i == 0 and seed_1hw is not None:
-                    a, fg = core.step(f, seed_1hw)
-                else:
-                    a, fg = core.step(f)
-                alphas.append(a)
-                fgs.append(fg)
-            alpha = torch.cat(alphas, dim=0)
-            fg = torch.cat(fgs, dim=0)
-            return alpha, fg
-    raise MatAnyError(f"Unsupported mode: {mode}")
-def _safe_empty_cache():
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-        torch.cuda.empty_cache()
-def _cuda_snapshot():
-    if not torch.cuda.is_available():
-        return "CUDA: N/A"
-    i = torch.cuda.current_device()
-    return (f"device={i}, name={torch.cuda.get_device_name(i)}, "
-            f"alloc={torch.cuda.memory_allocated(i)/1e9:.2f}GB, "
-            f"reserved={torch.cuda.memory_reserved(i)/1e9:.2f}GB")
 def _read_mask_hw(mask_path: Path, target_hw: Tuple[int, int]) -> np.ndarray:
     """Read mask image, convert to float32 [0,1], resize to target (H,W)."""
     if not Path(mask_path).exists():
@@ -241,267 +90,197 @@ def _read_mask_hw(mask_path: Path, target_hw: Tuple[int, int]) -> np.ndarray:
 def _to_chw01(img_bgr: np.ndarray) -> np.ndarray:
     """BGR [H,W,3] uint8 -> CHW float32 [0,1] RGB."""
-    # OpenCV gives BGR; convert to RGB
     rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
     rgbf = rgb.astype(np.float32) / 255.0
     chw = np.transpose(rgbf, (2, 0, 1))  # C,H,W
     return chw
-def _mask_to_1hw(mask_hw01: np.ndarray) -> np.ndarray:
-    """HW float32 [0,1] -> 1HW float32 [0,1]."""
-    return np.expand_dims(mask_hw01, axis=0)
-def _ensure_dir(p: Path) -> None:
-    p.mkdir(parents=True, exist_ok=True)
-def _open_video_writers(out_dir: Path, fps: float, size: Tuple[int, int]) -> Tuple[cv2.VideoWriter, cv2.VideoWriter]:
-    """Return (alpha_writer, fg_writer). size=(W,H)."""
-    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-    W, H = size
-    alpha_path = str(out_dir / "alpha.mp4")
-    fg_path    = str(out_dir / "fg.mp4")
-    # alpha: single channel => write as 3-channel grayscale for broad compatibility
-    alpha_writer = cv2.VideoWriter(alpha_path, fourcc, fps, (W, H), True)
-    fg_writer    = cv2.VideoWriter(fg_path, fourcc, fps, (W, H), True)
-    if not alpha_writer.isOpened() or not fg_writer.isOpened():
-        raise MatAnyError("Failed to open VideoWriter for alpha/fg outputs.")
-    return alpha_writer, fg_writer
 def _validate_nonempty(file_path: Path) -> None:
     if not file_path.exists() or file_path.stat().st_size == 0:
         raise MatAnyError(f"Output file missing/empty: {file_path}")
 class MatAnyoneSession:
     """
     Unified, streaming wrapper over MatAnyone variants.
     Public:
       - process_stream(video_path, seed_mask_path, out_dir, progress_cb)
-    Detects API once at init:
-      - prefers frame-wise:  core.step(img[, mask]) OR session.process_frame(img, mask)
-      - else uses video-wise: process_video(frames, mask) with chunk fallback
     """
     def __init__(self, device: Optional[str] = None, precision: str = "auto"):
-        """Initialize MatAnyoneSession with optional device and precision settings.
         Args:
-            device: Device to run on (e.g., 'cuda', 'cpu', 'cuda:0'). If None, auto-detects CUDA.
-            precision: One of 'auto', 'fp32', or 'fp16'. 'auto' uses fp16 if CUDA is available.
         """
         self.device = torch.device(device) if device else (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
         self.precision = precision.lower()
         self._core = None
-        self._api_mode = None  # "step", "process_frame", or "process_video"
-        self._frame_times = []
-        self._start_time = 0.0
-        self._gpu_mem_allocated = 0.0
-        self._gpu_mem_cached = 0.0
         self._lazy_init()
-        # Log initialization
-        log.info(f"Initialized MatAnyoneSession on {self.device} with precision {self.precision}")
         if torch.cuda.is_available():
-            log.info(f"CUDA device: {torch.cuda.get_device_name(self.device)}")
             self._log_gpu_memory()
-    def _log_gpu_memory(self) -> None:
-        """Log current GPU memory usage."""
         if torch.cuda.is_available():
             try:
-                allocated = torch.cuda.memory_allocated(self.device) / 1024**2
-                cached = torch.cuda.memory_reserved(self.device) / 1024**2
-                log.info(f"GPU Memory - Allocated: {allocated:.1f}MB, Cached: {cached:.1f}MB")
-                return allocated, cached
             except Exception as e:
-                log.warning(f"Failed to get GPU memory info: {e}")
         return 0.0, 0.0
     def _lazy_init(self) -> None:
-        """Lazy initialization of the MatAnyone inference core."""
         try:
             from matanyone.inference.inference_core import InferenceCore  # type: ignore
         except ImportError as e:
-            raise MatAnyError(f"Failed to import MatAnyone: {e}. Please ensure it's installed correctly.")
         except Exception as e:
             raise MatAnyError(f"Unexpected error during MatAnyone import: {e}")
-        # Log GPU info
-        if torch.cuda.is_available():
-            log.info(f"[GPU] CUDA is available. Device: {torch.cuda.get_device_name(0)}")
-            log.info(f"[GPU] Memory allocated: {torch.cuda.memory_allocated()/1024**2:.1f}MB")
-            log.info(f"[GPU] Memory cached: {torch.cuda.memory_reserved()/1024**2:.1f}MB")
-        else:
-            log.warning("[GPU] CUDA is not available. Using CPU (this will be slow!)")
-        # Try zero-arg first, then repo-id variant
         try:
             self._core = InferenceCore()
         except TypeError:
-            try:
-                self._core = InferenceCore("PeiqingYang/MatAnyone")
-            except Exception as e:
-                raise MatAnyError(f"MatAnyone InferenceCore init failed: {e}")
-        core = self._core
-        # MODE SELECTION (prefer video) — can be forced by env flags
         force_video = os.getenv("MATANY_FORCE_VIDEO", "1") == "1"
         force_step  = os.getenv("MATANY_FORCE_STEP",  "0") == "1"
-        if force_step and hasattr(core, "step") and callable(getattr(core, "step")):
-            self._api_mode = "step"
-        elif force_video and hasattr(core, "process_video") and callable(getattr(core, "process_video")):
-            self._api_mode = "process_video"
-        elif hasattr(core, "process_video") and callable(getattr(core, "process_video")):
-            self._api_mode = "process_video"
-        elif hasattr(core, "process_frame") and callable(getattr(core, "process_frame")):
-            self._api_mode = "process_frame"
-        elif hasattr(core, "step") and callable(getattr(core, "step")):
             self._api_mode = "step"
         else:
-            raise MatAnyError("No supported MatAnyone API found (process_video/process_frame/step).")
-        log.info(f"[MATANY] Initialized on {self.device} | API mode = {self._api_mode}")
         self._initialized = True
     def _maybe_amp(self):
-        # Use new API to silence deprecation warning
         if self.precision == "fp32":
             return torch.amp.autocast(device_type="cuda", enabled=False)
         if self.precision == "fp16":
-            return torch.amp.autocast(device_type="cuda", enabled=True, dtype=torch.float16)
-        return torch.amp.autocast(device_type="cuda", enabled=torch.cuda.is_available())
     def _validate_input_frame(self, frame: np.ndarray) -> None:
-        """Validate input frame dimensions and type."""
         if not isinstance(frame, np.ndarray):
-            raise MatAnyError(f"Frame must be a numpy array, got {type(frame)}")
         if frame.dtype != np.uint8:
             raise MatAnyError(f"Frame must be uint8, got {frame.dtype}")
         if frame.ndim != 3 or frame.shape[2] != 3:
-            raise MatAnyError(f"Frame must be HWC with 3 channels, got shape {frame.shape}")
-    def _run_frame(self, frame_bgr: np.ndarray, seed_1hw: Optional[np.ndarray], is_first: bool = False) -> np.ndarray:
         """
-        Process a single frame through MatAnyone to generate an alpha matte.
-        Uses strict 3D image (CHW) and 2D mask (HW) formats to avoid dimension issues.
-        Args:
-            frame_bgr: Input frame in BGR format (H,W,3) uint8
-            seed_1hw: Optional mask in 1HW or HW format (float32 [0,1])
-            is_first: Whether this is the first frame in the sequence
-        Returns:
-            Alpha matte in HW format (float32 [0,1])
-        Raises:
-            MatAnyError: If processing fails or invalid input is provided
         """
-        # --- Prepare image tensor (CHW float32 [0,1]) ---
-        img_chw = _to_chw01(frame_bgr)  # (3,H,W) float32
         img_t = torch.from_numpy(img_chw).to(self.device)
-        # --- Prepare mask tensor (HW float32 [0,1]) ---
         mask_t = None
         if is_first and seed_1hw is not None:
             if seed_1hw.ndim == 3 and seed_1hw.shape[0] == 1:
-                seed_hw = seed_1hw[0]  # (H,W)
             elif seed_1hw.ndim == 2:
                 seed_hw = seed_1hw
             else:
                 raise MatAnyError(f"seed mask must be 1HW or HW; got {seed_1hw.shape}")
-            mask_t = torch.from_numpy(seed_hw).to(self.device)  # (H,W)
-        # --- Validate shapes ---
-        if img_t.ndim != 3 or img_t.shape[0] != 3:
-            raise MatAnyError(f"img_t must be CHW; got {tuple(img_t.shape)}")
-        if mask_t is not None and mask_t.ndim != 2:
-            raise MatAnyError(f"mask_t must be HW; got {tuple(mask_t.shape)}")
-        # --- Process with MatAnyone ---
-        frame_start_time = time.time()
         try:
             with torch.no_grad(), self._maybe_amp():
                 if self._api_mode == "step":
-                    alpha = self._core.step(img_t, mask_t) if mask_t is not None else self._core.step(img_t)
                 elif self._api_mode == "process_frame":
-                    alpha = self._core.process_frame(img_t, mask_t)
                 else:
-                    raise MatAnyError("Internal error: Invalid API mode")
-                # Log performance metrics
-                frame_time = time.time() - frame_start_time
-                self._frame_times.append(frame_time)
-                if len(self._frame_times) > 10:  # Keep last 10 frame times
-                    self._frame_times.pop(0)
-                # Log GPU memory every 10 frames
-                if len(self._frame_times) % 10 == 0:
-                    self._log_gpu_memory()
-                return alpha
-        except torch.cuda.OutOfMemoryError:
             self._log_gpu_memory()
-            raise MatAnyError("CUDA out of memory. Try reducing the input resolution or batch size.")
         except RuntimeError as e:
             if "CUDA" in str(e):
                 self._log_gpu_memory()
-                raise MatAnyError(f"CUDA error: {e}")
-            raise MatAnyError(f"Runtime error: {e}")
         except Exception as e:
-            raise MatAnyError(f"Processing failed: {e}")
-        # --- Process output ---
-        # Convert to numpy and ensure correct shape/range
-        if isinstance(alpha, torch.Tensor):
-            alpha_np = alpha.detach().float().clamp(0, 1).squeeze().cpu().numpy()
         else:
-            alpha_np = np.asarray(alpha, dtype=np.float32)
             if alpha_np.max() > 1.0:
-                alpha_np = (alpha_np / 255.0).clip(0, 1)
-        # Ensure 2D output (H,W)
         alpha_np = np.squeeze(alpha_np)
         if alpha_np.ndim != 2:
             raise MatAnyError(f"Expected 2D alpha matte; got shape {alpha_np.shape}")
-        return alpha_np
     def _harvest_process_video_output(self, res, out_dir: Path, base: str) -> Tuple[Path, Path]:
         """
         Accepts varied return types from MatAnyone.process_video and produces
-        (alpha.mp4, fg.mp4) inside out_dir. Strategies:
-          - If res is a sequence of alpha arrays/tensors → write our own videos.
-          - If res is dict/tuple of paths → copy/rename.
-          - Else: glob typical output dirs for files matching base.
         """
-        # Case A: sequence of masks
-        import torch, numpy as np, cv2, glob, shutil
-        def _as_np(a):
-            if isinstance(a, torch.Tensor):
-                a = a.detach().float().cpu().numpy()
-            a = np.asarray(a)
-            if a.ndim == 3 and a.shape[0] in (1,3):   # (C,H,W) → prefer HW
-                a = np.squeeze(a) if a.shape[0] == 1 else np.mean(a, axis=0)
-            if a.max() > 1.0:
-                a = a / 255.0
-            return a.clip(0,1).astype(np.float32)
         alpha_mp4 = out_dir / "alpha.mp4"
         fg_mp4    = out_dir / "fg.mp4"
-        # If we got arrays/tensors: we can't reconstruct FG without original frames here,
-        # so prefer path-returning flows. If needed, you can extend this to re-read frames
-        # and blend. For now, try to detect paths first.
         if isinstance(res, dict):
             cand_alpha = res.get("alpha") or res.get("alpha_path") or res.get("matte") or res.get("matte_path")
             cand_fg    = res.get("fg")    or res.get("fg_path")    or res.get("foreground") or res.get("foreground_path")
@@ -510,13 +289,13 @@ def _as_np(a):
                 shutil.copy2(cand_alpha, alpha_mp4); moved += 1
             if cand_fg and Path(cand_fg).exists():
                 shutil.copy2(cand_fg, fg_mp4); moved += 1
-            if moved == 2: return alpha_mp4, fg_mp4
         if isinstance(res, (list, tuple)) and len(res) >= 1:
-            # Heuristic: assume list/tuple of file paths
             paths = [Path(x) for x in res if isinstance(x, (str, Path))]
             if paths:
-                # Pick best matches by name
                 alpha_candidates = [p for p in paths if p.exists() and ("alpha" in p.name or "matte" in p.name)]
                 fg_candidates    = [p for p in paths if p.exists() and ("fg" in p.name or "fore" in p.name)]
                 if alpha_candidates and fg_candidates:
@@ -524,23 +303,25 @@ def _as_np(a):
                     shutil.copy2(fg_candidates[0], fg_mp4)
                     return alpha_mp4, fg_mp4
-        # As last resort, glob common dirs created by the lib
         search_dirs = [Path.cwd(), out_dir, Path("results"), Path("result"), Path("output"), Path("outputs")]
-        hits = []
         for d in search_dirs:
             if d.exists():
                 hits.extend(list(d.rglob(f"*{base}*.*")))
-        # choose best alpha/fg
         alpha_candidates = [p for p in hits if p.suffix.lower() in (".mp4",".mov",".mkv",".avi") and ("alpha" in p.name or "matte" in p.name)]
         fg_candidates    = [p for p in hits if p.suffix.lower() in (".mp4",".mov",".mkv",".avi") and ("fg" in p.name or "fore" in p.name)]
         if alpha_candidates and fg_candidates:
-            import shutil
             shutil.copy2(alpha_candidates[0], alpha_mp4)
             shutil.copy2(fg_candidates[0], fg_mp4)
             return alpha_mp4, fg_mp4
-        raise MatAnyError("MatAnyone.process_video did not yield discoverable outputs.")
     def process_stream(
         self,
         video_path: Path,
@@ -548,520 +329,209 @@ def process_stream(
         out_dir: Optional[Path] = None,
         progress_cb: Optional[Callable] = None,
     ) -> Tuple[Path, Path]:
-        """Process video stream with MatAnyone.
-        Args:
-            video_path: Input video file path (must exist and be readable)
-            seed_mask_path: Optional seed mask image (grayscale, same size as video)
-            out_dir: Output directory (default: video_path.parent)
-            progress_cb: Callback for progress updates (signature: (float, str) or (str,))
         Returns:
-            Tuple of (alpha_path, fg_path) output video paths
         Raises:
-            MatAnyError: If processing fails for any reason
-            FileNotFoundError: If input files are not found
-            ValueError: If input parameters are invalid
         """
-        # Input validation
         if not video_path.exists():
             raise FileNotFoundError(f"Input video not found: {video_path}")
-        if seed_mask_path is not None and not seed_mask_path.exists():
-            raise FileNotFoundError(f"Seed mask not found: {seed_mask_path}")
         if out_dir is None:
             out_dir = video_path.parent
         out_dir = Path(out_dir)
         out_dir.mkdir(parents=True, exist_ok=True)
-        # Initialize progress tracking
-        self._frame_times = []
-        self._start_time = time.time()
-        _emit_progress(progress_cb, 0.0, "Initializing video processing...")
-        # Log GPU status
-        if torch.cuda.is_available():
-            _emit_progress(progress_cb, 0.01, "GPU detected, initializing CUDA...")
-        else:
-            _emit_progress(progress_cb, 0.01, "No GPU detected, using CPU (slower)...")
-        cap = cv2.VideoCapture(str(video_path))
-        if not cap.isOpened():
             raise MatAnyError(f"Failed to open video: {video_path}")
-        N = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        cap.release()
-        log.info(f"[MATANY] Processing {N} frames ({W}x{H} @ {fps:.1f}fps) from {video_path}")
-        _emit_progress(progress_cb, 0.05, f"Processing {N} frames ({W}x{H} @ {fps:.1f}fps)")
         try:
             if self._api_mode == "process_video":
-                # --- PATH-BASED CALL (this wheel expects a video path, not tensors) ---
-                _emit_progress(progress_cb, 0.1, "Using MatAnyone video mode (GPU-accelerated)")
-                # Log before starting video processing
                 if torch.cuda.is_available():
-                    mem_alloc, _ = self._log_gpu_memory()
-                    _emit_progress(progress_cb, 0.12, f"GPU memory before processing: {mem_alloc:.1f}MB")
-                    # Some builds accept (video_path, seed_mask_path), others just (video_path)
-                    try:
-                        _emit_progress(progress_cb, 0.15, "Starting video processing with mask...")
-                        res = self._core.process_video(
-                            str(video_path),
-                            str(seed_mask_path) if seed_mask_path is not None else None
-                        )
-                    except TypeError as e:
-                        if "takes 2 positional arguments but 3 were given" in str(e):
-                            _emit_progress(progress_cb, 0.15, "Starting video processing without mask...")
-                            res = self._core.process_video(str(video_path))
-                        else:
-                            raise
-                    # Log after processing
-                    if torch.cuda.is_available():
-                        _emit_progress(progress_cb, 0.9, f"Processing complete. GPU memory used: {torch.cuda.memory_allocated()/1024**2:.1f}MB")
-                    else:
-                        _emit_progress(progress_cb, 0.9, "Processing complete.")
-                    # Normalize output files
-                    _emit_progress(progress_cb, 0.95, "Finalizing output files...")
-                    alpha_path, fg_path = self._harvest_process_video_output(res, out_dir, base=video_path.stem)
-                    _validate_nonempty(alpha_path)
-                    _validate_nonempty(fg_path)
-                    _emit_progress(progress_cb, 1.0, "Processing complete!")
-                    return alpha_path, fg_path
-            else:
-                # Frame-by-frame (preferred)
-                log.info(f"[MATANY] Using frame-by-frame mode: {self._api_mode}")
-                _emit_progress(progress_cb, 0.1, f"Using {self._api_mode} mode (frame-by-frame)")
-                cap = cv2.VideoCapture(str(video_path))
-                alpha_path = out_dir / "alpha.mp4"
-                fg_path = out_dir / "fg.mp4"
-                # Initialize video writers
-                _emit_progress(progress_cb, 0.12, "Initializing video writers...")
-                alpha_writer = cv2.VideoWriter(
-                    str(alpha_path),
-                    cv2.VideoWriter_fourcc(*'mp4v'),
-                    fps,
-                    (W, H),
-                    isColor=False
-                )
-                fg_writer = cv2.VideoWriter(
-                    str(fg_path),
-                    cv2.VideoWriter_fourcc(*'mp4v'),
-                    fps,
-                    (W, H),
-                    isColor=True
-                )
-                if not alpha_writer.isOpened() or not fg_writer.isOpened():
-                    raise MatAnyError("Failed to initialize video writers")
                 try:
-                    # Load seed mask if provided
-                    seed_1hw = None
-                    if seed_mask_path is not None:
-                        seed_1hw = _read_mask_hw(seed_mask_path, (H, W))
-                    idx = 0
-                    last_progress_update = 0
-                    frame_times = []
-                    start_time = time.time()
-                    while True:
-                        ret, frame = cap.read()
-                        if not ret:
-                            break
-                        frame_start_time = time.time()
-                        # Update progress more frequently (every 1% or 5 frames, whichever is more frequent)
-                        current_progress = (idx / N) if N > 0 else 0.0
-                        if idx % max(5, N//100) == 0 or time.time() - last_progress_update > 2.0:
-                            # Calculate progress metrics
-                            elapsed = time.time() - start_time
-                            if idx > 0 and current_progress > 0:
-                                # Calculate ETA
-                                eta_seconds = (elapsed / current_progress) * (1 - current_progress)
-                                if eta_seconds > 3600:
-                                    eta_str = f"{eta_seconds/3600:.1f} hours"
-                                elif eta_seconds > 60:
-                                    eta_str = f"{eta_seconds/60:.1f} minutes"
-                                else:
-                                    eta_str = f"{eta_seconds:.0f} seconds"
-                                # Calculate processing speed
-                                fps = idx / elapsed if elapsed > 0 else 0
-                                # Add GPU memory info if available
-                                gpu_info = ""
-                                if torch.cuda.is_available():
-                                    mem_alloc = torch.cuda.memory_allocated() / 1024**2
-                                    mem_cached = torch.cuda.memory_reserved() / 1024**2
-                                    gpu_info = f" | GPU: {mem_alloc:.1f}/{mem_cached:.1f}MB"
-                                status = (f"Processing frame {idx+1}/{N} (ETA: {eta_str}, "
-                                        f"{fps:.1f} FPS{gpu_info}")
-                                _emit_progress(progress_cb, min(0.99, current_progress), status)
-                                last_progress_update = time.time()
-                        # Process frame
-                        log.debug(f"[MATANY] Processing frame {idx+1}/{N}")
-                        # Only pass seed mask on first frame
-                        current_mask = seed_1hw if idx == 0 else None
-                        alpha_hw = self._run_frame(frame, current_mask, is_first=(idx == 0))
-                        # Calculate frame processing time
-                        frame_time = time.time() - frame_start_time
-                        frame_times.append(frame_time)
-                        if len(frame_times) > 10:  # Keep last 10 frame times for average
-                            frame_times.pop(0)
-                        # Log GPU memory usage occasionally
-                        if idx % 50 == 0 and torch.cuda.is_available():
-                            log.info(f"[GPU] Memory allocated: {torch.cuda.memory_allocated()/1024**2:.1f}MB, "
-                                   f"Cached: {torch.cuda.memory_reserved()/1024**2:.1f}MB, "
-                                   f"Avg frame time: {sum(frame_times)/len(frame_times)*1000:.1f}ms")
-                        # Compose output frames
-                        alpha_u8 = (alpha_hw * 255.0 + 0.5).astype(np.uint8)
-                        alpha_rgb = cv2.cvtColor(alpha_u8, cv2.COLOR_GRAY2BGR)
-                        fg_bgr = (frame.astype(np.float32) * (alpha_hw[..., None] / 255.0)).astype(np.uint8)
-                        # Write outputs
-                        alpha_writer.write(alpha_rgb)
-                        fg_writer.write(fg_bgr)
-                        idx += 1
-                except Exception as e:
-                    # Log detailed error information
-                    error_msg = f"Error processing frame {idx+1}/{N}: {str(e)}"
-                    log.error(error_msg, exc_info=True)
-                    # Add GPU memory info if available
-                    if torch.cuda.is_available():
-                        mem_alloc = torch.cuda.memory_allocated() / 1024**2
-                        mem_cached = torch.cuda.memory_reserved() / 1024**2
-                        error_msg += (f"\nGPU Memory - Allocated: {mem_alloc:.1f}MB, "
-                                    f"Cached: {mem_cached:.1f}MB")
-                    # Add frame processing stats
-                    if frame_times:
-                        avg_time = sum(frame_times) / len(frame_times)
-                        error_msg += f"\nAvg frame time: {avg_time*1000:.1f}ms"
-                    _emit_progress(progress_cb, -1, f"ERROR: {error_msg}")
-                    raise MatAnyError(error_msg) from e
-                finally:
-                    # Cleanup resources
-                    try:
-                        if 'cap' in locals() and hasattr(cap, 'isOpened') and cap.isOpened():
-                            cap.release()
-                        if 'alpha_writer' in locals() and alpha_writer is not None:
-                            if hasattr(alpha_writer, 'isOpened') and alpha_writer.isOpened():
-                                alpha_writer.release()
-                        if 'fg_writer' in locals() and fg_writer is not None:
-                            if hasattr(fg_writer, 'isOpened') and fg_writer.isOpened():
-                                fg_writer.release()
-                        # Log final stats
-                        total_time = time.time() - start_time
-                        fps = idx / total_time if total_time > 0 else 0
-                        # Log GPU memory info if available
-                        gpu_info = ""
-                        if torch.cuda.is_available():
-                            mem_alloc = torch.cuda.memory_allocated() / 1024**2
-                            mem_cached = torch.cuda.memory_reserved() / 1024**2
-                            gpu_info = f"\nGPU Memory - Allocated: {mem_alloc:.1f}MB, Cached: {mem_cached:.1f}MB"
-                        log.info(
-                            f"[MATANY] Processed {idx} frames in {total_time:.1f}s ({fps:.1f} FPS){gpu_info}"
-                        )
-                        # Validate outputs
-                        _validate_nonempty(alpha_path)
-                        _validate_nonempty(fg_path)
-                        # Final progress update
-                        _emit_progress(
-                            progress_cb,
-                            1.0,
-                            f"Complete! Processed {idx} frames at {fps:.1f} FPS{gpu_info}"
-                        )
-                        return alpha_path, fg_path
-                    except Exception as e:
-                        error_msg = f"Error during cleanup: {str(e)}"
-                        log.error(error_msg, exc_info=True)
-                        _emit_progress(progress_cb, -1, f"CLEANUP ERROR: {error_msg}")
-                        raise MatAnyError(error_msg) from e
-        except Exception as e:
-            error_msg = f"Error during video processing: {str(e)}"
-            log.error(error_msg, exc_info=True)
-            if torch.cuda.is_available():
-                error_msg += f"\nGPU Memory: {torch.cuda.memory_allocated()/1024**2:.1f}MB allocated"
-            _emit_progress(progress_cb, -1, error_msg)
-            raise MatAnyError(error_msg) from e
-        else:
-            # Frame-by-frame (preferred)
-            log.info(f"[MATANY] Using frame-by-frame mode: {self._api_mode}")
-            _emit_progress(progress_cb, 0.1, f"Using {self._api_mode} mode (frame-by-frame)")
             cap = cv2.VideoCapture(str(video_path))
-            alpha_path = out_dir / "alpha.mp4"
-            fg_path = out_dir / "fg.mp4"
-            # Initialize video writers
-            _emit_progress(progress_cb, 0.12, "Initializing video writers...")
-            alpha_writer = cv2.VideoWriter(
-                str(alpha_path),
-                cv2.VideoWriter_fourcc(*'mp4v'),
-                fps,
-                (W, H),
-                isColor=False
-            )
-            fg_writer = cv2.VideoWriter(
-                str(fg_path),
-                cv2.VideoWriter_fourcc(*'mp4v'),
-                fps,
-                (W, H),
-                isColor=True
-            )
             if not alpha_writer.isOpened() or not fg_writer.isOpened():
-                raise MatAnyError("Failed to initialize video writers")
             try:
-                # Load seed mask if provided
-                seed_1hw = None
-                if seed_mask_path is not None:
-                    seed_1hw = _read_mask_hw(seed_mask_path, (H, W))
-                idx = 0
-                last_progress_update = 0
-                frame_times = []
-                start_time = time.time()
-                try:
-                    while True:
-                        ret, frame = cap.read()
-                        if not ret:
-                            break
-                        frame_start_time = time.time()
-                        # Update progress more frequently (every 1% or 5 frames, whichever is more frequent)
-                        current_progress = (idx / N) if N > 0 else 0.0
-                        if idx % max(5, N//100) == 0 or time.time() - last_progress_update > 2.0:
-                            # Calculate progress metrics
-                            elapsed = time.time() - start_time
-                            if idx > 0 and current_progress > 0:
-                                # Calculate ETA
-                                eta_seconds = (elapsed / current_progress) * (1 - current_progress)
-                                if eta_seconds > 3600:
-                                    eta_str = f"{eta_seconds/3600:.1f} hours"
-                                elif eta_seconds > 60:
-                                    eta_str = f"{eta_seconds/60:.1f} minutes"
-                                else:
-                                    eta_str = f"{eta_seconds:.0f} seconds"
-                                # Calculate processing speed
-                                fps = idx / elapsed if elapsed > 0 else 0
-                                # Add GPU memory info if available
-                                gpu_info = ""
-                                if torch.cuda.is_available():
-                                    mem_alloc = torch.cuda.memory_allocated() / 1024**2
-                                    mem_cached = torch.cuda.memory_reserved() / 1024**2
-                                    gpu_info = f" | GPU: {mem_alloc:.1f}/{mem_cached:.1f}MB"
-                                status = (f"Processing frame {idx+1}/{N} (ETA: {eta_str}, "
-                                        f"{fps:.1f} FPS{gpu_info}")
-                                _emit_progress(progress_cb, min(0.99, current_progress), status)
-                                last_progress_update = time.time()
-                        # Process frame
-                        log.debug(f"[MATANY] Processing frame {idx+1}/{N}")
-                        # Only pass seed mask on first frame
-                        current_mask = seed_1hw if idx == 0 else None
-                        alpha_hw = self._run_frame(frame, current_mask, is_first=(idx == 0))
-                        # Calculate frame processing time
-                        frame_time = time.time() - frame_start_time
-                        frame_times.append(frame_time)
-                        if len(frame_times) > 10:  # Keep last 10 frame times for average
-                            frame_times.pop(0)
-                        # Log GPU memory usage occasionally
-                        if idx % 50 == 0 and torch.cuda.is_available():
-                            log.info(f"[GPU] Memory allocated: {torch.cuda.memory_allocated()/1024**2:.1f}MB, "
-                                   f"Cached: {torch.cuda.memory_reserved()/1024**2:.1f}MB, "
-                                   f"Avg frame time: {sum(frame_times)/len(frame_times)*1000:.1f}ms")
-                        # Compose output frames
-                        alpha_u8 = (alpha_hw * 255.0 + 0.5).astype(np.uint8)
-                        alpha_rgb = cv2.cvtColor(alpha_u8, cv2.COLOR_GRAY2BGR)
-                        fg_bgr = (frame.astype(np.float32) * (alpha_hw[..., None] / 255.0)).astype(np.uint8)
-                        # Write outputs
-                        alpha_writer.write(alpha_rgb)
-                        fg_writer.write(fg_bgr)
-                        idx += 1
-                except Exception as e:
-                    # Log detailed error information
-                    error_msg = f"Error processing frame {idx+1}/{N}: {str(e)}"
-                    log.error(error_msg, exc_info=True)
-                    # Add GPU memory info if available
-                    if torch.cuda.is_available():
-                        mem_alloc = torch.cuda.memory_allocated() / 1024**2
-                        mem_cached = torch.cuda.memory_reserved() / 1024**2
-                        error_msg += (f"\nGPU Memory - Allocated: {mem_alloc:.1f}MB, "
-                                    f"Cached: {mem_cached:.1f}MB")
-                    # Add frame processing stats
-                    if self._frame_times:
-                        avg_time = sum(self._frame_times) / len(self._frame_times)
-                        error_msg += f"\nAvg frame time: {avg_time*1000:.1f}ms"
-                    _emit_progress(progress_cb, -1, f"ERROR: {error_msg}")
-                    raise MatAnyError(error_msg) from e
-                finally:
-                    # Cleanup resources
-                    # Cleanup resources in a single finally block
-                    try:
-                        if 'cap' in locals() and cap is not None:
-                            if hasattr(cap, 'isOpened') and cap.isOpened():
-                                cap.release()
-                        if 'alpha_writer' in locals() and alpha_writer is not None:
-                            if hasattr(alpha_writer, 'isOpened') and alpha_writer.isOpened():
-                                alpha_writer.release()
-                        if 'fg_writer' in locals() and fg_writer is not None:
-                            if hasattr(fg_writer, 'isOpened') and fg_writer.isOpened():
-                                fg_writer.release()
-                        # Log final stats
-                        total_time = time.time() - start_time
-                        fps = idx / total_time if total_time > 0 else 0
-                        # Log GPU memory info if available
-                        gpu_info = ""
                         if torch.cuda.is_available():
-                            mem_alloc = torch.cuda.memory_allocated() / 1024**2
-                            mem_cached = torch.cuda.memory_reserved() / 1024**2
-                            gpu_info = f"\nGPU Memory - Allocated: {mem_alloc:.1f}MB, Cached: {mem_cached:.1f}MB"
-                        log.info(
-                            f"[MATANY] Processed {idx} frames in {total_time:.1f}s ({fps:.1f} FPS){gpu_info}"
-                        )
-                        # Validate outputs
-                        _validate_nonempty(alpha_path)
-                        _validate_nonempty(fg_path)
-                        # Final progress update
-                        _emit_progress(
-                            progress_cb,
-                            1.0,
-                            f"Complete! Processed {idx} frames at {fps:.1f} FPS{gpu_info}"
-                        )
-                        return alpha_path, fg_path
-                    except Exception as e:
-                        error_msg = f"Error during cleanup: {str(e)}"
-                        log.error(error_msg, exc_info=True)
-                        _emit_progress(progress_cb, -1, f"CLEANUP ERROR: {error_msg}")
-                        raise MatAnyError(error_msg) from e
-                    finally:
-                        # Ensure all resources are cleaned up
-                        if 'cap' in locals() and cap is not None:
-                            if hasattr(cap, 'release'):
-                                cap.release()
-                        if 'alpha_writer' in locals() and alpha_writer is not None:
-                            if hasattr(alpha_writer, 'release'):
-                                alpha_writer.release()
-                        if 'fg_writer' in locals() and fg_writer is not None:
-                            if hasattr(fg_writer, 'release'):
-                                fg_writer.release()
-                        _safe_empty_cache()
-    def _flush_chunk(self, frames_bgr, seed_1hw, alpha_writer, fg_writer):
         """
-        Process an in-memory batch (list of uint8 BGR frames), write results via writers.
-        Strong CUDA guards + cleanup.
         """
-        device = self.device
-        use_fp16 = (device.type == "cuda") and getattr(self, "use_fp16", True)
         mode = _select_matany_mode(self._core)
-        frames_04chw = None
-        alpha_n1hw = None
-        fg_n3hw = None
-        try:
-            frames_04chw = _to_device_batch(frames_bgr, device, dtype=torch.float16 if use_fp16 else torch.float32)
-            if device.type == "cuda":
-                stream = torch.cuda.Stream()
-                with torch.cuda.stream(stream):
-                    with torch.autocast(device_type="cuda", enabled=use_fp16):
-                        alpha_n1hw, fg_n3hw = _matany_run(self._core, mode, frames_04chw, seed_1hw, use_fp16)
-                stream.synchronize()
-            else:
-                alpha_n1hw, fg_n3hw = _matany_run(self._core, mode, frames_04chw, seed_1hw, use_fp16)
-            alpha_cpu, fg_cpu = _to_uint8_cpu(alpha_n1hw, fg_n3hw)
-            for i in range(alpha_cpu.shape[0]):
-                alpha_writer.write(alpha_cpu[i])                 # [H,W] uint8
-                fg_writer.write(fg_cpu[i][..., ::-1].copy())     # RGB->BGR
-            if hasattr(self._core, "last_mask"):
-                self._last_alpha_1hw = self._core.last_mask
-        except torch.cuda.OutOfMemoryError as e:
-            snap = _cuda_snapshot()
-            _safe_empty_cache()
-            # Re-raise with context for pipeline to catch
-            raise MatAnyError(f"CUDA OOM in _flush_chunk | {snap}") from e
-        except Exception as e:
-            snap = _cuda_snapshot()
-            raise MatAnyError(f"MatAnyone failure in _flush_chunk: {e} | {snap}") from e
-        finally:
-            # ensure we release heavy tensors
-            try:
-                del alpha_n1hw, fg_n3hw, frames_04chw
-            except Exception:
-                pass
-            _safe_empty_cache()
-    def process_stream(self, frames_iterable, seed_1hw, alpha_writer, fg_writer, chunk_size=32):
         """
         Buffer frames from iterable and process in chunks.
         On OOM, retry once with half chunk size; otherwise bubble up MatAnyError.
         """
-        frames_buf = []
         try:
             for f in frames_iterable:
                 frames_buf.append(f)
@@ -1069,10 +539,8 @@ def process_stream(self, frames_iterable, seed_1hw, alpha_writer, fg_writer, chu
                     try:
                         self._flush_chunk(frames_buf, seed_1hw, alpha_writer, fg_writer)
                         frames_buf.clear()
-                    except torch.cuda.OutOfMemoryError:
-                        # should be wrapped above, but double-guard
-                        raise
-                    except MatAnyError as inner:
                         # one-time downshift
                         if chunk_size > 4:
                             half = max(4, chunk_size // 2)
@@ -1081,19 +549,14 @@ def process_stream(self, frames_iterable, seed_1hw, alpha_writer, fg_writer, chu
                                 self._flush_chunk(sub, seed_1hw, alpha_writer, fg_writer)
                             frames_buf.clear()
                         else:
-                            raise inner
             if frames_buf:
                 self._flush_chunk(frames_buf, seed_1hw, alpha_writer, fg_writer)
                 frames_buf.clear()
-        except torch.cuda.OutOfMemoryError as e:
-            snap = _cuda_snapshot()
-            _safe_empty_cache()
-            raise MatAnyError(f"CUDA OOM in process_stream outer | {snap}") from e
         except Exception as e:
-            raise MatAnyError(f"Unexpected error in process_stream: {e}") from e
         finally:
             frames_buf.clear()

 """
 MatAnyone Adapter (streaming, API-agnostic)
 -------------------------------------------
+- Supports multiple MatAnyone variants:
+  * frame API:  core.step(image[, mask])  or  core.process_frame(image, mask)
+  * video API:  core.process_video(video_path[, mask_path])
 - Streams frames: no full-video-in-RAM.
+- Emits alpha.mp4 (grayscale-as-BGR for compatibility) and fg.mp4 (RGB-on-black) as it goes.
 - Validates outputs and raises MatAnyError on failure (so pipeline can fallback).
 I/O conventions:
 import os
 import cv2
 import sys
 import time
+import glob
+import shutil
 import torch
 import logging
 import numpy as np
 from pathlib import Path
+from typing import Optional, Callable, Tuple, List, Union
 log = logging.getLogger(__name__)
+# -----------------------------
+# Small utilities
+# -----------------------------
 def _emit_progress(cb, pct: float, msg: str):
     if not cb:
         return
         try:
             cb(msg)         # legacy 1-arg
         except TypeError:
+            pass
 class MatAnyError(RuntimeError):
     """Custom exception for MatAnyone processing errors."""
     pass
+def _cuda_snapshot(device: Optional[torch.device] = None) -> str:
     if not torch.cuda.is_available():
         return "CUDA: N/A"
+    idx = 0
+    if device is not None and isinstance(device, torch.device) and device.index is not None:
+        idx = device.index
+    name = torch.cuda.get_device_name(idx)
+    alloc = torch.cuda.memory_allocated(idx) / 1e9
+    resv  = torch.cuda.memory_reserved(idx) / 1e9
+    return f"device={idx}, name={name}, alloc={alloc:.2f}GB, reserved={resv:.2f}GB"
 def _safe_empty_cache():
         torch.cuda.empty_cache()
 def _read_mask_hw(mask_path: Path, target_hw: Tuple[int, int]) -> np.ndarray:
     """Read mask image, convert to float32 [0,1], resize to target (H,W)."""
     if not Path(mask_path).exists():
 def _to_chw01(img_bgr: np.ndarray) -> np.ndarray:
     """BGR [H,W,3] uint8 -> CHW float32 [0,1] RGB."""
     rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
     rgbf = rgb.astype(np.float32) / 255.0
     chw = np.transpose(rgbf, (2, 0, 1))  # C,H,W
     return chw
 def _validate_nonempty(file_path: Path) -> None:
     if not file_path.exists() or file_path.stat().st_size == 0:
         raise MatAnyError(f"Output file missing/empty: {file_path}")
+def _select_matany_mode(core) -> str:
+    """
+    Pick the best-available MatAnyone API at runtime.
+    Priority: process_video > process_frame > step
+    """
+    if hasattr(core, "process_video") and callable(getattr(core, "process_video")):
+        return "process_video"
+    if hasattr(core, "process_frame") and callable(getattr(core, "process_frame")):
+        return "process_frame"
+    if hasattr(core, "step") and callable(getattr(core, "step")):
+        return "step"
+    raise MatAnyError("No supported MatAnyone API on core (process_video/process_frame/step).")
+# -----------------------------
+# Main session
+# -----------------------------
 class MatAnyoneSession:
     """
     Unified, streaming wrapper over MatAnyone variants.
     Public:
       - process_stream(video_path, seed_mask_path, out_dir, progress_cb)
+        -> returns (alpha_path, fg_path)
+    Private helper:
+      - _process_stream_chunks(frames_iterable, seed_1hw, alpha_writer, fg_writer, chunk_size)
     """
     def __init__(self, device: Optional[str] = None, precision: str = "auto"):
+        """
         Args:
+            device: 'cuda', 'cpu', 'cuda:0', etc. If None, auto-detects CUDA.
+            precision: 'auto' | 'fp32' | 'fp16'
         """
         self.device = torch.device(device) if device else (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
         self.precision = precision.lower()
+        self.use_fp16 = (self.precision == "fp16") or (self.precision == "auto" and self.device.type == "cuda")
         self._core = None
+        self._api_mode = None
+        self._initialized = False
         self._lazy_init()
+        log.info(f"Initialized MatAnyoneSession on {self.device} | precision={self.precision}, use_fp16={self.use_fp16}")
         if torch.cuda.is_available():
+            idx = self.device.index if isinstance(self.device, torch.device) and self.device.index is not None else 0
+            log.info(f"CUDA device: {torch.cuda.get_device_name(idx)}")
             self._log_gpu_memory()
+    # ---- internals ----
+    def _log_gpu_memory(self) -> Tuple[float, float]:
         if torch.cuda.is_available():
+            idx = self.device.index if isinstance(self.device, torch.device) and self.device.index is not None else 0
             try:
+                allocated = torch.cuda.memory_allocated(idx) / 1024**2
+                reserved  = torch.cuda.memory_reserved(idx) / 1024**2
+                log.info(f"GPU Memory - Allocated: {allocated:.1f}MB, Reserved: {reserved:.1f}MB")
+                return allocated, reserved
             except Exception as e:
+                log.warning(f"Failed to read GPU memory: {e}")
         return 0.0, 0.0
     def _lazy_init(self) -> None:
+        """Import and initialize the MatAnyone InferenceCore and choose API mode."""
         try:
             from matanyone.inference.inference_core import InferenceCore  # type: ignore
         except ImportError as e:
+            raise MatAnyError(f"Failed to import MatAnyone: {e}. Ensure it's installed and on PYTHONPATH.")
         except Exception as e:
             raise MatAnyError(f"Unexpected error during MatAnyone import: {e}")
+        # Some wheels accept zero-arg, some require a repo-id; try both
         try:
             self._core = InferenceCore()
         except TypeError:
+            self._core = InferenceCore("PeiqingYang/MatAnyone")
+        # Mode selection (env flags can influence)
         force_video = os.getenv("MATANY_FORCE_VIDEO", "1") == "1"
         force_step  = os.getenv("MATANY_FORCE_STEP",  "0") == "1"
+        if force_step and hasattr(self._core, "step"):
             self._api_mode = "step"
         else:
+            mode = _select_matany_mode(self._core)
+            if force_video and mode != "process_video" and hasattr(self._core, "process_video"):
+                self._api_mode = "process_video"
+            else:
+                self._api_mode = mode
+        log.info(f"[MATANY] API mode selected: {self._api_mode}")
         self._initialized = True
     def _maybe_amp(self):
+        enabled = (self.device.type == "cuda")
         if self.precision == "fp32":
             return torch.amp.autocast(device_type="cuda", enabled=False)
         if self.precision == "fp16":
+            return torch.amp.autocast(device_type="cuda", enabled=enabled, dtype=torch.float16)
+        # auto
+        return torch.amp.autocast(device_type="cuda", enabled=enabled and self.use_fp16)
     def _validate_input_frame(self, frame: np.ndarray) -> None:
         if not isinstance(frame, np.ndarray):
+            raise MatAnyError(f"Frame must be numpy.ndarray, got {type(frame)}")
         if frame.dtype != np.uint8:
             raise MatAnyError(f"Frame must be uint8, got {frame.dtype}")
         if frame.ndim != 3 or frame.shape[2] != 3:
+            raise MatAnyError(f"Frame must be HWC with 3 channels, got {frame.shape}")
+    def _run_frame(self, frame_bgr: np.ndarray, seed_1hw: Optional[np.ndarray], is_first: bool) -> np.ndarray:
         """
+        Returns alpha matte as 2D np.float32 in [0,1].
         """
+        self._validate_input_frame(frame_bgr)
+        img_chw = _to_chw01(frame_bgr)  # (3,H,W) float32 [0,1]
         img_t = torch.from_numpy(img_chw).to(self.device)
         mask_t = None
         if is_first and seed_1hw is not None:
             if seed_1hw.ndim == 3 and seed_1hw.shape[0] == 1:
+                seed_hw = seed_1hw[0]
             elif seed_1hw.ndim == 2:
                 seed_hw = seed_1hw
             else:
                 raise MatAnyError(f"seed mask must be 1HW or HW; got {seed_1hw.shape}")
+            mask_t = torch.from_numpy(seed_hw).to(self.device)
+        # dispatch
+        frame_start = time.time()
         try:
             with torch.no_grad(), self._maybe_amp():
                 if self._api_mode == "step":
+                    out = self._core.step(img_t, mask_t) if mask_t is not None else self._core.step(img_t)
                 elif self._api_mode == "process_frame":
+                    out = self._core.process_frame(img_t, mask_t)
                 else:
+                    raise MatAnyError("Internal error: _run_frame used in non-frame mode")
+        except torch.cuda.OutOfMemoryError as e:
+            snap = _cuda_snapshot(self.device)
             self._log_gpu_memory()
+            raise MatAnyError(f"CUDA OOM while processing frame | {snap}") from e
         except RuntimeError as e:
             if "CUDA" in str(e):
+                snap = _cuda_snapshot(self.device)
                 self._log_gpu_memory()
+                raise MatAnyError(f"CUDA runtime error: {e} | {snap}") from e
+            raise MatAnyError(f"Runtime error: {e}") from e
         except Exception as e:
+            raise MatAnyError(f"Processing failed: {e}") from e
+        finally:
+            # optional: track times / stats (omitted to keep adapter slim)
+            pass
+        # Normalize to 2D numpy [0,1]
+        if isinstance(out, torch.Tensor):
+            alpha_np = out.detach().float().clamp(0, 1).squeeze().cpu().numpy()
         else:
+            alpha_np = np.asarray(out, dtype=np.float32)
             if alpha_np.max() > 1.0:
+                alpha_np = alpha_np / 255.0
         alpha_np = np.squeeze(alpha_np)
         if alpha_np.ndim != 2:
             raise MatAnyError(f"Expected 2D alpha matte; got shape {alpha_np.shape}")
+        return alpha_np.astype(np.float32)
     def _harvest_process_video_output(self, res, out_dir: Path, base: str) -> Tuple[Path, Path]:
         """
         Accepts varied return types from MatAnyone.process_video and produces
+        (alpha.mp4, fg.mp4) inside out_dir.
+        Strategy: prefer path returns; as a last resort, glob common output dirs.
+        NOTE: If backend returns arrays only, we raise (cannot reconstruct FG here).
         """
         alpha_mp4 = out_dir / "alpha.mp4"
         fg_mp4    = out_dir / "fg.mp4"
+        # Dict style: look for common keys
         if isinstance(res, dict):
             cand_alpha = res.get("alpha") or res.get("alpha_path") or res.get("matte") or res.get("matte_path")
             cand_fg    = res.get("fg")    or res.get("fg_path")    or res.get("foreground") or res.get("foreground_path")
                 shutil.copy2(cand_alpha, alpha_mp4); moved += 1
             if cand_fg and Path(cand_fg).exists():
                 shutil.copy2(cand_fg, fg_mp4); moved += 1
+            if moved == 2:
+                return alpha_mp4, fg_mp4
+        # Tuple/list of paths
         if isinstance(res, (list, tuple)) and len(res) >= 1:
             paths = [Path(x) for x in res if isinstance(x, (str, Path))]
             if paths:
                 alpha_candidates = [p for p in paths if p.exists() and ("alpha" in p.name or "matte" in p.name)]
                 fg_candidates    = [p for p in paths if p.exists() and ("fg" in p.name or "fore" in p.name)]
                 if alpha_candidates and fg_candidates:
                     shutil.copy2(fg_candidates[0], fg_mp4)
                     return alpha_mp4, fg_mp4
+        # Fallback: glob common dirs
         search_dirs = [Path.cwd(), out_dir, Path("results"), Path("result"), Path("output"), Path("outputs")]
+        hits: List[Path] = []
         for d in search_dirs:
             if d.exists():
                 hits.extend(list(d.rglob(f"*{base}*.*")))
         alpha_candidates = [p for p in hits if p.suffix.lower() in (".mp4",".mov",".mkv",".avi") and ("alpha" in p.name or "matte" in p.name)]
         fg_candidates    = [p for p in hits if p.suffix.lower() in (".mp4",".mov",".mkv",".avi") and ("fg" in p.name or "fore" in p.name)]
         if alpha_candidates and fg_candidates:
             shutil.copy2(alpha_candidates[0], alpha_mp4)
             shutil.copy2(fg_candidates[0], fg_mp4)
             return alpha_mp4, fg_mp4
+        # If we got arrays only, we cannot reconstruct FG here (we'd need to replay frames)
+        raise MatAnyError("MatAnyone.process_video did not yield discoverable output paths.")
+    # -----------------------------
+    # Public API
+    # -----------------------------
     def process_stream(
         self,
         video_path: Path,
         out_dir: Optional[Path] = None,
         progress_cb: Optional[Callable] = None,
     ) -> Tuple[Path, Path]:
+        """
+        Process a video with MatAnyone.
         Returns:
+            (alpha_path, fg_path)
         Raises:
+            MatAnyError / FileNotFoundError / ValueError
         """
+        video_path = Path(video_path)
         if not video_path.exists():
             raise FileNotFoundError(f"Input video not found: {video_path}")
         if out_dir is None:
             out_dir = video_path.parent
         out_dir = Path(out_dir)
         out_dir.mkdir(parents=True, exist_ok=True)
+        _emit_progress(progress_cb, 0.0, "Initializing video processing...")
+        # Inspect video
+        cap_probe = cv2.VideoCapture(str(video_path))
+        if not cap_probe.isOpened():
             raise MatAnyError(f"Failed to open video: {video_path}")
+        N   = int(cap_probe.get(cv2.CAP_PROP_FRAME_COUNT))
+        fps = cap_probe.get(cv2.CAP_PROP_FPS) or 25.0
+        W   = int(cap_probe.get(cv2.CAP_PROP_FRAME_WIDTH))
+        H   = int(cap_probe.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        cap_probe.release()
+        log.info(f"[MATANY] {video_path.name}: {N} frames {W}x{H} @ {fps:.2f} fps")
+        _emit_progress(progress_cb, 0.05, f"Video: {N} frames {W}x{H} @ {fps:.2f} fps")
+        # If full-video API exists, prefer it
+        alpha_path = out_dir / "alpha.mp4"
+        fg_path    = out_dir / "fg.mp4"
+        t0 = time.time()
         try:
             if self._api_mode == "process_video":
+                _emit_progress(progress_cb, 0.10, "Using MatAnyone video mode")
                 if torch.cuda.is_available():
+                    self._log_gpu_memory()
+                # Some builds accept (video, mask), some only (video)
                 try:
+                    res = self._core.process_video(
+                        str(video_path),
+                        str(seed_mask_path) if seed_mask_path is not None else None
+                    )
+                except TypeError as e:
+                    if "takes 2 positional arguments but 3 were given" in str(e):
+                        res = self._core.process_video(str(video_path))
+                    else:
+                        raise
+                _emit_progress(progress_cb, 0.90, "Processing complete, collecting outputs…")
+                alpha_path, fg_path = self._harvest_process_video_output(res, out_dir, base=video_path.stem)
+                _validate_nonempty(alpha_path)
+                _validate_nonempty(fg_path)
+                _emit_progress(progress_cb, 1.0, "Done!")
+                return alpha_path, fg_path
+            # -----------------------------
+            # Frame-by-frame streaming path
+            # -----------------------------
+            _emit_progress(progress_cb, 0.10, f"Using {self._api_mode} (frame-by-frame)")
             cap = cv2.VideoCapture(str(video_path))
+            if not cap.isOpened():
+                raise MatAnyError(f"Failed to open video for reading: {video_path}")
+            # Writers (alpha as BGR grayscale for broad mp4v compatibility)
+            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+            alpha_writer = cv2.VideoWriter(str(alpha_path), fourcc, fps, (W, H), True)  # isColor=True
+            fg_writer    = cv2.VideoWriter(str(fg_path),    fourcc, fps, (W, H), True)
             if not alpha_writer.isOpened() or not fg_writer.isOpened():
+                raise MatAnyError("Failed to initialize VideoWriter(s)")
+            # Optional seed mask
+            seed_1hw = None
+            if seed_mask_path is not None:
+                seed_1hw = _read_mask_hw(Path(seed_mask_path), (H, W))
+            idx = 0
+            last_tick = time.time()
+            start = time.time()
             try:
+                while True:
+                    ret, frame = cap.read()
+                    if not ret:
+                        break
+                    current_mask = seed_1hw if idx == 0 else None
+                    alpha_hw = self._run_frame(frame, current_mask, is_first=(idx == 0))
+                    # Compose outputs
+                    alpha_u8  = (alpha_hw * 255.0 + 0.5).astype(np.uint8)
+                    alpha_bgr = cv2.cvtColor(alpha_u8, cv2.COLOR_GRAY2BGR)
+                    # IMPORTANT: alpha_hw already [0,1]
+                    fg_bgr = (frame.astype(np.float32) * alpha_hw[..., None]).clip(0, 255).astype(np.uint8)
+                    alpha_writer.write(alpha_bgr)
+                    fg_writer.write(fg_bgr)
+                    idx += 1
+                    # progress & ETA
+                    if N > 0 and (idx % max(5, N // 100) == 0 or (time.time() - last_tick) > 2.0):
+                        elapsed = time.time() - start
+                        prog = idx / max(1, N)
+                        eta_s = (elapsed / prog) * (1.0 - prog) if prog > 0 else 0.0
+                        if eta_s > 3600:
+                            eta = f"{eta_s/3600:.1f} h"
+                        elif eta_s > 60:
+                            eta = f"{eta_s/60:.1f} m"
+                        else:
+                            eta = f"{eta_s:.0f} s"
+                        fps_run = idx / elapsed if elapsed > 0 else 0.0
+                        gpu_tail = ""
                         if torch.cuda.is_available():
+                            idx_dev = self.device.index if self.device.index is not None else 0
+                            mem_a = torch.cuda.memory_allocated(idx_dev) / 1024**2
+                            mem_r = torch.cuda.memory_reserved(idx_dev) / 1024**2
+                            gpu_tail = f" | GPU {mem_a:.0f}/{mem_r:.0f}MB"
+                        _emit_progress(progress_cb, min(0.99, prog), f"Frame {idx}/{N} • {fps_run:.1f} FPS • ETA {eta}{gpu_tail}")
+                        last_tick = time.time()
+                # finalize
+                _validate_nonempty(alpha_path)
+                _validate_nonempty(fg_path)
+                total = time.time() - start
+                fps_run = idx / total if total > 0 else 0.0
+                _emit_progress(progress_cb, 1.0, f"Complete! {idx} frames at {fps_run:.1f} FPS")
+                return alpha_path, fg_path
+            finally:
+                try:
+                    if cap and hasattr(cap, "isOpened") and cap.isOpened():
+                        cap.release()
+                except Exception:
+                    pass
+                try:
+                    if alpha_writer:
+                        alpha_writer.release()
+                except Exception:
+                    pass
+                try:
+                    if fg_writer:
+                        fg_writer.release()
+                except Exception:
+                    pass
+                _safe_empty_cache()
+        except Exception as e:
+            msg = f"Error during video processing: {e}"
+            log.error(msg, exc_info=True)
+            if torch.cuda.is_available():
+                msg += f" | {_cuda_snapshot(self.device)}"
+            _emit_progress(progress_cb, -1, msg)
+            raise MatAnyError(msg) from e
+    # -----------------------------
+    # Private chunk helper (not used by public API in this file,
+    # but available if your pipeline wants to feed frames itself)
+    # -----------------------------
+    def _flush_chunk(self, frames_bgr: List[np.ndarray], seed_1hw: Optional[np.ndarray],
+                     alpha_writer: cv2.VideoWriter, fg_writer: cv2.VideoWriter):
         """
+        Process an in-memory batch (list of uint8 BGR frames) and write results.
+        This path assumes a core that can process batches; if not, it falls back per-frame.
         """
         mode = _select_matany_mode(self._core)
+        # If the core doesn't support tensor-batch processing, go per-frame
+        if mode in ("process_frame", "step"):
+            for i, frame in enumerate(frames_bgr):
+                alpha_hw = self._run_frame(frame, seed_1hw if i == 0 else None, is_first=(i == 0))
+                alpha_u8  = (alpha_hw * 255.0 + 0.5).astype(np.uint8)
+                alpha_bgr = cv2.cvtColor(alpha_u8, cv2.COLOR_GRAY2BGR)
+                fg_bgr    = (frame.astype(np.float32) * alpha_hw[..., None]).clip(0, 255).astype(np.uint8)
+                alpha_writer.write(alpha_bgr)
+                fg_writer.write(fg_bgr)
+            return
+        # If we reach here, assume a tensor-video code path exists (rare in released wheels).
+        # For safety we still fallback per-frame because API signatures vary wildly.
+        for i, frame in enumerate(frames_bgr):
+            alpha_hw = self._run_frame(frame, seed_1hw if i == 0 else None, is_first=(i == 0))
+            alpha_u8  = (alpha_hw * 255.0 + 0.5).astype(np.uint8)
+            alpha_bgr = cv2.cvtColor(alpha_u8, cv2.COLOR_GRAY2BGR)
+            fg_bgr    = (frame.astype(np.float32) * alpha_hw[..., None]).clip(0, 255).astype(np.uint8)
+            alpha_writer.write(alpha_bgr)
+            fg_writer.write(fg_bgr)
+    def _process_stream_chunks(self,
+                               frames_iterable,
+                               seed_1hw: Optional[np.ndarray],
+                               alpha_writer: cv2.VideoWriter,
+                               fg_writer: cv2.VideoWriter,
+                               chunk_size: int = 32):
         """
         Buffer frames from iterable and process in chunks.
         On OOM, retry once with half chunk size; otherwise bubble up MatAnyError.
         """
+        frames_buf: List[np.ndarray] = []
         try:
             for f in frames_iterable:
                 frames_buf.append(f)
                     try:
                         self._flush_chunk(frames_buf, seed_1hw, alpha_writer, fg_writer)
                         frames_buf.clear()
+                    except torch.cuda.OutOfMemoryError as e:
+                        _safe_empty_cache()
                         # one-time downshift
                         if chunk_size > 4:
                             half = max(4, chunk_size // 2)
                                 self._flush_chunk(sub, seed_1hw, alpha_writer, fg_writer)
                             frames_buf.clear()
                         else:
+                            raise MatAnyError(f"CUDA OOM in _process_stream_chunks | {_cuda_snapshot(self.device)}") from e
             if frames_buf:
                 self._flush_chunk(frames_buf, seed_1hw, alpha_writer, fg_writer)
                 frames_buf.clear()
         except Exception as e:
+            raise MatAnyError(f"Unexpected error in _process_stream_chunks: {e}") from e
         finally:
             frames_buf.clear()