Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 30, 2025

Commit

6095d82

1 Parent(s): b4ba9ec

Update models/loaders/matanyone_loader.py

Browse files

Files changed (1) hide show

models/loaders/matanyone_loader.py +150 -188

models/loaders/matanyone_loader.py CHANGED Viewed

@@ -1,10 +1,9 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-MatAnyone Loader - Official InferenceCore API Implementation
-============================================================
-Fixed to use official MatAnyone API to resolve tensor dimension issues.
-No manual tensor manipulation - let InferenceCore handle everything internally.
 """
 import os
@@ -22,11 +21,117 @@
 logger = logging.getLogger(__name__)
 class MatAnyoneLoader:
     """
-    Official MatAnyone loader using InferenceCore API.
-    This fixes the tensor dimension mismatch by using the official API
-    which handles all tensor dimensions internally.
     """
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyone_cache"):
@@ -35,6 +140,7 @@ def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyo
         os.makedirs(self.cache_dir, exist_ok=True)
         self.processor = None
         self.model_id = "PeiqingYang/MatAnyone"
         self.load_time = 0.0
         self.loaded = False
@@ -50,10 +156,10 @@ def _select_device(self, pref: str) -> str:
             return "cpu"
         return "cuda" if torch.cuda.is_available() else "cpu"
-    def load(self):  # <-- CHANGED: No return type hint, returns processor
-        """Load MatAnyone using official InferenceCore API."""
-        if self.loaded:
-            return self.processor  # <-- CHANGED: Return processor, not True
         logger.info(f"Loading MatAnyone from HF: {self.model_id} (device={self.device})")
         t0 = time.time()
@@ -62,174 +168,32 @@ def load(self):  # <-- CHANGED: No return type hint, returns processor
             # Import the official API
             from matanyone.inference.inference_core import InferenceCore
-            # Use official API - this handles ALL tensor dimensions internally
-            # No manual tensor reshaping needed!
             self.processor = InferenceCore(self.model_id)
             self.loaded = True
             self.load_time = time.time() - t0
-            logger.info(f"MatAnyone loaded successfully via InferenceCore API in {self.load_time:.2f}s")
-            return self.processor  # <-- CHANGED: Return processor, not True
         except ImportError as e:
             self.load_error = f"MatAnyone not installed: {e}"
             logger.error(f"Failed to import MatAnyone. Install with: pip install git+https://github.com/pq-yang/MatAnyone.git@main")
-            return None  # <-- CHANGED: Return None on failure
         except Exception as e:
             self.load_error = str(e)
             logger.error(f"Failed to load MatAnyone: {e}")
             logger.debug(traceback.format_exc())
-            return None  # <-- CHANGED: Return None on failure
-    def process_video(self, video_path: str, mask_path: str, output_dir: Optional[str] = None,
-                     max_size: int = 720, save_frames: bool = False) -> Tuple[Optional[str], Optional[str]]:
-        """
-        Process video using official MatAnyone API.
-        Args:
-            video_path: Path to input video
-            mask_path: Path to first frame mask
-            output_dir: Output directory (uses temp if None)
-            max_size: Maximum resolution (-1 for original)
-            save_frames: Whether to save individual frames
-        Returns:
-            (foreground_path, alpha_path) or (None, None) on error
-        """
-        if not self.loaded:
-            if not self.load():
-                logger.error(f"MatAnyone not loaded: {self.load_error}")
-                return None, None
-        if output_dir is None:
-            output_dir = str(self.temp_dir)
-        try:
-            # Use official API - no tensor manipulation needed!
-            # The API handles all dimension requirements internally
-            foreground_path, alpha_path = self.processor.process_video(
-                input_path=str(video_path),
-                mask_path=str(mask_path),
-                output_path=str(output_dir),
-                max_size=max_size,
-                save_frames=save_frames
-            )
-            logger.info(f"MatAnyone processing complete: fg={foreground_path}, alpha={alpha_path}")
-            return foreground_path, alpha_path
-        except Exception as e:
-            logger.error(f"MatAnyone processing failed: {e}")
-            logger.debug(traceback.format_exc())
-            return None, None
-    def process_frames_to_alpha(self, frames: np.ndarray, initial_mask: np.ndarray,
-                                output_dir: Optional[str] = None) -> Optional[np.ndarray]:
-        """
-        Process video frames and return alpha masks.
-        This is a compatibility wrapper for frame-based processing.
-        Args:
-            frames: Video frames as numpy array (T, H, W, C) or list
-            initial_mask: First frame mask (H, W) with values 0-255
-            output_dir: Optional output directory
-        Returns:
-            Alpha masks array (T, H, W) or None on error
-        """
-        if not self.loaded:
-            if not self.load():
-                return None
-        if output_dir is None:
-            output_dir = str(self.temp_dir)
-        # Save frames as temporary video
-        temp_video_path = Path(output_dir) / "temp_input.mp4"
-        temp_mask_path = Path(output_dir) / "temp_mask.png"
-        try:
-            # Convert frames to video
-            if isinstance(frames, list):
-                frames = np.stack(frames)
-            # Ensure correct format
-            if frames.ndim == 5:  # (B, C, T, H, W) or similar
-                # Take first batch, rearrange to (T, H, W, C)
-                frames = frames[0]
-                if frames.shape[0] == 3:  # Channels first
-                    frames = frames.transpose(1, 2, 3, 0)
-            elif frames.ndim == 4 and frames.shape[1] == 3:  # (T, C, H, W)
-                frames = frames.transpose(0, 2, 3, 1)
-            # Write video
-            fps = 30
-            height, width = frames.shape[1:3]
-            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-            out = cv2.VideoWriter(str(temp_video_path), fourcc, fps, (width, height))
-            for frame in frames:
-                if frame.dtype in (np.float32, np.float64):
-                    frame = (frame * 255).astype(np.uint8)
-                if frame.shape[-1] == 3:
-                    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
-                out.write(frame)
-            out.release()
-            # Save mask
-            if initial_mask.dtype in (np.float32, np.float64):
-                initial_mask = (initial_mask * 255).astype(np.uint8)
-            cv2.imwrite(str(temp_mask_path), initial_mask)
-            # Process with official API
-            _, alpha_path = self.process_video(
-                str(temp_video_path),
-                str(temp_mask_path),
-                str(output_dir)
-            )
-            if alpha_path:
-                # Load alpha video and return as array
-                return self._load_alpha_video(alpha_path)
-            return None
-        except Exception as e:
-            logger.error(f"Frame processing failed: {e}")
-            return None
-        finally:
-            # Cleanup temp files
-            if temp_video_path.exists():
-                temp_video_path.unlink()
-            if temp_mask_path.exists():
-                temp_mask_path.unlink()
-    def _load_alpha_video(self, alpha_video_path: str) -> Optional[np.ndarray]:
-        """Load alpha video and return as numpy array."""
-        try:
-            cap = cv2.VideoCapture(str(alpha_video_path))
-            frames = []
-            while True:
-                ret, frame = cap.read()
-                if not ret:
-                    break
-                # Convert to grayscale if needed
-                if len(frame.shape) == 3:
-                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
-                frames.append(frame / 255.0)  # Normalize to 0-1
-            cap.release()
-            return np.array(frames) if frames else None
-        except Exception as e:
-            logger.error(f"Failed to load alpha video: {e}")
             return None
     def cleanup(self):
         """Cleanup temporary files and release resources."""
         self.processor = None
         # Clean temp directory
         if self.temp_dir.exists():
@@ -242,45 +206,43 @@ def cleanup(self):
     def get_info(self) -> Dict[str, Any]:
         """Get model information."""
-        return {
             "loaded": self.loaded,
             "model_id": self.model_id,
             "device": str(self.device),
             "load_time": self.load_time,
             "error": self.load_error,
-            "api": "InferenceCore (official)"
         }
     def reset(self):
         """Reset the processor for a new video."""
-        # The official API handles session management internally
-        # Just log that reset was called
-        logger.info("MatAnyone session reset requested (handled by InferenceCore)")
-    # Compatibility method for existing code that might call this
     def __call__(self, image, mask=None, **kwargs):
-        """
-        Direct call compatibility wrapper.
-        For single frame processing or backwards compatibility.
-        """
-        if isinstance(image, (list, np.ndarray)) and mask is not None:
-            # Process as frames
-            if not isinstance(image, np.ndarray):
-                image = np.array(image)
-            if image.ndim == 3:  # Single frame
-                image = image[np.newaxis, ...]
-            alphas = self.process_frames_to_alpha(image, mask)
-            if alphas is not None and len(alphas) > 0:
-                return alphas[0] if alphas.shape[0] == 1 else alphas
-        # Fallback
-        logger.warning("Direct call to MatAnyoneLoader not fully supported with official API")
-        return mask if mask is not None else np.zeros(image.shape[:2], dtype=np.float32)
-# For backwards compatibility - expose session class name even though we don't use it
-_MatAnyoneSession = MatAnyoneLoader  # Alias for compatibility
-__all__ = ["MatAnyoneLoader", "_MatAnyoneSession"]

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
+MatAnyone Loader - Wrapper for Official InferenceCore API
+=========================================================
+Creates a callable wrapper around InferenceCore to maintain compatibility.
 """
 import os
 logger = logging.getLogger(__name__)
+class MatAnyoneCallableWrapper:
+    """
+    Callable wrapper around InferenceCore to maintain API compatibility.
+    Makes the processor work like a callable session.
+    """
+    def __init__(self, inference_core):
+        self.core = inference_core
+        self.initialized = False
+    def __call__(self, image, mask=None, **kwargs):
+        """
+        Make this wrapper callable like the old session interface.
+        Args:
+            image: Input image as numpy array
+            mask: Optional mask for first frame
+        Returns:
+            Alpha mask as 2D numpy array
+        """
+        try:
+            # For MatAnyone, the first frame needs initialization with a mask
+            if not self.initialized:
+                if mask is None:
+                    # Return a default mask if no mask provided for first frame
+                    logger.warning("First frame called without mask, returning default")
+                    if isinstance(image, np.ndarray):
+                        h, w = image.shape[:2]
+                    else:
+                        h, w = 512, 512
+                    return np.ones((h, w), dtype=np.float32) * 0.5
+                # Initialize with first frame and mask
+                # The exact API call depends on the InferenceCore implementation
+                # This is a placeholder - adjust based on actual API
+                if hasattr(self.core, 'step'):
+                    result = self.core.step(image=image, mask=mask)
+                elif hasattr(self.core, 'process_frame'):
+                    result = self.core.process_frame(image, mask)
+                else:
+                    # Fallback
+                    logger.warning("InferenceCore API unclear, returning input mask")
+                    return mask if isinstance(mask, np.ndarray) else np.array(mask)
+                self.initialized = True
+                return self._extract_alpha(result)
+            else:
+                # Subsequent frames - no mask needed
+                if hasattr(self.core, 'step'):
+                    result = self.core.step(image=image)
+                elif hasattr(self.core, 'process_frame'):
+                    result = self.core.process_frame(image)
+                else:
+                    # Fallback - return neutral mask
+                    if isinstance(image, np.ndarray):
+                        h, w = image.shape[:2]
+                    else:
+                        h, w = 512, 512
+                    return np.ones((h, w), dtype=np.float32) * 0.5
+                return self._extract_alpha(result)
+        except Exception as e:
+            logger.error(f"MatAnyone wrapper call failed: {e}")
+            # Return a fallback mask
+            if mask is not None:
+                return mask if isinstance(mask, np.ndarray) else np.array(mask)
+            if isinstance(image, np.ndarray):
+                h, w = image.shape[:2]
+            else:
+                h, w = 512, 512
+            return np.ones((h, w), dtype=np.float32) * 0.5
+    def _extract_alpha(self, result):
+        """Extract alpha channel from result."""
+        if result is None:
+            return np.ones((512, 512), dtype=np.float32) * 0.5
+        if isinstance(result, np.ndarray):
+            if result.ndim == 2:
+                return result.astype(np.float32)
+            elif result.ndim == 3:
+                # Take first channel or average
+                return result[..., 0].astype(np.float32)
+            elif result.ndim == 4:
+                # Batch dimension - take first
+                return result[0, 0].astype(np.float32)
+        # Try to convert to numpy
+        try:
+            arr = np.array(result)
+            if arr.ndim >= 2:
+                return arr[..., 0] if arr.ndim > 2 else arr
+        except:
+            pass
+        return np.ones((512, 512), dtype=np.float32) * 0.5
+    def reset(self):
+        """Reset the session state."""
+        self.initialized = False
+        if hasattr(self.core, 'reset'):
+            self.core.reset()
+        elif hasattr(self.core, 'clear_memory'):
+            self.core.clear_memory()
 class MatAnyoneLoader:
     """
+    Official MatAnyone loader using InferenceCore API with callable wrapper.
     """
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/matanyone_cache"):
         os.makedirs(self.cache_dir, exist_ok=True)
         self.processor = None
+        self.wrapper = None
         self.model_id = "PeiqingYang/MatAnyone"
         self.load_time = 0.0
         self.loaded = False
             return "cpu"
         return "cuda" if torch.cuda.is_available() else "cpu"
+    def load(self):
+        """Load MatAnyone using official InferenceCore API and wrap it."""
+        if self.loaded and self.wrapper:
+            return self.wrapper
         logger.info(f"Loading MatAnyone from HF: {self.model_id} (device={self.device})")
         t0 = time.time()
             # Import the official API
             from matanyone.inference.inference_core import InferenceCore
+            # Create the InferenceCore processor
             self.processor = InferenceCore(self.model_id)
+            # Wrap it to make it callable
+            self.wrapper = MatAnyoneCallableWrapper(self.processor)
             self.loaded = True
             self.load_time = time.time() - t0
+            logger.info(f"MatAnyone loaded and wrapped successfully in {self.load_time:.2f}s")
+            return self.wrapper
         except ImportError as e:
             self.load_error = f"MatAnyone not installed: {e}"
             logger.error(f"Failed to import MatAnyone. Install with: pip install git+https://github.com/pq-yang/MatAnyone.git@main")
+            return None
         except Exception as e:
             self.load_error = str(e)
             logger.error(f"Failed to load MatAnyone: {e}")
             logger.debug(traceback.format_exc())
             return None
     def cleanup(self):
         """Cleanup temporary files and release resources."""
         self.processor = None
+        self.wrapper = None
         # Clean temp directory
         if self.temp_dir.exists():
     def get_info(self) -> Dict[str, Any]:
         """Get model information."""
+        info = {
             "loaded": self.loaded,
             "model_id": self.model_id,
             "device": str(self.device),
             "load_time": self.load_time,
             "error": self.load_error,
+            "api": "InferenceCore (wrapped)"
         }
+        # Add interface info
+        if self.processor:
+            info["has_step"] = hasattr(self.processor, 'step')
+            info["has_process_frame"] = hasattr(self.processor, 'process_frame')
+            info["has_process_video"] = hasattr(self.processor, 'process_video')
+        return info
     def reset(self):
         """Reset the processor for a new video."""
+        if self.wrapper:
+            self.wrapper.reset()
+        logger.info("MatAnyone session reset")
+    # Compatibility - make the loader itself callable
     def __call__(self, image, mask=None, **kwargs):
+        """Direct call compatibility."""
+        if not self.wrapper:
+            if not self.load():
+                # Fallback if loading fails
+                if mask is not None:
+                    return mask if isinstance(mask, np.ndarray) else np.array(mask)
+                return np.zeros(image.shape[:2], dtype=np.float32)
+        return self.wrapper(image, mask, **kwargs)
+# For backwards compatibility
+_MatAnyoneSession = MatAnyoneCallableWrapper
+__all__ = ["MatAnyoneLoader", "_MatAnyoneSession", "MatAnyoneCallableWrapper"]