Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 28, 2025

Commit

fd66920

1 Parent(s): 58a43ef

Update utils/cv_processing.py

Browse files

Files changed (1) hide show

utils/cv_processing.py +140 -129

utils/cv_processing.py CHANGED Viewed

@@ -1,13 +1,13 @@
 #!/usr/bin/env python3
 """
-cv_processing.py · FIXED VERSION with proper SAM2 handling
 """
 from __future__ import annotations
 import logging
 from pathlib import Path
-from typing import Any, Dict, Optional, Tuple
 import cv2
 import numpy as np
@@ -37,6 +37,24 @@ def _ensure_rgb(img: np.ndarray) -> np.ndarray:
         return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
     return img
 def _to_mask01(m: np.ndarray) -> np.ndarray:
     if m is None:
         return None
@@ -47,6 +65,36 @@ def _to_mask01(m: np.ndarray) -> np.ndarray:
         m = m / 255.0
     return np.clip(m, 0.0, 1.0)
 def _feather(mask01: np.ndarray, k: int = 2) -> np.ndarray:
     if mask01.ndim == 3:
         mask01 = mask01[..., 0]
@@ -90,38 +138,29 @@ def create_professional_background(key_or_cfg: Any, width: int, height: int) ->
 def _simple_person_segmentation(frame_bgr: np.ndarray) -> np.ndarray:
     """Basic fallback segmentation using color detection"""
     h, w = frame_bgr.shape[:2]
-    # Convert to HSV for better color detection
     hsv = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2HSV)
-    # Detect skin tones (basic person detection)
     lower_skin = np.array([0, 20, 70], dtype=np.uint8)
     upper_skin = np.array([20, 255, 255], dtype=np.uint8)
     skin_mask = cv2.inRange(hsv, lower_skin, upper_skin)
-    # Also detect non-green/non-white areas as potential person
     lower_green = np.array([40, 40, 40], dtype=np.uint8)
     upper_green = np.array([80, 255, 255], dtype=np.uint8)
     green_mask = cv2.inRange(hsv, lower_green, upper_green)
-    # Assume person is NOT green screen
     person_mask = cv2.bitwise_not(green_mask)
-    # Combine with skin detection
     person_mask = cv2.bitwise_or(person_mask, skin_mask)
-    # Clean up the mask
     kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
     person_mask = cv2.morphologyEx(person_mask, cv2.MORPH_CLOSE, kernel, iterations=2)
     person_mask = cv2.morphologyEx(person_mask, cv2.MORPH_OPEN, kernel, iterations=1)
-    # Find largest contour (assume it's the person)
     contours, _ = cv2.findContours(person_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     if contours:
         largest_contour = max(contours, key=cv2.contourArea)
         person_mask = np.zeros_like(person_mask)
         cv2.drawContours(person_mask, [largest_contour], -1, 255, -1)
     return (person_mask.astype(np.float32) / 255.0)
 def segment_person_hq(
@@ -135,50 +174,32 @@ def segment_person_hq(
     High-quality person segmentation with proper SAM2 handling
     """
     h, w = frame.shape[:2]
-    # Skip SAM2 if explicitly disabled
     if use_sam2 is False:
         return _simple_person_segmentation(frame)
-    # Try SAM2 if available
     if predictor is not None:
         try:
-            # Ensure we have the right methods
             if hasattr(predictor, "set_image") and hasattr(predictor, "predict"):
-                # Convert to RGB for SAM2
                 rgb = _ensure_rgb(frame)
-                # Set the image
                 predictor.set_image(rgb)
-                # Generate multiple prompt points for better coverage
                 points = []
                 labels = []
-                # Add center point
-                points.append([w // 2, h // 2])
-                labels.append(1)  # Foreground
-                # Add points for head area (upper center)
-                points.append([w // 2, h // 4])
-                labels.append(1)
-                # Add body points
-                points.append([w // 2, h // 2 + h // 8])
-                labels.append(1)
-                # Convert to numpy arrays
                 point_coords = np.array(points, dtype=np.float32)
                 point_labels = np.array(labels, dtype=np.int32)
-                # Predict with multiple masks
                 result = predictor.predict(
                     point_coords=point_coords,
                     point_labels=point_labels,
                     multimask_output=True
                 )
-                # Extract masks and scores
                 if isinstance(result, dict):
                     masks = result.get("masks", None)
                     scores = result.get("scores", None)
@@ -187,118 +208,121 @@ def segment_person_hq(
                 else:
                     masks = result
                     scores = None
-                # Validate and process masks
                 if masks is not None:
                     masks = np.array(masks)
-                    if masks.size > 0:  # Check if not empty
-                        # Handle different mask shapes
                         if masks.ndim == 3 and masks.shape[0] > 0:
-                            # Multiple masks - choose best one
                             if scores is not None and len(scores) > 0:
                                 best_idx = np.argmax(scores)
                                 mask = masks[best_idx]
                             else:
-                                # Use first mask if no scores
                                 mask = masks[0]
                         elif masks.ndim == 2:
-                            # Single mask
                             mask = masks
                         else:
                             logger.warning(f"Unexpected mask shape from SAM2: {masks.shape}")
                             mask = None
                         if mask is not None:
-                            # Convert to proper format
                             mask = _to_mask01(mask)
-                            # Validate mask has actual content
-                            if mask.max() > 0.1:  # At least 10% confidence somewhere
                                 return mask
                             else:
                                 logger.warning("SAM2 mask too weak, using fallback")
                 else:
                     logger.warning("SAM2 returned no masks")
         except Exception as e:
             logger.warning(f"SAM2 segmentation error: {e}")
-    # Fallback to simple segmentation
     if fallback_enabled:
         logger.debug("Using fallback segmentation")
         return _simple_person_segmentation(frame)
     else:
-        # Return full mask if no fallback
         return np.ones((h, w), dtype=np.float32)
 segment_person_hq_original = segment_person_hq
 # ----------------------------------------------------------------------------
-# MatAnyone Refinement (Fixed)
 # ----------------------------------------------------------------------------
 def refine_mask_hq(
     frame: np.ndarray,
     mask: np.ndarray,
-    matanyone: Optional[Any] = None,
     fallback_enabled: bool = True,
     use_matanyone: Optional[bool] = None,
     **_compat_kwargs,
 ) -> np.ndarray:
     """
-    Refine mask with MatAnyone - with proper handling
     """
-    # Convert mask to proper format
     mask01 = _to_mask01(mask)
-    # Skip MatAnyone if explicitly disabled
     if use_matanyone is False:
         return mask01
-    # Try MatAnyone if available
-    if matanyone is not None:
         try:
-            # Try different MatAnyone interfaces
             refined = None
-            # Method 1: Direct callable
-            if callable(matanyone):
-                try:
-                    refined = matanyone(frame, mask01)
-                    if refined is not None:
-                        refined = _to_mask01(np.array(refined))
-                except Exception as e:
-                    logger.debug(f"MatAnyone callable failed: {e}")
-            # Method 2: step method
             if refined is None and hasattr(matanyone, 'step'):
                 try:
-                    refined = matanyone.step(frame, mask01)
-                    if refined is not None:
-                        refined = _to_mask01(np.array(refined))
                 except Exception as e:
                     logger.debug(f"MatAnyone step failed: {e}")
-            # Method 3: process method
             if refined is None and hasattr(matanyone, 'process'):
                 try:
-                    refined = matanyone.process(frame, mask01)
-                    if refined is not None:
-                        refined = _to_mask01(np.array(refined))
                 except Exception as e:
                     logger.debug(f"MatAnyone process failed: {e}")
-            # Use refined mask if successful
             if refined is not None and refined.max() > 0.1:
-                # Apply post-processing
-                refined = _postprocess_mask(refined)
-                return refined
             else:
                 logger.warning("MatAnyone refinement failed or produced empty mask")
         except Exception as e:
             logger.warning(f"MatAnyone error: {e}")
     # Fallback refinement
     if fallback_enabled:
         return _fallback_refine(mask01)
@@ -307,39 +331,31 @@ def refine_mask_hq(
 def _postprocess_mask(mask01: np.ndarray) -> np.ndarray:
     """Post-process mask to clean edges and remove artifacts"""
-    # Convert to uint8
     mask_uint8 = (mask01 * 255).astype(np.uint8)
-    # Remove small holes
     kernel_close = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
     mask_uint8 = cv2.morphologyEx(mask_uint8, cv2.MORPH_CLOSE, kernel_close)
-    # Smooth edges
     mask_uint8 = cv2.GaussianBlur(mask_uint8, (3, 3), 0)
-    # Threshold to clean up
     _, mask_uint8 = cv2.threshold(mask_uint8, 127, 255, cv2.THRESH_BINARY)
-    # Final smooth
     mask_uint8 = cv2.GaussianBlur(mask_uint8, (5, 5), 1)
     return mask_uint8.astype(np.float32) / 255.0
 def _fallback_refine(mask01: np.ndarray) -> np.ndarray:
     """Simple fallback refinement"""
     mask_uint8 = (mask01 * 255).astype(np.uint8)
-    # Bilateral filter for edge-preserving smoothing
     mask_uint8 = cv2.bilateralFilter(mask_uint8, 9, 75, 75)
-    # Morphological operations
     kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
     mask_uint8 = cv2.morphologyEx(mask_uint8, cv2.MORPH_CLOSE, kernel)
     mask_uint8 = cv2.morphologyEx(mask_uint8, cv2.MORPH_OPEN, kernel)
-    # Edge feathering
     mask_uint8 = cv2.GaussianBlur(mask_uint8, (5, 5), 1)
     return mask_uint8.astype(np.float32) / 255.0
 # ----------------------------------------------------------------------------
@@ -355,25 +371,20 @@ def replace_background_hq(
     """High-quality background replacement with alpha blending"""
     try:
         H, W = frame.shape[:2]
-        # Resize background if needed
         if background.shape[:2] != (H, W):
             background = cv2.resize(background, (W, H), interpolation=cv2.INTER_LANCZOS4)
-        # Ensure mask is properly formatted
-        m = _to_mask01(mask01)
-        # Apply slight feather for smooth edges
         m = _feather(m, k=1)
-        # Convert to 3-channel for multiplication
         m3 = np.repeat(m[:, :, None], 3, axis=2)
-        # Alpha blending
         comp = frame.astype(np.float32) * m3 + background.astype(np.float32) * (1.0 - m3)
         return np.clip(comp, 0, 255).astype(np.uint8)
     except Exception as e:
         if fallback_enabled:
             logger.warning(f"Compositing failed ({e}) – returning original frame")
@@ -432,4 +443,4 @@ def validate_video_file(video_path: str) -> Tuple[bool, str]:
     "create_professional_background",
     "validate_video_file",
     "PROFESSIONAL_BACKGROUNDS",
-]

 #!/usr/bin/env python3
 """
+cv_processing.py · FIXED VERSION with proper SAM2 handling + MatAnyone stateful integration
 """
 from __future__ import annotations
 import logging
 from pathlib import Path
+from typing import Any, Dict, Optional, Tuple, Callable
 import cv2
 import numpy as np
         return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
     return img
+def _ensure_rgb01(frame_bgr: np.ndarray) -> np.ndarray:
+    """
+    Convert BGR uint8 [H,W,3] to RGB float32 in [0,1].
+    Accepts a variety of layouts and coerces safely to HWC.
+    """
+    if frame_bgr is None:
+        raise ValueError("frame_bgr is None")
+    x = frame_bgr
+    if x.ndim == 2:
+        x = np.stack([x, x, x], axis=-1)  # gray -> 3ch
+    # channels-first -> HWC
+    if x.ndim == 3 and x.shape[0] in (1, 3, 4) and x.shape[-1] not in (1, 3, 4):
+        x = np.transpose(x, (1, 2, 0))
+    if x.dtype != np.uint8:
+        x = np.clip(x, 0, 255).astype(np.uint8)
+    rgb = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)
+    return (rgb.astype(np.float32) / 255.0).copy()
 def _to_mask01(m: np.ndarray) -> np.ndarray:
     if m is None:
         return None
         m = m / 255.0
     return np.clip(m, 0.0, 1.0)
+def _mask_to_2d(mask: np.ndarray) -> np.ndarray:
+    """
+    Reduce any mask to 2-D float32 [H,W], contiguous, in [0,1].
+    Handles HWC/CHW/B1HW/1HW/HW, etc.
+    """
+    m = np.asarray(mask)
+    # channels-first 1xHxW
+    if m.ndim == 3 and m.shape[0] == 1 and (m.shape[1] > 1 and m.shape[2] > 1):
+        m = m[0]
+    # channels-last HxWx1
+    if m.ndim == 3 and m.shape[-1] == 1:
+        m = m[..., 0]
+    # multi-channel -> take first channel
+    if m.ndim == 3:
+        m = m[..., 0] if m.shape[-1] in (1, 3, 4) else m[0]
+    # squeeze anything left
+    m = np.squeeze(m)
+    if m.ndim != 2:
+        h = int(m.shape[-2]) if m.ndim >= 2 else 512
+        w = int(m.shape[-1]) if m.ndim >= 2 else 512
+        logger.warning(f"_mask_to_2d: unexpected shape {mask.shape}, creating neutral mask.")
+        m = np.full((h, w), 0.5, dtype=np.float32)
+    # dtype/range
+    if m.dtype == np.uint8:
+        m = m.astype(np.float32) / 255.0
+    elif m.dtype != np.float32:
+        m = m.astype(np.float32)
+    m = np.clip(m, 0.0, 1.0)
+    return np.ascontiguousarray(m)
 def _feather(mask01: np.ndarray, k: int = 2) -> np.ndarray:
     if mask01.ndim == 3:
         mask01 = mask01[..., 0]
 def _simple_person_segmentation(frame_bgr: np.ndarray) -> np.ndarray:
     """Basic fallback segmentation using color detection"""
     h, w = frame_bgr.shape[:2]
     hsv = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2HSV)
     lower_skin = np.array([0, 20, 70], dtype=np.uint8)
     upper_skin = np.array([20, 255, 255], dtype=np.uint8)
     skin_mask = cv2.inRange(hsv, lower_skin, upper_skin)
     lower_green = np.array([40, 40, 40], dtype=np.uint8)
     upper_green = np.array([80, 255, 255], dtype=np.uint8)
     green_mask = cv2.inRange(hsv, lower_green, upper_green)
     person_mask = cv2.bitwise_not(green_mask)
     person_mask = cv2.bitwise_or(person_mask, skin_mask)
     kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
     person_mask = cv2.morphologyEx(person_mask, cv2.MORPH_CLOSE, kernel, iterations=2)
     person_mask = cv2.morphologyEx(person_mask, cv2.MORPH_OPEN, kernel, iterations=1)
     contours, _ = cv2.findContours(person_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     if contours:
         largest_contour = max(contours, key=cv2.contourArea)
         person_mask = np.zeros_like(person_mask)
         cv2.drawContours(person_mask, [largest_contour], -1, 255, -1)
     return (person_mask.astype(np.float32) / 255.0)
 def segment_person_hq(
     High-quality person segmentation with proper SAM2 handling
     """
     h, w = frame.shape[:2]
     if use_sam2 is False:
         return _simple_person_segmentation(frame)
     if predictor is not None:
         try:
             if hasattr(predictor, "set_image") and hasattr(predictor, "predict"):
                 rgb = _ensure_rgb(frame)
                 predictor.set_image(rgb)
                 points = []
                 labels = []
+                points.append([w // 2, h // 2]); labels.append(1)
+                points.append([w // 2, h // 4]); labels.append(1)
+                points.append([w // 2, h // 2 + h // 8]); labels.append(1)
                 point_coords = np.array(points, dtype=np.float32)
                 point_labels = np.array(labels, dtype=np.int32)
                 result = predictor.predict(
                     point_coords=point_coords,
                     point_labels=point_labels,
                     multimask_output=True
                 )
                 if isinstance(result, dict):
                     masks = result.get("masks", None)
                     scores = result.get("scores", None)
                 else:
                     masks = result
                     scores = None
                 if masks is not None:
                     masks = np.array(masks)
+                    if masks.size > 0:
                         if masks.ndim == 3 and masks.shape[0] > 0:
                             if scores is not None and len(scores) > 0:
                                 best_idx = np.argmax(scores)
                                 mask = masks[best_idx]
                             else:
                                 mask = masks[0]
                         elif masks.ndim == 2:
                             mask = masks
                         else:
                             logger.warning(f"Unexpected mask shape from SAM2: {masks.shape}")
                             mask = None
                         if mask is not None:
                             mask = _to_mask01(mask)
+                            if mask.max() > 0.1:
                                 return mask
                             else:
                                 logger.warning("SAM2 mask too weak, using fallback")
                 else:
                     logger.warning("SAM2 returned no masks")
         except Exception as e:
             logger.warning(f"SAM2 segmentation error: {e}")
     if fallback_enabled:
         logger.debug("Using fallback segmentation")
         return _simple_person_segmentation(frame)
     else:
         return np.ones((h, w), dtype=np.float32)
 segment_person_hq_original = segment_person_hq
 # ----------------------------------------------------------------------------
+# MatAnyone Refinement (Stateful-capable)
 # ----------------------------------------------------------------------------
 def refine_mask_hq(
     frame: np.ndarray,
     mask: np.ndarray,
+    matanyone: Optional[Callable] = None,
+    *,
+    frame_idx: Optional[int] = None,
     fallback_enabled: bool = True,
     use_matanyone: Optional[bool] = None,
     **_compat_kwargs,
 ) -> np.ndarray:
     """
+    Refine mask with MatAnyone.
+    Modes:
+      • Stateful (preferred): provide `frame_idx`. On frame_idx==0, the session encodes with the mask.
+        On subsequent frames, the session propagates without a mask.
+      • Backward-compat (stateless): if `frame_idx` is None, we try callable/step/process with (frame, mask)
+        like before.
+    Returns:
+      2-D float32 alpha [H,W], contiguous, in [0,1] (OpenCV-safe).
     """
     mask01 = _to_mask01(mask)
     if use_matanyone is False:
         return mask01
+    if matanyone is not None and callable(matanyone):
         try:
+            rgb01 = _ensure_rgb01(frame)
+            # Stateful path (preferred)
+            if frame_idx is not None:
+                if frame_idx == 0:
+                    refined = matanyone(rgb01, mask01)        # encode + first-frame predict inside
+                else:
+                    refined = matanyone(rgb01)                 # propagate without mask
+                refined = _mask_to_2d(refined)
+                if refined.max() > 0.1:
+                    return _postprocess_mask(refined)
+                logger.warning("MatAnyone stateful refinement produced empty/weak mask; falling back.")
+            # Backward-compat (stateless) path
             refined = None
+            # Method 1: Direct callable with (frame, mask)
+            try:
+                refined = matanyone(rgb01, mask01)
+                refined = _mask_to_2d(refined)
+            except Exception as e:
+                logger.debug(f"MatAnyone callable failed: {e}")
+            # Method 2: step(image, mask)
             if refined is None and hasattr(matanyone, 'step'):
                 try:
+                    refined = matanyone.step(rgb01, mask01)
+                    refined = _mask_to_2d(refined)
                 except Exception as e:
                     logger.debug(f"MatAnyone step failed: {e}")
+            # Method 3: process(image, mask)
             if refined is None and hasattr(matanyone, 'process'):
                 try:
+                    refined = matanyone.process(rgb01, mask01)
+                    refined = _mask_to_2d(refined)
                 except Exception as e:
                     logger.debug(f"MatAnyone process failed: {e}")
             if refined is not None and refined.max() > 0.1:
+                return _postprocess_mask(refined)
             else:
                 logger.warning("MatAnyone refinement failed or produced empty mask")
         except Exception as e:
             logger.warning(f"MatAnyone error: {e}")
     # Fallback refinement
     if fallback_enabled:
         return _fallback_refine(mask01)
 def _postprocess_mask(mask01: np.ndarray) -> np.ndarray:
     """Post-process mask to clean edges and remove artifacts"""
     mask_uint8 = (mask01 * 255).astype(np.uint8)
     kernel_close = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (5, 5))
     mask_uint8 = cv2.morphologyEx(mask_uint8, cv2.MORPH_CLOSE, kernel_close)
     mask_uint8 = cv2.GaussianBlur(mask_uint8, (3, 3), 0)
     _, mask_uint8 = cv2.threshold(mask_uint8, 127, 255, cv2.THRESH_BINARY)
     mask_uint8 = cv2.GaussianBlur(mask_uint8, (5, 5), 1)
     return mask_uint8.astype(np.float32) / 255.0
 def _fallback_refine(mask01: np.ndarray) -> np.ndarray:
     """Simple fallback refinement"""
     mask_uint8 = (mask01 * 255).astype(np.uint8)
     mask_uint8 = cv2.bilateralFilter(mask_uint8, 9, 75, 75)
     kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
     mask_uint8 = cv2.morphologyEx(mask_uint8, cv2.MORPH_CLOSE, kernel)
     mask_uint8 = cv2.morphologyEx(mask_uint8, cv2.MORPH_OPEN, kernel)
     mask_uint8 = cv2.GaussianBlur(mask_uint8, (5, 5), 1)
     return mask_uint8.astype(np.float32) / 255.0
 # ----------------------------------------------------------------------------
     """High-quality background replacement with alpha blending"""
     try:
         H, W = frame.shape[:2]
         if background.shape[:2] != (H, W):
             background = cv2.resize(background, (W, H), interpolation=cv2.INTER_LANCZOS4)
+        m = _mask_to_2d(_to_mask01(mask01))
         m = _feather(m, k=1)
         m3 = np.repeat(m[:, :, None], 3, axis=2)
         comp = frame.astype(np.float32) * m3 + background.astype(np.float32) * (1.0 - m3)
         return np.clip(comp, 0, 255).astype(np.uint8)
     except Exception as e:
         if fallback_enabled:
             logger.warning(f"Compositing failed ({e}) – returning original frame")
     "create_professional_background",
     "validate_video_file",
     "PROFESSIONAL_BACKGROUNDS",
+]