Spaces:

MogensR
/

VideoBackgroundReplacer

Paused

App Files Files Community

MogensR commited on Aug 29, 2025

Commit

88eae72

1 Parent(s): c29dcc4

Update models/loaders/sam2_loader.py

Browse files

Files changed (1) hide show

models/loaders/sam2_loader.py +175 -106

models/loaders/sam2_loader.py CHANGED Viewed

@@ -1,22 +1,31 @@
-#!/usr/bin/env python3
-"""
-SAM2 Loader + Guarded Predictor Adapter (VRAM-friendly, shape-safe, thread-safe, PyTorch2-ready)
-"""
-from __future__ import annotations
-import os
-import time
-import logging
-import traceback
-from typing import Optional, Dict, Any, Tuple, List
-import numpy as np
-import torch
-import cv2
-import threading
 logger = logging.getLogger(__name__)
 def _select_device(pref: str) -> str:
     pref = (pref or "").lower()
@@ -26,27 +35,42 @@ def _select_device(pref: str) -> str:
         return "cpu"
     return "cuda" if torch.cuda.is_available() else "cpu"
 def _ensure_rgb_uint8(img: np.ndarray, force_bgr_to_rgb: bool = False) -> np.ndarray:
     if img is None:
         raise ValueError("set_image received None image")
     arr = np.asarray(img)
     if arr.ndim != 3 or arr.shape[2] < 3:
         raise ValueError(f"Expected HxWxC image with C>=3, got shape={arr.shape}")
     if np.issubdtype(arr.dtype, np.floating):
         arr = np.clip(arr, 0.0, 1.0)
         arr = (arr * 255.0 + 0.5).astype(np.uint8)
     elif arr.dtype != np.uint8:
-        if arr.dtype == np.uint16:
-            arr = (arr / 257).astype(np.uint8)
-        else:
-            arr = arr.astype(np.uint8)
-    if arr.shape[2] == 4:
         arr = arr[:, :, :3]
     if force_bgr_to_rgb:
         arr = cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)
     return arr
 def _compute_scaled_size(h: int, w: int, max_edge: int, target_pixels: int) -> Tuple[int, int, float]:
     if h <= 0 or w <= 0:
         return h, w, 1.0
     s1 = min(1.0, float(max_edge) / float(max(h, w))) if max_edge > 0 else 1.0
@@ -56,30 +80,34 @@ def _compute_scaled_size(h: int, w: int, max_edge: int, target_pixels: int) -> T
     nw = max(1, int(round(w * s)))
     return nh, nw, s
 def _ladder(nh: int, nw: int) -> List[Tuple[int, int]]:
     sizes = [(nh, nw)]
-    sizes.append((max(1, int(nh * 0.85)), max(1, int(nw * 0.85))))
-    sizes.append((max(1, int(nh * 0.70)), max(1, int(nw * 0.70))))
-    sizes.append((max(1, int(nh * 0.50)), max(1, int(nw * 0.50))))
-    sizes.append((max(1, int(nh * 0.35)), max(1, int(nw * 0.35))))
-    uniq = []
-    seen = set()
     for s in sizes:
         if s not in seen:
             uniq.append(s); seen.add(s)
     return uniq
 def _upsample_stack(masks: np.ndarray, out_hw: Tuple[int, int]) -> np.ndarray:
     if masks.ndim != 3:
-        masks = np.asarray(masks)
         if masks.ndim == 2:
             masks = masks[None, ...]
-        elif masks.ndim == 4 and masks.shape[1] == 1:
-            masks = masks[:, 0, :, :]
-        else:
-            masks = np.squeeze(masks)
-            if masks.ndim == 2:
-                masks = masks[None, ...]
     n, h, w = masks.shape
     H, W = out_hw
     if (h, w) == (H, W):
@@ -89,50 +117,49 @@ def _upsample_stack(masks: np.ndarray, out_hw: Tuple[int, int]) -> np.ndarray:
         out[i] = cv2.resize(masks[i].astype(np.float32), (W, H), interpolation=cv2.INTER_LINEAR)
     return np.clip(out, 0.0, 1.0)
 def _normalize_masks_dtype(x: np.ndarray) -> np.ndarray:
     x = np.asarray(x)
     if x.dtype == np.uint8:
         return (x.astype(np.float32) / 255.0)
     return x.astype(np.float32, copy=False)
-# -------------------------- adapter --------------------------
 class _SAM2Adapter:
     """
-    Wraps SAM2ImagePredictor to:
-      - store original H,W
-      - model-only downscale on set_image
-      - OOM-aware predict with retry at smaller sizes
-      - upsample masks back to original size
-      - now thread-safe
     """
     def __init__(self, predictor, device: str):
         self.pred = predictor
         self.device = device
         self.orig_hw: Tuple[int, int] = (0, 0)
         self.max_edge = int(os.environ.get("SAM2_MAX_EDGE", "1024"))
         self.target_pixels = int(os.environ.get("SAM2_TARGET_PIXELS", "900000"))
         self.force_bgr_to_rgb = os.environ.get("SAM2_ASSUME_BGR", "0") == "1"
-        self.use_autocast = (device == "cuda")
-        self.autocast_dtype = None
-        if self.use_autocast:
-            try:
-                if hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported():
-                    self.autocast_dtype = torch.bfloat16
-                else:
-                    cc = torch.cuda.get_device_capability() if torch.cuda.is_available() else (0, 0)
-                    self.autocast_dtype = torch.float16 if cc[0] >= 7 else None
-            except Exception:
-                self.autocast_dtype = None
-        self._current_rgb: Optional[np.ndarray] = None
-        self._current_hw: Tuple[int, int] = (0, 0)
         self._lock = threading.Lock()
     def set_image(self, image: np.ndarray):
         with self._lock:
             rgb = _ensure_rgb_uint8(image, force_bgr_to_rgb=self.force_bgr_to_rgb)
             H, W = rgb.shape[:2]
             self.orig_hw = (H, W)
             nh, nw, s = _compute_scaled_size(H, W, self.max_edge, self.target_pixels)
             if s < 1.0:
                 work = cv2.resize(rgb, (nw, nh), interpolation=cv2.INTER_AREA)
@@ -141,68 +168,83 @@ def set_image(self, image: np.ndarray):
             else:
                 self._current_rgb = rgb
                 self._current_hw = (H, W)
             self.pred.set_image(self._current_rgb)
     def predict(self, **kwargs) -> Dict[str, Any]:
         with self._lock:
             if self._current_rgb is None or self.orig_hw == (0, 0):
                 raise RuntimeError("SAM2Adapter.predict called before set_image()")
             H, W = self.orig_hw
             nh, nw = self._current_hw
             sizes = _ladder(nh, nw)
             last_exc: Optional[BaseException] = None
             for (th, tw) in sizes:
                 try:
                     if (th, tw) != (nh, nw):
                         small = cv2.resize(self._current_rgb, (tw, th), interpolation=cv2.INTER_AREA)
                         self.pred.set_image(small)
                     class _NoOp:
                         def __enter__(self): return None
                         def __exit__(self, *a): return False
-                    # -------- PyTorch 2.x autocast signature --------
-                    if self.use_autocast and self.autocast_dtype is not None:
-                        amp_ctx = torch.autocast(device_type="cuda", dtype=self.autocast_dtype)
                     else:
                         amp_ctx = _NoOp()
                     with torch.inference_mode():
                         with amp_ctx:
                             out = self.pred.predict(**kwargs)
-                    # normalize outputs to dict
-                    masks = None
-                    scores = None
-                    logits = None
                     if isinstance(out, dict):
-                        masks = out.get("masks", None)
-                        scores = out.get("scores", None)
-                        logits = out.get("logits", None)
                     elif isinstance(out, (tuple, list)):
                         if len(out) >= 1: masks = out[0]
                         if len(out) >= 2: scores = out[1]
                         if len(out) >= 3: logits = out[2]
                     else:
                         masks = out
                     if masks is None:
                         raise RuntimeError("SAM2 returned no masks")
-                    masks = np.asarray(masks)
-                    if masks.ndim == 2:
-                        masks = masks[None, ...]
-                    elif masks.ndim == 4 and masks.shape[1] == 1:
-                        masks = masks[:, 0, :, :]
                     masks = _normalize_masks_dtype(masks)
                     masks_up = _upsample_stack(masks, (H, W))
                     if scores is None:
                         scores = np.ones((masks_up.shape[0],), dtype=np.float32) * 0.5
                     else:
                         scores = np.asarray(scores).astype(np.float32, copy=False).reshape(-1)
                     out_dict = {"masks": masks_up, "scores": scores}
                     if logits is not None:
                         lg = np.asarray(logits)
                         if lg.ndim == 3:
                             lg = _upsample_stack(lg, (H, W))
                         elif lg.ndim == 4 and lg.shape[1] == 1:
                             lg = _upsample_stack(lg[:, 0, :, :], (H, W))
                         out_dict["logits"] = lg.astype(np.float32, copy=False)
                     return out_dict
                 except torch.cuda.OutOfMemoryError as e:
                     last_exc = e
                     if torch.cuda.is_available():
@@ -216,40 +258,52 @@ def __exit__(self, *a): return False
                     logger.debug(traceback.format_exc())
                     logger.warning(f"SAM2 predict failed at {th}x{tw}; retrying smaller. {e}")
                     continue
-            logger.warning(f"SAM2 calls failed; returning fallback. {last_exc}")
             return {
                 "masks": np.ones((1, H, W), dtype=np.float32),
                 "scores": np.array([0.5], dtype=np.float32),
             }
-# -------------------------- Loader --------------------------
 class SAM2Loader:
-    """Dedicated loader for SAM2 models"""
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/sam2_cache"):
         self.device = _select_device(device)
         self.cache_dir = cache_dir
         os.makedirs(self.cache_dir, exist_ok=True)
-        # HuggingFace Hub for spaces: avoid symlink errors
         os.environ.setdefault("HF_HUB_DISABLE_SYMLINKS", "1")
         os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "0")
-        self.model = None          # underlying predictor (SAM2ImagePredictor)
-        self.adapter = None        # wrapped predictor exposed to callers
         self.model_id = None
         self.load_time = 0.0
-    def load(self, model_size: str = "auto") -> Optional[Any]:
         """
-        Load SAM2 model with specified size
-        Args:
-            model_size: "tiny", "small", "base", "large", or "auto"
-        Returns:
-            Wrapped predictor (adapter) or None
         """
         if model_size == "auto":
             model_size = self._determine_optimal_size()
         model_map = {
             "tiny":  "facebook/sam2.1-hiera-tiny",
             "small": "facebook/sam2.1-hiera-small",
@@ -258,8 +312,8 @@ def load(self, model_size: str = "auto") -> Optional[Any]:
         }
         self.model_id = model_map.get(model_size, model_map["tiny"])
         logger.info(f"Loading SAM2 model: {self.model_id} (device={self.device})")
-        strategies = [("official", self._load_official), ("fallback", self._load_fallback)]
-        for name, fn in strategies:
             try:
                 t0 = time.time()
                 pred = fn()
@@ -273,25 +327,14 @@ def load(self, model_size: str = "auto") -> Optional[Any]:
             except Exception as e:
                 logger.error(f"SAM2 {name} strategy failed: {e}")
                 logger.debug(traceback.format_exc())
         logger.error("All SAM2 loading strategies failed")
         return None
-    def _determine_optimal_size(self) -> str:
-        """Determine optimal model size based on available memory"""
-        try:
-            if torch.cuda.is_available():
-                props = torch.cuda.get_device_properties(0)
-                vram_gb = props.total_memory / (1024**3)
-                if vram_gb < 4:   return "tiny"
-                if vram_gb < 8:   return "small"
-                if vram_gb < 12:  return "base"
-                return "large"
-        except Exception:
-            pass
-        return "tiny"
-    def _load_official(self) -> Optional[Any]:
-        """Load using official SAM2 API"""
         from sam2.sam2_image_predictor import SAM2ImagePredictor
         predictor = SAM2ImagePredictor.from_pretrained(
             self.model_id,
@@ -299,15 +342,14 @@ def _load_official(self) -> Optional[Any]:
             local_files_only=False,
             trust_remote_code=True,
         )
         if hasattr(predictor, "model"):
             predictor.model = predictor.model.to(self.device)
             predictor.model.eval()
-        if hasattr(predictor, "device"):
-            predictor.device = self.device
         return predictor
-    def _load_fallback(self) -> Optional[Any]:
-        """Create a tiny fallback predictor"""
         class FallbackSAM2:
             def __init__(self, device):
                 self.device = device
@@ -315,11 +357,7 @@ def __init__(self, device):
             def set_image(self, image):
                 self._img = np.asarray(image)
             def predict(self, **kwargs):
-                if self._img is not None:
-                    h, w = self._img.shape[:2]
-                else:
-                    h, w = 512, 512
-                # Return a full-ones mask—**handled downstream!**
                 return {
                     "masks": np.ones((1, h, w), dtype=np.float32),
                     "scores": np.array([0.5], dtype=np.float32),
@@ -327,6 +365,8 @@ def predict(self, **kwargs):
         logger.warning("Using fallback SAM2 (no real segmentation)")
         return FallbackSAM2(self.device)
     def cleanup(self):
         self.adapter = None
         if self.model is not None:
@@ -346,3 +386,32 @@ def get_info(self) -> Dict[str, Any]:
             "load_time": self.load_time,
             "model_type": type(self.model).__name__ if self.model else None,
         }

+from sam2_loader import SAM2Loader
+import cv2, numpy as np
+# Load SAM2 (auto-selects size from VRAM; or pass "tiny|small|base|large")
+sam_adapter = SAM2Loader(device="cuda").load(model_size="auto")
+assert sam_adapter, "SAM2 failed to load"
+# 1) Provide the first frame (BGR or RGB ok; float [0..1] or uint8)
+bgr0 = cv2.imread("frame0001.jpg")
+sam_adapter.set_image(bgr0)  # internally converts if needed
+# 2) Predict a coarse person mask to “boot” MatAnyone
+out = sam_adapter.predict(point_coords=None, point_labels=None)  # or your prompt strategy
+masks = out["masks"]  # (N,H,W) float32 in [0,1], sized to original frame
+first_mask = masks[0] if masks is not None and len(masks) else np.ones_like(bgr0[...,0], np.float32)
+# Logging
 logger = logging.getLogger(__name__)
+if not logger.handlers:
+    logging.basicConfig(level=logging.INFO)
+# Silence bad OMP values that sometimes leak in Spaces
+_val = os.environ.get("OMP_NUM_THREADS")
+if _val is not None and not str(_val).strip().isdigit():
+    try:
+        del os.environ["OMP_NUM_THREADS"]
+    except Exception:
+        pass
 def _select_device(pref: str) -> str:
     pref = (pref or "").lower()
         return "cpu"
     return "cuda" if torch.cuda.is_available() else "cpu"
 def _ensure_rgb_uint8(img: np.ndarray, force_bgr_to_rgb: bool = False) -> np.ndarray:
+    """
+    Accepts: HxWxC where C>=3; dtype uint8/float/uint16; optional BGRA/RGBA.
+    Returns: RGB uint8 HxWx3
+    """
     if img is None:
         raise ValueError("set_image received None image")
     arr = np.asarray(img)
     if arr.ndim != 3 or arr.shape[2] < 3:
         raise ValueError(f"Expected HxWxC image with C>=3, got shape={arr.shape}")
     if np.issubdtype(arr.dtype, np.floating):
         arr = np.clip(arr, 0.0, 1.0)
         arr = (arr * 255.0 + 0.5).astype(np.uint8)
+    elif arr.dtype == np.uint16:
+        arr = (arr / 257).astype(np.uint8)  # 16→8 bit
     elif arr.dtype != np.uint8:
+        arr = arr.astype(np.uint8)
+    if arr.shape[2] == 4:  # drop alpha
         arr = arr[:, :, :3]
     if force_bgr_to_rgb:
         arr = cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)
     return arr
 def _compute_scaled_size(h: int, w: int, max_edge: int, target_pixels: int) -> Tuple[int, int, float]:
+    """
+    Scale so that:
+      - max(h, w) <= max_edge
+      - h*w <= target_pixels
+    Returns: (nh, nw, scale) with nh,nw >= 1
+    """
     if h <= 0 or w <= 0:
         return h, w, 1.0
     s1 = min(1.0, float(max_edge) / float(max(h, w))) if max_edge > 0 else 1.0
     nw = max(1, int(round(w * s)))
     return nh, nw, s
 def _ladder(nh: int, nw: int) -> List[Tuple[int, int]]:
+    """Progressive smaller sizes to retry on OOM or other failures."""
     sizes = [(nh, nw)]
+    for f in (0.85, 0.70, 0.55, 0.40, 0.30):
+        sizes.append((max(64, int(nh * f)), max(64, int(nw * f))))
+    uniq, seen = [], set()
     for s in sizes:
         if s not in seen:
             uniq.append(s); seen.add(s)
     return uniq
 def _upsample_stack(masks: np.ndarray, out_hw: Tuple[int, int]) -> np.ndarray:
+    """
+    Input masks may be (N,H,W) or (N,1,H,W) or (H,W).
+    Output is always (N, H_out, W_out) float32 in [0,1].
+    """
+    masks = np.asarray(masks)
+    if masks.ndim == 2:
+        masks = masks[None, ...]
+    elif masks.ndim == 4 and masks.shape[1] == 1:
+        masks = masks[:, 0, :, :]
     if masks.ndim != 3:
+        # try best-effort squeeze
+        masks = np.squeeze(masks)
         if masks.ndim == 2:
             masks = masks[None, ...]
     n, h, w = masks.shape
     H, W = out_hw
     if (h, w) == (H, W):
         out[i] = cv2.resize(masks[i].astype(np.float32), (W, H), interpolation=cv2.INTER_LINEAR)
     return np.clip(out, 0.0, 1.0)
 def _normalize_masks_dtype(x: np.ndarray) -> np.ndarray:
     x = np.asarray(x)
     if x.dtype == np.uint8:
         return (x.astype(np.float32) / 255.0)
     return x.astype(np.float32, copy=False)
 class _SAM2Adapter:
     """
+    Thin guard around SAM2ImagePredictor that:
+      - remembers original H,W
+      - VRAM-downscales on set_image(); retries smaller on failure
+      - upsamples masks to original H,W
+      - uses torch.autocast(device_type="cuda", ...) when available
+      - is thread-safe (single predictor instance can serve concurrent calls)
     """
     def __init__(self, predictor, device: str):
         self.pred = predictor
         self.device = device
+        # Original and working sizes
         self.orig_hw: Tuple[int, int] = (0, 0)
+        self._current_rgb: Optional[np.ndarray] = None
+        self._current_hw: Tuple[int, int] = (0, 0)
+        # Tuning knobs via env
         self.max_edge = int(os.environ.get("SAM2_MAX_EDGE", "1024"))
         self.target_pixels = int(os.environ.get("SAM2_TARGET_PIXELS", "900000"))
         self.force_bgr_to_rgb = os.environ.get("SAM2_ASSUME_BGR", "0") == "1"
         self._lock = threading.Lock()
+    # ------------------ public API ------------------
     def set_image(self, image: np.ndarray):
+        """
+        image: RGB or BGR; float [0..1] or uint8; HxWx{3,4}
+        """
         with self._lock:
             rgb = _ensure_rgb_uint8(image, force_bgr_to_rgb=self.force_bgr_to_rgb)
             H, W = rgb.shape[:2]
             self.orig_hw = (H, W)
             nh, nw, s = _compute_scaled_size(H, W, self.max_edge, self.target_pixels)
             if s < 1.0:
                 work = cv2.resize(rgb, (nw, nh), interpolation=cv2.INTER_AREA)
             else:
                 self._current_rgb = rgb
                 self._current_hw = (H, W)
             self.pred.set_image(self._current_rgb)
     def predict(self, **kwargs) -> Dict[str, Any]:
+        """
+        Calls SAM2 predictor with your prompt args (points/boxes/etc).
+        Returns: {"masks": (N,H,W) float32, "scores": (N,) float32, "logits"?: ...}
+        On any failure path, returns a full-ones mask as a safe fallback.
+        """
         with self._lock:
             if self._current_rgb is None or self.orig_hw == (0, 0):
                 raise RuntimeError("SAM2Adapter.predict called before set_image()")
             H, W = self.orig_hw
             nh, nw = self._current_hw
             sizes = _ladder(nh, nw)
             last_exc: Optional[BaseException] = None
             for (th, tw) in sizes:
                 try:
+                    # Optionally re-set smaller image
                     if (th, tw) != (nh, nw):
                         small = cv2.resize(self._current_rgb, (tw, th), interpolation=cv2.INTER_AREA)
                         self.pred.set_image(small)
+                    # PyTorch 2.x autocast
                     class _NoOp:
                         def __enter__(self): return None
                         def __exit__(self, *a): return False
+                    use_amp = (self.device == "cuda")
+                    if use_amp:
+                        amp_ctx = torch.autocast(
+                            device_type="cuda",
+                            dtype=(torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16)
+                        )
                     else:
                         amp_ctx = _NoOp()
                     with torch.inference_mode():
                         with amp_ctx:
                             out = self.pred.predict(**kwargs)
+                    # Normalize outputs
+                    masks = None; scores = None; logits = None
                     if isinstance(out, dict):
+                        masks = out.get("masks"); scores = out.get("scores"); logits = out.get("logits")
                     elif isinstance(out, (tuple, list)):
                         if len(out) >= 1: masks = out[0]
                         if len(out) >= 2: scores = out[1]
                         if len(out) >= 3: logits = out[2]
                     else:
                         masks = out
                     if masks is None:
                         raise RuntimeError("SAM2 returned no masks")
                     masks = _normalize_masks_dtype(masks)
                     masks_up = _upsample_stack(masks, (H, W))
                     if scores is None:
                         scores = np.ones((masks_up.shape[0],), dtype=np.float32) * 0.5
                     else:
                         scores = np.asarray(scores).astype(np.float32, copy=False).reshape(-1)
                     out_dict = {"masks": masks_up, "scores": scores}
                     if logits is not None:
                         lg = np.asarray(logits)
+                        # Best-effort upsample if spatial
                         if lg.ndim == 3:
                             lg = _upsample_stack(lg, (H, W))
                         elif lg.ndim == 4 and lg.shape[1] == 1:
                             lg = _upsample_stack(lg[:, 0, :, :], (H, W))
                         out_dict["logits"] = lg.astype(np.float32, copy=False)
                     return out_dict
                 except torch.cuda.OutOfMemoryError as e:
                     last_exc = e
                     if torch.cuda.is_available():
                     logger.debug(traceback.format_exc())
                     logger.warning(f"SAM2 predict failed at {th}x{tw}; retrying smaller. {e}")
                     continue
+            logger.warning(f"SAM2 calls failed; returning fallback mask. {last_exc}")
             return {
                 "masks": np.ones((1, H, W), dtype=np.float32),
                 "scores": np.array([0.5], dtype=np.float32),
             }
 class SAM2Loader:
+    """Dedicated loader for SAM2 models (PyTorch 2.x, Spaces-friendly)."""
     def __init__(self, device: str = "cuda", cache_dir: str = "./checkpoints/sam2_cache"):
         self.device = _select_device(device)
         self.cache_dir = cache_dir
         os.makedirs(self.cache_dir, exist_ok=True)
+        # Hugging Face Hub knobs for Spaces
         os.environ.setdefault("HF_HUB_DISABLE_SYMLINKS", "1")
         os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "0")
+        self.model = None          # underlying SAM2ImagePredictor
+        self.adapter = None        # _SAM2Adapter
         self.model_id = None
         self.load_time = 0.0
+    def _determine_optimal_size(self) -> str:
+        """Choose model size based on VRAM."""
+        try:
+            if torch.cuda.is_available():
+                props = torch.cuda.get_device_properties(0)
+                vram_gb = props.total_memory / (1024**3)
+                if vram_gb < 4:   return "tiny"
+                if vram_gb < 8:   return "small"
+                if vram_gb < 12:  return "base"
+                return "large"
+        except Exception:
+            pass
+        return "tiny"
+    def load(self, model_size: str = "auto") -> Optional[_SAM2Adapter]:
         """
+        model_size: "tiny" | "small" | "base" | "large" | "auto"
+        Returns: thread-safe adapter or None
         """
         if model_size == "auto":
             model_size = self._determine_optimal_size()
         model_map = {
             "tiny":  "facebook/sam2.1-hiera-tiny",
             "small": "facebook/sam2.1-hiera-small",
         }
         self.model_id = model_map.get(model_size, model_map["tiny"])
         logger.info(f"Loading SAM2 model: {self.model_id} (device={self.device})")
+        for name, fn in (("official", self._load_official), ("fallback", self._load_fallback)):
             try:
                 t0 = time.time()
                 pred = fn()
             except Exception as e:
                 logger.error(f"SAM2 {name} strategy failed: {e}")
                 logger.debug(traceback.format_exc())
         logger.error("All SAM2 loading strategies failed")
         return None
+    # -------------- strategies --------------
+    def _load_official(self):
+        """Load SAM2ImagePredictor via its official API and move weights to device."""
         from sam2.sam2_image_predictor import SAM2ImagePredictor
         predictor = SAM2ImagePredictor.from_pretrained(
             self.model_id,
             local_files_only=False,
             trust_remote_code=True,
         )
+        # Move **model** to device; DO NOT set predictor.device (read-only → error)
         if hasattr(predictor, "model"):
             predictor.model = predictor.model.to(self.device)
             predictor.model.eval()
         return predictor
+    def _load_fallback(self):
+        """Tiny local fallback that returns a full-ones mask — keeps pipeline alive."""
         class FallbackSAM2:
             def __init__(self, device):
                 self.device = device
             def set_image(self, image):
                 self._img = np.asarray(image)
             def predict(self, **kwargs):
+                h, w = (self._img.shape[:2] if self._img is not None else (512, 512))
                 return {
                     "masks": np.ones((1, h, w), dtype=np.float32),
                     "scores": np.array([0.5], dtype=np.float32),
         logger.warning("Using fallback SAM2 (no real segmentation)")
         return FallbackSAM2(self.device)
+    # -------------- housekeeping --------------
     def cleanup(self):
         self.adapter = None
         if self.model is not None:
             "load_time": self.load_time,
             "model_type": type(self.model).__name__ if self.model else None,
         }
+if __name__ == "__main__":
+    import sys
+    logging.basicConfig(level=logging.INFO)
+    dev = "cuda" if torch.cuda.is_available() else "cpu"
+    if len(sys.argv) < 2:
+        print(f"Usage: {sys.argv[0]} image.jpg")
+        raise SystemExit(1)
+    path = sys.argv[1]
+    img = cv2.imread(path, cv2.IMREAD_COLOR)
+    if img is None:
+        print(f"Could not load image {path}")
+        raise SystemExit(2)
+    loader = SAM2Loader(device=dev)
+    sam = loader.load("auto")
+    if not sam:
+        print("Failed to load SAM2")
+        raise SystemExit(3)
+    sam.set_image(img)
+    out = sam.predict(point_coords=None, point_labels=None)
+    m = out["masks"]
+    print("Masks:", m.shape, m.dtype, m.min(), m.max())
+    cv2.imwrite("sam2_mask0.png", (np.clip(m[0], 0, 1) * 255).astype(np.uint8))
+    print("Wrote sam2_mask0.png")