Spaces:

BiasLab2025
/

detection_base

Paused

Zhen Ye Claude Opus 4.6 commited on Feb 20

Commit

63684e4

1 Parent(s): b17bd6d

fix: manual torch.compile with graceful fallback for SAM2 components

Replace vos_optimized=True (caused 500 error) with manual per-component
torch.compile matching Facebook's official recipe. Compiles image_encoder,
memory_encoder, memory_attention (dynamic=True), sam_prompt_encoder, and
sam_mask_decoder with max-autotune. Falls back to eager mode silently if
PyTorch version or CUDA compiler is unavailable.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

models/segmenters/grounded_sam2.py +37 -4

models/segmenters/grounded_sam2.py CHANGED Viewed

@@ -362,13 +362,17 @@ class GroundedSAM2Segmenter(Segmenter):
         from sam2.sam2_image_predictor import SAM2ImagePredictor
         # Video predictor (for process_video)
-        # vos_optimized=True enables SAM2VideoPredictorVOS which compiles
-        # image_encoder, memory_encoder, memory_attention, sam_prompt_encoder,
-        # and sam_mask_decoder with torch.compile(mode="max-autotune").
         self._video_predictor = build_sam2_video_predictor_hf(
-            hf_id, device=self.device, vos_optimized=True,
         )
         # Image predictor (for single-frame predict)
         sam2_image_model = build_sam2_hf(hf_id, device=self.device)
         self._image_predictor = SAM2ImagePredictor(sam2_image_model)
@@ -381,6 +385,35 @@ class GroundedSAM2Segmenter(Segmenter):
         self._models_loaded = True
         logging.info("Grounded-SAM-2 models loaded successfully.")
     # -- Single-frame interface (Segmenter.predict) -------------------------
     def predict(

         from sam2.sam2_image_predictor import SAM2ImagePredictor
         # Video predictor (for process_video)
         self._video_predictor = build_sam2_video_predictor_hf(
+            hf_id, device=self.device,
         )
+        # torch.compile individual components for fused kernels.
+        # memory_attention uses dynamic=True (variable memory token count).
+        # Wrapped in try/except: falls back to eager if PyTorch < 2.5 or
+        # CUDA compiler not available.
+        if self.device.startswith("cuda"):
+            self._apply_torch_compile()
         # Image predictor (for single-frame predict)
         sam2_image_model = build_sam2_hf(hf_id, device=self.device)
         self._image_predictor = SAM2ImagePredictor(sam2_image_model)
         self._models_loaded = True
         logging.info("Grounded-SAM-2 models loaded successfully.")
+    def _apply_torch_compile(self):
+        """Compile SAM2 sub-modules with torch.compile (max-autotune).
+        Compiles 5 components matching Facebook's official VOS recipe.
+        Falls back silently to eager mode on any compilation error.
+        """
+        vp = self._video_predictor
+        components = [
+            ("image_encoder",    dict(mode="max-autotune", fullgraph=True, dynamic=False)),
+            ("memory_encoder",   dict(mode="max-autotune", fullgraph=True, dynamic=False)),
+            ("memory_attention", dict(mode="max-autotune", fullgraph=True, dynamic=True)),
+            ("sam_prompt_encoder", dict(mode="max-autotune", fullgraph=True, dynamic=False)),
+            ("sam_mask_decoder", dict(mode="max-autotune", fullgraph=True, dynamic=False)),
+        ]
+        compiled = []
+        for attr, kwargs in components:
+            module = getattr(vp, attr, None)
+            if module is None:
+                continue
+            try:
+                module.forward = torch.compile(module.forward, **kwargs)
+                compiled.append(attr)
+            except Exception as e:
+                logging.warning("torch.compile failed for %s: %s", attr, e)
+        if compiled:
+            logging.info("torch.compile applied to: %s", ", ".join(compiled))
+        else:
+            logging.info("torch.compile not available, using eager mode.")
     # -- Single-frame interface (Segmenter.predict) -------------------------
     def predict(