Spaces:

BiasLab2025
/

detection_base

Paused

Zhen Ye Claude Opus 4.6 commited on Feb 20

Commit

5749bd6

1 Parent(s): 10eb3c6

revert: remove torch.compile — runtime failures on HF Space

Reverts all torch.compile changes (b17bd6d..10eb3c6). The HF Space
container lacks Triton/inductor support, causing 500 errors. The
GPU-resident tensor pipeline from 5aec47c is retained.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show

models/segmenters/grounded_sam2.py +3 -79

models/segmenters/grounded_sam2.py CHANGED Viewed

@@ -363,16 +363,9 @@ class GroundedSAM2Segmenter(Segmenter):
         # Video predictor (for process_video)
         self._video_predictor = build_sam2_video_predictor_hf(
-            hf_id, device=self.device,
         )
-        # torch.compile individual components for fused kernels.
-        # memory_attention uses dynamic=True (variable memory token count).
-        # Wrapped in try/except: falls back to eager if PyTorch < 2.5 or
-        # CUDA compiler not available.
-        if self.device.startswith("cuda"):
-            self._apply_torch_compile()
         # Image predictor (for single-frame predict)
         sam2_image_model = build_sam2_hf(hf_id, device=self.device)
         self._image_predictor = SAM2ImagePredictor(sam2_image_model)
@@ -385,52 +378,6 @@ class GroundedSAM2Segmenter(Segmenter):
         self._models_loaded = True
         logging.info("Grounded-SAM-2 models loaded successfully.")
-    def _apply_torch_compile(self):
-        """Compile SAM2 sub-modules with torch.compile (max-autotune).
-        Compiles 5 components matching Facebook's official VOS recipe.
-        torch.compile wraps succeed immediately; actual Triton/inductor
-        compilation happens lazily on first forward pass.  We store
-        original forwards so propagate_segment can fall back on error.
-        """
-        vp = self._video_predictor
-        components = [
-            ("image_encoder",    dict(mode="max-autotune", dynamic=False)),
-            ("memory_encoder",   dict(mode="max-autotune", dynamic=False)),
-            ("memory_attention", dict(mode="max-autotune", dynamic=True)),
-            ("sam_prompt_encoder", dict(mode="max-autotune", dynamic=False)),
-            ("sam_mask_decoder", dict(mode="max-autotune", dynamic=False)),
-        ]
-        self._original_forwards: Dict[str, Any] = {}
-        compiled = []
-        for attr, kwargs in components:
-            module = getattr(vp, attr, None)
-            if module is None:
-                continue
-            try:
-                self._original_forwards[attr] = module.forward
-                module.forward = torch.compile(module.forward, **kwargs)
-                compiled.append(attr)
-            except Exception as e:
-                logging.warning("torch.compile wrapping failed for %s: %s", attr, e)
-        if compiled:
-            logging.info("torch.compile applied to: %s", ", ".join(compiled))
-            self._torch_compiled = True
-        else:
-            logging.info("torch.compile not available, using eager mode.")
-            self._torch_compiled = False
-    def _revert_torch_compile(self):
-        """Revert compiled forwards back to eager originals."""
-        vp = self._video_predictor
-        for attr, orig_fwd in self._original_forwards.items():
-            module = getattr(vp, attr, None)
-            if module is not None:
-                module.forward = orig_fwd
-        self._original_forwards.clear()
-        self._torch_compiled = False
-        logging.warning("Reverted torch.compile — falling back to eager mode.")
     # -- Single-frame interface (Segmenter.predict) -------------------------
     def predict(
@@ -613,32 +560,9 @@ class GroundedSAM2Segmenter(Segmenter):
         class_names_list: List[str] = []
         cursor = 0
-        # Wrap generator to catch torch.compile runtime failures on first frame.
-        # If inductor/triton fails, revert to eager and restart propagation.
-        _generator = self._video_predictor.propagate_in_video(
             inference_state, max_frame_num_to_track=step, start_frame_idx=start_idx,
-        )
-        if getattr(self, '_torch_compiled', False) and not getattr(self, '_compile_verified', False):
-            try:
-                _first = next(_generator)
-            except Exception as e:
-                logging.warning("torch.compile runtime error, reverting to eager: %s", e)
-                self._revert_torch_compile()
-                # Re-init propagation with eager forwards
-                self._video_predictor.reset_state(inference_state)
-                for obj_id, obj_info in mask_dict.labels.items():
-                    self._video_predictor.add_new_mask(
-                        inference_state, start_idx, obj_id, obj_info.mask,
-                    )
-                _generator = self._video_predictor.propagate_in_video(
-                    inference_state, max_frame_num_to_track=step, start_frame_idx=start_idx,
-                )
-                _first = next(_generator)
-            import itertools
-            self._compile_verified = True
-            _generator = itertools.chain([_first], _generator)
-        for out_frame_idx, out_obj_ids, out_mask_logits in _generator:
             bool_masks = (out_mask_logits[:, 0] > 0.0)  # (N, H, W) GPU async
             n = bool_masks.shape[0]

         # Video predictor (for process_video)
         self._video_predictor = build_sam2_video_predictor_hf(
+            hf_id, device=self.device
         )
         # Image predictor (for single-frame predict)
         sam2_image_model = build_sam2_hf(hf_id, device=self.device)
         self._image_predictor = SAM2ImagePredictor(sam2_image_model)
         self._models_loaded = True
         logging.info("Grounded-SAM-2 models loaded successfully.")
     # -- Single-frame interface (Segmenter.predict) -------------------------
     def predict(
         class_names_list: List[str] = []
         cursor = 0
+        for out_frame_idx, out_obj_ids, out_mask_logits in self._video_predictor.propagate_in_video(
             inference_state, max_frame_num_to_track=step, start_frame_idx=start_idx,
+        ):
             bool_masks = (out_mask_logits[:, 0] > 0.0)  # (N, H, W) GPU async
             n = bool_masks.shape[0]