Spaces:

BiasLab2025
/

detection_base

Paused

Zhen Ye Claude Opus 4.6 commited on Feb 20

Commit

5aec47c

1 Parent(s): 64f68de

perf: GPU-resident tensor pipeline for SAM2 video propagation

Eliminate all CUDA synchronization from propagate_segment() by keeping
masks, bboxes, and validity flags on GPU in pre-allocated buffers.
CPU materialization is deferred to a single bulk transfer via
SegmentOutput.to_object_dicts() at the consumer.

- Add _bbox_gpu() for zero-sync bounding box computation on GPU
- Add SegmentOutput dataclass for GPU-resident segment results
- Rewrite propagate_segment() with inline bbox + pre-allocated tensors
- Refactor process_video() to reuse propagate_segment()
- Update Phase 4 reconciliation: 3 CUDA syncs per segment vs 100+

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show

inference.py +12 -8
models/segmenters/grounded_sam2.py +149 -117

inference.py CHANGED Viewed

@@ -1643,7 +1643,7 @@ def run_grounded_sam2_tracking(
     from PIL import Image as PILImage
     from utils.video import extract_frames_to_jpeg_dir
-    from models.segmenters.grounded_sam2 import MaskDictionary, ObjectInfo
     active_segmenter = segmenter_name or "gsam2_large"
     logging.info(
@@ -1816,11 +1816,11 @@ def run_grounded_sam2_tracking(
                                 label_list=labels,
                             )
-                            segment_results = seg.propagate_segment(
                                 state, start_idx, mask_dict, step,
                             )
                             seg_queue_out.put(
-                                (seg_idx, start_idx, mask_dict, segment_results)
                             )
                         except RuntimeError as e:
                             if "cancelled" in str(e).lower():
@@ -1853,8 +1853,8 @@ def run_grounded_sam2_tracking(
             # Collect all segment outputs
             segment_data: Dict[int, Tuple] = {}
             while not seg_queue_out.empty():
-                seg_idx, start_idx, mask_dict, results = seg_queue_out.get()
-                segment_data[seg_idx] = (start_idx, mask_dict, results)
             # Phase 4: Sequential ID reconciliation
             if _perf_metrics is not None:
@@ -1865,12 +1865,13 @@ def run_grounded_sam2_tracking(
             tracking_results: Dict[int, Dict[int, ObjectInfo]] = {}
             def _mask_to_cpu(mask):
                 if torch.is_tensor(mask):
                     return mask.detach().cpu()
                 return mask
             for seg_idx in sorted(segment_data.keys()):
-                start_idx, mask_dict, segment_results = segment_data[seg_idx]
                 if mask_dict is None or not mask_dict.labels:
                     # No detections — carry forward previous masks
@@ -1882,7 +1883,7 @@ def run_grounded_sam2_tracking(
                                 {
                                     k: ObjectInfo(
                                         instance_id=v.instance_id,
-                                        mask=_mask_to_cpu(v.mask),
                                         class_name=v.class_name,
                                         x1=v.x1, y1=v.y1,
                                         x2=v.x2, y2=v.y2,
@@ -1914,6 +1915,9 @@ def run_grounded_sam2_tracking(
                         tracking_results[fi] = {}
                     continue
                 # Apply remapping to every frame in this segment
                 for frame_idx, frame_objects in segment_results.items():
                     remapped: Dict[int, ObjectInfo] = {}
@@ -1923,7 +1927,7 @@ def run_grounded_sam2_tracking(
                             continue
                         remapped[global_id] = ObjectInfo(
                             instance_id=global_id,
-                            mask=_mask_to_cpu(obj_info.mask),
                             class_name=obj_info.class_name,
                             x1=obj_info.x1, y1=obj_info.y1,
                             x2=obj_info.x2, y2=obj_info.y2,

     from PIL import Image as PILImage
     from utils.video import extract_frames_to_jpeg_dir
+    from models.segmenters.grounded_sam2 import MaskDictionary, ObjectInfo, SegmentOutput
     active_segmenter = segmenter_name or "gsam2_large"
     logging.info(
                                 label_list=labels,
                             )
+                            segment_output = seg.propagate_segment(
                                 state, start_idx, mask_dict, step,
                             )
                             seg_queue_out.put(
+                                (seg_idx, start_idx, mask_dict, segment_output)
                             )
                         except RuntimeError as e:
                             if "cancelled" in str(e).lower():
             # Collect all segment outputs
             segment_data: Dict[int, Tuple] = {}
             while not seg_queue_out.empty():
+                seg_idx, start_idx, mask_dict, segment_output = seg_queue_out.get()
+                segment_data[seg_idx] = (start_idx, mask_dict, segment_output)
             # Phase 4: Sequential ID reconciliation
             if _perf_metrics is not None:
             tracking_results: Dict[int, Dict[int, ObjectInfo]] = {}
             def _mask_to_cpu(mask):
+                """Normalize mask to CPU tensor (still used for keyframe mask_dict)."""
                 if torch.is_tensor(mask):
                     return mask.detach().cpu()
                 return mask
             for seg_idx in sorted(segment_data.keys()):
+                start_idx, mask_dict, segment_output = segment_data[seg_idx]
                 if mask_dict is None or not mask_dict.labels:
                     # No detections — carry forward previous masks
                                 {
                                     k: ObjectInfo(
                                         instance_id=v.instance_id,
+                                        mask=v.mask,
                                         class_name=v.class_name,
                                         x1=v.x1, y1=v.y1,
                                         x2=v.x2, y2=v.y2,
                         tracking_results[fi] = {}
                     continue
+                # Bulk CPU transfer: 3 CUDA syncs total (was 100+ per-mask syncs)
+                segment_results = segment_output.to_object_dicts()
                 # Apply remapping to every frame in this segment
                 for frame_idx, frame_objects in segment_results.items():
                     remapped: Dict[int, ObjectInfo] = {}
                             continue
                         remapped[global_id] = ObjectInfo(
                             instance_id=global_id,
+                            mask=obj_info.mask,
                             class_name=obj_info.class_name,
                             x1=obj_info.x1, y1=obj_info.y1,
                             x2=obj_info.x2, y2=obj_info.y2,

models/segmenters/grounded_sam2.py CHANGED Viewed

@@ -13,7 +13,7 @@ import logging
 import time
 from contextlib import nullcontext
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Sequence, Tuple
 import numpy as np
 import torch
@@ -220,6 +220,72 @@ class MaskDictionary:
         return float((inter / union).item())
 # ---------------------------------------------------------------------------
 # SAM2 HuggingFace model IDs per size
 # ---------------------------------------------------------------------------
@@ -466,21 +532,11 @@ class GroundedSAM2Segmenter(Segmenter):
         start_idx: int,
         mask_dict: "MaskDictionary",
         step: int,
-    ) -> Dict[int, Dict[int, "ObjectInfo"]]:
         """Propagate masks for a single segment via SAM2 video predictor.
-        Calls ``reset_state`` first, making this safe to call independently
-        (and therefore parallelisable across GPUs).
-        Args:
-            inference_state: SAM2 video predictor state (from ``init_state``).
-            start_idx: Starting frame index for this segment.
-            mask_dict: MaskDictionary with object masks for the keyframe.
-            step: Maximum number of frames to propagate.
-        Returns:
-            Dict mapping ``frame_idx`` → ``{obj_id: ObjectInfo}`` using the
-            IDs from *mask_dict* (local, not yet reconciled).
         """
         _pm = getattr(self, '_perf_metrics', None)
         if _pm is not None:
@@ -490,53 +546,72 @@ class GroundedSAM2Segmenter(Segmenter):
         for obj_id, obj_info in mask_dict.labels.items():
             self._video_predictor.add_new_mask(
-                inference_state,
-                start_idx,
-                obj_id,
-                obj_info.mask,
             )
-        segment_results: Dict[int, Dict[int, ObjectInfo]] = {}
-        # Phase A: Drain generator — GPU ops only, zero CUDA syncs
-        raw_frames: list = []
         for out_frame_idx, out_obj_ids, out_mask_logits in self._video_predictor.propagate_in_video(
-            inference_state,
-            max_frame_num_to_track=step,
-            start_frame_idx=start_idx,
         ):
-            bool_masks = (out_mask_logits[:, 0] > 0.0)   # (N_obj, H, W) bool, GPU
-            raw_frames.append((out_frame_idx, list(out_obj_ids), bool_masks))
-        # Phase B: Batched bbox + ObjectInfo construction — 2 CUDA syncs total
-        if raw_frames:
-            entries: list = []
-            all_masks: list = []
-            for frame_idx, obj_ids, bool_masks in raw_frames:
-                for i, obj_id in enumerate(obj_ids):
-                    entries.append((frame_idx, obj_id, mask_dict.get_target_class_name(obj_id)))
-                    all_masks.append(bool_masks[i])
-            if all_masks:
-                stacked = torch.stack(all_masks)
-                bboxes_cpu, valid_cpu = ObjectInfo.batch_bbox(stacked)
-                del stacked
-                bboxes_list = bboxes_cpu.tolist()
-                valid_list = valid_cpu.tolist()
-                for idx, (frame_idx, obj_id, class_name) in enumerate(entries):
-                    if valid_list[idx]:
-                        x1, y1, x2, y2 = int(bboxes_list[idx][0]), int(bboxes_list[idx][1]), int(bboxes_list[idx][2]), int(bboxes_list[idx][3])
-                    else:
-                        x1 = y1 = x2 = y2 = 0
-                    info = ObjectInfo(
-                        instance_id=obj_id,
-                        mask=all_masks[idx],
-                        class_name=class_name,
-                        x1=x1, y1=y1, x2=x2, y2=y2,
-                    )
-                    segment_results.setdefault(frame_idx, {})[obj_id] = info
         if _pm is not None:
             _pl = getattr(self, '_perf_lock', None)
@@ -546,7 +621,7 @@ class GroundedSAM2Segmenter(Segmenter):
             else:
                 _pm["sam_video_total_ms"] += _d
-        return segment_results
     # -- Video-level tracking interface -------------------------------------
@@ -721,66 +796,23 @@ class GroundedSAM2Segmenter(Segmenter):
                 if _pm is not None:
                     _t_sv = time.perf_counter()
-                self._video_predictor.reset_state(inference_state)
-                for obj_id, obj_info in mask_dict.labels.items():
-                    self._video_predictor.add_new_mask(
-                        inference_state,
-                        start_idx,
-                        obj_id,
-                        obj_info.mask,
-                    )
-                # Phase A: Drain generator — GPU ops only, zero CUDA syncs
-                raw_frames: list = []
-                for out_frame_idx, out_obj_ids, out_mask_logits in self._video_predictor.propagate_in_video(
-                    inference_state,
-                    max_frame_num_to_track=step,
-                    start_frame_idx=start_idx,
-                ):
-                    bool_masks = (out_mask_logits[:, 0] > 0.0)   # (N_obj, H, W) bool, GPU
-                    raw_frames.append((out_frame_idx, list(out_obj_ids), bool_masks))
-                # Phase B: Batched bbox + ObjectInfo construction — 2 CUDA syncs total
-                if raw_frames:
-                    entries: list = []
-                    all_masks: list = []
-                    for frame_idx, obj_ids, bool_masks in raw_frames:
-                        for i, obj_id in enumerate(obj_ids):
-                            entries.append((frame_idx, obj_id, mask_dict.get_target_class_name(obj_id)))
-                            all_masks.append(bool_masks[i])
-                    if all_masks:
-                        stacked = torch.stack(all_masks)
-                        bboxes_cpu, valid_cpu = ObjectInfo.batch_bbox(stacked)
-                        del stacked
-                        bboxes_list = bboxes_cpu.tolist()
-                        valid_list = valid_cpu.tolist()
-                        for idx, (frame_idx, obj_id, class_name) in enumerate(entries):
-                            if valid_list[idx]:
-                                x1, y1, x2, y2 = int(bboxes_list[idx][0]), int(bboxes_list[idx][1]), int(bboxes_list[idx][2]), int(bboxes_list[idx][3])
-                            else:
-                                x1 = y1 = x2 = y2 = 0
-                            info = ObjectInfo(
-                                instance_id=obj_id,
-                                mask=all_masks[idx],
-                                class_name=class_name,
-                                x1=x1, y1=y1, x2=x2, y2=y2,
-                            )
-                            all_results.setdefault(frame_idx, {})[obj_id] = info
-                    # deepcopy ONLY the last frame (was running every frame before)
-                    last_frame_idx = raw_frames[-1][0]
-                    last_frame_objects = all_results.get(last_frame_idx, {})
-                    sam2_masks = MaskDictionary()
-                    sam2_masks.labels = copy.deepcopy(last_frame_objects)
-                    if last_frame_objects:
-                        first_info = next(iter(last_frame_objects.values()))
-                        if first_info.mask is not None:
-                            sam2_masks.mask_height = first_info.mask.shape[-2] if first_info.mask.ndim >= 2 else 0
-                            sam2_masks.mask_width = first_info.mask.shape[-1] if first_info.mask.ndim >= 2 else 0
                 if _pm is not None:
                     _pl = getattr(self, '_perf_lock', None)

 import time
 from contextlib import nullcontext
 from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Sequence, Tuple, TYPE_CHECKING
 import numpy as np
 import torch
         return float((inter / union).item())
+# ---------------------------------------------------------------------------
+# GPU-resident bounding-box helper (zero CUDA syncs)
+# ---------------------------------------------------------------------------
+def _bbox_gpu(bool_masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Compute bboxes from (N, H, W) bool GPU masks. Returns GPU tensors, zero sync.
+    Returns:
+        bboxes: (N, 4) int64 on same device as input  [x_min, y_min, x_max, y_max]
+        valid:  (N,) bool on same device as input
+    """
+    N, H, W = bool_masks.shape
+    rows = bool_masks.any(dim=2)       # (N, H)
+    cols = bool_masks.any(dim=1)       # (N, W)
+    valid = rows.any(dim=1)            # (N,)
+    rows_f = rows.float()
+    cols_f = cols.float()
+    bboxes = torch.stack([
+        cols_f.argmax(dim=1),                        # x_min
+        rows_f.argmax(dim=1),                        # y_min
+        W - 1 - cols_f.flip(1).argmax(dim=1),       # x_max
+        H - 1 - rows_f.flip(1).argmax(dim=1),       # y_max
+    ], dim=1).to(torch.int64)                        # (N, 4) int64
+    return bboxes, valid
+# ---------------------------------------------------------------------------
+# GPU-resident segment output (deferred CPU materialization)
+# ---------------------------------------------------------------------------
+@dataclass
+class SegmentOutput:
+    """GPU-resident segment propagation result. Zero CUDA syncs to construct."""
+    masks: torch.Tensor          # (count, H, W) bool on GPU
+    bboxes: torch.Tensor         # (count, 4) int64 on GPU
+    valid: torch.Tensor          # (count,) bool on GPU
+    frame_indices: List[int]     # len == count
+    obj_ids: List[int]           # len == count
+    class_names: List[str]       # len == count
+    device: str = "cpu"
+    def to_object_dicts(self) -> Dict[int, Dict[int, "ObjectInfo"]]:
+        """Bulk CPU transfer + ObjectInfo construction. 3 CUDA syncs total."""
+        if self.masks.numel() == 0:
+            return {}
+        masks_cpu = self.masks.cpu()       # sync 1
+        bboxes_cpu = self.bboxes.cpu()     # sync 2
+        valid_cpu = self.valid.cpu()       # sync 3
+        result: Dict[int, Dict[int, ObjectInfo]] = {}
+        for i in range(masks_cpu.shape[0]):
+            fi, oid, cn = self.frame_indices[i], self.obj_ids[i], self.class_names[i]
+            if valid_cpu[i]:
+                x1, y1, x2, y2 = int(bboxes_cpu[i, 0]), int(bboxes_cpu[i, 1]), int(bboxes_cpu[i, 2]), int(bboxes_cpu[i, 3])
+            else:
+                x1 = y1 = x2 = y2 = 0
+            info = ObjectInfo(
+                instance_id=oid, mask=masks_cpu[i],
+                class_name=cn, x1=x1, y1=y1, x2=x2, y2=y2,
+            )
+            result.setdefault(fi, {})[oid] = info
+        return result
+    def last_frame_idx(self) -> Optional[int]:
+        return self.frame_indices[-1] if self.frame_indices else None
 # ---------------------------------------------------------------------------
 # SAM2 HuggingFace model IDs per size
 # ---------------------------------------------------------------------------
         start_idx: int,
         mask_dict: "MaskDictionary",
         step: int,
+    ) -> "SegmentOutput":
         """Propagate masks for a single segment via SAM2 video predictor.
+        Returns a GPU-resident ``SegmentOutput`` with zero CUDA syncs.
+        Call ``output.to_object_dicts()`` to materialize CPU ObjectInfo dicts.
         """
         _pm = getattr(self, '_perf_metrics', None)
         if _pm is not None:
         for obj_id, obj_info in mask_dict.labels.items():
             self._video_predictor.add_new_mask(
+                inference_state, start_idx, obj_id, obj_info.mask,
             )
+        # Pre-compute class name lookup (avoid repeated dict access in loop)
+        obj_id_to_class = {oid: mask_dict.get_target_class_name(oid) for oid in mask_dict.labels}
+        n_obj = len(mask_dict.labels)
+        # Pre-allocated GPU buffers (allocated on first yield when H, W known)
+        masks_buf = bboxes_buf = valid_buf = None
+        frame_indices: List[int] = []
+        obj_ids_list: List[int] = []
+        class_names_list: List[str] = []
+        cursor = 0
         for out_frame_idx, out_obj_ids, out_mask_logits in self._video_predictor.propagate_in_video(
+            inference_state, max_frame_num_to_track=step, start_frame_idx=start_idx,
         ):
+            bool_masks = (out_mask_logits[:, 0] > 0.0)  # (N, H, W) GPU async
+            n = bool_masks.shape[0]
+            # Allocate on first yield
+            if masks_buf is None:
+                H, W = bool_masks.shape[1], bool_masks.shape[2]
+                max_entries = step * max(n_obj, n)
+                masks_buf = torch.empty(max_entries, H, W, dtype=torch.bool, device=self.device)
+                bboxes_buf = torch.empty(max_entries, 4, dtype=torch.int64, device=self.device)
+                valid_buf = torch.empty(max_entries, dtype=torch.bool, device=self.device)
+            # Grow buffers if needed (unlikely but safe)
+            if cursor + n > masks_buf.shape[0]:
+                grow = max(step * n_obj, cursor + n - masks_buf.shape[0])
+                H, W = masks_buf.shape[1], masks_buf.shape[2]
+                masks_buf = torch.cat([masks_buf, torch.empty(grow, H, W, dtype=torch.bool, device=self.device)])
+                bboxes_buf = torch.cat([bboxes_buf, torch.empty(grow, 4, dtype=torch.int64, device=self.device)])
+                valid_buf = torch.cat([valid_buf, torch.empty(grow, dtype=torch.bool, device=self.device)])
+            # Inline bbox — GPU async, zero sync
+            frame_bboxes, frame_valid = _bbox_gpu(bool_masks)
+            # Fill pre-allocated slices — GPU async
+            masks_buf[cursor:cursor + n] = bool_masks
+            bboxes_buf[cursor:cursor + n] = frame_bboxes
+            valid_buf[cursor:cursor + n] = frame_valid
+            # Metadata (trivial Python, ~2μs GIL)
+            oid_list = list(out_obj_ids) if not isinstance(out_obj_ids, list) else out_obj_ids
+            for oid in oid_list:
+                frame_indices.append(out_frame_idx)
+                obj_ids_list.append(oid)
+                class_names_list.append(obj_id_to_class.get(oid, ""))
+            cursor += n
+        # Build output (zero-copy slice if under-filled, empty tensors if no frames)
+        if masks_buf is not None:
+            output = SegmentOutput(
+                masks=masks_buf[:cursor], bboxes=bboxes_buf[:cursor],
+                valid=valid_buf[:cursor], frame_indices=frame_indices,
+                obj_ids=obj_ids_list, class_names=class_names_list, device=self.device,
+            )
+        else:
+            output = SegmentOutput(
+                masks=torch.empty(0, 0, 0, dtype=torch.bool, device=self.device),
+                bboxes=torch.empty(0, 4, dtype=torch.int64, device=self.device),
+                valid=torch.empty(0, dtype=torch.bool, device=self.device),
+                frame_indices=[], obj_ids=[], class_names=[], device=self.device,
+            )
         if _pm is not None:
             _pl = getattr(self, '_perf_lock', None)
             else:
                 _pm["sam_video_total_ms"] += _d
+        return output
     # -- Video-level tracking interface -------------------------------------
                 if _pm is not None:
                     _t_sv = time.perf_counter()
+                segment_output = self.propagate_segment(
+                    inference_state, start_idx, mask_dict, step,
+                )
+                segment_results = segment_output.to_object_dicts()
+                if segment_results:
+                    all_results.update(segment_results)
+                    last_fi = segment_output.last_frame_idx()
+                    if last_fi is not None:
+                        last_frame_objects = all_results.get(last_fi, {})
+                        sam2_masks = MaskDictionary()
+                        sam2_masks.labels = copy.deepcopy(last_frame_objects)
+                        if last_frame_objects:
+                            first_info = next(iter(last_frame_objects.values()))
+                            if first_info.mask is not None:
+                                sam2_masks.mask_height = first_info.mask.shape[-2] if first_info.mask.ndim >= 2 else 0
+                                sam2_masks.mask_width = first_info.mask.shape[-1] if first_info.mask.ndim >= 2 else 0
                 if _pm is not None:
                     _pl = getattr(self, '_perf_lock', None)