Spaces:

BiasLab2025
/

perception

Paused

App Files Files Community

Zhen Ye commited on 3 days ago

Commit

032b60f

1 Parent(s): 3fde4e4

Refine Grounded-SAM2 tracking behavior and video frame handling

Browse files

Files changed (3) hide show

inference.py +4 -3
models/segmenters/grounded_sam2.py +82 -78
utils/video.py +1 -1

inference.py CHANGED Viewed

@@ -1641,8 +1641,8 @@ def run_grounded_sam2_tracking(
                 frame_path = _os.path.join(frame_dir, frame_names[frame_idx])
                 frame = cv2.imread(frame_path)
                 if frame is None:
-                    logging.warning("Failed to read frame %d, skipping", frame_idx)
-                    continue
                 frame_objects = tracking_results.get(frame_idx, {})
@@ -1671,7 +1671,8 @@ def run_grounded_sam2_tracking(
                         label = f"{obj_info.instance_id} {obj_info.class_name}"
                         label_list.append(label)
-                        if obj_info.x1 or obj_info.y1 or obj_info.x2 or obj_info.y2:
                             boxes_list.append([obj_info.x1, obj_info.y1, obj_info.x2, obj_info.y2])
                     # Draw masks

                 frame_path = _os.path.join(frame_dir, frame_names[frame_idx])
                 frame = cv2.imread(frame_path)
                 if frame is None:
+                    logging.warning("Failed to read frame %d, writing blank", frame_idx)
+                    frame = np.zeros((height, width, 3), dtype=np.uint8)
                 frame_objects = tracking_results.get(frame_idx, {})
                         label = f"{obj_info.instance_id} {obj_info.class_name}"
                         label_list.append(label)
+                        has_box = not (obj_info.x1 == 0 and obj_info.y1 == 0 and obj_info.x2 == 0 and obj_info.y2 == 0)
+                        if has_box:
                             boxes_list.append([obj_info.x1, obj_info.y1, obj_info.x2, obj_info.y2])
                     # Draw masks

models/segmenters/grounded_sam2.py CHANGED Viewed

@@ -10,6 +10,7 @@ Reference implementation:
 import copy
 import logging
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Sequence, Tuple
@@ -84,7 +85,7 @@ class MaskDictionary:
     def update_masks(
         self,
         tracking_dict: "MaskDictionary",
-        iou_threshold: float = 0.8,
         objects_count: int = 0,
     ) -> int:
         """Match current detections against tracked objects via IoU."""
@@ -156,7 +157,7 @@ class GroundedSAM2Segmenter(Segmenter):
         model_size: str = "large",
         device: Optional[str] = None,
         step: int = 20,
-        iou_threshold: float = 0.8,
     ):
         self.model_size = model_size
         self.step = step
@@ -240,7 +241,9 @@ class GroundedSAM2Segmenter(Segmenter):
         import cv2 as _cv2
         frame_rgb = _cv2.cvtColor(frame, _cv2.COLOR_BGR2RGB)
-        with torch.autocast(device_type=self.device.split(":")[0], dtype=torch.bfloat16):
             self._image_predictor.set_image(frame_rgb)
             input_boxes = torch.tensor(det.boxes, device=self.device, dtype=torch.float32)
             masks, scores, _ = self._image_predictor.predict(
@@ -311,68 +314,70 @@ class GroundedSAM2Segmenter(Segmenter):
             total_frames, step, text_prompts,
         )
-        # Init SAM2 video predictor state
-        with torch.autocast(device_type=device.split(":")[0], dtype=torch.bfloat16):
-            inference_state = self._video_predictor.init_state(
-                video_path=frame_dir,
-                offload_video_to_cpu=True,
-                async_loading_frames=True,
-            )
         sam2_masks = MaskDictionary()
         objects_count = 0
         all_results: Dict[int, Dict[int, ObjectInfo]] = {}
-        for start_idx in range(0, total_frames, step):
-            logging.info("Processing keyframe %d / %d", start_idx, total_frames)
-            img_path = os.path.join(frame_dir, frame_names[start_idx])
-            image = Image.open(img_path).convert("RGB")
-            mask_dict = MaskDictionary()
-            # -- Grounding DINO detection on keyframe --
-            inputs = gdino_processor(
-                images=image, text=prompt, return_tensors="pt"
-            )
-            inputs = {k: v.to(device) for k, v in inputs.items()}
-            with torch.no_grad():
-                outputs = gdino_model(**inputs)
-            results = gdino_processor.post_process_grounded_object_detection(
-                outputs,
-                inputs["input_ids"],
-                threshold=0.25,
-                text_threshold=0.25,
-                target_sizes=[image.size[::-1]],
-            )
-            input_boxes = results[0]["boxes"]
-            det_labels = results[0].get("text_labels") or results[0].get("labels", [])
-            if torch.is_tensor(det_labels):
-                det_labels = det_labels.detach().cpu().tolist()
-            det_labels = [str(l) for l in det_labels]
-            if input_boxes.shape[0] == 0:
-                logging.info("No detections on keyframe %d, propagating previous masks", start_idx)
-                # Fill empty results for this segment
-                for fi in range(start_idx, min(start_idx + step, total_frames)):
-                    if fi not in all_results:
-                        # Carry forward last known masks
-                        all_results[fi] = {
-                            k: ObjectInfo(
-                                instance_id=v.instance_id,
-                                mask=v.mask,
-                                class_name=v.class_name,
-                                x1=v.x1, y1=v.y1, x2=v.x2, y2=v.y2,
-                            )
-                            for k, v in sam2_masks.labels.items()
-                        } if sam2_masks.labels else {}
-                continue
-            # -- SAM2 image predictor on keyframe --
-            with torch.autocast(device_type=device.split(":")[0], dtype=torch.bfloat16):
                 self._image_predictor.set_image(np.array(image))
                 masks, scores, logits = self._image_predictor.predict(
                     point_coords=None,
@@ -381,34 +386,33 @@ class GroundedSAM2Segmenter(Segmenter):
                     multimask_output=False,
                 )
-            # Normalize mask dims
-            if masks.ndim == 2:
-                masks = masks[None]
-                scores = scores[None]
-                logits = logits[None]
-            elif masks.ndim == 4:
-                masks = masks.squeeze(1)
-            mask_dict.add_new_frame_annotation(
-                mask_list=torch.tensor(masks).to(device),
-                box_list=torch.tensor(input_boxes.cpu().numpy() if torch.is_tensor(input_boxes) else input_boxes),
-                label_list=det_labels,
-            )
-            # -- IoU matching to maintain persistent IDs --
-            objects_count = mask_dict.update_masks(
-                tracking_dict=sam2_masks,
-                iou_threshold=self.iou_threshold,
-                objects_count=objects_count,
-            )
-            if len(mask_dict.labels) == 0:
-                for fi in range(start_idx, min(start_idx + step, total_frames)):
-                    all_results[fi] = {}
-                continue
-            # -- SAM2 video predictor: propagate masks --
-            with torch.autocast(device_type=device.split(":")[0], dtype=torch.bfloat16):
                 self._video_predictor.reset_state(inference_state)
                 for obj_id, obj_info in mask_dict.labels.items():

 import copy
 import logging
+from contextlib import nullcontext
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Sequence, Tuple
     def update_masks(
         self,
         tracking_dict: "MaskDictionary",
+        iou_threshold: float = 0.5,
         objects_count: int = 0,
     ) -> int:
         """Match current detections against tracked objects via IoU."""
         model_size: str = "large",
         device: Optional[str] = None,
         step: int = 20,
+        iou_threshold: float = 0.5,
     ):
         self.model_size = model_size
         self.step = step
         import cv2 as _cv2
         frame_rgb = _cv2.cvtColor(frame, _cv2.COLOR_BGR2RGB)
+        device_type = self.device.split(":")[0]
+        autocast_ctx = torch.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
+        with autocast_ctx:
             self._image_predictor.set_image(frame_rgb)
             input_boxes = torch.tensor(det.boxes, device=self.device, dtype=torch.float32)
             masks, scores, _ = self._image_predictor.predict(
             total_frames, step, text_prompts,
         )
+        # Single global autocast context (matches reference implementation)
+        device_type = device.split(":")[0]
+        autocast_ctx = torch.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
         sam2_masks = MaskDictionary()
         objects_count = 0
         all_results: Dict[int, Dict[int, ObjectInfo]] = {}
+        with autocast_ctx:
+            # Init SAM2 video predictor state
+            inference_state = self._video_predictor.init_state(
+                video_path=frame_dir,
+                offload_video_to_cpu=True,
+                async_loading_frames=True,
+            )
+            for start_idx in range(0, total_frames, step):
+                logging.info("Processing keyframe %d / %d", start_idx, total_frames)
+                img_path = os.path.join(frame_dir, frame_names[start_idx])
+                image = Image.open(img_path).convert("RGB")
+                mask_dict = MaskDictionary()
+                # -- Grounding DINO detection on keyframe --
+                inputs = gdino_processor(
+                    images=image, text=prompt, return_tensors="pt"
+                )
+                inputs = {k: v.to(device) for k, v in inputs.items()}
+                with torch.no_grad():
+                    outputs = gdino_model(**inputs)
+                # Use GDINO detector's _post_process for transformers version compat
+                results = self._gdino_detector._post_process(
+                    outputs,
+                    inputs["input_ids"],
+                    target_sizes=[image.size[::-1]],
+                )
+                input_boxes = results[0]["boxes"]
+                det_labels = results[0].get("text_labels") or results[0].get("labels", [])
+                if torch.is_tensor(det_labels):
+                    det_labels = det_labels.detach().cpu().tolist()
+                det_labels = [str(l) for l in det_labels]
+                if input_boxes.shape[0] == 0:
+                    logging.info("No detections on keyframe %d, propagating previous masks", start_idx)
+                    # Fill empty results for this segment
+                    for fi in range(start_idx, min(start_idx + step, total_frames)):
+                        if fi not in all_results:
+                            # Carry forward last known masks
+                            all_results[fi] = {
+                                k: ObjectInfo(
+                                    instance_id=v.instance_id,
+                                    mask=v.mask,
+                                    class_name=v.class_name,
+                                    x1=v.x1, y1=v.y1, x2=v.x2, y2=v.y2,
+                                )
+                                for k, v in sam2_masks.labels.items()
+                            } if sam2_masks.labels else {}
+                    continue
+                # -- SAM2 image predictor on keyframe --
                 self._image_predictor.set_image(np.array(image))
                 masks, scores, logits = self._image_predictor.predict(
                     point_coords=None,
                     multimask_output=False,
                 )
+                # Normalize mask dims
+                if masks.ndim == 2:
+                    masks = masks[None]
+                    scores = scores[None]
+                    logits = logits[None]
+                elif masks.ndim == 4:
+                    masks = masks.squeeze(1)
+                mask_dict.add_new_frame_annotation(
+                    mask_list=torch.tensor(masks).to(device),
+                    box_list=input_boxes.clone() if torch.is_tensor(input_boxes) else torch.tensor(input_boxes),
+                    label_list=det_labels,
+                )
+                # -- IoU matching to maintain persistent IDs --
+                objects_count = mask_dict.update_masks(
+                    tracking_dict=sam2_masks,
+                    iou_threshold=self.iou_threshold,
+                    objects_count=objects_count,
+                )
+                if len(mask_dict.labels) == 0:
+                    for fi in range(start_idx, min(start_idx + step, total_frames)):
+                        all_results[fi] = {}
+                    continue
+                # -- SAM2 video predictor: propagate masks --
                 self._video_predictor.reset_state(inference_state)
                 for obj_id, obj_info in mask_dict.labels.items():

utils/video.py CHANGED Viewed

@@ -43,7 +43,7 @@ def extract_frames_to_jpeg_dir(
         if not success:
             break
         fname = f"{idx:06d}.jpg"
-        cv2.imwrite(os.path.join(output_dir, fname), frame)
         frame_names.append(fname)
         idx += 1

         if not success:
             break
         fname = f"{idx:06d}.jpg"
+        cv2.imwrite(os.path.join(output_dir, fname), frame, [cv2.IMWRITE_JPEG_QUALITY, 100])
         frame_names.append(fname)
         idx += 1