Spaces:

BiasLab2025
/

perception

Sleeping

App Files Files Community

Zhen Ye commited on 4 days ago

Commit

3fde4e4

1 Parent(s): 04d4562

Remove SAM3 and standardize segmentation on Grounded-SAM2

Browse files

Files changed (13) hide show

Dockerfile +1 -1
app.py +5 -5
frontend/index.html +4 -2
frontend/js/LaserPerception_original.js +4 -2
frontend/js/main.js +22 -3
inference.py +129 -1
jobs/background.py +2 -2
models/segmenters/__init__.py +2 -2
models/segmenters/grounded_sam2.py +452 -0
models/segmenters/model_loader.py +7 -8
models/segmenters/sam3.py +0 -284
requirements.txt +3 -0
utils/video.py +45 -0

Dockerfile CHANGED Viewed

@@ -19,7 +19,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && rm -rf /var/lib/apt/lists/* \
     && pip install --no-cache-dir --upgrade pip \
     && pip install --no-cache-dir -r requirements.txt \
-    && python -c "import transformers; print('transformers', transformers.__version__); print('has Sam3Model', hasattr(transformers, 'Sam3Model'))"
 COPY --chown=user . .

     && rm -rf /var/lib/apt/lists/* \
     && pip install --no-cache-dir --upgrade pip \
     && pip install --no-cache-dir -r requirements.txt \
+    && python -c "import transformers; print('transformers', transformers.__version__); print('has Sam2Model', hasattr(transformers, 'Sam2Model'))"
 COPY --chown=user . .

app.py CHANGED Viewed

@@ -41,7 +41,7 @@ from fastapi.responses import FileResponse, HTMLResponse, JSONResponse, Redirect
 from fastapi.staticfiles import StaticFiles
 import uvicorn
-from inference import process_first_frame, run_inference, run_segmentation
 from models.depth_estimators.model_loader import list_depth_estimators
 from jobs.background import process_video_async
 from jobs.models import JobInfo, JobStatus
@@ -268,7 +268,7 @@ async def detect_endpoint(
     mode: str = Form(...),
     queries: str = Form(""),
     detector: str = Form("hf_yolov8"),
-    segmenter: str = Form("sam3"),
     enable_depth: bool = Form(False),
     enable_gpt: bool = Form(True),
 ):
@@ -280,7 +280,7 @@ async def detect_endpoint(
         mode: Detection mode (object_detection, segmentation, drone_detection)
         queries: Comma-separated object classes for object_detection mode
         detector: Model to use (hf_yolov8, detr_resnet50, grounding_dino)
-        segmenter: Segmentation model to use (sam3)
         enable_depth: Whether to run legacy depth estimation (default: False)
         drone_detection uses the dedicated drone_yolo model.
@@ -317,7 +317,7 @@ async def detect_endpoint(
             query_list = ["object"]
         try:
-            output_path = run_segmentation(
                 input_path,
                 output_path,
                 query_list,
@@ -421,7 +421,7 @@ async def detect_async_endpoint(
     mode: str = Form(...),
     queries: str = Form(""),
     detector: str = Form("hf_yolov8"),
-    segmenter: str = Form("sam3"),
     depth_estimator: str = Form("depth"),
     depth_scale: float = Form(25.0),
     enable_depth: bool = Form(False),

 from fastapi.staticfiles import StaticFiles
 import uvicorn
+from inference import process_first_frame, run_inference, run_grounded_sam2_tracking
 from models.depth_estimators.model_loader import list_depth_estimators
 from jobs.background import process_video_async
 from jobs.models import JobInfo, JobStatus
     mode: str = Form(...),
     queries: str = Form(""),
     detector: str = Form("hf_yolov8"),
+    segmenter: str = Form("gsam2_large"),
     enable_depth: bool = Form(False),
     enable_gpt: bool = Form(True),
 ):
         mode: Detection mode (object_detection, segmentation, drone_detection)
         queries: Comma-separated object classes for object_detection mode
         detector: Model to use (hf_yolov8, detr_resnet50, grounding_dino)
+        segmenter: Segmentation model to use (gsam2_small, gsam2_base, gsam2_large)
         enable_depth: Whether to run legacy depth estimation (default: False)
         drone_detection uses the dedicated drone_yolo model.
             query_list = ["object"]
         try:
+            output_path = run_grounded_sam2_tracking(
                 input_path,
                 output_path,
                 query_list,
     mode: str = Form(...),
     queries: str = Form(""),
     detector: str = Form("hf_yolov8"),
+    segmenter: str = Form("gsam2_large"),
     depth_estimator: str = Form("depth"),
     depth_scale: float = Form(25.0),
     enable_depth: bool = Form(False),

frontend/index.html CHANGED Viewed

@@ -75,7 +75,9 @@
                   <option value="grounding_dino" data-kind="object">Large</option>
                 </optgroup>
                 <optgroup label="Segmentation Models">
-                  <option value="sam3" data-kind="segmentation">Segmentor</option>
                 </optgroup>
                 <optgroup label="Drone Detection Models">
                   <option value="drone_yolo" data-kind="drone">Drone</option>
@@ -293,4 +295,4 @@
 </body>
-</html>

                   <option value="grounding_dino" data-kind="object">Large</option>
                 </optgroup>
                 <optgroup label="Segmentation Models">
+                  <option value="gsam2_large" data-kind="segmentation">SAM2 Large</option>
+                  <option value="gsam2_base" data-kind="segmentation">SAM2 Base+</option>
+                  <option value="gsam2_small" data-kind="segmentation">SAM2 Small</option>
                 </optgroup>
                 <optgroup label="Drone Detection Models">
                   <option value="drone_yolo" data-kind="drone">Drone</option>
 </body>
+</html>

frontend/js/LaserPerception_original.js CHANGED Viewed

@@ -701,7 +701,9 @@
         "hf_yolov8",
         "detr_resnet50",
         "grounding_dino",
-        "sam3",
         "drone_yolo",
     ]);
@@ -900,7 +902,7 @@
             form.append("detector", detector);
         }
         if (mode === "segmentation") {
-            form.append("segmenter", "sam3");
         }
         // drone_detection uses drone_yolo automatically

         "hf_yolov8",
         "detr_resnet50",
         "grounding_dino",
+        "gsam2_small",
+        "gsam2_base",
+        "gsam2_large",
         "drone_yolo",
     ]);
             form.append("detector", detector);
         }
         if (mode === "segmentation") {
+            form.append("segmenter", detector || "gsam2_large");
         }
         // drone_detection uses drone_yolo automatically

frontend/js/main.js CHANGED Viewed

@@ -339,16 +339,35 @@ document.addEventListener("DOMContentLoaded", () => {
         }
         try {
-            const mode = detectorSelect ? detectorSelect.value : "hf_yolov8";
             const queries = missionText ? missionText.value.trim() : "";
             const enableGPT = $("#enableGPTToggle")?.checked || false;
             const enableDepth = false; // depth mode disabled
             const form = new FormData();
             form.append("video", state.videoFile);
-            form.append("mode", "object_detection");
             if (queries) form.append("queries", queries);
-            form.append("detector", mode);
             form.append("enable_gpt", enableGPT ? "true" : "false");
             form.append("enable_depth", enableDepth ? "true" : "false");

         }
         try {
+            const selectedOption = detectorSelect ? detectorSelect.options[detectorSelect.selectedIndex] : null;
+            const selectedValue = detectorSelect ? detectorSelect.value : "hf_yolov8";
+            const kind = selectedOption ? selectedOption.getAttribute("data-kind") : "object";
             const queries = missionText ? missionText.value.trim() : "";
             const enableGPT = $("#enableGPTToggle")?.checked || false;
             const enableDepth = false; // depth mode disabled
+            // Determine mode and model parameter from data-kind attribute
+            let mode, detectorParam, segmenterParam;
+            if (kind === "segmentation") {
+                mode = "segmentation";
+                segmenterParam = selectedValue;
+                detectorParam = "hf_yolov8"; // default, unused for segmentation
+            } else if (kind === "drone") {
+                mode = "drone_detection";
+                detectorParam = selectedValue;
+                segmenterParam = "gsam2_large";
+            } else {
+                mode = "object_detection";
+                detectorParam = selectedValue;
+                segmenterParam = "gsam2_large";
+            }
             const form = new FormData();
             form.append("video", state.videoFile);
+            form.append("mode", mode);
             if (queries) form.append("queries", queries);
+            form.append("detector", detectorParam);
+            form.append("segmenter", segmenterParam);
             form.append("enable_gpt", enableGPT ? "true" : "false");
             form.append("enable_depth", enableDepth ? "true" : "false");

inference.py CHANGED Viewed

@@ -1380,7 +1380,7 @@ def run_segmentation(
     if max_frames is not None:
         total_frames = min(total_frames, max_frames)
-    active_segmenter = segmenter_name or "sam3"
     logging.info("Using segmenter: %s with queries: %s", active_segmenter, queries)
     # 2. Load Segmenters (Parallel)
@@ -1586,6 +1586,134 @@ def run_segmentation(
 def run_depth_inference(
     input_video_path: str,
     output_video_path: str,

     if max_frames is not None:
         total_frames = min(total_frames, max_frames)
+    active_segmenter = segmenter_name or "gsam2_large"
     logging.info("Using segmenter: %s with queries: %s", active_segmenter, queries)
     # 2. Load Segmenters (Parallel)
+def run_grounded_sam2_tracking(
+    input_video_path: str,
+    output_video_path: str,
+    queries: List[str],
+    max_frames: Optional[int] = None,
+    segmenter_name: Optional[str] = None,
+    job_id: Optional[str] = None,
+    stream_queue: Optional[Queue] = None,
+    step: int = 20,
+) -> str:
+    """Run Grounded-SAM-2 video tracking pipeline.
+    Unlike per-frame segmentation, this extracts all frames to JPEG,
+    runs SAM2 video predictor for temporal mask propagation, then
+    renders the results back into a video.
+    """
+    import shutil
+    from utils.video import extract_frames_to_jpeg_dir
+    from models.segmenters.model_loader import load_segmenter as _load_seg
+    active_segmenter = segmenter_name or "gsam2_large"
+    logging.info(
+        "Grounded-SAM-2 tracking: segmenter=%s, queries=%s, step=%d",
+        active_segmenter, queries, step,
+    )
+    # 1. Extract frames to JPEG directory
+    frame_dir = tempfile.mkdtemp(prefix="gsam2_frames_")
+    try:
+        frame_names, fps, width, height = extract_frames_to_jpeg_dir(
+            input_video_path, frame_dir, max_frames=max_frames,
+        )
+        total_frames = len(frame_names)
+        logging.info("Extracted %d frames to %s", total_frames, frame_dir)
+        # 2. Load segmenter
+        segmenter = _load_seg(active_segmenter)
+        # 3. Run tracking pipeline
+        _check_cancellation(job_id)
+        tracking_results = segmenter.process_video(frame_dir, frame_names, queries)
+        # 4. Render results into output video
+        _check_cancellation(job_id)
+        import os as _os
+        with StreamingVideoWriter(output_video_path, fps, width, height) as writer:
+            for frame_idx in range(total_frames):
+                _check_cancellation(job_id)
+                # Read original frame
+                frame_path = _os.path.join(frame_dir, frame_names[frame_idx])
+                frame = cv2.imread(frame_path)
+                if frame is None:
+                    logging.warning("Failed to read frame %d, skipping", frame_idx)
+                    continue
+                frame_objects = tracking_results.get(frame_idx, {})
+                if frame_objects:
+                    # Collect masks, boxes, and labels for rendering
+                    masks_list = []
+                    boxes_list = []
+                    label_list = []
+                    for obj_id, obj_info in frame_objects.items():
+                        mask = obj_info.mask
+                        if mask is not None:
+                            if isinstance(mask, torch.Tensor):
+                                mask_np = mask.cpu().numpy().astype(bool)
+                            else:
+                                mask_np = np.asarray(mask).astype(bool)
+                            # Resize mask if needed
+                            if mask_np.shape[:2] != (height, width):
+                                mask_np = cv2.resize(
+                                    mask_np.astype(np.uint8),
+                                    (width, height),
+                                    interpolation=cv2.INTER_NEAREST,
+                                ).astype(bool)
+                            masks_list.append(mask_np)
+                        label = f"{obj_info.instance_id} {obj_info.class_name}"
+                        label_list.append(label)
+                        if obj_info.x1 or obj_info.y1 or obj_info.x2 or obj_info.y2:
+                            boxes_list.append([obj_info.x1, obj_info.y1, obj_info.x2, obj_info.y2])
+                    # Draw masks
+                    if masks_list:
+                        masks_array = np.stack(masks_list)
+                        frame = draw_masks(frame, masks_array, labels=label_list)
+                    # Draw boxes
+                    if boxes_list:
+                        boxes_array = np.array(boxes_list)
+                        frame = draw_boxes(frame, boxes_array, label_names=label_list)
+                writer.write(frame)
+                # Stream frame if requested
+                if stream_queue:
+                    try:
+                        from jobs.streaming import publish_frame as _pub
+                        if job_id:
+                            _pub(job_id, frame)
+                        else:
+                            stream_queue.put(frame, timeout=0.01)
+                    except Exception:
+                        pass
+                if frame_idx % 30 == 0:
+                    logging.info(
+                        "Rendered frame %d / %d", frame_idx, total_frames
+                    )
+        logging.info("Grounded-SAM-2 output written to: %s", output_video_path)
+        return output_video_path
+    finally:
+        # Cleanup temp frame directory
+        try:
+            shutil.rmtree(frame_dir)
+            logging.info("Cleaned up temp frame dir: %s", frame_dir)
+        except Exception:
+            logging.warning("Failed to clean up temp frame dir: %s", frame_dir)
 def run_depth_inference(
     input_video_path: str,
     output_video_path: str,

jobs/background.py CHANGED Viewed

@@ -7,7 +7,7 @@ import torch
 from jobs.models import JobStatus
 from jobs.storage import get_job_storage, get_depth_output_path, get_first_frame_depth_path
 from jobs.streaming import create_stream, remove_stream
-from inference import run_inference, run_segmentation, run_depth_inference
 async def process_video_async(job_id: str) -> None:
@@ -28,7 +28,7 @@ async def process_video_async(job_id: str) -> None:
         # Run detection or segmentation first
         if job.mode == "segmentation":
             detection_path = await asyncio.to_thread(
-                run_segmentation,
                 job.input_video_path,
                 job.output_video_path,
                 job.queries,

 from jobs.models import JobStatus
 from jobs.storage import get_job_storage, get_depth_output_path, get_first_frame_depth_path
 from jobs.streaming import create_stream, remove_stream
+from inference import run_inference, run_grounded_sam2_tracking, run_depth_inference
 async def process_video_async(job_id: str) -> None:
         # Run detection or segmentation first
         if job.mode == "segmentation":
             detection_path = await asyncio.to_thread(
+                run_grounded_sam2_tracking,
                 job.input_video_path,
                 job.output_video_path,
                 job.queries,

models/segmenters/__init__.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from .base import Segmenter, SegmentationResult
 from .model_loader import load_segmenter
-from .sam3 import SAM3Segmenter
 __all__ = [
     "Segmenter",
     "SegmentationResult",
     "load_segmenter",
-    "SAM3Segmenter",
 ]

 from .base import Segmenter, SegmentationResult
 from .model_loader import load_segmenter
+from .grounded_sam2 import GroundedSAM2Segmenter
 __all__ = [
     "Segmenter",
     "SegmentationResult",
     "load_segmenter",
+    "GroundedSAM2Segmenter",
 ]

models/segmenters/grounded_sam2.py ADDED Viewed

	@@ -0,0 +1,452 @@

+"""Grounded-SAM-2 segmenter with continuous-ID video tracking.
+Combines Grounding DINO (open-vocabulary detection) with SAM2's video
+predictor to produce temporally consistent segmentation masks with
+persistent object IDs across an entire video.
+Reference implementation:
+    Grounded-SAM-2/grounded_sam2_tracking_demo_with_continuous_id.py
+"""
+import copy
+import logging
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+import numpy as np
+import torch
+from PIL import Image
+from .base import Segmenter, SegmentationResult
+# ---------------------------------------------------------------------------
+# Data structures (mirrors Grounded-SAM-2 reference utilities)
+# ---------------------------------------------------------------------------
+@dataclass
+class ObjectInfo:
+    """Per-object tracking info for a single frame."""
+    instance_id: int = 0
+    mask: Any = None  # torch.Tensor bool (H, W)
+    class_name: str = ""
+    x1: int = 0
+    y1: int = 0
+    x2: int = 0
+    y2: int = 0
+    def update_box(self):
+        """Derive bounding box from mask."""
+        if self.mask is None:
+            return
+        nonzero = torch.nonzero(self.mask)
+        if nonzero.size(0) == 0:
+            return
+        y_min, x_min = torch.min(nonzero, dim=0)[0]
+        y_max, x_max = torch.max(nonzero, dim=0)[0]
+        self.x1 = x_min.item()
+        self.y1 = y_min.item()
+        self.x2 = x_max.item()
+        self.y2 = y_max.item()
+@dataclass
+class MaskDictionary:
+    """Tracks object masks across frames with IoU-based ID matching."""
+    mask_height: int = 0
+    mask_width: int = 0
+    labels: Dict[int, ObjectInfo] = field(default_factory=dict)
+    def add_new_frame_annotation(
+        self,
+        mask_list: torch.Tensor,
+        box_list: torch.Tensor,
+        label_list: list,
+    ):
+        mask_img = torch.zeros(mask_list.shape[-2:])
+        anno = {}
+        for idx, (mask, box, label) in enumerate(zip(mask_list, box_list, label_list)):
+            final_index = idx + 1
+            mask_img[mask == True] = final_index  # noqa: E712
+            anno[final_index] = ObjectInfo(
+                instance_id=final_index,
+                mask=mask,
+                class_name=str(label),
+                x1=int(box[0]),
+                y1=int(box[1]),
+                x2=int(box[2]),
+                y2=int(box[3]),
+            )
+        self.mask_height = mask_img.shape[0]
+        self.mask_width = mask_img.shape[1]
+        self.labels = anno
+    def update_masks(
+        self,
+        tracking_dict: "MaskDictionary",
+        iou_threshold: float = 0.8,
+        objects_count: int = 0,
+    ) -> int:
+        """Match current detections against tracked objects via IoU."""
+        updated = {}
+        for _seg_id, seg_info in self.labels.items():
+            if seg_info.mask is None or seg_info.mask.sum() == 0:
+                continue
+            matched_id = 0
+            for _obj_id, obj_info in tracking_dict.labels.items():
+                iou = self._iou(seg_info.mask, obj_info.mask)
+                if iou > iou_threshold:
+                    matched_id = obj_info.instance_id
+                    break
+            if not matched_id:
+                objects_count += 1
+                matched_id = objects_count
+            new_info = ObjectInfo(
+                instance_id=matched_id,
+                mask=seg_info.mask,
+                class_name=seg_info.class_name,
+            )
+            updated[matched_id] = new_info
+        self.labels = updated
+        return objects_count
+    def get_target_class_name(self, instance_id: int) -> str:
+        info = self.labels.get(instance_id)
+        return info.class_name if info else ""
+    @staticmethod
+    def _iou(m1: torch.Tensor, m2: torch.Tensor) -> float:
+        m1f = m1.to(torch.float32)
+        m2f = m2.to(torch.float32)
+        inter = (m1f * m2f).sum()
+        union = m1f.sum() + m2f.sum() - inter
+        if union == 0:
+            return 0.0
+        return float(inter / union)
+# ---------------------------------------------------------------------------
+# SAM2 HuggingFace model IDs per size
+# ---------------------------------------------------------------------------
+_SAM2_HF_MODELS = {
+    "small": "facebook/sam2.1-hiera-small",
+    "base": "facebook/sam2.1-hiera-base-plus",
+    "large": "facebook/sam2.1-hiera-large",
+}
+# ---------------------------------------------------------------------------
+# Grounded-SAM-2 Segmenter
+# ---------------------------------------------------------------------------
+class GroundedSAM2Segmenter(Segmenter):
+    """SAM2 video segmenter driven by Grounding DINO detections.
+    For single-frame mode (``predict``), uses GDINO + SAM2 image predictor.
+    For video mode (``process_video``), uses GDINO on keyframes + SAM2 video
+    predictor for temporal mask propagation with continuous object IDs.
+    """
+    supports_batch = False
+    max_batch_size = 1
+    def __init__(
+        self,
+        model_size: str = "large",
+        device: Optional[str] = None,
+        step: int = 20,
+        iou_threshold: float = 0.8,
+    ):
+        self.model_size = model_size
+        self.step = step
+        self.iou_threshold = iou_threshold
+        self.name = f"gsam2_{model_size}"
+        if device:
+            self.device = device
+        else:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Lazy-loaded model handles
+        self._video_predictor = None
+        self._image_predictor = None
+        self._gdino_detector = None
+        self._models_loaded = False
+    # -- Lazy loading -------------------------------------------------------
+    def _ensure_models_loaded(self):
+        if self._models_loaded:
+            return
+        hf_id = _SAM2_HF_MODELS[self.model_size]
+        logging.info(
+            "Loading Grounded-SAM-2 (%s) on device %s ...", hf_id, self.device
+        )
+        # Enable TF32 on Ampere+ GPUs
+        if torch.cuda.is_available():
+            try:
+                props = torch.cuda.get_device_properties(
+                    int(self.device.split(":")[-1]) if ":" in self.device else 0
+                )
+                if props.major >= 8:
+                    torch.backends.cuda.matmul.allow_tf32 = True
+                    torch.backends.cudnn.allow_tf32 = True
+            except Exception:
+                pass
+        from sam2.build_sam import build_sam2_hf, build_sam2_video_predictor_hf
+        from sam2.sam2_image_predictor import SAM2ImagePredictor
+        # Video predictor (for process_video)
+        self._video_predictor = build_sam2_video_predictor_hf(
+            hf_id, device=self.device
+        )
+        # Image predictor (for single-frame predict)
+        sam2_image_model = build_sam2_hf(hf_id, device=self.device)
+        self._image_predictor = SAM2ImagePredictor(sam2_image_model)
+        # Reuse existing Grounding DINO detector from our codebase
+        from models.detectors.grounding_dino import GroundingDinoDetector
+        self._gdino_detector = GroundingDinoDetector(device=self.device)
+        self._models_loaded = True
+        logging.info("Grounded-SAM-2 models loaded successfully.")
+    # -- Single-frame interface (Segmenter.predict) -------------------------
+    def predict(
+        self, frame: np.ndarray, text_prompts: Optional[list] = None
+    ) -> SegmentationResult:
+        """Run GDINO + SAM2 image predictor on a single frame."""
+        self._ensure_models_loaded()
+        prompts = text_prompts or ["object"]
+        # Run Grounding DINO to get boxes
+        det = self._gdino_detector.predict(frame, prompts)
+        if det.boxes is None or len(det.boxes) == 0:
+            return SegmentationResult(
+                masks=np.zeros((0, frame.shape[0], frame.shape[1]), dtype=bool),
+                scores=None,
+                boxes=None,
+            )
+        # SAM2 image predictor expects RGB
+        import cv2 as _cv2
+        frame_rgb = _cv2.cvtColor(frame, _cv2.COLOR_BGR2RGB)
+        with torch.autocast(device_type=self.device.split(":")[0], dtype=torch.bfloat16):
+            self._image_predictor.set_image(frame_rgb)
+            input_boxes = torch.tensor(det.boxes, device=self.device, dtype=torch.float32)
+            masks, scores, _ = self._image_predictor.predict(
+                point_coords=None,
+                point_labels=None,
+                box=input_boxes,
+                multimask_output=False,
+            )
+        # Normalize mask shape to (N, H, W)
+        if masks.ndim == 2:
+            masks = masks[None]
+        elif masks.ndim == 4:
+            masks = masks.squeeze(1)
+        if isinstance(masks, torch.Tensor):
+            masks_np = masks.cpu().numpy().astype(bool)
+        else:
+            masks_np = np.asarray(masks).astype(bool)
+        scores_np = None
+        if scores is not None:
+            if isinstance(scores, torch.Tensor):
+                scores_np = scores.cpu().numpy().flatten()
+            else:
+                scores_np = np.asarray(scores).flatten()
+        return SegmentationResult(
+            masks=masks_np,
+            scores=scores_np,
+            boxes=det.boxes,
+        )
+    # -- Video-level tracking interface -------------------------------------
+    def process_video(
+        self,
+        frame_dir: str,
+        frame_names: List[str],
+        text_prompts: List[str],
+    ) -> Dict[int, Dict[int, ObjectInfo]]:
+        """Run full Grounded-SAM-2 tracking pipeline on extracted JPEG frames.
+        Args:
+            frame_dir: Directory containing JPEG frames.
+            frame_names: Sorted list of frame filenames.
+            text_prompts: Text queries for Grounding DINO.
+        Returns:
+            Dict mapping frame_idx -> {obj_id: ObjectInfo} with masks,
+            bboxes, and class names for every frame.
+        """
+        import os
+        self._ensure_models_loaded()
+        device = self.device
+        step = self.step
+        prompt = self._gdino_detector._build_prompt(text_prompts)
+        # HF processor for Grounding DINO (reuse from our detector)
+        gdino_processor = self._gdino_detector.processor
+        gdino_model = self._gdino_detector.model
+        total_frames = len(frame_names)
+        logging.info(
+            "Grounded-SAM-2 tracking: %d frames, step=%d, queries=%s",
+            total_frames, step, text_prompts,
+        )
+        # Init SAM2 video predictor state
+        with torch.autocast(device_type=device.split(":")[0], dtype=torch.bfloat16):
+            inference_state = self._video_predictor.init_state(
+                video_path=frame_dir,
+                offload_video_to_cpu=True,
+                async_loading_frames=True,
+            )
+        sam2_masks = MaskDictionary()
+        objects_count = 0
+        all_results: Dict[int, Dict[int, ObjectInfo]] = {}
+        for start_idx in range(0, total_frames, step):
+            logging.info("Processing keyframe %d / %d", start_idx, total_frames)
+            img_path = os.path.join(frame_dir, frame_names[start_idx])
+            image = Image.open(img_path).convert("RGB")
+            mask_dict = MaskDictionary()
+            # -- Grounding DINO detection on keyframe --
+            inputs = gdino_processor(
+                images=image, text=prompt, return_tensors="pt"
+            )
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = gdino_model(**inputs)
+            results = gdino_processor.post_process_grounded_object_detection(
+                outputs,
+                inputs["input_ids"],
+                threshold=0.25,
+                text_threshold=0.25,
+                target_sizes=[image.size[::-1]],
+            )
+            input_boxes = results[0]["boxes"]
+            det_labels = results[0].get("text_labels") or results[0].get("labels", [])
+            if torch.is_tensor(det_labels):
+                det_labels = det_labels.detach().cpu().tolist()
+            det_labels = [str(l) for l in det_labels]
+            if input_boxes.shape[0] == 0:
+                logging.info("No detections on keyframe %d, propagating previous masks", start_idx)
+                # Fill empty results for this segment
+                for fi in range(start_idx, min(start_idx + step, total_frames)):
+                    if fi not in all_results:
+                        # Carry forward last known masks
+                        all_results[fi] = {
+                            k: ObjectInfo(
+                                instance_id=v.instance_id,
+                                mask=v.mask,
+                                class_name=v.class_name,
+                                x1=v.x1, y1=v.y1, x2=v.x2, y2=v.y2,
+                            )
+                            for k, v in sam2_masks.labels.items()
+                        } if sam2_masks.labels else {}
+                continue
+            # -- SAM2 image predictor on keyframe --
+            with torch.autocast(device_type=device.split(":")[0], dtype=torch.bfloat16):
+                self._image_predictor.set_image(np.array(image))
+                masks, scores, logits = self._image_predictor.predict(
+                    point_coords=None,
+                    point_labels=None,
+                    box=input_boxes,
+                    multimask_output=False,
+                )
+            # Normalize mask dims
+            if masks.ndim == 2:
+                masks = masks[None]
+                scores = scores[None]
+                logits = logits[None]
+            elif masks.ndim == 4:
+                masks = masks.squeeze(1)
+            mask_dict.add_new_frame_annotation(
+                mask_list=torch.tensor(masks).to(device),
+                box_list=torch.tensor(input_boxes.cpu().numpy() if torch.is_tensor(input_boxes) else input_boxes),
+                label_list=det_labels,
+            )
+            # -- IoU matching to maintain persistent IDs --
+            objects_count = mask_dict.update_masks(
+                tracking_dict=sam2_masks,
+                iou_threshold=self.iou_threshold,
+                objects_count=objects_count,
+            )
+            if len(mask_dict.labels) == 0:
+                for fi in range(start_idx, min(start_idx + step, total_frames)):
+                    all_results[fi] = {}
+                continue
+            # -- SAM2 video predictor: propagate masks --
+            with torch.autocast(device_type=device.split(":")[0], dtype=torch.bfloat16):
+                self._video_predictor.reset_state(inference_state)
+                for obj_id, obj_info in mask_dict.labels.items():
+                    self._video_predictor.add_new_mask(
+                        inference_state,
+                        start_idx,
+                        obj_id,
+                        obj_info.mask,
+                    )
+                for out_frame_idx, out_obj_ids, out_mask_logits in self._video_predictor.propagate_in_video(
+                    inference_state,
+                    max_frame_num_to_track=step,
+                    start_frame_idx=start_idx,
+                ):
+                    frame_objects: Dict[int, ObjectInfo] = {}
+                    for i, out_obj_id in enumerate(out_obj_ids):
+                        out_mask = (out_mask_logits[i] > 0.0)
+                        info = ObjectInfo(
+                            instance_id=out_obj_id,
+                            mask=out_mask[0],
+                            class_name=mask_dict.get_target_class_name(out_obj_id),
+                        )
+                        info.update_box()
+                        frame_objects[out_obj_id] = info
+                    all_results[out_frame_idx] = frame_objects
+                    # Keep latest frame masks for next segment's IoU matching
+                    sam2_masks = MaskDictionary()
+                    sam2_masks.labels = copy.deepcopy(frame_objects)
+                    if frame_objects:
+                        first_info = next(iter(frame_objects.values()))
+                        if first_info.mask is not None:
+                            sam2_masks.mask_height = first_info.mask.shape[-2] if first_info.mask.ndim >= 2 else 0
+                            sam2_masks.mask_width = first_info.mask.shape[-1] if first_info.mask.ndim >= 2 else 0
+        logging.info(
+            "Grounded-SAM-2 tracking complete: %d frames, %d tracked objects",
+            len(all_results), objects_count,
+        )
+        return all_results

models/segmenters/model_loader.py CHANGED Viewed

@@ -3,12 +3,14 @@ from functools import lru_cache
 from typing import Callable, Dict, Optional
 from .base import Segmenter
-from .sam3 import SAM3Segmenter
-DEFAULT_SEGMENTER = "sam3"
-_REGISTRY: Dict[str, Callable[[], Segmenter]] = {
-    "sam3": SAM3Segmenter,
 }
@@ -35,7 +37,7 @@ def load_segmenter(name: Optional[str] = None) -> Segmenter:
     Load a segmenter by name.
     Args:
-        name: Segmenter name (default: sam3)
     Returns:
         Cached segmenter instance
@@ -46,7 +48,4 @@ def load_segmenter(name: Optional[str] = None) -> Segmenter:
 def load_segmenter_on_device(name: str, device: str) -> Segmenter:
     """Create a new segmenter instance on the specified device (no caching)."""
-    # bypass cache by calling private creator directly
-    # Note: _create_segmenter calls factory() which needs to accept device now.
-    # We need to update _create_segmenter to pass kwargs too.
     return _create_segmenter(name, device=device)

 from typing import Callable, Dict, Optional
 from .base import Segmenter
+from .grounded_sam2 import GroundedSAM2Segmenter
+DEFAULT_SEGMENTER = "gsam2_large"
+_REGISTRY: Dict[str, Callable[..., Segmenter]] = {
+    "gsam2_small": lambda **kw: GroundedSAM2Segmenter(model_size="small", **kw),
+    "gsam2_base": lambda **kw: GroundedSAM2Segmenter(model_size="base", **kw),
+    "gsam2_large": lambda **kw: GroundedSAM2Segmenter(model_size="large", **kw),
 }
     Load a segmenter by name.
     Args:
+        name: Segmenter name (default: gsam2_large)
     Returns:
         Cached segmenter instance
 def load_segmenter_on_device(name: str, device: str) -> Segmenter:
     """Create a new segmenter instance on the specified device (no caching)."""
     return _create_segmenter(name, device=device)

models/segmenters/sam3.py DELETED Viewed

@@ -1,284 +0,0 @@
-import logging
-from typing import Optional, Sequence
-import numpy as np
-import torch
-from PIL import Image
-from transformers import Sam3Model, Sam3Processor
-from .base import Segmenter, SegmentationResult
-class SAM3Segmenter(Segmenter):
-    """
-    SAM3 (Segment Anything Model 3) segmenter.
-    Performs automatic instance segmentation on images without prompts.
-    Uses facebook/sam3 model from HuggingFace.
-    """
-    name = "sam3"
-    def __init__(
-        self,
-        model_id: str = "facebook/sam3",
-        device: Optional[str] = None,
-        threshold: float = 0.5,
-        mask_threshold: float = 0.5,
-    ):
-        """
-        Initialize SAM3 segmenter.
-        Args:
-            model_id: HuggingFace model ID
-            device: Device to run on (cuda/cpu), auto-detected if None
-            threshold: Confidence threshold for filtering instances
-            mask_threshold: Threshold for binarizing masks
-        """
-        self.device = device or (
-            "cuda" if torch.cuda.is_available() else "cpu"
-        )
-        self.threshold = threshold
-        self.mask_threshold = mask_threshold
-        logging.info(
-            "Loading SAM3 model %s on device %s", model_id, self.device
-        )
-        try:
-            self.model = Sam3Model.from_pretrained(model_id).to(self.device)
-            self.processor = Sam3Processor.from_pretrained(model_id)
-            self.model.eval()
-        except Exception:
-            logging.exception("Failed to load SAM3 model")
-            raise
-        logging.info("SAM3 model loaded successfully")
-    supports_batch = True
-    max_batch_size = 8
-    def _parse_single_result(self, results, frame_shape) -> SegmentationResult:
-        # Extract results
-        masks = results.get("masks", [])
-        scores = results.get("scores", None)
-        boxes = results.get("boxes", None)
-        # Convert to numpy arrays
-        if len(masks) > 0:
-            # Stack masks: list of (H, W) -> (N, H, W)
-            masks_array = np.stack([m.cpu().numpy() for m in masks])
-        else:
-            # No objects detected
-            masks_array = np.zeros(
-                (0, frame_shape[0], frame_shape[1]), dtype=bool
-            )
-        scores_array = (
-            scores.cpu().numpy() if scores is not None else None
-        )
-        boxes_array = (
-            boxes.cpu().numpy() if boxes is not None else None
-        )
-        return SegmentationResult(
-            masks=masks_array,
-            scores=scores_array,
-            boxes=boxes_array,
-        )
-    def _expand_inputs_if_needed(self, inputs):
-        """
-        Helper to expand vision inputs (pixel_values or vision_embeds) to match text prompts.
-        Handles:
-        1. 1 image, N texts (Expand 1 -> N)
-        2. N images, N*M texts (Expand N -> N*M)
-        """
-        pixel_values = inputs.get("pixel_values")
-        input_ids = inputs.get("input_ids")
-        if (
-            pixel_values is not None
-            and input_ids is not None
-        ):
-            img_batch = pixel_values.shape[0]
-            text_batch = input_ids.shape[0]
-            should_expand = False
-            expansion_factor = 1
-            if img_batch == 1 and text_batch > 1:
-                should_expand = True
-                expansion_factor = text_batch
-            elif img_batch > 1 and text_batch > img_batch and text_batch % img_batch == 0:
-                should_expand = True
-                expansion_factor = text_batch // img_batch
-            if should_expand:
-                logging.debug(f"Expanding SAM3 vision inputs from {img_batch} to {text_batch} (factor {expansion_factor}) using embeddings reuse.")
-                # 1. Compute vision embeddings once for original images
-                with torch.no_grad():
-                    vision_outputs = self.model.get_vision_features(
-                        pixel_values=pixel_values
-                    )
-                # Iterate over keys to expand
-                keys_to_expand = list(vision_outputs.keys())
-                for key in keys_to_expand:
-                    value = getattr(vision_outputs, key, None)
-                    if value is None:
-                        # Try getItem
-                        try:
-                            value = vision_outputs[key]
-                        except:
-                            continue
-                    new_value = None
-                    if isinstance(value, torch.Tensor):
-                        # Ensure we only expand the batch dimension (dim 0)
-                        if value.shape[0] == img_batch:
-                             new_value = value.repeat_interleave(expansion_factor, dim=0)
-                    elif isinstance(value, (list, tuple)):
-                        new_list = []
-                        valid_expansion = False
-                        for i, v in enumerate(value):
-                            if isinstance(v, torch.Tensor) and v.shape[0] == img_batch:
-                                new_list.append(v.repeat_interleave(expansion_factor, dim=0))
-                                valid_expansion = True
-                            else:
-                                new_list.append(v)
-                        if valid_expansion:
-                            # Preserve type
-                            new_value = type(value)(new_list)
-                    if new_value is not None:
-                         # Update dict item if possible
-                         try:
-                            vision_outputs[key] = new_value
-                         except:
-                            pass
-                         # Update attribute explicitly if it exists
-                         if hasattr(vision_outputs, key):
-                             setattr(vision_outputs, key, new_value)
-                # 3. Update inputs for model call
-                inputs["vision_embeds"] = vision_outputs
-                del inputs["pixel_values"] # Mutually exclusive with vision_embeds
-                # 4. Expand other metadata
-                if "original_sizes" in inputs and inputs["original_sizes"].shape[0] == img_batch:
-                    inputs["original_sizes"] = inputs["original_sizes"].repeat_interleave(expansion_factor, dim=0)
-                if "reshape_input_sizes" in inputs and inputs["reshape_input_sizes"].shape[0] == img_batch:
-                    inputs["reshape_input_sizes"] = inputs["reshape_input_sizes"].repeat_interleave(expansion_factor, dim=0)
-    def predict(self, frame: np.ndarray, text_prompts: Optional[list] = None) -> SegmentationResult:
-        """
-        Run SAM3 segmentation on a frame.
-        Args:
-            frame: Input image (HxWx3 numpy array in RGB)
-            text_prompts: List of text prompts for segmentation
-        Returns:
-            SegmentationResult with instance masks
-        """
-        # Convert numpy array to PIL Image
-        if frame.dtype == np.uint8:
-            pil_image = Image.fromarray(frame)
-        else:
-            # Normalize to 0-255 if needed
-            frame_uint8 = (frame * 255).astype(np.uint8)
-            pil_image = Image.fromarray(frame_uint8)
-        # Use default prompts if none provided
-        if not text_prompts:
-            text_prompts = ["object"]
-        # Process image with text prompts
-        inputs = self.processor(
-            images=pil_image, text=text_prompts, return_tensors="pt"
-        ).to(self.device)
-        # Handle batch expansion
-        self._expand_inputs_if_needed(inputs)
-        # Run inference
-        try:
-            if "pixel_values" in inputs:
-                logging.debug(f"SAM3 Input pixel_values shape: {inputs['pixel_values'].shape}")
-            with torch.no_grad():
-                outputs = self.model(**inputs)
-        except RuntimeError as e:
-            logging.error(f"RuntimeError during SAM3 inference: {e}")
-            logging.error(f"Input keys: {inputs.keys()}")
-            if 'pixel_values' in inputs:
-                logging.error(f"Pixel values shape: {inputs['pixel_values'].shape}")
-            # Re-raise to let user know
-            raise
-        # Post-process to get instance masks
-        try:
-            results = self.processor.post_process_instance_segmentation(
-                outputs,
-                threshold=self.threshold,
-                mask_threshold=self.mask_threshold,
-                target_sizes=inputs.get("original_sizes").tolist(),
-            )[0]
-            return self._parse_single_result(results, frame.shape)
-        except Exception:
-            logging.exception("SAM3 post-processing failed")
-            # Return empty result
-            return SegmentationResult(
-                masks=np.zeros((0, frame.shape[0], frame.shape[1]), dtype=bool),
-                scores=None,
-                boxes=None,
-            )
-    def predict_batch(self, frames: Sequence[np.ndarray], text_prompts: Optional[list] = None) -> Sequence[SegmentationResult]:
-        pil_images = []
-        for f in frames:
-            if f.dtype == np.uint8:
-                pil_images.append(Image.fromarray(f))
-            else:
-                f_uint8 = (f * 255).astype(np.uint8)
-                pil_images.append(Image.fromarray(f_uint8))
-        prompts = text_prompts or ["object"]
-        # Flatten prompts for all images: [img1_p1, img1_p2, img2_p1, img2_p2, ...]
-        flattened_prompts = []
-        for _ in frames:
-            flattened_prompts.extend(prompts)
-        inputs = self.processor(images=pil_images, text=flattened_prompts, return_tensors="pt").to(self.device)
-        # Handle batch expansion
-        self._expand_inputs_if_needed(inputs)
-        with torch.no_grad():
-            outputs = self.model(**inputs)
-        try:
-             results_list = self.processor.post_process_instance_segmentation(
-                outputs,
-                threshold=self.threshold,
-                mask_threshold=self.mask_threshold,
-                target_sizes=inputs.get("original_sizes").tolist(),
-            )
-             return [self._parse_single_result(r, f.shape) for r, f in zip(results_list, frames)]
-        except Exception:
-            logging.exception("SAM3 batch post-processing failed")
-            return [
-                SegmentationResult(
-                    masks=np.zeros((0, f.shape[0], f.shape[1]), dtype=bool),
-                    scores=None,
-                    boxes=None
-                ) for f in frames
-            ]

requirements.txt CHANGED Viewed

@@ -10,3 +10,6 @@ ultralytics
 python-dotenv
 einops
 sentence-transformers

 python-dotenv
 einops
 sentence-transformers
+SAM-2 @ git+https://github.com/facebookresearch/sam2.git
+hydra-core>=1.3.2
+iopath>=0.1.10

utils/video.py CHANGED Viewed

@@ -9,6 +9,51 @@ import cv2
 import numpy as np
 def extract_frames(video_path: str) -> Tuple[List[np.ndarray], float, int, int]:
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():

 import numpy as np
+def extract_frames_to_jpeg_dir(
+    video_path: str,
+    output_dir: str,
+    max_frames: int = None,
+) -> Tuple[List[str], float, int, int]:
+    """Extract video frames as numbered JPEG files for SAM2 video predictor.
+    Args:
+        video_path: Path to input video.
+        output_dir: Directory to write JPEG files into.
+        max_frames: Optional cap on number of frames to extract.
+    Returns:
+        (frame_names, fps, width, height) where *frame_names* is a sorted
+        list of filenames like ``000000.jpg``, ``000001.jpg``, etc.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError(f"Unable to open video: {video_path}")
+    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    frame_names: List[str] = []
+    idx = 0
+    while True:
+        if max_frames is not None and idx >= max_frames:
+            break
+        success, frame = cap.read()
+        if not success:
+            break
+        fname = f"{idx:06d}.jpg"
+        cv2.imwrite(os.path.join(output_dir, fname), frame)
+        frame_names.append(fname)
+        idx += 1
+    cap.release()
+    if not frame_names:
+        raise ValueError("Video decode produced zero frames.")
+    return frame_names, fps, width, height
 def extract_frames(video_path: str) -> Tuple[List[np.ndarray], float, int, int]:
     cap = cv2.VideoCapture(video_path)
     if not cap.isOpened():