Spaces:

BiasLab2025
/

perception

Paused

Zhen Ye commited on 13 days ago

Commit

af29397

1 Parent(s): 284ce20

feat: Implement SAHI Tiling for 4K video detection

- Added utils/tiling.py for image slicing and NMS
- Updated DroneYolo and HFYoloV8 to auto-tile images > 3000px width
- Uses 1280x1280 tiles with 20% overlap for maximum small object recall

Files changed (3) hide show

models/detectors/drone_yolo.py +94 -0
models/detectors/yolov8.py +80 -0
utils/tiling.py +153 -0

models/detectors/drone_yolo.py CHANGED Viewed

@@ -62,7 +62,94 @@ class DroneYoloDetector(ObjectDetector):
             label_names=label_names,
         )
     def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
         device_arg = self.device
         results = self.model.predict(
             source=frame,
@@ -74,6 +161,13 @@ class DroneYoloDetector(ObjectDetector):
         return self._parse_single_result(results[0], queries)
     def predict_batch(self, frames: Sequence[np.ndarray], queries: Sequence[str]) -> Sequence[DetectionResult]:
         results = self.model.predict(
             source=frames,
             device=self.device,

             label_names=label_names,
         )
+from utils.tiling import get_slice_bboxes, slice_image, shift_bboxes, batched_nms
+    def _predict_tiled(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
+        """Run tiled inference for high-resolution frames."""
+        # 1. Slice
+        h, w = frame.shape[:2]
+        # Heuristic: 1280x1280 tiles with 20% overlap
+        slice_boxes = get_slice_bboxes(h, w, 1280, 1280, 0.2, 0.2)
+        tiles = slice_image(frame, slice_boxes)
+        # 2. Batch Inference
+        # We can use our own model's batch prediction if we can trust it not to recurse strictly
+        # But we need raw results to merge.
+        # Actually proper way: run standard predict on tiles.
+        all_boxes = []
+        all_scores = []
+        all_labels = []
+        # Run in batches of max_batch_size to respect GPU memory
+        batch_size = self.max_batch_size
+        for i in range(0, len(tiles), batch_size):
+            batch_tiles = tiles[i : i + batch_size]
+            batch_slices = slice_boxes[i : i + batch_size]
+            results = self.model.predict(
+                source=batch_tiles,
+                device=self.device,
+                conf=self.score_threshold,
+                imgsz=1280, # Run tiles at full res
+                verbose=False,
+            )
+            for res, slice_coord in zip(results, batch_slices):
+                 if res.boxes is None: continue
+                 # Extract standard results
+                 boxes = res.boxes.xyxy.cpu().numpy().tolist()
+                 scores = res.boxes.conf.cpu().numpy().tolist()
+                 clss = res.boxes.cls.cpu().numpy().tolist()
+                 # Shift to global
+                 shifted = shift_bboxes(boxes, slice_coord)
+                 all_boxes.extend(shifted)
+                 all_scores.extend(scores)
+                 all_labels.extend(clss)
+        if not all_boxes:
+            empty = np.empty((0, 4), dtype=np.float32)
+            return DetectionResult(empty, [], [], [])
+        # 3. NMS Merge
+        boxes_t = torch.tensor(all_boxes, device=self.device)
+        scores_t = torch.tensor(all_scores, device=self.device)
+        labels_t = torch.tensor(all_labels, device=self.device)
+        keep = batched_nms(boxes_t, scores_t, labels_t, iou_threshold=0.4)
+        final_boxes = boxes_t[keep].cpu().numpy()
+        final_scores = scores_t[keep].cpu().tolist()
+        final_labels = labels_t[keep].cpu().int().tolist()
+        # 4. Filter & Format
+        label_names = [self.class_names.get(idx, f"class_{idx}") for idx in final_labels]
+        keep_indices = self._filter_indices(label_names, queries)
+        if not keep_indices:
+             empty = np.empty((0, 4), dtype=np.float32)
+             return DetectionResult(empty, [], [], [])
+        final_boxes = final_boxes[keep_indices]
+        final_scores = [final_scores[i] for i in keep_indices]
+        final_labels = [final_labels[i] for i in keep_indices]
+        final_names = [label_names[i] for i in keep_indices]
+        return DetectionResult(
+            boxes=final_boxes,
+            scores=final_scores,
+            labels=final_labels,
+            label_names=final_names
+        )
     def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
+        h, w = frame.shape[:2]
+        # Enable tiling for 4Kish images (width > 3000)
+        if w > 3000:
+             return self._predict_tiled(frame, queries)
         device_arg = self.device
         results = self.model.predict(
             source=frame,
         return self._parse_single_result(results[0], queries)
     def predict_batch(self, frames: Sequence[np.ndarray], queries: Sequence[str]) -> Sequence[DetectionResult]:
+        # Mixed batch support is hard. Assume batch is uniform size.
+        if not frames: return []
+        h, w = frames[0].shape[:2]
+        if w > 3000:
+            return [self._predict_tiled(f, queries) for f in frames]
         results = self.model.predict(
             source=frames,
             device=self.device,

models/detectors/yolov8.py CHANGED Viewed

@@ -7,6 +7,7 @@ from huggingface_hub import hf_hub_download
 from ultralytics import YOLO
 from models.detectors.base import DetectionResult, ObjectDetector
 class HuggingFaceYoloV8Detector(ObjectDetector):
@@ -64,7 +65,81 @@ class HuggingFaceYoloV8Detector(ObjectDetector):
             label_names=label_names,
         )
     def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
         results = self.model.predict(
             source=frame,
             device=self.device,
@@ -75,6 +150,11 @@ class HuggingFaceYoloV8Detector(ObjectDetector):
         return self._parse_single_result(results[0], queries)
     def predict_batch(self, frames: Sequence[np.ndarray], queries: Sequence[str]) -> Sequence[DetectionResult]:
         results = self.model.predict(
             source=frames,
             device=self.device,

 from ultralytics import YOLO
 from models.detectors.base import DetectionResult, ObjectDetector
+from utils.tiling import get_slice_bboxes, slice_image, shift_bboxes, batched_nms
 class HuggingFaceYoloV8Detector(ObjectDetector):
             label_names=label_names,
         )
+    def _predict_tiled(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
+        """Run tiled inference for high-resolution frames."""
+        h, w = frame.shape[:2]
+        # Heuristic: 1280x1280 tiles with 20% overlap
+        slice_boxes = get_slice_bboxes(h, w, 1280, 1280, 0.2, 0.2)
+        tiles = slice_image(frame, slice_boxes)
+        all_boxes = []
+        all_scores = []
+        all_labels = []
+        batch_size = self.max_batch_size
+        for i in range(0, len(tiles), batch_size):
+            batch_tiles = tiles[i : i + batch_size]
+            batch_slices = slice_boxes[i : i + batch_size]
+            # Using 1280px tiles
+            results = self.model.predict(
+                source=batch_tiles,
+                device=self.device,
+                conf=self.score_threshold,
+                imgsz=1280,
+                verbose=False,
+            )
+            for res, slice_coord in zip(results, batch_slices):
+                 if res.boxes is None: continue
+                 boxes = res.boxes.xyxy.cpu().numpy().tolist()
+                 scores = res.boxes.conf.cpu().numpy().tolist()
+                 clss = res.boxes.cls.cpu().numpy().tolist()
+                 shifted = shift_bboxes(boxes, slice_coord)
+                 all_boxes.extend(shifted)
+                 all_scores.extend(scores)
+                 all_labels.extend(clss)
+        if not all_boxes:
+            empty = np.empty((0, 4), dtype=np.float32)
+            return DetectionResult(empty, [], [], [])
+        boxes_t = torch.tensor(all_boxes, device=self.device)
+        scores_t = torch.tensor(all_scores, device=self.device)
+        labels_t = torch.tensor(all_labels, device=self.device)
+        keep = batched_nms(boxes_t, scores_t, labels_t, iou_threshold=0.4)
+        final_boxes = boxes_t[keep].cpu().numpy()
+        final_scores = scores_t[keep].cpu().tolist()
+        final_labels = labels_t[keep].cpu().int().tolist()
+        label_names = [self.class_names.get(idx, f"class_{idx}") for idx in final_labels]
+        keep_indices = self._filter_indices(label_names, queries)
+        if not keep_indices:
+             empty = np.empty((0, 4), dtype=np.float32)
+             return DetectionResult(empty, [], [], [])
+        final_boxes = final_boxes[keep_indices]
+        final_scores = [final_scores[i] for i in keep_indices]
+        final_labels = [final_labels[i] for i in keep_indices]
+        final_names = [label_names[i] for i in keep_indices]
+        return DetectionResult(
+            boxes=final_boxes,
+            scores=final_scores,
+            labels=final_labels,
+            label_names=final_names
+        )
     def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
+        h, w = frame.shape[:2]
+        if w > 3000:
+             return self._predict_tiled(frame, queries)
         results = self.model.predict(
             source=frame,
             device=self.device,
         return self._parse_single_result(results[0], queries)
     def predict_batch(self, frames: Sequence[np.ndarray], queries: Sequence[str]) -> Sequence[DetectionResult]:
+        if not frames: return []
+        h, w = frames[0].shape[:2]
+        if w > 3000:
+            return [self._predict_tiled(f, queries) for f in frames]
         results = self.model.predict(
             source=frames,
             device=self.device,

utils/tiling.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import numpy as np
+import torch
+import logging
+from typing import List, Tuple, Dict, Any, Optional
+def get_slice_bboxes(
+    image_height: int,
+    image_width: int,
+    slice_height: int = 640,
+    slice_width: int = 640,
+    overlap_height_ratio: float = 0.2,
+    overlap_width_ratio: float = 0.2,
+) -> List[List[int]]:
+    """
+    Calculate bounding boxes for slices with overlap.
+    Returns: List of [x_min, y_min, x_max, y_max]
+    """
+    slice_bboxes = []
+    y_max = y_min = 0
+    y_overlap = int(slice_height * overlap_height_ratio)
+    x_overlap = int(slice_width * overlap_width_ratio)
+    while y_max < image_height:
+        x_min = x_max = 0
+        y_max = y_min + slice_height
+        while x_max < image_width:
+            x_max = x_min + slice_width
+            # Adjustment for boundaries
+            if y_max > image_height:
+                y_max = image_height
+                y_min = max(0, image_height - slice_height)
+            if x_max > image_width:
+                x_max = image_width
+                x_min = max(0, image_width - slice_width)
+            slice_bboxes.append([x_min, y_min, x_max, y_max])
+            x_min = x_max - x_overlap
+        y_min = y_max - y_overlap
+    return slice_bboxes
+def slice_image(
+    image: np.ndarray,
+    slice_bboxes: List[List[int]]
+) -> List[np.ndarray]:
+    """Crops the image based on provided bounding boxes."""
+    slices = []
+    for bbox in slice_bboxes:
+        xmin, ymin, xmax, ymax = bbox
+        slices.append(image[ymin:ymax, xmin:xmax])
+    return slices
+def shift_bboxes(
+    bboxes: List[List[float]],
+    slice_coords: List[int]
+) -> List[List[float]]:
+    """
+    Shifts bounding boxes from slice coordinates to global image coordinates.
+    slice_coords: [xmin, ymin, xmax, ymax]
+    bboxes: List of [xmin, ymin, xmax, ymax]
+    """
+    shift_x = slice_coords[0]
+    shift_y = slice_coords[1]
+    shifted = []
+    for box in bboxes:
+        # box = [x1, y1, x2, y2]
+        shifted.append([
+            box[0] + shift_x,
+            box[1] + shift_y,
+            box[2] + shift_x,
+            box[3] + shift_y
+        ])
+    return shifted
+def batched_nms(
+    boxes: torch.Tensor,
+    scores: torch.Tensor,
+    idxs: torch.Tensor,
+    iou_threshold: float = 0.5
+) -> torch.Tensor:
+    """
+    Performs non-maximum suppression in a batched fashion.
+    Fallback to simple NMS if torchvision/ultralytics unavailable.
+    """
+    if boxes.numel() == 0:
+        return torch.empty((0,), dtype=torch.int64, device=boxes.device)
+    # Try importing efficient NMS implementations
+    try:
+        import torchvision
+        return torchvision.ops.batched_nms(boxes, scores, idxs, iou_threshold)
+    except ImportError:
+        pass
+    try:
+        from ultralytics.utils.ops import non_max_suppression
+        # Ultralytics NMS is usually complex/end-to-end. We need simple box NMS.
+        # Fallback to custom greedy NMS
+    except ImportError:
+        pass
+    # Custom Batched NMS Implementation (Slow but standard)
+    keep_indices = []
+    unique_labels = idxs.unique()
+    for label in unique_labels:
+        mask = (idxs == label)
+        cls_boxes = boxes[mask]
+        cls_scores = scores[mask]
+        original_indices = torch.where(mask)[0]
+        # Sort by score
+        sorted_indices = torch.argsort(cls_scores, descending=True)
+        cls_boxes = cls_boxes[sorted_indices]
+        original_indices = original_indices[sorted_indices]
+        cls_keep = []
+        while cls_boxes.size(0) > 0:
+            current_idx = 0
+            cls_keep.append(original_indices[current_idx])
+            if cls_boxes.size(0) == 1:
+                break
+            current_box = cls_boxes[current_idx].unsqueeze(0)
+            rest_boxes = cls_boxes[1:]
+            # IoU Calculation
+            x1 = torch.max(current_box[:, 0], rest_boxes[:, 0])
+            y1 = torch.max(current_box[:, 1], rest_boxes[:, 1])
+            x2 = torch.min(current_box[:, 2], rest_boxes[:, 2])
+            y2 = torch.min(current_box[:, 3], rest_boxes[:, 3])
+            inter_area = (x2 - x1).clamp(min=0) * (y2 - y1).clamp(min=0)
+            box_area = (current_box[:, 2] - current_box[:, 0]) * (current_box[:, 3] - current_box[:, 1])
+            rest_area = (rest_boxes[:, 2] - rest_boxes[:, 0]) * (rest_boxes[:, 3] - rest_boxes[:, 1])
+            union_area = box_area + rest_area - inter_area
+            iou = inter_area / (union_area + 1e-6)
+            # Keep boxes with low IoU
+            mask_iou = iou < iou_threshold
+            cls_boxes = rest_boxes[mask_iou]
+            original_indices = original_indices[1:][mask_iou]
+        keep_indices.extend(cls_keep)
+    return torch.tensor(keep_indices, dtype=torch.int64, device=boxes.device)