ISR

Paused

Zhen Ye Claude Opus 4.6 commited on about 1 month ago

Commit

e1fbf50

1 Parent(s): 3c61b44

feat: replace drone_yolo with YOLOv8-VisDrone detector

Remove broken drone_yolo detector (rujutashashikanjoshi repo not found)
and replace with Mahadih534/YoloV8-VisDrone. Use hf_hub_download for
reliable weight fetching. Allow drone_detection mode to use user-selected
detector instead of hardcoding.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (7) hide show

app.py +4 -4
frontend/index.html +0 -1
models/detectors/drone_yolo.py +0 -182
models/detectors/yolov8_visdrone.py +15 -1
models/model_loader.py +0 -2
utils/profiler.py +1 -1
utils/roofline.py +2 -2

app.py CHANGED Viewed

@@ -264,7 +264,7 @@ async def detect_endpoint(
         detector: Model to use (yolo11, detr_resnet50, grounding_dino)
         segmenter: Segmentation model to use (GSAM2-S/B/L, YSAM2-S/B/L)
         enable_depth: Whether to run legacy depth estimation (default: False)
-        drone_detection uses the dedicated drone_yolo model.
     Returns:
         - For object_detection: Processed video with bounding boxes
@@ -344,7 +344,7 @@ async def detect_endpoint(
     os.close(fd)
     # Parse queries with mission awareness
-    detector_name = "drone_yolo" if mode == "drone_detection" else detector
     mission_spec = None
     if queries.strip():
@@ -447,8 +447,8 @@ async def detect_async_endpoint(
     detector_name = detector
     mission_detector = detector  # detector key used for mission query parsing
     if mode == "drone_detection":
-        detector_name = "drone_yolo"
-        mission_detector = "drone_yolo"
     elif mode == "segmentation":
         # Segmenter registry owns detector selection (GSAM2→GDINO, YSAM2→YOLO).
         # detector_name=None so the job doesn't forward it (avoids duplicate kwarg).

         detector: Model to use (yolo11, detr_resnet50, grounding_dino)
         segmenter: Segmentation model to use (GSAM2-S/B/L, YSAM2-S/B/L)
         enable_depth: Whether to run legacy depth estimation (default: False)
+        drone_detection uses the dedicated yolov8_visdrone model.
     Returns:
         - For object_detection: Processed video with bounding boxes
     os.close(fd)
     # Parse queries with mission awareness
+    detector_name = (detector or "yolov8_visdrone") if mode == "drone_detection" else detector
     mission_spec = None
     if queries.strip():
     detector_name = detector
     mission_detector = detector  # detector key used for mission query parsing
     if mode == "drone_detection":
+        detector_name = detector or "yolov8_visdrone"
+        mission_detector = detector_name
     elif mode == "segmentation":
         # Segmenter registry owns detector selection (GSAM2→GDINO, YSAM2→YOLO).
         # detector_name=None so the job doesn't forward it (avoids duplicate kwarg).

frontend/index.html CHANGED Viewed

@@ -83,7 +83,6 @@
                   <option value="YSAM2-S" data-kind="segmentation">YSAM2-S (Fast)</option>
                 </optgroup>
                 <optgroup label="Drone Detection Models">
-                  <option value="drone_yolo" data-kind="drone">Drone</option>
                   <option value="yolov8_visdrone" data-kind="drone">VisDrone (YOLOv8)</option>
                 </optgroup>

                   <option value="YSAM2-S" data-kind="segmentation">YSAM2-S (Fast)</option>
                 </optgroup>
                 <optgroup label="Drone Detection Models">
                   <option value="yolov8_visdrone" data-kind="drone">VisDrone (YOLOv8)</option>
                 </optgroup>

models/detectors/drone_yolo.py DELETED Viewed

@@ -1,182 +0,0 @@
-import logging
-from typing import List, Sequence
-import numpy as np
-import torch
-from ultralytics import YOLO
-from models.detectors.base import DetectionResult, ObjectDetector
-from utils.tiling import get_slice_bboxes, slice_image, shift_bboxes, batched_nms
-class DroneYoloDetector(ObjectDetector):
-    """Drone detector backed by a YOLO model on the Hugging Face Hub."""
-    REPO_ID = "rujutashashikanjoshi/yolo12-drone-detection-0205-100m"
-    supports_batch = True
-    max_batch_size = 32
-    def __init__(self, score_threshold: float = 0.3, device: str = None) -> None:
-        self.name = "drone_yolo"
-        self.score_threshold = score_threshold
-        # CRITICAL: Store device as torch.device, NOT a string.
-        # Ultralytics' select_device() sets CUDA_VISIBLE_DEVICES when it
-        # receives a string like "cuda:0", restricting the entire process to
-        # one GPU.  Passing a torch.device object causes select_device() to
-        # return immediately without touching the environment.
-        if device:
-            self.device = torch.device(device)
-        else:
-            self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-        logging.info(
-            "Loading drone YOLO from HuggingFace Hub: %s onto %s",
-            self.REPO_ID,
-            self.device,
-        )
-        # Load directly from HuggingFace Hub using ultralytics native support
-        self.model = YOLO(f"hf://{self.REPO_ID}")
-        self.model.to(self.device)
-        self.class_names = self.model.names
-    def _filter_indices(self, label_names: Sequence[str], queries: Sequence[str]) -> List[int]:
-        if not queries:
-            return list(range(len(label_names)))
-        allowed = {query.lower().strip() for query in queries if query}
-        keep = [idx for idx, name in enumerate(label_names) if name.lower() in allowed]
-        return keep or list(range(len(label_names)))
-    def _parse_single_result(self, result, queries: Sequence[str]) -> DetectionResult:
-        boxes = result.boxes
-        if boxes is None or boxes.xyxy is None:
-            empty = np.empty((0, 4), dtype=np.float32)
-            return DetectionResult(empty, [], [], [])
-        xyxy = boxes.xyxy.cpu().numpy()
-        scores = boxes.conf.cpu().numpy().tolist()
-        label_ids = boxes.cls.cpu().numpy().astype(int).tolist()
-        label_names = [self.class_names.get(idx, f"class_{idx}") for idx in label_ids]
-        keep_indices = self._filter_indices(label_names, queries)
-        xyxy = xyxy[keep_indices] if len(xyxy) else xyxy
-        scores = [scores[i] for i in keep_indices]
-        label_ids = [label_ids[i] for i in keep_indices]
-        label_names = [label_names[i] for i in keep_indices]
-        return DetectionResult(
-            boxes=xyxy,
-            scores=scores,
-            labels=label_ids,
-            label_names=label_names,
-        )
-    def _predict_tiled(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
-        """Run tiled inference for high-resolution frames."""
-        # 1. Slice
-        h, w = frame.shape[:2]
-        # Heuristic: 1280x1280 tiles with 20% overlap
-        slice_boxes = get_slice_bboxes(h, w, 1280, 1280, 0.2, 0.2)
-        tiles = slice_image(frame, slice_boxes)
-        # 2. Batch Inference
-        # We can use our own model's batch prediction if we can trust it not to recurse strictly
-        # But we need raw results to merge.
-        # Actually proper way: run standard predict on tiles.
-        all_boxes = []
-        all_scores = []
-        all_labels = []
-        # Run in batches of max_batch_size to respect GPU memory
-        batch_size = self.max_batch_size
-        for i in range(0, len(tiles), batch_size):
-            batch_tiles = tiles[i : i + batch_size]
-            batch_slices = slice_boxes[i : i + batch_size]
-            results = self.model.predict(
-                source=batch_tiles,
-                device=self.device,
-                conf=self.score_threshold,
-                imgsz=1280, # Run tiles at full res
-                verbose=False,
-            )
-            for res, slice_coord in zip(results, batch_slices):
-                 if res.boxes is None: continue
-                 # Extract standard results
-                 boxes = res.boxes.xyxy.cpu().numpy().tolist()
-                 scores = res.boxes.conf.cpu().numpy().tolist()
-                 clss = res.boxes.cls.cpu().numpy().tolist()
-                 # Shift to global
-                 shifted = shift_bboxes(boxes, slice_coord)
-                 all_boxes.extend(shifted)
-                 all_scores.extend(scores)
-                 all_labels.extend(clss)
-        if not all_boxes:
-            empty = np.empty((0, 4), dtype=np.float32)
-            return DetectionResult(empty, [], [], [])
-        # 3. NMS Merge
-        boxes_t = torch.tensor(all_boxes, device=self.device)
-        scores_t = torch.tensor(all_scores, device=self.device)
-        labels_t = torch.tensor(all_labels, device=self.device)
-        keep = batched_nms(boxes_t, scores_t, labels_t, iou_threshold=0.4)
-        final_boxes = boxes_t[keep].cpu().numpy()
-        final_scores = scores_t[keep].cpu().tolist()
-        final_labels = labels_t[keep].cpu().int().tolist()
-        # 4. Filter & Format
-        label_names = [self.class_names.get(idx, f"class_{idx}") for idx in final_labels]
-        keep_indices = self._filter_indices(label_names, queries)
-        if not keep_indices:
-             empty = np.empty((0, 4), dtype=np.float32)
-             return DetectionResult(empty, [], [], [])
-        final_boxes = final_boxes[keep_indices]
-        final_scores = [final_scores[i] for i in keep_indices]
-        final_labels = [final_labels[i] for i in keep_indices]
-        final_names = [label_names[i] for i in keep_indices]
-        return DetectionResult(
-            boxes=final_boxes,
-            scores=final_scores,
-            labels=final_labels,
-            label_names=final_names
-        )
-    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
-        h, w = frame.shape[:2]
-        # Enable tiling for 4Kish images (width > 3000)
-        if w > 3000:
-             return self._predict_tiled(frame, queries)
-        device_arg = self.device
-        results = self.model.predict(
-            source=frame,
-            device=device_arg,
-            conf=self.score_threshold,
-            imgsz=1280,
-            verbose=False,
-        )
-        return self._parse_single_result(results[0], queries)
-    def predict_batch(self, frames: Sequence[np.ndarray], queries: Sequence[str]) -> Sequence[DetectionResult]:
-        # Mixed batch support is hard. Assume batch is uniform size.
-        if not frames: return []
-        h, w = frames[0].shape[:2]
-        if w > 3000:
-            return [self._predict_tiled(f, queries) for f in frames]
-        results = self.model.predict(
-            source=frames,
-            device=self.device,
-            conf=self.score_threshold,
-            imgsz=1280,
-            verbose=False,
-        )
-        return [self._parse_single_result(r, queries) for r in results]

models/detectors/yolov8_visdrone.py CHANGED Viewed

@@ -1,13 +1,20 @@
 import logging
 from typing import List, Sequence
 import numpy as np
 import torch
 from ultralytics import YOLO
 from models.detectors.base import DetectionResult, ObjectDetector
 from utils.tiling import get_slice_bboxes, slice_image, shift_bboxes, batched_nms
 class YoloV8VisDroneDetector(ObjectDetector):
     """YOLOv8 detector fine-tuned on VisDrone dataset for aerial imagery."""
@@ -28,7 +35,14 @@ class YoloV8VisDroneDetector(ObjectDetector):
             self.REPO_ID,
             self.device,
         )
-        self.model = YOLO(f"hf://{self.REPO_ID}")
         self.model.to(self.device)
         self.class_names = self.model.names

 import logging
+import os
+from pathlib import Path
 from typing import List, Sequence
 import numpy as np
 import torch
+from huggingface_hub import hf_hub_download
 from ultralytics import YOLO
 from models.detectors.base import DetectionResult, ObjectDetector
 from utils.tiling import get_slice_bboxes, slice_image, shift_bboxes, batched_nms
+_WEIGHTS_CACHE = Path(os.environ.get("YOLO_CACHE", "/tmp/yolo_weights"))
+_WEIGHTS_CACHE.mkdir(parents=True, exist_ok=True)
+_VISDRONE_PATH = _WEIGHTS_CACHE / "visDrone.pt"
 class YoloV8VisDroneDetector(ObjectDetector):
     """YOLOv8 detector fine-tuned on VisDrone dataset for aerial imagery."""
             self.REPO_ID,
             self.device,
         )
+        if not _VISDRONE_PATH.exists():
+            logging.info("Downloading visDrone.pt to %s ...", _VISDRONE_PATH)
+            hf_hub_download(
+                repo_id=self.REPO_ID,
+                filename="visDrone.pt",
+                local_dir=str(_WEIGHTS_CACHE),
+            )
+        self.model = YOLO(str(_VISDRONE_PATH))
         self.model.to(self.device)
         self.class_names = self.model.names

models/model_loader.py CHANGED Viewed

@@ -4,7 +4,6 @@ from typing import Callable, Dict, Optional
 from models.detectors.base import ObjectDetector
 from models.detectors.detr import DetrDetector
-from models.detectors.drone_yolo import DroneYoloDetector
 from models.detectors.grounding_dino import GroundingDinoDetector
 from models.detectors.yolov11 import Yolo11Detector
 from models.detectors.yolov8_visdrone import YoloV8VisDroneDetector
@@ -16,7 +15,6 @@ _REGISTRY: Dict[str, Callable[[], ObjectDetector]] = {
     "yolo11": Yolo11Detector,
     "detr_resnet50": DetrDetector,
     "grounding_dino": GroundingDinoDetector,
-    "drone_yolo": DroneYoloDetector,
     "yolov8_visdrone": YoloV8VisDroneDetector,
 }

 from models.detectors.base import ObjectDetector
 from models.detectors.detr import DetrDetector
 from models.detectors.grounding_dino import GroundingDinoDetector
 from models.detectors.yolov11 import Yolo11Detector
 from models.detectors.yolov8_visdrone import YoloV8VisDroneDetector
     "yolo11": Yolo11Detector,
     "detr_resnet50": DetrDetector,
     "grounding_dino": GroundingDinoDetector,
     "yolov8_visdrone": YoloV8VisDroneDetector,
 }

utils/profiler.py CHANGED Viewed

@@ -20,7 +20,7 @@ logger = logging.getLogger(__name__)
 # Detectors whose predict() can be decomposed into processor -> model -> post_process
 _DECOMPOSABLE_DETECTORS = {"detr_resnet50", "grounding_dino"}
 # Detectors with opaque predict() calls (YOLO-based)
-_OPAQUE_DETECTORS = {"yolo11", "drone_yolo"}
 @dataclass

 # Detectors whose predict() can be decomposed into processor -> model -> post_process
 _DECOMPOSABLE_DETECTORS = {"detr_resnet50", "grounding_dino"}
 # Detectors with opaque predict() calls (YOLO-based)
+_OPAQUE_DETECTORS = {"yolo11", "yolov8_visdrone"}
 @dataclass

utils/roofline.py CHANGED Viewed

@@ -18,7 +18,7 @@ _MODEL_FLOPS: Dict[str, float] = {
     "yolo11": 78.9,           # YOLO11m ~79 GFLOPs at 640px
     "detr_resnet50": 86.0,       # DETR-R50 ~86 GFLOPs at 800px
     "grounding_dino": 172.0,     # Grounding DINO-B ~172 GFLOPs
-    "drone_yolo": 78.9,          # Same arch as YOLO11m-class model
     # Segmentation models (GFLOPs per keyframe)
     "GSAM2-S": 48.0,             # SAM2 small encoder
@@ -37,7 +37,7 @@ _MODEL_BYTES: Dict[str, float] = {
     "yolo11": 52.0,
     "detr_resnet50": 166.0,
     "grounding_dino": 340.0,
-    "drone_yolo": 52.0,
     "GSAM2-S": 92.0,
     "GSAM2-B": 180.0,
     "GSAM2-L": 400.0,

     "yolo11": 78.9,           # YOLO11m ~79 GFLOPs at 640px
     "detr_resnet50": 86.0,       # DETR-R50 ~86 GFLOPs at 800px
     "grounding_dino": 172.0,     # Grounding DINO-B ~172 GFLOPs
+    "yolov8_visdrone": 78.9,     # YOLOv8 VisDrone model
     # Segmentation models (GFLOPs per keyframe)
     "GSAM2-S": 48.0,             # SAM2 small encoder
     "yolo11": 52.0,
     "detr_resnet50": 166.0,
     "grounding_dino": 340.0,
+    "yolov8_visdrone": 52.0,
     "GSAM2-S": 92.0,
     "GSAM2-B": 180.0,
     "GSAM2-L": 400.0,