Spaces:

Rishabh12j
/

DepthLens

Sleeping

File size: 2,536 Bytes

5412d82

"""
Object detection using YOLOv8n.

Wraps the ultralytics YOLO interface and returns detections in the format
expected by build_depth_context: (boxes, classes, confidences).
"""

import numpy as np
import torch
from ultralytics import YOLO

from ..config import CONF_THRESHOLD, YOLO_MODEL


class ObjectDetector:
    """YOLOv8n object detector.

    Downloads ``yolov8n.pt`` on first use (cached by ultralytics in
    ``~/.cache/ultralytics/``).  Subsequent loads use the cached weights.
    """

    def __init__(self) -> None:
        """Load YOLOv8n onto the available device."""
        print("Loading YOLOv8n...")
        self.model = YOLO(YOLO_MODEL)

        # Move weights to GPU when available.  YOLO's constructor always
        # loads to CPU; .to() moves the underlying PyTorch model in-place.
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

        if torch.cuda.is_available():
            print(
                f"  GPU memory allocated: "
                f"{torch.cuda.memory_allocated() / 1024**2:.0f} MB"
            )

    def detect(
        self, image: np.ndarray
    ) -> tuple[np.ndarray, list[str], list[float]]:
        """Run detection on an RGB image.

        Args:
            image: uint8 RGB numpy array of shape (H, W, 3).

        Returns:
            boxes:       float32 array of shape (N, 4) as [x1, y1, x2, y2]
                         in pixel coordinates.
            classes:     List of N class-name strings.
            confidences: List of N confidence floats in [0, 1].
        """
        # ultralytics assumes BGR numpy input and does its own BGR→RGB flip
        # internally.  Convert so colours are correct for a model trained on
        # standard BGR/OpenCV images.
        bgr = image[..., ::-1]

        with torch.inference_mode():
            results = self.model(
                bgr,
                conf=CONF_THRESHOLD,
                verbose=False,
                device=self.device,
            )

        result = results[0]
        det = result.boxes

        if det is None or len(det) == 0:
            empty = np.empty((0, 4), dtype=np.float32)
            return empty, [], []

        boxes = det.xyxy.cpu().numpy().astype(np.float32)          # (N, 4)
        confidences = det.conf.cpu().numpy().tolist()               # (N,)
        class_ids = det.cls.cpu().numpy().astype(int).tolist()      # (N,)
        classes = [result.names[cid] for cid in class_ids]

        return boxes, classes, confidences