Spaces:

BiasLab2025
/

perception

Running on A10G

App Files Files Community

Zhen Ye commited on Jan 20

Commit

5c36daa

1 Parent(s): 1eea4fe

Implement Batch Inference & Queue Backpressure Fixes

Browse files

Files changed (10) hide show

inference.py +321 -65
models/depth_estimators/base.py +6 -1
models/depth_estimators/depth_anything_v2.py +40 -16
models/detectors/base.py +6 -0
models/detectors/detr.py +32 -11
models/detectors/drone_yolo.py +22 -9
models/detectors/grounding_dino.py +38 -16
models/detectors/yolov8.py +21 -8
models/segmenters/base.py +6 -1
models/segmenters/sam3.py +69 -29

inference.py CHANGED Viewed

@@ -22,6 +22,7 @@ from models.detectors.base import ObjectDetector
 from models.model_loader import load_detector, load_detector_on_device
 from models.segmenters.model_loader import load_segmenter, load_segmenter_on_device
 from models.depth_estimators.model_loader import load_depth_estimator, load_depth_estimator_on_device
 from utils.video import extract_frames, write_video, VideoReader, VideoWriter
 from utils.gpt_distance import estimate_distance_gpt
 import tempfile
@@ -352,6 +353,149 @@ def infer_frame(
     ), detections
 def infer_segmentation_frame(
     frame: np.ndarray,
     text_queries: Optional[List[str]] = None,
@@ -557,16 +701,45 @@ def run_inference(
     # queue_in: (frame_idx, frame_data)
     # queue_out: (frame_idx, processed_frame, detections)
     queue_in = Queue(maxsize=16)
-    queue_out = Queue() # Unbounded, consumed by writer
     # 5. Worker Function
     def worker_task(gpu_idx: int):
         detector_instance = detectors[gpu_idx]
         depth_instance = depth_estimators[gpu_idx] if depth_estimators[gpu_idx] else None
         while True:
             item = queue_in.get()
             if item is None:
                 queue_in.task_done()
                 break
@@ -576,24 +749,22 @@ def run_inference(
                 logging.debug("Processing frame %d on device %s", frame_idx, "cpu" if num_gpus==0 else f"cuda:{gpu_idx}")
             try:
-                # Depth strategy: Run every 3 frames
-                active_depth_name = depth_estimator_name if (frame_idx % 3 == 0) else None
-                active_depth_instance = depth_instance if (frame_idx % 3 == 0) else None
-                processed, frame_dets = infer_frame(
-                    frame_data,
-                    queries,
-                    detector_name=None,
-                    depth_estimator_name=active_depth_name,
-                    depth_scale=depth_scale,
-                    detector_instance=detector_instance,
-                    depth_estimator_instance=active_depth_instance
-                )
-                queue_out.put((frame_idx, processed, frame_dets))
             except Exception as e:
-                logging.exception("Error processing frame %d", frame_idx)
-                # Put placeholders to avoid hanging writer
-                queue_out.put((frame_idx, frame_data, []))
             queue_in.task_done()
@@ -626,7 +797,18 @@ def run_inference(
                     # Fetch from queue
                     try:
                         while next_idx not in buffer:
                             item = queue_out.get(timeout=1.0) # wait
                             idx, p_frame, dets = item
                             buffer[idx] = (p_frame, dets)
@@ -763,33 +945,68 @@ def run_segmentation(
     # 3. Processing
     queue_in = Queue(maxsize=16)
-    queue_out = Queue()
     def worker_seg(gpu_idx: int):
         seg = segmenters[gpu_idx]
         while True:
             item = queue_in.get()
             if item is None:
                 queue_in.task_done()
                 break
             idx, frame = item
             if idx % 30 == 0:
-                logging.debug("Segmenting frame %d (GPU %d)", idx, gpu_idx)
-            try:
-                processed, _ = infer_segmentation_frame(
-                    frame,
-                    text_queries=queries,
-                    segmenter_name=None,
-                    segmenter_instance=seg
-                )
-                queue_out.put((idx, processed))
-            except Exception as e:
-                logging.error("Segmentation failed frame %d: %s", idx, e)
-                queue_out.put((idx, frame))
             queue_in.task_done()
     workers = []
@@ -811,6 +1028,10 @@ def run_segmentation(
                 while next_idx < total_frames:
                     try:
                         while next_idx not in buffer:
                             idx, frm = queue_out.get(timeout=1.0)
                             buffer[idx] = frm
@@ -1014,49 +1235,82 @@ def run_depth_inference(
     logging.info("Starting Phase 2: Streaming...")
     queue_in = Queue(maxsize=16)
-    queue_out = Queue()
     def worker_depth(gpu_idx: int):
         est = estimators[gpu_idx]
         while True:
             item = queue_in.get()
             if item is None:
                 queue_in.task_done()
                 break
             idx, frame = item
-            try:
-                if idx % 30 == 0:
-                    logging.info("Depth frame %d (GPU %d)", idx, gpu_idx)
-                with est.lock:
-                    res = est.predict(frame)
-                depth_map = res.depth_map
-                # Colorize
-                colored = colorize_depth_map(depth_map, global_min, global_max)
-                # Overlay Detections
-                # Detections list is [ [det1, det2], [det1, det2] ... ]
-                if detections and idx < len(detections):
-                     frame_dets = detections[idx]
-                     if frame_dets:
-                         import cv2
-                         boxes = []
-                         labels = []
-                         for d in frame_dets:
-                             boxes.append(d.get("bbox"))
-                             lbl = d.get("label", "obj")
-                             if d.get("depth_est_m"):
-                                 lbl = f"{lbl} {int(d['depth_est_m'])}m"
-                             labels.append(lbl)
-                         colored = draw_boxes(colored, boxes=boxes, label_names=labels)
-                queue_out.put((idx, colored))
-            except Exception as e:
-                logging.error("Depth worker failed frame %d: %s", idx, e)
-                queue_out.put((idx, frame)) # Fallback to original?
             queue_in.task_done()
     # Workers
@@ -1081,6 +1335,8 @@ def run_depth_inference(
                 while next_idx < total_frames:
                     try:
                         while next_idx not in buffer:
                             idx, frm = queue_out.get(timeout=1.0)
                             buffer[idx] = frm

 from models.model_loader import load_detector, load_detector_on_device
 from models.segmenters.model_loader import load_segmenter, load_segmenter_on_device
 from models.depth_estimators.model_loader import load_depth_estimator, load_depth_estimator_on_device
+from models.depth_estimators.base import DepthEstimator
 from utils.video import extract_frames, write_video, VideoReader, VideoWriter
 from utils.gpt_distance import estimate_distance_gpt
 import tempfile
     ), detections
+def infer_batch(
+    frames: List[np.ndarray],
+    frame_indices: List[int],
+    queries: Sequence[str],
+    detector_instance: ObjectDetector,
+    depth_estimator_instance: Optional[DepthEstimator] = None,
+    depth_scale: float = 1.0,
+    depth_frame_stride: int = 3,
+) -> List[Tuple[int, np.ndarray, List[Dict[str, Any]]]]:
+    # Batch detection
+    text_queries = list(queries) or ["object"]
+    try:
+        if detector_instance.supports_batch:
+            with detector_instance.lock:
+                 det_results = detector_instance.predict_batch(frames, text_queries)
+        else:
+            # Fallback
+            with detector_instance.lock:
+                 det_results = [detector_instance.predict(f, text_queries) for f in frames]
+    except Exception:
+        logging.exception("Batch detection failed")
+        # Return empty for all
+        return [(idx, f, []) for idx, f in zip(frame_indices, frames)]
+    # Batch depth
+    depth_map_results = {} # frame_idx -> depth_map
+    depth_batch_inputs = []
+    depth_batch_indices = []
+    for idx, f in zip(frame_indices, frames):
+        if idx % depth_frame_stride == 0:
+            depth_batch_inputs.append(f)
+            depth_batch_indices.append(idx)
+    if depth_estimator_instance and depth_batch_inputs:
+        try:
+             with depth_estimator_instance.lock:
+                 if depth_estimator_instance.supports_batch:
+                     d_results = depth_estimator_instance.predict_batch(depth_batch_inputs)
+                 else:
+                     d_results = [depth_estimator_instance.predict(f) for f in depth_batch_inputs]
+             for idx, res in zip(depth_batch_indices, d_results):
+                 depth_map_results[idx] = res
+        except Exception:
+             logging.exception("Batch depth estimation failed")
+    # Post-process and merge
+    outputs = []
+    for i, (idx, frame, det_result) in enumerate(zip(frame_indices, frames, det_results)):
+        detections = _build_detection_records(
+            det_result.boxes, det_result.scores, det_result.labels, text_queries, det_result.label_names
+        )
+        if idx in depth_map_results:
+             try:
+                 # existing _attach_depth_metrics expects detections and estimator name/instance
+                 # but we already computed depth. We need a helper or just modify logical flow.
+                 # Actually _attach_depth_metrics calls predict(). We want to skip predict.
+                 # Let's manually attach.
+                 d_res = depth_map_results[idx]
+                 # We need to manually invoke the attachment logic using the precomputed result.
+                 # Refactoring _attach_depth_metrics to accept result would be cleaner, but for now:
+                 # Copy-paste logic or use a trick.
+                 # Let's extract logic from _attach_depth_metrics essentially.
+                 # Wait, _attach_depth_metrics does the box checking.
+                 _attach_depth_from_result(detections, d_res, depth_scale)
+             except Exception:
+                 logging.warning("Failed to attach depth for frame %d", idx)
+        display_labels = [_build_display_label(d) for d in detections]
+        processed = draw_boxes(frame, det_result.boxes, label_names=display_labels)
+        outputs.append((idx, processed, detections))
+    return outputs
+def _build_display_label(det):
+    label = det["label"]
+    if det.get("depth_valid") and det.get("depth_est_m") is not None:
+         depth_str = f"{int(det['depth_est_m'])}m"
+         label = f"{label} {depth_str}"
+    return label
+def _attach_depth_from_result(detections, depth_result, depth_scale):
+    depth_map = depth_result.depth_map
+    if depth_map is None or depth_map.size == 0: return
+    height, width = depth_map.shape[:2]
+    valid_depths = []
+    for det in detections:
+        det["depth_est_m"] = None
+        det["depth_rel"] = None
+        det["depth_valid"] = False
+        bbox = det.get("bbox")
+        if not bbox or len(bbox) < 4: continue
+        x1, y1, x2, y2 = [int(coord) for coord in bbox[:4]]
+        x1 = max(0, min(width - 1, x1))
+        y1 = max(0, min(height - 1, y1))
+        x2 = max(x1 + 1, min(width, x2))
+        y2 = max(y1 + 1, min(height, y2))
+        patch = depth_map[y1:y2, x1:x2]
+        if patch.size == 0: continue
+        h_p, w_p = patch.shape
+        cy, cx = h_p // 2, w_p // 2
+        dy, dx = h_p // 4, w_p // 4
+        center_patch = patch[cy - dy : cy + dy, cx - dx : cx + dx]
+        if center_patch.size == 0: center_patch = patch
+        finite = center_patch[np.isfinite(center_patch)]
+        if finite.size == 0: continue
+        depth_raw = float(np.median(finite))
+        if depth_raw <= 1e-6:
+             det["depth_est_m"] = None
+             det["depth_valid"] = False
+             continue
+        try:
+            depth_est = depth_scale / depth_raw
+        except ZeroDivisionError:
+            continue
+        det["depth_est_m"] = depth_est
+        det["depth_valid"] = True
+        valid_depths.append(depth_est)
+    if not valid_depths: return
+    min_depth = float(min(valid_depths))
+    max_depth = float(max(valid_depths))
+    denom = max(max_depth - min_depth, 1e-6)
+    for det in detections:
+        if det.get("depth_valid"):
+            det["depth_rel"] = (float(det["depth_est_m"]) - min_depth) / denom
 def infer_segmentation_frame(
     frame: np.ndarray,
     text_queries: Optional[List[str]] = None,
     # queue_in: (frame_idx, frame_data)
     # queue_out: (frame_idx, processed_frame, detections)
     queue_in = Queue(maxsize=16)
+    # Bound queue_out to prevent OOM
+    # Maxsize should be enough to keep writer busy but not explode memory
+    queue_out_max = max(32, (len(detectors) if detectors else 1) * 4)
+    queue_out = Queue(maxsize=queue_out_max)
     # 5. Worker Function
     def worker_task(gpu_idx: int):
         detector_instance = detectors[gpu_idx]
         depth_instance = depth_estimators[gpu_idx] if depth_estimators[gpu_idx] else None
+        batch_size = detector_instance.max_batch_size if detector_instance.supports_batch else 1
+        batch_accum = [] # List[Tuple[idx, frame]]
+        def flush_batch():
+            if not batch_accum: return
+            indices = [item[0] for item in batch_accum]
+            frames = [item[1] for item in batch_accum]
+            batch_outputs = infer_batch(
+                frames, indices, queries, detector_instance,
+                depth_estimator_instance=depth_instance,
+                depth_scale=depth_scale
+            )
+            for out_item in batch_outputs:
+                while True:
+                    try:
+                        queue_out.put(out_item, timeout=1.0)
+                        break
+                    except Full:
+                        if job_id: _check_cancellation(job_id)
+            batch_accum.clear()
         while True:
             item = queue_in.get()
             if item is None:
+                flush_batch()
                 queue_in.task_done()
                 break
                 logging.debug("Processing frame %d on device %s", frame_idx, "cpu" if num_gpus==0 else f"cuda:{gpu_idx}")
             try:
+                batch_accum.append((frame_idx, frame_data))
+                if len(batch_accum) >= batch_size:
+                    flush_batch()
             except Exception as e:
+                logging.exception("Error processing batch around frame %d", frame_idx)
+                # Fail strictly or soft?
+                # If batch fails, we probably lost a chunk.
+                # Put placeholders for what we have in accum
+                for idx, frm in batch_accum:
+                     while True:
+                        try:
+                            queue_out.put((idx, frm, []), timeout=1.0)
+                            break
+                        except Full:
+                             if job_id: _check_cancellation(job_id)
+                batch_accum.clear()
             queue_in.task_done()
                     # Fetch from queue
                     try:
                         while next_idx not in buffer:
+                            # Backpressure: If buffer gets too big due to out-of-order frames,
+                            # we might want to warn or just hope for the best.
+                            # But here we are just consuming.
+                            # However, if 'buffer' grows too large (because we are missing next_idx),
+                            # we are effectively unbounded again if queue_out fills up with future frames.
+                            # So we should monitor buffer size.
+                            if len(buffer) > 64:
+                                logging.warning("Writer buffer large (%d items), waiting for frame %d...", len(buffer), next_idx)
                             item = queue_out.get(timeout=1.0) # wait
                             idx, p_frame, dets = item
                             buffer[idx] = (p_frame, dets)
     # 3. Processing
     queue_in = Queue(maxsize=16)
+    queue_out = Queue(maxsize=max(32, len(segmenters)*4))
     def worker_seg(gpu_idx: int):
         seg = segmenters[gpu_idx]
+        batch_size = seg.max_batch_size if seg.supports_batch else 1
+        batch_accum = []
+        def flush_batch():
+            if not batch_accum: return
+            indices = [i for i, _ in batch_accum]
+            frames = [f for _, f in batch_accum]
+            try:
+                # 1. Inference
+                if seg.supports_batch:
+                    with seg.lock:
+                         results = seg.predict_batch(frames, queries)
+                else:
+                    with seg.lock:
+                         results = [seg.predict(f, queries) for f in frames]
+                # 2. Post-process loop
+                for idx, frm, res in zip(indices, frames, results):
+                    labels = queries or []
+                    if len(labels) == 1:
+                        masks = res.masks if res.masks is not None else []
+                        labels = [labels[0] for _ in range(len(masks))]
+                    processed = draw_masks(frm, res.masks, labels=labels)
+                    while True:
+                        try:
+                            queue_out.put((idx, processed), timeout=1.0)
+                            break
+                        except Full:
+                             if job_id: _check_cancellation(job_id)
+            except Exception as e:
+                logging.error("Batch seg failed: %s", e)
+                for idx, frm in batch_accum:
+                     while True:
+                        try:
+                            queue_out.put((idx, frm), timeout=1.0) # Fallback
+                            break
+                        except Full:
+                             if job_id: _check_cancellation(job_id)
+            batch_accum.clear()
         while True:
             item = queue_in.get()
             if item is None:
+                flush_batch()
                 queue_in.task_done()
                 break
             idx, frame = item
+            batch_accum.append(item)
             if idx % 30 == 0:
+                 logging.debug("Seg frame %d (GPU %d)", idx, gpu_idx)
+            if len(batch_accum) >= batch_size:
+                flush_batch()
             queue_in.task_done()
     workers = []
                 while next_idx < total_frames:
                     try:
                         while next_idx not in buffer:
+                            # Check buffer size
+                            if len(buffer) > 64:
+                                logging.warning("Writer buffer large (%d), waiting for %d", len(buffer), next_idx)
                             idx, frm = queue_out.get(timeout=1.0)
                             buffer[idx] = frm
     logging.info("Starting Phase 2: Streaming...")
     queue_in = Queue(maxsize=16)
+    queue_out_max = max(32, (len(estimators) if estimators else 1) * 4)
+    queue_out = Queue(maxsize=queue_out_max)
     def worker_depth(gpu_idx: int):
         est = estimators[gpu_idx]
+        batch_size = est.max_batch_size if est.supports_batch else 1
+        batch_accum = []
+        def flush_batch():
+            if not batch_accum: return
+            indices = [i for i, _ in batch_accum]
+            frames = [f for _, f in batch_accum]
+            try:
+                # 1. Inference
+                if est.supports_batch:
+                    with est.lock:
+                         results = est.predict_batch(frames)
+                else:
+                    with est.lock:
+                         results = [est.predict(f) for f in frames]
+                # 2. Post-process loop
+                for idx, frm, res in zip(indices, frames, results):
+                    depth_map = res.depth_map
+                    colored = colorize_depth_map(depth_map, global_min, global_max)
+                    # Overlay Detections
+                    if detections and idx < len(detections):
+                         frame_dets = detections[idx]
+                         if frame_dets:
+                             import cv2
+                             boxes = []
+                             labels = []
+                             for d in frame_dets:
+                                 boxes.append(d.get("bbox"))
+                                 lbl = d.get("label", "obj")
+                                 if d.get("depth_est_m"):
+                                     lbl = f"{lbl} {int(d['depth_est_m'])}m"
+                                 labels.append(lbl)
+                             colored = draw_boxes(colored, boxes=boxes, label_names=labels)
+                    while True:
+                        try:
+                            queue_out.put((idx, colored), timeout=1.0)
+                            break
+                        except Full:
+                             if job_id: _check_cancellation(job_id)
+            except Exception as e:
+                logging.error("Batch depth failed: %s", e)
+                for idx, frm in batch_accum:
+                     while True:
+                        try:
+                            queue_out.put((idx, frm), timeout=1.0)
+                            break
+                        except Full:
+                             if job_id: _check_cancellation(job_id)
+            batch_accum.clear()
         while True:
             item = queue_in.get()
             if item is None:
+                flush_batch()
                 queue_in.task_done()
                 break
             idx, frame = item
+            batch_accum.append(item)
+            if idx % 30 == 0:
+                 logging.info("Depth frame %d (GPU %d)", idx, gpu_idx)
+            if len(batch_accum) >= batch_size:
+                flush_batch()
             queue_in.task_done()
     # Workers
                 while next_idx < total_frames:
                     try:
                         while next_idx not in buffer:
+                            if len(buffer) > 64:
+                                logging.warning("Writer buffer large (%d), waiting for %d", len(buffer), next_idx)
                             idx, frm = queue_out.get(timeout=1.0)
                             buffer[idx] = frm

models/depth_estimators/base.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import NamedTuple
 import numpy as np
@@ -13,6 +13,8 @@ class DepthEstimator:
     """Base interface for depth estimation models."""
     name: str
     def predict(self, frame: np.ndarray) -> DepthResult:
         """
@@ -25,3 +27,6 @@ class DepthEstimator:
             DepthResult with depth_map and focal_length
         """
         raise NotImplementedError

+from typing import NamedTuple, Sequence, List
 import numpy as np
     """Base interface for depth estimation models."""
     name: str
+    supports_batch: bool = False
+    max_batch_size: int = 1
     def predict(self, frame: np.ndarray) -> DepthResult:
         """
             DepthResult with depth_map and focal_length
         """
         raise NotImplementedError
+    def predict_batch(self, frames: Sequence[np.ndarray]) -> Sequence[DepthResult]:
+        return [self.predict(f) for f in frames]

models/depth_estimators/depth_anything_v2.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
 import numpy as np
 import torch
@@ -12,6 +13,24 @@ class DepthAnythingV2Estimator(DepthEstimator):
     """Depth-Anything depth estimator (Transformers-compatible)."""
     name = "depth"
     def __init__(self, device: str = None) -> None:
         logging.info("Loading Depth-Anything model from Hugging Face (transformers)...")
@@ -50,25 +69,30 @@ class DepthAnythingV2Estimator(DepthEstimator):
                 outputs = self.model(**inputs)
             raw_depth = outputs.predicted_depth
-            if raw_depth.dim() == 2:
-                raw_depth = raw_depth.unsqueeze(0).unsqueeze(0)
-            elif raw_depth.dim() == 3:
-                raw_depth = raw_depth.unsqueeze(1) if raw_depth.shape[0] == 1 else raw_depth.unsqueeze(0)
-            if raw_depth.shape[-2:] != (height, width):
-                import torch.nn.functional as F
-                raw_depth = F.interpolate(
-                    raw_depth,
-                    size=(height, width),
-                    mode="bilinear",
-                    align_corners=False,
-                )
-            depth_map = raw_depth.squeeze().cpu().numpy().astype(np.float32, copy=False)
         except Exception as exc:
             logging.error("Depth-Anything inference failed: %s", exc)
             h, w = frame.shape[:2]
             return DepthResult(depth_map=np.zeros((h, w), dtype=np.float32), focal_length=1.0)
         return DepthResult(depth_map=depth_map, focal_length=1.0)

 import logging
+from typing import Sequence
 import numpy as np
 import torch
     """Depth-Anything depth estimator (Transformers-compatible)."""
     name = "depth"
+    supports_batch = True
+    max_batch_size = 4
+    def _resize_depth(self, raw_depth, height, width):
+        if raw_depth.dim() == 2:
+            raw_depth = raw_depth.unsqueeze(0).unsqueeze(0)
+        elif raw_depth.dim() == 3:
+            raw_depth = raw_depth.unsqueeze(1) if raw_depth.shape[0] == 1 else raw_depth.unsqueeze(0)
+        if raw_depth.shape[-2:] != (height, width):
+            import torch.nn.functional as F
+            raw_depth = F.interpolate(
+                raw_depth,
+                size=(height, width),
+                mode="bilinear",
+                align_corners=False,
+            )
+        return raw_depth.squeeze().cpu().numpy().astype(np.float32, copy=False)
     def __init__(self, device: str = None) -> None:
         logging.info("Loading Depth-Anything model from Hugging Face (transformers)...")
                 outputs = self.model(**inputs)
             raw_depth = outputs.predicted_depth
+            depth_map = self._resize_depth(raw_depth, height, width)
         except Exception as exc:
             logging.error("Depth-Anything inference failed: %s", exc)
             h, w = frame.shape[:2]
             return DepthResult(depth_map=np.zeros((h, w), dtype=np.float32), focal_length=1.0)
         return DepthResult(depth_map=depth_map, focal_length=1.0)
+    def predict_batch(self, frames: Sequence[np.ndarray]) -> Sequence[DepthResult]:
+        # Convert frames to PIL images
+        pil_images = [Image.fromarray(f[:, :, ::-1]) for f in frames]  # BGR->RGB
+        sizes = [(img.height, img.width) for img in pil_images]
+        inputs = self.image_processor(images=pil_images, return_tensors="pt").to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        # outputs.predicted_depth is (B, H, W)
+        depths = outputs.predicted_depth
+        results = []
+        for i, (h, w) in enumerate(sizes):
+            depth_map = self._resize_depth(depths[i], h, w)
+            results.append(DepthResult(depth_map=depth_map, focal_length=1.0))
+        return results

models/detectors/base.py CHANGED Viewed

@@ -14,6 +14,12 @@ class ObjectDetector:
     """Detector interface to keep inference agnostic to model details."""
     name: str
     def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
         raise NotImplementedError

     """Detector interface to keep inference agnostic to model details."""
     name: str
+    supports_batch: bool = False
+    max_batch_size: int = 1
     def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
         raise NotImplementedError
+    def predict_batch(self, frames: Sequence[np.ndarray], queries: Sequence[str]) -> Sequence[DetectionResult]:
+        """Default: sequential fallback"""
+        return [self.predict(f, queries) for f in frames]

models/detectors/detr.py CHANGED Viewed

@@ -26,17 +26,10 @@ class DetrDetector(ObjectDetector):
         self.model.to(self.device)
         self.model.eval()
-    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
-        inputs = self.processor(images=frame, return_tensors="pt")
-        inputs = {key: value.to(self.device) for key, value in inputs.items()}
-        with torch.no_grad():
-            outputs = self.model(**inputs)
-        target_sizes = torch.tensor([frame.shape[:2]], device=self.device)
-        processed = self.processor.post_process_object_detection(
-            outputs,
-            threshold=self.score_threshold,
-            target_sizes=target_sizes,
-        )[0]
         boxes = processed["boxes"].cpu().numpy()
         scores = processed["scores"].cpu().tolist()
         labels = processed["labels"].cpu().tolist()
@@ -49,3 +42,31 @@ class DetrDetector(ObjectDetector):
             labels=labels,
             label_names=label_names,
         )

         self.model.to(self.device)
         self.model.eval()
+    supports_batch = True
+    max_batch_size = 4
+    def _parse_single_result(self, processed) -> DetectionResult:
         boxes = processed["boxes"].cpu().numpy()
         scores = processed["scores"].cpu().tolist()
         labels = processed["labels"].cpu().tolist()
             labels=labels,
             label_names=label_names,
         )
+    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
+        inputs = self.processor(images=frame, return_tensors="pt")
+        inputs = {key: value.to(self.device) for key, value in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        target_sizes = torch.tensor([frame.shape[:2]], device=self.device)
+        processed = self.processor.post_process_object_detection(
+            outputs,
+            threshold=self.score_threshold,
+            target_sizes=target_sizes,
+        )[0]
+        return self._parse_single_result(processed)
+    def predict_batch(self, frames: Sequence[np.ndarray], queries: Sequence[str]) -> Sequence[DetectionResult]:
+        inputs = self.processor(images=frames, return_tensors="pt", padding=True)
+        inputs = {key: value.to(self.device) for key, value in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        target_sizes = torch.tensor([f.shape[:2] for f in frames], device=self.device)
+        processed_list = self.processor.post_process_object_detection(
+            outputs,
+            threshold=self.score_threshold,
+            target_sizes=target_sizes,
+        )
+        return [self._parse_single_result(p) for p in processed_list]

models/detectors/drone_yolo.py CHANGED Viewed

@@ -15,6 +15,8 @@ class DroneYoloDetector(ObjectDetector):
     REPO_ID = "rujutashashikanjoshi/yolo12-drone-detection-0205-100m"
     DEFAULT_WEIGHT = "best.pt"
     def __init__(self, score_threshold: float = 0.3, device: str = None) -> None:
         self.name = "drone_yolo"
@@ -42,15 +44,7 @@ class DroneYoloDetector(ObjectDetector):
         keep = [idx for idx, name in enumerate(label_names) if name.lower() in allowed]
         return keep or list(range(len(label_names)))
-    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
-        device_arg = self.device
-        results = self.model.predict(
-            source=frame,
-            device=device_arg,
-            conf=self.score_threshold,
-            verbose=False,
-        )
-        result = results[0]
         boxes = result.boxes
         if boxes is None or boxes.xyxy is None:
             empty = np.empty((0, 4), dtype=np.float32)
@@ -71,3 +65,22 @@ class DroneYoloDetector(ObjectDetector):
             labels=label_ids,
             label_names=label_names,
         )

     REPO_ID = "rujutashashikanjoshi/yolo12-drone-detection-0205-100m"
     DEFAULT_WEIGHT = "best.pt"
+    supports_batch = True
+    max_batch_size = 8
     def __init__(self, score_threshold: float = 0.3, device: str = None) -> None:
         self.name = "drone_yolo"
         keep = [idx for idx, name in enumerate(label_names) if name.lower() in allowed]
         return keep or list(range(len(label_names)))
+    def _parse_single_result(self, result, queries: Sequence[str]) -> DetectionResult:
         boxes = result.boxes
         if boxes is None or boxes.xyxy is None:
             empty = np.empty((0, 4), dtype=np.float32)
             labels=label_ids,
             label_names=label_names,
         )
+    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
+        device_arg = self.device
+        results = self.model.predict(
+            source=frame,
+            device=device_arg,
+            conf=self.score_threshold,
+            verbose=False,
+        )
+        return self._parse_single_result(results[0], queries)
+    def predict_batch(self, frames: Sequence[np.ndarray], queries: Sequence[str]) -> Sequence[DetectionResult]:
+        results = self.model.predict(
+            source=frames,
+            device=self.device,
+            conf=self.score_threshold,
+            verbose=False,
+        )
+        return [self._parse_single_result(r, queries) for r in results]

models/detectors/grounding_dino.py CHANGED Viewed

@@ -33,36 +33,35 @@ class GroundingDinoDetector(ObjectDetector):
             return "object."
         return " ".join(f"{term}." for term in filtered)
-    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
-        prompt = self._build_prompt(queries)
-        inputs = self.processor(images=frame, text=prompt, return_tensors="pt")
-        inputs = {key: value.to(self.device) for key, value in inputs.items()}
-        with torch.no_grad():
-            outputs = self.model(**inputs)
-        target_sizes = torch.tensor([frame.shape[:2]], device=self.device)
         try:
-            processed = self.processor.post_process_grounded_object_detection(
                 outputs,
-                inputs["input_ids"],
                 box_threshold=self.box_threshold,
                 text_threshold=self.text_threshold,
                 target_sizes=target_sizes,
-            )[0]
         except TypeError:
             try:
-                processed = self.processor.post_process_grounded_object_detection(
                     outputs,
-                    inputs["input_ids"],
                     threshold=self.box_threshold,
                     text_threshold=self.text_threshold,
                     target_sizes=target_sizes,
-                )[0]
             except TypeError:
-                processed = self.processor.post_process_grounded_object_detection(
                     outputs,
-                    inputs["input_ids"],
                     target_sizes=target_sizes,
-                )[0]
         boxes = processed["boxes"].cpu().numpy()
         scores = processed["scores"].cpu().tolist()
         label_names = list(processed.get("labels") or [])
@@ -73,3 +72,26 @@ class GroundingDinoDetector(ObjectDetector):
             labels=label_ids,
             label_names=label_names,
         )

             return "object."
         return " ".join(f"{term}." for term in filtered)
+    supports_batch = True
+    max_batch_size = 4
+    def _post_process(self, outputs, input_ids, target_sizes):
         try:
+            return self.processor.post_process_grounded_object_detection(
                 outputs,
+                input_ids,
                 box_threshold=self.box_threshold,
                 text_threshold=self.text_threshold,
                 target_sizes=target_sizes,
+            )
         except TypeError:
             try:
+                return self.processor.post_process_grounded_object_detection(
                     outputs,
+                    input_ids,
                     threshold=self.box_threshold,
                     text_threshold=self.text_threshold,
                     target_sizes=target_sizes,
+                )
             except TypeError:
+                return self.processor.post_process_grounded_object_detection(
                     outputs,
+                    input_ids,
                     target_sizes=target_sizes,
+                )
+    def _parse_single_result(self, processed) -> DetectionResult:
         boxes = processed["boxes"].cpu().numpy()
         scores = processed["scores"].cpu().tolist()
         label_names = list(processed.get("labels") or [])
             labels=label_ids,
             label_names=label_names,
         )
+    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
+        prompt = self._build_prompt(queries)
+        inputs = self.processor(images=frame, text=prompt, return_tensors="pt")
+        inputs = {key: value.to(self.device) for key, value in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        target_sizes = torch.tensor([frame.shape[:2]], device=self.device)
+        processed_list = self._post_process(outputs, inputs["input_ids"], target_sizes)
+        return self._parse_single_result(processed_list[0])
+    def predict_batch(self, frames: Sequence[np.ndarray], queries: Sequence[str]) -> Sequence[DetectionResult]:
+        prompt = self._build_prompt(queries)
+        # Same prompt for all frames in batch
+        inputs = self.processor(images=frames, text=[prompt]*len(frames), return_tensors="pt", padding=True)
+        inputs = {key: value.to(self.device) for key, value in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        target_sizes = torch.tensor([f.shape[:2] for f in frames], device=self.device)
+        processed_list = self._post_process(outputs, inputs["input_ids"], target_sizes)
+        return [self._parse_single_result(p) for p in processed_list]

models/detectors/yolov8.py CHANGED Viewed

@@ -14,6 +14,8 @@ class HuggingFaceYoloV8Detector(ObjectDetector):
     REPO_ID = "Ultralytics/YOLOv8"
     WEIGHT_FILE = "yolov8s.pt"
     def __init__(self, score_threshold: float = 0.3, device: str = None) -> None:
         self.name = "hf_yolov8"
@@ -40,14 +42,7 @@ class HuggingFaceYoloV8Detector(ObjectDetector):
         keep = [idx for idx, name in enumerate(label_names) if name.lower() in allowed]
         return keep or list(range(len(label_names)))
-    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
-        results = self.model.predict(
-            source=frame,
-            device=self.device,
-            conf=self.score_threshold,
-            verbose=False,
-        )
-        result = results[0]
         boxes = result.boxes
         if boxes is None or boxes.xyxy is None:
             empty = np.empty((0, 4), dtype=np.float32)
@@ -69,3 +64,21 @@ class HuggingFaceYoloV8Detector(ObjectDetector):
             label_names=label_names,
         )

     REPO_ID = "Ultralytics/YOLOv8"
     WEIGHT_FILE = "yolov8s.pt"
+    supports_batch = True
+    max_batch_size = 8
     def __init__(self, score_threshold: float = 0.3, device: str = None) -> None:
         self.name = "hf_yolov8"
         keep = [idx for idx, name in enumerate(label_names) if name.lower() in allowed]
         return keep or list(range(len(label_names)))
+    def _parse_single_result(self, result, queries: Sequence[str]) -> DetectionResult:
         boxes = result.boxes
         if boxes is None or boxes.xyxy is None:
             empty = np.empty((0, 4), dtype=np.float32)
             label_names=label_names,
         )
+    def predict(self, frame: np.ndarray, queries: Sequence[str]) -> DetectionResult:
+        results = self.model.predict(
+            source=frame,
+            device=self.device,
+            conf=self.score_threshold,
+            verbose=False,
+        )
+        return self._parse_single_result(results[0], queries)
+    def predict_batch(self, frames: Sequence[np.ndarray], queries: Sequence[str]) -> Sequence[DetectionResult]:
+        results = self.model.predict(
+            source=frames,
+            device=self.device,
+            conf=self.score_threshold,
+            verbose=False,
+        )
+        return [self._parse_single_result(r, queries) for r in results]

models/segmenters/base.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import NamedTuple, Optional
 import numpy as np
@@ -14,6 +14,8 @@ class Segmenter:
     """Base interface for segmentation models."""
     name: str
     def predict(self, frame: np.ndarray, text_prompts: Optional[list] = None) -> SegmentationResult:
         """
@@ -27,3 +29,6 @@ class Segmenter:
             SegmentationResult with masks and optional metadata
         """
         raise NotImplementedError

+from typing import NamedTuple, Optional, Sequence, List
 import numpy as np
     """Base interface for segmentation models."""
     name: str
+    supports_batch: bool = False
+    max_batch_size: int = 1
     def predict(self, frame: np.ndarray, text_prompts: Optional[list] = None) -> SegmentationResult:
         """
             SegmentationResult with masks and optional metadata
         """
         raise NotImplementedError
+    def predict_batch(self, frames: Sequence[np.ndarray], text_prompts: Optional[list] = None) -> Sequence[SegmentationResult]:
+        return [self.predict(f, text_prompts) for f in frames]

models/segmenters/sam3.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Optional
 import numpy as np
 import torch
@@ -55,6 +55,38 @@ class SAM3Segmenter(Segmenter):
         logging.info("SAM3 model loaded successfully")
     def predict(self, frame: np.ndarray, text_prompts: Optional[list] = None) -> SegmentationResult:
         """
         Run SAM3 segmentation on a frame.
@@ -95,34 +127,7 @@ class SAM3Segmenter(Segmenter):
                 mask_threshold=self.mask_threshold,
                 target_sizes=inputs.get("original_sizes").tolist(),
             )[0]
-            # Extract results
-            masks = results.get("masks", [])
-            scores = results.get("scores", None)
-            boxes = results.get("boxes", None)
-            # Convert to numpy arrays
-            if len(masks) > 0:
-                # Stack masks: list of (H, W) -> (N, H, W)
-                masks_array = np.stack([m.cpu().numpy() for m in masks])
-            else:
-                # No objects detected
-                masks_array = np.zeros(
-                    (0, frame.shape[0], frame.shape[1]), dtype=bool
-                )
-            scores_array = (
-                scores.cpu().numpy() if scores is not None else None
-            )
-            boxes_array = (
-                boxes.cpu().numpy() if boxes is not None else None
-            )
-            return SegmentationResult(
-                masks=masks_array,
-                scores=scores_array,
-                boxes=boxes_array,
-            )
         except Exception:
             logging.exception("SAM3 post-processing failed")
@@ -132,3 +137,38 @@ class SAM3Segmenter(Segmenter):
                 scores=None,
                 boxes=None,
             )

 import logging
+from typing import Optional, Sequence
 import numpy as np
 import torch
         logging.info("SAM3 model loaded successfully")
+    supports_batch = True
+    max_batch_size = 4
+    def _parse_single_result(self, results, frame_shape) -> SegmentationResult:
+        # Extract results
+        masks = results.get("masks", [])
+        scores = results.get("scores", None)
+        boxes = results.get("boxes", None)
+        # Convert to numpy arrays
+        if len(masks) > 0:
+            # Stack masks: list of (H, W) -> (N, H, W)
+            masks_array = np.stack([m.cpu().numpy() for m in masks])
+        else:
+            # No objects detected
+            masks_array = np.zeros(
+                (0, frame_shape[0], frame_shape[1]), dtype=bool
+            )
+        scores_array = (
+            scores.cpu().numpy() if scores is not None else None
+        )
+        boxes_array = (
+            boxes.cpu().numpy() if boxes is not None else None
+        )
+        return SegmentationResult(
+            masks=masks_array,
+            scores=scores_array,
+            boxes=boxes_array,
+        )
     def predict(self, frame: np.ndarray, text_prompts: Optional[list] = None) -> SegmentationResult:
         """
         Run SAM3 segmentation on a frame.
                 mask_threshold=self.mask_threshold,
                 target_sizes=inputs.get("original_sizes").tolist(),
             )[0]
+            return self._parse_single_result(results, frame.shape)
         except Exception:
             logging.exception("SAM3 post-processing failed")
                 scores=None,
                 boxes=None,
             )
+    def predict_batch(self, frames: Sequence[np.ndarray], text_prompts: Optional[list] = None) -> Sequence[SegmentationResult]:
+        pil_images = []
+        for f in frames:
+            if f.dtype == np.uint8:
+                pil_images.append(Image.fromarray(f))
+            else:
+                f_uint8 = (f * 255).astype(np.uint8)
+                pil_images.append(Image.fromarray(f_uint8))
+        prompts = text_prompts or ["object"]
+        # Same prompts for all images
+        inputs = self.processor(images=pil_images, text=[prompts]*len(frames), return_tensors="pt").to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        try:
+             results_list = self.processor.post_process_instance_segmentation(
+                outputs,
+                threshold=self.threshold,
+                mask_threshold=self.mask_threshold,
+                target_sizes=inputs.get("original_sizes").tolist(),
+            )
+             return [self._parse_single_result(r, f.shape) for r, f in zip(results_list, frames)]
+        except Exception:
+            logging.exception("SAM3 batch post-processing failed")
+            return [
+                SegmentationResult(
+                    masks=np.zeros((0, f.shape[0], f.shape[1]), dtype=bool),
+                    scores=None,
+                    boxes=None
+                ) for f in frames
+            ]