Spaces:

BiasLab2025
/

perception

Paused

Zhen Ye commited on 28 days ago

Commit

45eb65b

1 Parent(s): b2e7d79

feat(inference): enable full multi-GPU support for all models

- Update inference.py to parallelize detection, segmentation, and depth estimation across all available GPUs
- Update detectors (YOLOv8, DETR, GroundingDINO, DroneYolo) to accept device argument
- Update SAM3 and DepthAnythingV2 to accept device argument
- Add device-specific model loading to all model loaders
- Remove OwlV2 support

Files changed (12) hide show

app.py +5 -0
inference.py +292 -46
jobs/background.py +2 -0
jobs/models.py +1 -0
models/depth_estimators/depth_anything_v2.py +5 -2
models/depth_estimators/model_loader.py +7 -2
models/detectors/detr.py +5 -2
models/detectors/drone_yolo.py +5 -2
models/detectors/grounding_dino.py +5 -2
models/detectors/yolov8.py +5 -2
models/model_loader.py +7 -2
models/segmenters/model_loader.py +10 -2

app.py CHANGED Viewed

@@ -228,6 +228,8 @@ async def detect_endpoint(
             output_path,
             query_list,
             detector_name=detector_name,
         )
     except ValueError as exc:
         logging.exception("Video processing failed.")
@@ -261,6 +263,7 @@ async def detect_async_endpoint(
     detector: str = Form("hf_yolov8"),
     segmenter: str = Form("sam3"),
     depth_estimator: str = Form("depth"),
 ):
     if mode not in VALID_MODES:
         raise HTTPException(
@@ -313,6 +316,7 @@ async def detect_async_endpoint(
             detector_name=detector_name,
             segmenter_name=segmenter,
             depth_estimator_name=depth_estimator,
         )
         cv2.imwrite(str(first_frame_path), processed_frame)
     except Exception:
@@ -332,6 +336,7 @@ async def detect_async_endpoint(
         first_frame_path=str(first_frame_path),
         first_frame_detections=detections,
         depth_estimator_name=depth_estimator,
         depth_output_path=str(depth_output_path),
         first_frame_depth_path=str(first_frame_depth_path),
     )

             output_path,
             query_list,
             detector_name=detector_name,
+            depth_estimator_name="depth",  # Synch endpoint default
+            depth_scale=1.0,
         )
     except ValueError as exc:
         logging.exception("Video processing failed.")
     detector: str = Form("hf_yolov8"),
     segmenter: str = Form("sam3"),
     depth_estimator: str = Form("depth"),
+    depth_scale: float = Form(1.0),
 ):
     if mode not in VALID_MODES:
         raise HTTPException(
             detector_name=detector_name,
             segmenter_name=segmenter,
             depth_estimator_name=depth_estimator,
+            depth_scale=depth_scale,
         )
         cv2.imwrite(str(first_frame_path), processed_frame)
     except Exception:
         first_frame_path=str(first_frame_path),
         first_frame_detections=detections,
         depth_estimator_name=depth_estimator,
+        depth_scale=float(depth_scale),
         depth_output_path=str(depth_output_path),
         first_frame_depth_path=str(first_frame_depth_path),
     )

inference.py CHANGED Viewed

@@ -5,8 +5,13 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple
 import cv2
 import numpy as np
-from models.model_loader import load_detector
-from models.segmenters.model_loader import load_segmenter
 from utils.video import extract_frames, write_video
@@ -186,14 +191,25 @@ def _attach_depth_metrics(
     detections: List[Dict[str, Any]],
     depth_estimator_name: Optional[str],
     depth_scale: float,
 ) -> None:
-    if not detections or not depth_estimator_name:
         return
     from models.depth_estimators.model_loader import load_depth_estimator
-    estimator = load_depth_estimator(depth_estimator_name)
-    lock = _get_model_lock("depth", estimator.name)
     with lock:
         depth_result = estimator.predict(frame)
@@ -246,25 +262,56 @@ def infer_frame(
     frame: np.ndarray,
     queries: Sequence[str],
     detector_name: Optional[str] = None,
 ) -> tuple[np.ndarray, List[Dict[str, Any]]]:
-    detector = load_detector(detector_name)
     text_queries = list(queries) or ["object"]
     try:
-        lock = _get_model_lock("detector", detector.name)
         with lock:
             result = detector.predict(frame, text_queries)
             detections = _build_detection_records(
                 result.boxes, result.scores, result.labels, text_queries, result.label_names
             )
     except Exception:
         logging.exception("Inference failed for queries %s", text_queries)
         raise
     return draw_boxes(
         frame,
         result.boxes,
-        labels=result.labels,
-        queries=text_queries,
-        label_names=result.label_names,
     ), detections
@@ -272,9 +319,19 @@ def infer_segmentation_frame(
     frame: np.ndarray,
     text_queries: Optional[List[str]] = None,
     segmenter_name: Optional[str] = None,
 ) -> tuple[np.ndarray, Any]:
-    segmenter = load_segmenter(segmenter_name)
-    lock = _get_model_lock("segmenter", segmenter.name)
     with lock:
         result = segmenter.predict(frame, text_prompts=text_queries)
     labels = text_queries or []
@@ -335,6 +392,8 @@ def run_inference(
     max_frames: Optional[int] = None,
     detector_name: Optional[str] = None,
     job_id: Optional[str] = None,
 ) -> str:
     """
     Run object detection inference on a video.
@@ -346,9 +405,8 @@ def run_inference(
         max_frames: Optional frame limit for testing
         detector_name: Detector to use (default: hf_yolov8)
         job_id: Optional job ID for cancellation support
-    Returns:
-        Path to processed output video
     """
     try:
         frames, fps, width, height = extract_frames(input_video_path)
@@ -367,17 +425,102 @@ def run_inference(
     active_detector = detector_name or "hf_yolov8"
     logging.info("Using detector: %s", active_detector)
     # Process frames
-    processed_frames: List[np.ndarray] = []
-    for idx, frame in enumerate(frames):
-        # Check for cancellation every frame
-        _check_cancellation(job_id)
-        if max_frames is not None and idx >= max_frames:
-            break
-        logging.debug("Processing frame %d", idx)
-        processed_frame, _ = infer_frame(frame, queries, detector_name=active_detector)
-        processed_frames.append(processed_frame)
     # Write output video
     write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
@@ -403,16 +546,64 @@ def run_segmentation(
     active_segmenter = segmenter_name or "sam3"
     logging.info("Using segmenter: %s with queries: %s", active_segmenter, queries)
-    processed_frames: List[np.ndarray] = []
-    for idx, frame in enumerate(frames):
-        # Check for cancellation every frame
-        _check_cancellation(job_id)
-        if max_frames is not None and idx >= max_frames:
-            break
-        logging.debug("Processing frame %d", idx)
-        processed_frame, _ = infer_segmentation_frame(frame, text_queries=queries, segmenter_name=active_segmenter)
-        processed_frames.append(processed_frame)
     write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
     logging.info("Segmented video written to: %s", output_video_path)
@@ -490,25 +681,80 @@ def process_frames_depth(
     Returns:
         List of depth visualization frames (HxWx3 RGB uint8)
     """
-    from models.depth_estimators.model_loader import load_depth_estimator
-    estimator = load_depth_estimator(depth_estimator_name)
     # First pass: Compute all depth maps and find global range
-    depth_maps = []
     all_values = []
-    for idx, frame in enumerate(frames):
-        _check_cancellation(job_id)
-        lock = _get_model_lock("depth", estimator.name)
-        with lock:
-            depth_result = estimator.predict(frame)
-        depth_maps.append(depth_result.depth_map)
-        all_values.append(depth_result.depth_map.ravel())
-        if idx % 10 == 0:
-            logging.debug("Computed depth for frame %d/%d", idx + 1, len(frames))
     # Compute global min/max (using percentiles to handle outliers)
     all_depths = np.concatenate(all_values).astype(np.float32, copy=False)

 import cv2
 import numpy as np
+import torch
+from concurrent.futures import ThreadPoolExecutor
+from threading import RLock
+from models.detectors.base import ObjectDetector
+from models.model_loader import load_detector, load_detector_on_device
+from models.segmenters.model_loader import load_segmenter, load_segmenter_on_device
+from models.depth_estimators.model_loader import load_depth_estimator_on_device
 from utils.video import extract_frames, write_video
     detections: List[Dict[str, Any]],
     depth_estimator_name: Optional[str],
     depth_scale: float,
+    estimator_instance: Optional[Any] = None,
 ) -> None:
+    if not detections or (not depth_estimator_name and not estimator_instance):
         return
     from models.depth_estimators.model_loader import load_depth_estimator
+    if estimator_instance:
+        estimator = estimator_instance
+        # Use instance lock if available, or create one
+        if hasattr(estimator, "lock"):
+            lock = estimator.lock
+        else:
+            # Fallback (shouldn't happen with our new setup but safe)
+            lock = _get_model_lock("depth", estimator.name)
+    else:
+        estimator = load_depth_estimator(depth_estimator_name)
+        lock = _get_model_lock("depth", estimator.name)
     with lock:
         depth_result = estimator.predict(frame)
     frame: np.ndarray,
     queries: Sequence[str],
     detector_name: Optional[str] = None,
+    depth_estimator_name: Optional[str] = None,
+    depth_scale: float = 1.0,
+    detector_instance: Optional[ObjectDetector] = None,
+    depth_estimator_instance: Optional[Any] = None,
 ) -> tuple[np.ndarray, List[Dict[str, Any]]]:
+    if detector_instance:
+        detector = detector_instance
+    else:
+        detector = load_detector(detector_name)
     text_queries = list(queries) or ["object"]
     try:
+        if hasattr(detector, "lock"):
+            lock = detector.lock
+        else:
+            lock = _get_model_lock("detector", detector.name)
         with lock:
             result = detector.predict(frame, text_queries)
             detections = _build_detection_records(
                 result.boxes, result.scores, result.labels, text_queries, result.label_names
             )
+        if depth_estimator_name or depth_estimator_instance:
+            try:
+                _attach_depth_metrics(
+                    frame, detections, depth_estimator_name, depth_scale, estimator_instance=depth_estimator_instance
+                )
+            except Exception:
+                logging.exception("Depth estimation failed for frame")
+        # Re-build display labels to incude depth if available
+        display_labels = []
+        for i, det in enumerate(detections):
+            label = det["label"]
+            if det.get("depth_valid") and det.get("depth_est_m") is not None:
+                # Add depth to label, e.g. "car 12m"
+                depth_str = f"{int(det['depth_est_m'])}m"
+                label = f"{label} {depth_str}"
+            display_labels.append(label)
     except Exception:
         logging.exception("Inference failed for queries %s", text_queries)
         raise
     return draw_boxes(
         frame,
         result.boxes,
+        labels=None, # Use custom labels
+        queries=None,
+        label_names=display_labels,
     ), detections
     frame: np.ndarray,
     text_queries: Optional[List[str]] = None,
     segmenter_name: Optional[str] = None,
+    segmenter_instance: Optional[Any] = None,
 ) -> tuple[np.ndarray, Any]:
+    if segmenter_instance:
+        segmenter = segmenter_instance
+        # Use instance lock if available
+        if hasattr(segmenter, "lock"):
+            lock = segmenter.lock
+        else:
+            lock = _get_model_lock("segmenter", segmenter.name)
+    else:
+        segmenter = load_segmenter(segmenter_name)
+        lock = _get_model_lock("segmenter", segmenter.name)
     with lock:
         result = segmenter.predict(frame, text_prompts=text_queries)
     labels = text_queries or []
     max_frames: Optional[int] = None,
     detector_name: Optional[str] = None,
     job_id: Optional[str] = None,
+    depth_estimator_name: Optional[str] = None,
+    depth_scale: float = 1.0,
 ) -> str:
     """
     Run object detection inference on a video.
         max_frames: Optional frame limit for testing
         detector_name: Detector to use (default: hf_yolov8)
         job_id: Optional job ID for cancellation support
+        depth_estimator_name: Optional depth estimator name
+        depth_scale: Scale factor for depth estimation
     """
     try:
         frames, fps, width, height = extract_frames(input_video_path)
     active_detector = detector_name or "hf_yolov8"
     logging.info("Using detector: %s", active_detector)
+    # Detect GPUs
+    num_gpus = torch.cuda.device_count()
+    detectors = None
+    depth_estimators = None
+    if num_gpus > 1:
+        logging.info("Detected %d GPUs. Enabling Multi-GPU inference.", num_gpus)
+        # Initialize one detector per GPU
+        detectors = []
+        depth_estimators = []
+        for i in range(num_gpus):
+            device_str = f"cuda:{i}"
+            logging.info("Loading detector/depth on %s", device_str)
+            # Detector
+            det = load_detector_on_device(active_detector, device_str)
+            det.lock = RLock()
+            detectors.append(det)
+            # Depth (if requested)
+            if depth_estimator_name:
+                depth = load_depth_estimator_on_device(depth_estimator_name, device_str)
+                depth.lock = RLock()
+                depth_estimators.append(depth)
+            else:
+                depth_estimators.append(None)
+    else:
+        logging.info("Single device detected. Using standard inference.")
+        detectors = None
+    processed_frames_map = {}
     # Process frames
+    if detectors:
+        # Multi-GPU Parallel Processing
+        def process_frame_task(frame_idx: int, frame_data: np.ndarray) -> tuple[int, np.ndarray]:
+            # Determine which GPU to use based on frame index (round-robin)
+            gpu_idx = frame_idx % len(detectors)
+            detector_instance = detectors[gpu_idx]
+            depth_instance = depth_estimators[gpu_idx] if depth_estimators else None
+            # Run depth estimation every 3 frames if configured
+            active_depth_name = depth_estimator_name if (frame_idx % 3 == 0) else None
+            active_depth_instance = depth_instance if (frame_idx % 3 == 0) else None
+            processed, _ = infer_frame(
+                frame_data,
+                queries,
+                detector_name=None, # Use instance
+                depth_estimator_name=active_depth_name,
+                depth_scale=depth_scale,
+                detector_instance=detector_instance,
+                depth_estimator_instance=active_depth_instance
+            )
+            return frame_idx, processed
+        # Thread pool with more workers than GPUs to keep them fed
+        max_workers = min(len(detectors) * 2, 8)
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = []
+            for idx, frame in enumerate(frames):
+                _check_cancellation(job_id)
+                if max_frames is not None and idx >= max_frames:
+                    break
+                futures.append(executor.submit(process_frame_task, idx, frame))
+            for future in futures:
+                idx, result_frame = future.result() # Wait for completion (in order or not, but we verify order)
+                processed_frames_map[idx] = result_frame
+        # Reasemble in order
+        processed_frames = [processed_frames_map[i] for i in range(len(processed_frames_map))]
+    else:
+        # Standard Single-Threaded Loop
+        processed_frames = []
+        for idx, frame in enumerate(frames):
+            # Check for cancellation every frame
+            _check_cancellation(job_id)
+            if max_frames is not None and idx >= max_frames:
+                break
+            logging.debug("Processing frame %d", idx)
+            # Run depth estimation every 3 frames if configured
+            active_depth = depth_estimator_name if (idx % 3 == 0) else None
+            processed_frame, _ = infer_frame(
+                frame,
+                queries,
+                detector_name=active_detector,
+                depth_estimator_name=active_depth,
+                depth_scale=depth_scale
+            )
+            processed_frames.append(processed_frame)
     # Write output video
     write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
     active_segmenter = segmenter_name or "sam3"
     logging.info("Using segmenter: %s with queries: %s", active_segmenter, queries)
+    # Detect GPUs
+    num_gpus = torch.cuda.device_count()
+    segmenters = None
+    if num_gpus > 1:
+        logging.info("Detected %d GPUs. Enabling Multi-GPU segmentation.", num_gpus)
+        segmenters = []
+        for i in range(num_gpus):
+            device_str = f"cuda:{i}"
+            logging.info("Loading segmenter on %s", device_str)
+            seg = load_segmenter_on_device(active_segmenter, device_str)
+            seg.lock = RLock()
+            segmenters.append(seg)
+    else:
+        logging.info("Single device detected. Using standard segmentation.")
+        segmenters = None
+    processed_frames_map = {}
+    if segmenters:
+         # Multi-GPU Parallel Processing
+        def process_segmentation_task(frame_idx: int, frame_data: np.ndarray) -> tuple[int, np.ndarray]:
+            gpu_idx = frame_idx % len(segmenters)
+            segmenter_instance = segmenters[gpu_idx]
+            processed, _ = infer_segmentation_frame(
+                frame_data,
+                text_queries=queries,
+                segmenter_name=None,
+                segmenter_instance=segmenter_instance
+            )
+            return frame_idx, processed
+        max_workers = min(len(segmenters) * 2, 8)
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = []
+            for idx, frame in enumerate(frames):
+                _check_cancellation(job_id)
+                if max_frames is not None and idx >= max_frames:
+                    break
+                futures.append(executor.submit(process_segmentation_task, idx, frame))
+            for future in futures:
+                idx, result_frame = future.result()
+                processed_frames_map[idx] = result_frame
+        processed_frames = [processed_frames_map[i] for i in range(len(processed_frames_map))]
+    else:
+        processed_frames: List[np.ndarray] = []
+        for idx, frame in enumerate(frames):
+            # Check for cancellation every frame
+            _check_cancellation(job_id)
+            if max_frames is not None and idx >= max_frames:
+                break
+            logging.debug("Processing frame %d", idx)
+            processed_frame, _ = infer_segmentation_frame(frame, text_queries=queries, segmenter_name=active_segmenter)
+            processed_frames.append(processed_frame)
     write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
     logging.info("Segmented video written to: %s", output_video_path)
     Returns:
         List of depth visualization frames (HxWx3 RGB uint8)
     """
+    from models.depth_estimators.model_loader import load_depth_estimator, load_depth_estimator_on_device
+    # Detect GPUs
+    num_gpus = torch.cuda.device_count()
+    estimators = None
+    if num_gpus > 1:
+        logging.info("Detected %d GPUs. Enabling Multi-GPU depth estimation.", num_gpus)
+        estimators = []
+        for i in range(num_gpus):
+            device_str = f"cuda:{i}"
+            logging.info("Loading depth estimator on %s", device_str)
+            est = load_depth_estimator_on_device(depth_estimator_name, device_str)
+            est.lock = RLock()
+            estimators.append(est)
+    else:
+        logging.info("Single device detected. Using standard depth estimation.")
+        estimators = None
+        # Fallback to single estimator
+        single_estimator = load_depth_estimator(depth_estimator_name)
     # First pass: Compute all depth maps and find global range
+    depth_maps_map = {}
     all_values = []
+    if estimators:
+        # Multi-GPU Parallel Processing
+        def compute_depth_task(frame_idx: int, frame_data: np.ndarray) -> tuple[int, Any]:
+            gpu_idx = frame_idx % len(estimators)
+            estimator_instance = estimators[gpu_idx]
+            # Use instance lock
+            if hasattr(estimator_instance, "lock"):
+                lock = estimator_instance.lock
+            else:
+                 # Should have been assigned above
+                lock = RLock()
+            with lock:
+                result = estimator_instance.predict(frame_data)
+            return frame_idx, result
+        max_workers = min(len(estimators) * 2, 8)
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = []
+            for idx, frame in enumerate(frames):
+                _check_cancellation(job_id)
+                futures.append(executor.submit(compute_depth_task, idx, frame))
+            for future in futures:
+                idx, res = future.result()
+                depth_maps_map[idx] = res.depth_map
+                # We need to collect values for global min/max.
+                # Doing this here or later? doing it later to keep thread clean
+        # Reassemble
+        depth_maps = [depth_maps_map[i] for i in range(len(depth_maps_map))]
+        all_values = [dm.ravel() for dm in depth_maps]
+    else:
+        # Single threaded
+        estimator = single_estimator
+        depth_maps = []
+        for idx, frame in enumerate(frames):
+            _check_cancellation(job_id)
+            lock = _get_model_lock("depth", estimator.name)
+            with lock:
+                depth_result = estimator.predict(frame)
+            depth_maps.append(depth_result.depth_map)
+            all_values.append(depth_result.depth_map.ravel())
+            if idx % 10 == 0:
+                logging.debug("Computed depth for frame %d/%d", idx + 1, len(frames))
     # Compute global min/max (using percentiles to handle outliers)
     all_depths = np.concatenate(all_values).astype(np.float32, copy=False)

jobs/background.py CHANGED Viewed

@@ -41,6 +41,8 @@ async def process_video_async(job_id: str) -> None:
                 None,
                 job.detector_name,
                 job_id,
             )
         # Try to run depth estimation

                 None,
                 job.detector_name,
                 job_id,
+                job.depth_estimator_name,
+                job.depth_scale,
             )
         # Try to run depth estimation

jobs/models.py CHANGED Viewed

@@ -28,6 +28,7 @@ class JobInfo:
     first_frame_detections: List[Dict[str, Any]] = field(default_factory=list)
     # Depth estimation fields
     depth_estimator_name: str = "depth"
     depth_output_path: Optional[str] = None
     first_frame_depth_path: Optional[str] = None
     partial_success: bool = False  # True if one component failed but job completed

     first_frame_detections: List[Dict[str, Any]] = field(default_factory=list)
     # Depth estimation fields
     depth_estimator_name: str = "depth"
+    depth_scale: float = 1.0
     depth_output_path: Optional[str] = None
     first_frame_depth_path: Optional[str] = None
     partial_success: bool = False  # True if one component failed but job completed

models/depth_estimators/depth_anything_v2.py CHANGED Viewed

@@ -13,10 +13,13 @@ class DepthAnythingV2Estimator(DepthEstimator):
     name = "depth"
-    def __init__(self) -> None:
         logging.info("Loading Depth-Anything model from Hugging Face (transformers)...")
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         model_id = "LiheYoung/depth-anything-large-hf"
         self.image_processor = AutoImageProcessor.from_pretrained(model_id)

     name = "depth"
+    def __init__(self, device: str = None) -> None:
         logging.info("Loading Depth-Anything model from Hugging Face (transformers)...")
+        if device:
+             self.device = torch.device(device)
+        else:
+             self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         model_id = "LiheYoung/depth-anything-large-hf"
         self.image_processor = AutoImageProcessor.from_pretrained(model_id)

models/depth_estimators/model_loader.py CHANGED Viewed

@@ -27,7 +27,7 @@ def _get_cached_depth_estimator(name: str) -> DepthEstimator:
     return _create_depth_estimator(name)
-def _create_depth_estimator(name: str) -> DepthEstimator:
     """
     Create depth estimator instance.
@@ -46,7 +46,7 @@ def _create_depth_estimator(name: str) -> DepthEstimator:
         )
     estimator_class = _REGISTRY[name]
-    return estimator_class()
 def load_depth_estimator(name: str = "depth") -> DepthEstimator:
@@ -62,6 +62,11 @@ def load_depth_estimator(name: str = "depth") -> DepthEstimator:
     return _get_cached_depth_estimator(name)
 def list_depth_estimators() -> list[str]:
     """Return list of available depth estimator names."""
     return list(_REGISTRY.keys())

     return _create_depth_estimator(name)
+def _create_depth_estimator(name: str, **kwargs) -> DepthEstimator:
     """
     Create depth estimator instance.
         )
     estimator_class = _REGISTRY[name]
+    return estimator_class(**kwargs)
 def load_depth_estimator(name: str = "depth") -> DepthEstimator:
     return _get_cached_depth_estimator(name)
+def load_depth_estimator_on_device(name: str, device: str) -> DepthEstimator:
+    """Create a new depth estimator instance on the specified device (no caching)."""
+    return _create_depth_estimator(name, device=device)
 def list_depth_estimators() -> list[str]:
     """Return list of available depth estimator names."""
     return list(_REGISTRY.keys())

models/detectors/detr.py CHANGED Viewed

@@ -13,10 +13,13 @@ class DetrDetector(ObjectDetector):
     MODEL_NAME = "facebook/detr-resnet-50"
-    def __init__(self, score_threshold: float = 0.3) -> None:
         self.name = "detr_resnet50"
         self.score_threshold = score_threshold
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         logging.info("Loading %s onto %s", self.MODEL_NAME, self.device)
         self.processor = DetrImageProcessor.from_pretrained(self.MODEL_NAME)
         self.model = DetrForObjectDetection.from_pretrained(self.MODEL_NAME)

     MODEL_NAME = "facebook/detr-resnet-50"
+    def __init__(self, score_threshold: float = 0.3, device: str = None) -> None:
         self.name = "detr_resnet50"
         self.score_threshold = score_threshold
+        if device:
+             self.device = torch.device(device)
+        else:
+             self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         logging.info("Loading %s onto %s", self.MODEL_NAME, self.device)
         self.processor = DetrImageProcessor.from_pretrained(self.MODEL_NAME)
         self.model = DetrForObjectDetection.from_pretrained(self.MODEL_NAME)

models/detectors/drone_yolo.py CHANGED Viewed

@@ -16,10 +16,13 @@ class DroneYoloDetector(ObjectDetector):
     REPO_ID = "rujutashashikanjoshi/yolo12-drone-detection-0205-100m"
     DEFAULT_WEIGHT = "best.pt"
-    def __init__(self, score_threshold: float = 0.3) -> None:
         self.name = "drone_yolo"
         self.score_threshold = score_threshold
-        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
         weight_file = os.getenv("DRONE_YOLO_WEIGHT", self.DEFAULT_WEIGHT)
         logging.info(
             "Loading drone YOLO weights %s/%s onto %s",

     REPO_ID = "rujutashashikanjoshi/yolo12-drone-detection-0205-100m"
     DEFAULT_WEIGHT = "best.pt"
+    def __init__(self, score_threshold: float = 0.3, device: str = None) -> None:
         self.name = "drone_yolo"
         self.score_threshold = score_threshold
+        if device:
+            self.device = device
+        else:
+            self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
         weight_file = os.getenv("DRONE_YOLO_WEIGHT", self.DEFAULT_WEIGHT)
         logging.info(
             "Loading drone YOLO weights %s/%s onto %s",

models/detectors/grounding_dino.py CHANGED Viewed

@@ -13,11 +13,14 @@ class GroundingDinoDetector(ObjectDetector):
     MODEL_NAME = "IDEA-Research/grounding-dino-base"
-    def __init__(self, box_threshold: float = 0.35, text_threshold: float = 0.25) -> None:
         self.name = "grounding_dino"
         self.box_threshold = box_threshold
         self.text_threshold = text_threshold
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         logging.info("Loading %s onto %s", self.MODEL_NAME, self.device)
         self.processor = GroundingDinoProcessor.from_pretrained(self.MODEL_NAME)
         self.model = GroundingDinoForObjectDetection.from_pretrained(self.MODEL_NAME)

     MODEL_NAME = "IDEA-Research/grounding-dino-base"
+    def __init__(self, box_threshold: float = 0.35, text_threshold: float = 0.25, device: str = None) -> None:
         self.name = "grounding_dino"
         self.box_threshold = box_threshold
         self.text_threshold = text_threshold
+        if device:
+            self.device = torch.device(device)
+        else:
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         logging.info("Loading %s onto %s", self.MODEL_NAME, self.device)
         self.processor = GroundingDinoProcessor.from_pretrained(self.MODEL_NAME)
         self.model = GroundingDinoForObjectDetection.from_pretrained(self.MODEL_NAME)

models/detectors/yolov8.py CHANGED Viewed

@@ -15,10 +15,13 @@ class HuggingFaceYoloV8Detector(ObjectDetector):
     REPO_ID = "Ultralytics/YOLOv8"
     WEIGHT_FILE = "yolov8s.pt"
-    def __init__(self, score_threshold: float = 0.3) -> None:
         self.name = "hf_yolov8"
         self.score_threshold = score_threshold
-        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
         logging.info(
             "Loading Hugging Face YOLOv8 weights %s/%s onto %s",
             self.REPO_ID,

     REPO_ID = "Ultralytics/YOLOv8"
     WEIGHT_FILE = "yolov8s.pt"
+    def __init__(self, score_threshold: float = 0.3, device: str = None) -> None:
         self.name = "hf_yolov8"
         self.score_threshold = score_threshold
+        if device:
+            self.device = device
+        else:
+            self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
         logging.info(
             "Loading Hugging Face YOLOv8 weights %s/%s onto %s",
             self.REPO_ID,

models/model_loader.py CHANGED Viewed

@@ -18,13 +18,13 @@ _REGISTRY: Dict[str, Callable[[], ObjectDetector]] = {
 }
-def _create_detector(name: str) -> ObjectDetector:
     try:
         factory = _REGISTRY[name]
     except KeyError as exc:
         available = ", ".join(sorted(_REGISTRY))
         raise ValueError(f"Unknown detector '{name}'. Available: {available}") from exc
-    return factory()
 @lru_cache(maxsize=None)
@@ -38,6 +38,11 @@ def load_detector(name: Optional[str] = None) -> ObjectDetector:
     return _get_cached_detector(detector_name)
 # Backwards compatibility for existing callers.
 def load_model():
     return load_detector()

 }
+def _create_detector(name: str, **kwargs) -> ObjectDetector:
     try:
         factory = _REGISTRY[name]
     except KeyError as exc:
         available = ", ".join(sorted(_REGISTRY))
         raise ValueError(f"Unknown detector '{name}'. Available: {available}") from exc
+    return factory(**kwargs)
 @lru_cache(maxsize=None)
     return _get_cached_detector(detector_name)
+def load_detector_on_device(name: str, device: str) -> ObjectDetector:
+    """Create a new detector instance on the specified device (no caching)."""
+    return _create_detector(name, device=device)
 # Backwards compatibility for existing callers.
 def load_model():
     return load_detector()

models/segmenters/model_loader.py CHANGED Viewed

@@ -12,7 +12,7 @@ _REGISTRY: Dict[str, Callable[[], Segmenter]] = {
 }
-def _create_segmenter(name: str) -> Segmenter:
     """Create a new segmenter instance."""
     try:
         factory = _REGISTRY[name]
@@ -21,7 +21,7 @@ def _create_segmenter(name: str) -> Segmenter:
         raise ValueError(
             f"Unknown segmenter '{name}'. Available: {available}"
         ) from exc
-    return factory()
 @lru_cache(maxsize=None)
@@ -42,3 +42,11 @@ def load_segmenter(name: Optional[str] = None) -> Segmenter:
     """
     segmenter_name = name or os.getenv("SEGMENTER", DEFAULT_SEGMENTER)
     return _get_cached_segmenter(segmenter_name)

 }
+def _create_segmenter(name: str, **kwargs) -> Segmenter:
     """Create a new segmenter instance."""
     try:
         factory = _REGISTRY[name]
         raise ValueError(
             f"Unknown segmenter '{name}'. Available: {available}"
         ) from exc
+    return factory(**kwargs)
 @lru_cache(maxsize=None)
     """
     segmenter_name = name or os.getenv("SEGMENTER", DEFAULT_SEGMENTER)
     return _get_cached_segmenter(segmenter_name)
+def load_segmenter_on_device(name: str, device: str) -> Segmenter:
+    """Create a new segmenter instance on the specified device (no caching)."""
+    # bypass cache by calling private creator directly
+    # Note: _create_segmenter calls factory() which needs to accept device now.
+    # We need to update _create_segmenter to pass kwargs too.
+    return _create_segmenter(name, device=device)