ISR

Paused

App Files Files Community

Zhen Ye commited on Jan 17

Commit

7f8fcb7

1 Parent(s): 43ec7b4

update:inference pipeline optimization

Browse files

Files changed (2) hide show

inference.py +542 -404
utils/video.py +77 -0

inference.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import logging
 import os
-from threading import RLock
 from typing import Any, Dict, List, Optional, Sequence, Tuple
 import cv2
@@ -12,7 +14,7 @@ from models.detectors.base import ObjectDetector
 from models.model_loader import load_detector, load_detector_on_device
 from models.segmenters.model_loader import load_segmenter, load_segmenter_on_device
 from models.depth_estimators.model_loader import load_depth_estimator, load_depth_estimator_on_device
-from utils.video import extract_frames, write_video
 def _check_cancellation(job_id: Optional[str]) -> None:
@@ -27,7 +29,7 @@ def _check_cancellation(job_id: Optional[str]) -> None:
         raise RuntimeError("Job cancelled by user")
-def _color_for_label(label: str) -> tuple[int, int, int]:
     # Deterministic BGR color from label text.
     value = abs(hash(label)) % 0xFFFFFF
     blue = value & 0xFF
@@ -277,7 +279,7 @@ def infer_frame(
     depth_scale: float = 1.0,
     detector_instance: Optional[ObjectDetector] = None,
     depth_estimator_instance: Optional[Any] = None,
-) -> tuple[np.ndarray, List[Dict[str, Any]]]:
     if detector_instance:
         detector = detector_instance
     else:
@@ -332,7 +334,7 @@ def infer_segmentation_frame(
     text_queries: Optional[List[str]] = None,
     segmenter_name: Optional[str] = None,
     segmenter_instance: Optional[Any] = None,
-) -> tuple[np.ndarray, Any]:
     if segmenter_instance:
         segmenter = segmenter_instance
         # Use instance lock if available
@@ -406,175 +408,219 @@ def run_inference(
     job_id: Optional[str] = None,
     depth_estimator_name: Optional[str] = None,
     depth_scale: float = 1.0,
-) -> tuple[str, List[List[Dict[str, Any]]]]:
-    """
-    Run object detection inference on a video.
-    Args:
-        input_video_path: Path to input video
-        output_video_path: Path to write processed video
-        queries: List of object classes to detect (e.g., ["person", "car"])
-        max_frames: Optional frame limit for testing
-        detector_name: Detector to use (default: hf_yolov8)
-        job_id: Optional job ID for cancellation support
-        depth_estimator_name: Optional depth estimator name
-        depth_scale: Scale factor for depth estimation
-    """
     try:
-        frames, fps, width, height = extract_frames(input_video_path)
-    except ValueError as exc:
-        logging.exception("Failed to decode video at %s", input_video_path)
         raise
-    # Use provided queries or default to common objects
     if not queries:
         queries = ["person", "car", "truck", "motorcycle", "bicycle", "bus", "train", "airplane"]
         logging.info("No queries provided, using defaults: %s", queries)
     logging.info("Detection queries: %s", queries)
-    # Select detector
     active_detector = detector_name or "hf_yolov8"
-    logging.info("Using detector: %s", active_detector)
-    # Detect GPUs
-    # Debug/Fix: Ensure internal restrictions don't hide GPUs
     if "CUDA_VISIBLE_DEVICES" in os.environ:
-        logging.warning("Found CUDA_VISIBLE_DEVICES=%s in run_inference! Unsetting it.", os.environ["CUDA_VISIBLE_DEVICES"])
-        del os.environ["CUDA_VISIBLE_DEVICES"]
-    num_gpus = torch.cuda.device_count()
-    detectors = None
-    depth_estimators = None
-    # DIAGNOSTICS
-    logging.info("--- GPU DIAGNOSTICS ---")
-    logging.info("Torch version: %s", torch.__version__)
-    logging.info("CUDA available: %s", torch.cuda.is_available())
-    logging.info("Device count: %d", torch.cuda.device_count())
-    logging.info("Current device: %s", torch.cuda.current_device() if torch.cuda.is_available() else "N/A")
-    for k, v in os.environ.items():
-        if "CUDA" in k or "NVIDIA" in k:
-            logging.info("Env %s=%s", k, v)
-    logging.info("-----------------------")
-    if num_gpus > 1:
-        logging.info("Detected %d GPUs. Enabling Multi-GPU inference.", num_gpus)
-        # Initialize one detector per GPU
-        detectors = []
-        depth_estimators = []
-        for i in range(num_gpus):
-            device_str = f"cuda:{i}"
-            logging.info("Loading detector/depth on %s", device_str)
-            # Detector
-            det = load_detector_on_device(active_detector, device_str)
-            det.lock = RLock()
-            detectors.append(det)
-            # Depth (if requested)
-            if depth_estimator_name:
-                depth = load_depth_estimator_on_device(depth_estimator_name, device_str)
-                depth.lock = RLock()
                 depth_estimators.append(depth)
-            else:
-                depth_estimators.append(None)
     else:
-        logging.info("Single device detected. Using standard inference.")
-        detectors = None
-    processed_frames_map = {}
-    all_detections_map = {}
-    # Process frames
-    if detectors:
-        # Multi-GPU Parallel Processing
-        def process_frame_task(frame_idx: int, frame_data: np.ndarray) -> tuple[int, np.ndarray, List[Dict[str, Any]]]:
-            # Determine which GPU to use based on frame index (round-robin)
-            gpu_idx = frame_idx % len(detectors)
-            detector_instance = detectors[gpu_idx]
-            depth_instance = depth_estimators[gpu_idx] if depth_estimators else None
             if frame_idx % 30 == 0:
-                logging.info("Processing frame %d on GPU %d (cuda:%d)", frame_idx, gpu_idx, gpu_idx)
-            # Run depth estimation every 3 frames if configured
-            active_depth_name = depth_estimator_name if (frame_idx % 3 == 0) else None
-            active_depth_instance = depth_instance if (frame_idx % 3 == 0) else None
-            processed, frame_dets = infer_frame(
-                frame_data,
-                queries,
-                detector_name=None, # Use instance
-                depth_estimator_name=active_depth_name,
-                depth_scale=depth_scale,
-                detector_instance=detector_instance,
-                depth_estimator_instance=active_depth_instance
-            )
-            return frame_idx, processed, frame_dets
-        # Thread pool with more workers than GPUs to keep them fed
-        max_workers = min(len(detectors) * 2, 8)
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            futures = []
-            for idx, frame in enumerate(frames):
-                _check_cancellation(job_id)
-                if max_frames is not None and idx >= max_frames:
-                    break
-                futures.append(executor.submit(process_frame_task, idx, frame))
-            for future in futures:
-                idx, result_frame, result_dets = future.result() # Wait for completion (in order or not, but we verify order)
-                processed_frames_map[idx] = result_frame
-                all_detections_map[idx] = result_dets
-        # Reasemble in order
-        processed_frames = [processed_frames_map[i] for i in range(len(processed_frames_map))]
-        all_detections = [all_detections_map[i] for i in range(len(all_detections_map))]
-    else:
-        # Standard Single-Threaded Loop
-        # Pre-load models to ensure they are loaded once
-        detector_instance = load_detector(active_detector)
-        detector_instance.lock = _get_model_lock("detector", detector_instance.name)
-        depth_estimator_instance = None
-        if depth_estimator_name:
-            depth_estimator_instance = load_depth_estimator(depth_estimator_name)
-            depth_estimator_instance.lock = _get_model_lock("depth", depth_estimator_instance.name)
-        processed_frames = []
-        all_detections = []
-        for idx, frame in enumerate(frames):
-            # Check for cancellation every frame
             _check_cancellation(job_id)
-            if max_frames is not None and idx >= max_frames:
                 break
-            logging.debug("Processing frame %d", idx)
-            # Run depth estimation every 3 frames if configured
-            active_depth_name = depth_estimator_name if (idx % 3 == 0) else None
-            active_depth_instance = depth_estimator_instance if (idx % 3 == 0) else None
-            processed_frame, frame_dets = infer_frame(
-                frame,
-                queries,
-                detector_name=None,
-                depth_estimator_name=active_depth_name,
-                depth_scale=depth_scale,
-                detector_instance=detector_instance,
-                depth_estimator_instance=active_depth_instance
-            )
-            processed_frames.append(processed_frame)
-            all_detections.append(frame_dets)
-    # Write output video
-    write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
-    logging.info("Processed video written to: %s", output_video_path)
-    return output_video_path, all_detections
 def run_segmentation(
@@ -585,83 +631,139 @@ def run_segmentation(
     segmenter_name: Optional[str] = None,
     job_id: Optional[str] = None,
 ) -> str:
     try:
-        frames, fps, width, height = extract_frames(input_video_path)
-    except ValueError as exc:
-        logging.exception("Failed to decode video at %s", input_video_path)
         raise
     active_segmenter = segmenter_name or "sam3"
     logging.info("Using segmenter: %s with queries: %s", active_segmenter, queries)
-    # Detect GPUs
     num_gpus = torch.cuda.device_count()
-    segmenters = None
-    if num_gpus > 1:
-        logging.info("Detected %d GPUs. Enabling Multi-GPU segmentation.", num_gpus)
-        segmenters = []
-        for i in range(num_gpus):
-            device_str = f"cuda:{i}"
-            logging.info("Loading segmenter on %s", device_str)
             seg = load_segmenter_on_device(active_segmenter, device_str)
             seg.lock = RLock()
-            segmenters.append(seg)
     else:
-        logging.info("Single device detected. Using standard segmentation.")
-        segmenters = None
-    processed_frames_map = {}
-    if segmenters:
-         # Multi-GPU Parallel Processing
-        def process_segmentation_task(frame_idx: int, frame_data: np.ndarray) -> tuple[int, np.ndarray]:
-            gpu_idx = frame_idx % len(segmenters)
-            segmenter_instance = segmenters[gpu_idx]
-            if frame_idx % 30 == 0:
-                logging.info("Segmenting frame %d on GPU %d (cuda:%d)", frame_idx, gpu_idx, gpu_idx)
-            processed, _ = infer_segmentation_frame(
-                frame_data,
-                text_queries=queries,
-                segmenter_name=None,
-                segmenter_instance=segmenter_instance
-            )
-            return frame_idx, processed
-        max_workers = min(len(segmenters) * 2, 8)
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            futures = []
-            for idx, frame in enumerate(frames):
-                _check_cancellation(job_id)
-                if max_frames is not None and idx >= max_frames:
-                    break
-                futures.append(executor.submit(process_segmentation_task, idx, frame))
-            for future in futures:
-                idx, result_frame = future.result()
-                processed_frames_map[idx] = result_frame
-        processed_frames = [processed_frames_map[i] for i in range(len(processed_frames_map))]
-    else:
-        processed_frames: List[np.ndarray] = []
-        for idx, frame in enumerate(frames):
-            # Check for cancellation every frame
-            _check_cancellation(job_id)
-            if max_frames is not None and idx >= max_frames:
                 break
-            logging.debug("Processing frame %d", idx)
-            processed_frame, _ = infer_segmentation_frame(frame, text_queries=queries, segmenter_name=active_segmenter)
-            processed_frames.append(processed_frame)
-    write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
     logging.info("Segmented video written to: %s", output_video_path)
     return output_video_path
 def run_depth_inference(
     input_video_path: str,
     output_video_path: str,
@@ -671,227 +773,263 @@ def run_depth_inference(
     first_frame_depth_path: Optional[str] = None,
     job_id: Optional[str] = None,
 ) -> str:
-    """
-    Run depth estimation on a video.
-    Args:
-        input_video_path: Path to input video
-        output_video_path: Path to write depth visualization video
-        max_frames: Optional frame limit for testing
-        depth_estimator_name: Depth estimator to use (default: depth)
-        first_frame_depth_path: Optional path to save the first depth visualization frame
-        job_id: Optional job ID for cancellation support
-    Returns:
-        Path to depth visualization video
-    """
     try:
-        frames, fps, width, height = extract_frames(input_video_path)
-    except ValueError as exc:
-        logging.exception("Failed to decode video at %s", input_video_path)
         raise
-    logging.info("Using depth estimator: %s", depth_estimator_name)
-    # Limit frames if requested
     if max_frames is not None:
-        frames = frames[:max_frames]
-    # Process depth with stable normalization and overlay
-    processed_frames = process_frames_depth(frames, depth_estimator_name, detections=detections, job_id=job_id)
-    # Write output video
-    write_video(processed_frames, output_video_path, fps=fps, width=width, height=height)
-    logging.info("Depth video written to: %s", output_video_path)
-    if first_frame_depth_path and processed_frames:
-        import cv2
-        if not cv2.imwrite(first_frame_depth_path, processed_frames[0]):
-            logging.warning("Failed to write first frame depth image to: %s", first_frame_depth_path)
-    return output_video_path
-def process_frames_depth(
-    frames: List[np.ndarray],
-    depth_estimator_name: str,
-    detections: Optional[List[List[Dict[str, Any]]]] = None,
-    job_id: Optional[str] = None,
-) -> List[np.ndarray]:
-    """
-    Process all frames through depth estimator with stable normalization.
-    Two-pass approach:
-    1. Compute depth for all frames and find global min/max
-    2. Colorize using global range to avoid flicker
-    Args:
-        frames: List of frames (HxWx3 BGR uint8)
-        depth_estimator_name: Name of depth estimator to use
-        job_id: Optional job ID for cancellation
-    Returns:
-        List of depth visualization frames (HxWx3 RGB uint8)
-    """
-    from models.depth_estimators.model_loader import load_depth_estimator, load_depth_estimator_on_device
-    # Detect GPUs
     num_gpus = torch.cuda.device_count()
-    estimators = None
-    if num_gpus > 1:
-        logging.info("Detected %d GPUs. Enabling Multi-GPU depth estimation.", num_gpus)
-        estimators = []
-        for i in range(num_gpus):
-            device_str = f"cuda:{i}"
-            logging.info("Loading depth estimator on %s", device_str)
             est = load_depth_estimator_on_device(depth_estimator_name, device_str)
             est.lock = RLock()
-            estimators.append(est)
     else:
-        logging.info("Single device detected. Using standard depth estimation.")
-        estimators = None
-        # Fallback to single estimator
-        single_estimator = load_depth_estimator(depth_estimator_name)
-    # First pass: Compute all depth maps and find global range
-    depth_maps_map = {}
-    all_values = []
-    if estimators:
-        # Multi-GPU Parallel Processing
-        def compute_depth_task(frame_idx: int, frame_data: np.ndarray) -> tuple[int, Any]:
-            gpu_idx = frame_idx % len(estimators)
-            estimator_instance = estimators[gpu_idx]
-            if frame_idx % 30 == 0:
-                logging.info("Estimating depth for frame %d on GPU %d (cuda:%d)", frame_idx, gpu_idx, gpu_idx)
-            # Use instance lock
-            if hasattr(estimator_instance, "lock"):
-                lock = estimator_instance.lock
-            else:
-                 # Should have been assigned above
-                lock = RLock()
-            with lock:
-                result = estimator_instance.predict(frame_data)
-            return frame_idx, result
-        max_workers = min(len(estimators) * 2, 8)
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            futures = []
-            for idx, frame in enumerate(frames):
-                _check_cancellation(job_id)
-                futures.append(executor.submit(compute_depth_task, idx, frame))
-            for future in futures:
-                idx, res = future.result()
-                depth_maps_map[idx] = res.depth_map
-                # We need to collect values for global min/max.
-                # Doing this here or later? doing it later to keep thread clean
-        # Reassemble
-        depth_maps = [depth_maps_map[i] for i in range(len(depth_maps_map))]
-        all_values = [dm.ravel() for dm in depth_maps]
-    else:
-        # Single threaded
-        estimator = single_estimator
-        depth_maps = []
-        for idx, frame in enumerate(frames):
-            _check_cancellation(job_id)
-            lock = _get_model_lock("depth", estimator.name)
-            with lock:
-                depth_result = estimator.predict(frame)
-            depth_maps.append(depth_result.depth_map)
-            all_values.append(depth_result.depth_map.ravel())
-            if idx % 10 == 0:
-                logging.debug("Computed depth for frame %d/%d", idx + 1, len(frames))
-    # Compute global min/max (using percentiles to handle outliers)
-    all_depths = np.concatenate(all_values).astype(np.float32, copy=False)
-    # Filter out NaN and inf values
-    valid_depths = all_depths[np.isfinite(all_depths)]
-    if len(valid_depths) == 0:
-        logging.warning("All depth values are NaN/inf - using fallback range")
-        global_min = 0.0
-        global_max = 1.0
-    else:
-        valid_depths = valid_depths.astype(np.float64, copy=False)
-        global_min = float(np.percentile(valid_depths, 1))  # 1st percentile to clip outliers
-        global_max = float(np.percentile(valid_depths, 99))  # 99th percentile
-        if not np.isfinite(global_min) or not np.isfinite(global_max):
-            logging.warning("Depth percentiles are non-finite - using min/max fallback")
-            global_min = float(valid_depths.min())
-            global_max = float(valid_depths.max())
-            # Handle edge case where min == max
-            if abs(global_max - global_min) < 1e-6:
-                global_min = float(valid_depths.min())
-                global_max = float(valid_depths.max())
-                if abs(global_max - global_min) < 1e-6:
-                    global_max = global_min + 1.0
-        logging.info(
-            "Depth range: %.2f - %.2f meters (1st-99th percentile)",
-            global_min,
-            global_max,
-        )
-    # Second pass: Apply colormap and overlay detections
-    visualization_frames = []
-    # draw_boxes is defined in this module, so we can use it directly.
-    # Ensure cv2 is imported
-    import cv2
-    for i, depth_map in enumerate(depth_maps):
-        _check_cancellation(job_id)
-        # Norm: (val - min) / (max - min) -> 0..1
-        # Clip to ensure range
-        norm_map = np.clip(depth_map, global_min, global_max)
-        norm_map = (norm_map - global_min) / (global_max - global_min + 1e-6)
-        # Invert intensity? Usually Near(High val) -> Bright(1.0).
-        # Our val is high for near. So direct map is fine.
-        # Colorize
-        norm_map_u8 = (norm_map * 255).astype(np.uint8)
-        heatmap = cv2.applyColorMap(norm_map_u8, cv2.COLORMAP_INFERNO)
-        # Overlay detections if available
-        if detections and i < len(detections):
-            frame_dets = detections[i]
-            # Convert list of dicts to format for draw_boxes
-            if frame_dets:
-                boxes = []
-                labels = []
-                display_labels = []
-                for d in frame_dets:
-                    boxes.append(d.get("bbox"))
-                    # Create label "Class Dist"
-                    lbl = d.get("label", "obj")
-                    # If we have depth info that was calculated in inference:
-                    if d.get("depth_est_m"):
-                         lbl = f"{lbl} {int(d['depth_est_m'])}m"
-                    labels.append(lbl) # used for color
-                    display_labels.append(lbl)
-                heatmap = draw_boxes(heatmap, boxes, label_names=display_labels)
-        visualization_frames.append(heatmap)
-    return visualization_frames
 def colorize_depth_map(

 import logging
 import os
+import time
+from threading import RLock, Thread
+from queue import Queue, PriorityQueue
 from typing import Any, Dict, List, Optional, Sequence, Tuple
 import cv2
 from models.model_loader import load_detector, load_detector_on_device
 from models.segmenters.model_loader import load_segmenter, load_segmenter_on_device
 from models.depth_estimators.model_loader import load_depth_estimator, load_depth_estimator_on_device
+from utils.video import extract_frames, write_video, VideoReader, VideoWriter
 def _check_cancellation(job_id: Optional[str]) -> None:
         raise RuntimeError("Job cancelled by user")
+def _color_for_label(label: str) -> Tuple[int, int, int]:
     # Deterministic BGR color from label text.
     value = abs(hash(label)) % 0xFFFFFF
     blue = value & 0xFF
     depth_scale: float = 1.0,
     detector_instance: Optional[ObjectDetector] = None,
     depth_estimator_instance: Optional[Any] = None,
+) -> Tuple[np.ndarray, List[Dict[str, Any]]]:
     if detector_instance:
         detector = detector_instance
     else:
     text_queries: Optional[List[str]] = None,
     segmenter_name: Optional[str] = None,
     segmenter_instance: Optional[Any] = None,
+) -> Tuple[np.ndarray, Any]:
     if segmenter_instance:
         segmenter = segmenter_instance
         # Use instance lock if available
     job_id: Optional[str] = None,
     depth_estimator_name: Optional[str] = None,
     depth_scale: float = 1.0,
+) -> Tuple[str, List[List[Dict[str, Any]]]]:
+    # 1. Setup Video Reader
     try:
+        reader = VideoReader(input_video_path)
+    except ValueError:
+        logging.exception("Failed to open video at %s", input_video_path)
         raise
+    fps = reader.fps
+    width = reader.width
+    height = reader.height
+    total_frames = reader.total_frames
+    if max_frames is not None:
+        total_frames = min(total_frames, max_frames)
+    # 2. Defaults and Config
     if not queries:
         queries = ["person", "car", "truck", "motorcycle", "bicycle", "bus", "train", "airplane"]
         logging.info("No queries provided, using defaults: %s", queries)
     logging.info("Detection queries: %s", queries)
     active_detector = detector_name or "hf_yolov8"
+    # 3. Parallel Model Loading
+    num_gpus = torch.cuda.device_count()
+    detectors = []
+    depth_estimators = []
+    # Clear CUDA_VISIBLE_DEVICES to ensure we see all GPUs if not already handled
     if "CUDA_VISIBLE_DEVICES" in os.environ:
+         del os.environ["CUDA_VISIBLE_DEVICES"]
+    if num_gpus > 0:
+        logging.info("Detected %d GPUs. Loading models in parallel...", num_gpus)
+        def load_models_on_gpu(gpu_id: int):
+            device_str = f"cuda:{gpu_id}"
+            try:
+                det = load_detector_on_device(active_detector, device_str)
+                det.lock = RLock()
+                depth = None
+                if depth_estimator_name:
+                    depth = load_depth_estimator_on_device(depth_estimator_name, device_str)
+                    depth.lock = RLock()
+                return (gpu_id, det, depth)
+            except Exception as e:
+                logging.error(f"Failed to load models on GPU {gpu_id}: {e}")
+                raise
+        with ThreadPoolExecutor(max_workers=num_gpus) as loader_pool:
+            futures = [loader_pool.submit(load_models_on_gpu, i) for i in range(num_gpus)]
+            results = [f.result() for f in futures]
+            # Sort by GPU ID to ensure consistent indexing
+            results.sort(key=lambda x: x[0])
+            for _, det, depth in results:
+                detectors.append(det)
                 depth_estimators.append(depth)
     else:
+        logging.info("No GPUs detected. Loading CPU models...")
+        det = load_detector(active_detector)
+        det.lock = RLock()
+        detectors.append(det)
+        if depth_estimator_name:
+            depth = load_depth_estimator(depth_estimator_name)
+            depth.lock = RLock()
+            depth_estimators.append(depth)
+        else:
+            depth_estimators.append(None)
+    # 4. Processing Queues
+    # queue_in: (frame_idx, frame_data)
+    # queue_out: (frame_idx, processed_frame, detections)
+    queue_in = Queue(maxsize=16)
+    queue_out = Queue() # Unbounded, consumed by writer
+    # 5. Worker Function
+    def worker_task(gpu_idx: int):
+        detector_instance = detectors[gpu_idx]
+        depth_instance = depth_estimators[gpu_idx] if depth_estimators[gpu_idx] else None
+        while True:
+            item = queue_in.get()
+            if item is None:
+                queue_in.task_done()
+                break
+            frame_idx, frame_data = item
             if frame_idx % 30 == 0:
+                logging.info("Processing frame %d on device %s", frame_idx, "cpu" if num_gpus==0 else f"cuda:{gpu_idx}")
+            try:
+                # Depth strategy: Run every 3 frames
+                active_depth_name = depth_estimator_name if (frame_idx % 3 == 0) else None
+                active_depth_instance = depth_instance if (frame_idx % 3 == 0) else None
+                processed, frame_dets = infer_frame(
+                    frame_data,
+                    queries,
+                    detector_name=None,
+                    depth_estimator_name=active_depth_name,
+                    depth_scale=depth_scale,
+                    detector_instance=detector_instance,
+                    depth_estimator_instance=active_depth_instance
+                )
+                queue_out.put((frame_idx, processed, frame_dets))
+            except Exception as e:
+                logging.exception("Error processing frame %d", frame_idx)
+                # Put placeholders to avoid hanging writer
+                queue_out.put((frame_idx, frame_data, []))
+            queue_in.task_done()
+    # 6. Start Workers
+    workers = []
+    num_workers = len(detectors)
+    # If using CPU, maybe use more threads? No, CPU models usually multithread internally.
+    # If using GPU, 1 thread per GPU is efficient.
+    for i in range(num_workers):
+        t = Thread(target=worker_task, args=(i,), daemon=True)
+        t.start()
+        workers.append(t)
+    # 7. Start Writer / Output Collection (Main Thread or separate)
+    # We will run writer logic in the main thread after feeding is done?
+    # No, we must write continuously.
+    all_detections_map = {}
+    writer_finished = False
+    def writer_loop():
+        nonlocal writer_finished
+        next_idx = 0
+        buffer = {}
+        try:
+            with VideoWriter(output_video_path, fps, width, height) as writer:
+                while next_idx < total_frames:
+                    # Fetch from queue
+                    try:
+                        while next_idx not in buffer:
+                            item = queue_out.get(timeout=1.0) # wait
+                            idx, p_frame, dets = item
+                            buffer[idx] = (p_frame, dets)
+                        # Write next_idx
+                        p_frame, dets = buffer.pop(next_idx)
+                        writer.write(p_frame)
+                        all_detections_map[next_idx] = dets
+                        next_idx += 1
+                        if next_idx % 30 == 0:
+                            logging.debug("Wrote frame %d/%d", next_idx, total_frames)
+                    except Exception as e:
+                        # Check cancellation or timeout
+                        if job_id and _check_cancellation(job_id): # This raises
+                             pass
+                        if not any(w.is_alive() for w in workers) and queue_out.empty():
+                             # Workers dead, queue empty, but not finished? prevent infinite loop
+                             logging.error("Workers stopped unexpectedly.")
+                             break
+                        continue
+        except Exception as e:
+            logging.exception("Writer loop failed")
+        finally:
+            writer_finished = True
+    writer_thread = Thread(target=writer_loop, daemon=True)
+    writer_thread.start()
+    # 8. Feed Frames (Main Thread)
+    try:
+        frames_fed = 0
+        for i, frame in enumerate(reader):
             _check_cancellation(job_id)
+            if max_frames is not None and i >= max_frames:
                 break
+            queue_in.put((i, frame)) # Blocks if full
+            frames_fed += 1
+        # Signal workers to stop
+        for _ in range(num_workers):
+            queue_in.put(None)
+        # Wait for queue to process
+        queue_in.join()
+    except Exception as e:
+        logging.exception("Feeding frames failed")
+        raise
+    finally:
+        reader.close()
+    # Wait for writer
+    writer_thread.join()
+    # Sort detections
+    sorted_detections = []
+    # If we crashed early, we return what we have
+    max_key = max(all_detections_map.keys()) if all_detections_map else -1
+    for i in range(max_key + 1):
+        sorted_detections.append(all_detections_map.get(i, []))
+    logging.info("Inference complete. Output: %s", output_video_path)
+    return output_video_path, sorted_detections
 def run_segmentation(
     segmenter_name: Optional[str] = None,
     job_id: Optional[str] = None,
 ) -> str:
+    # 1. Setup Reader
     try:
+        reader = VideoReader(input_video_path)
+    except ValueError:
+        logging.exception("Failed to open video at %s", input_video_path)
         raise
+    fps = reader.fps
+    width = reader.width
+    height = reader.height
+    total_frames = reader.total_frames
+    if max_frames is not None:
+        total_frames = min(total_frames, max_frames)
     active_segmenter = segmenter_name or "sam3"
     logging.info("Using segmenter: %s with queries: %s", active_segmenter, queries)
+    # 2. Load Segmenters (Parallel)
     num_gpus = torch.cuda.device_count()
+    segmenters = []
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+         del os.environ["CUDA_VISIBLE_DEVICES"]
+    if num_gpus > 0:
+        logging.info("Detected %d GPUs. Loading segmenters...", num_gpus)
+        def load_seg(gpu_id: int):
+            device_str = f"cuda:{gpu_id}"
             seg = load_segmenter_on_device(active_segmenter, device_str)
             seg.lock = RLock()
+            return (gpu_id, seg)
+        with ThreadPoolExecutor(max_workers=num_gpus) as loader:
+            futures = [loader.submit(load_seg, i) for i in range(num_gpus)]
+            results = [f.result() for f in futures]
+            results.sort(key=lambda x: x[0])
+            segmenters = [r[1] for r in results]
     else:
+        seg = load_segmenter(active_segmenter)
+        seg.lock = RLock()
+        segmenters.append(seg)
+    # 3. Processing
+    queue_in = Queue(maxsize=16)
+    queue_out = Queue()
+    def worker_seg(gpu_idx: int):
+        seg = segmenters[gpu_idx]
+        while True:
+            item = queue_in.get()
+            if item is None:
+                queue_in.task_done()
+                break
+            idx, frame = item
+            if idx % 30 == 0:
+                logging.info("Segmenting frame %d (GPU %d)", idx, gpu_idx)
+            try:
+                processed, _ = infer_segmentation_frame(
+                    frame,
+                    text_queries=queries,
+                    segmenter_name=None,
+                    segmenter_instance=seg
+                )
+                queue_out.put((idx, processed))
+            except Exception as e:
+                logging.error("Segmentation failed frame %d: %s", idx, e)
+                queue_out.put((idx, frame))
+            queue_in.task_done()
+    workers = []
+    for i in range(len(segmenters)):
+        t = Thread(target=worker_seg, args=(i,), daemon=True)
+        t.start()
+        workers.append(t)
+    # Writer
+    writer_finished = False
+    def writer_loop():
+        nonlocal writer_finished
+        next_idx = 0
+        buffer = {}
+        try:
+            with VideoWriter(output_video_path, fps, width, height) as writer:
+                while next_idx < total_frames:
+                    try:
+                        while next_idx not in buffer:
+                            idx, frm = queue_out.get(timeout=1.0)
+                            buffer[idx] = frm
+                        frm = buffer.pop(next_idx)
+                        writer.write(frm)
+                        next_idx += 1
+                    except Exception:
+                         if job_id and _check_cancellation(job_id): pass
+                         if not any(w.is_alive() for w in workers) and queue_out.empty():
+                             break
+                         continue
+        finally:
+            writer_finished = True
+    w_thread = Thread(target=writer_loop, daemon=True)
+    w_thread.start()
+    # Feeder
+    try:
+        reader = VideoReader(input_video_path)
+        for i, frame in enumerate(reader):
+            _check_cancellation(job_id)
+            if max_frames is not None and i >= max_frames:
                 break
+            queue_in.put((i, frame))
+        for _ in workers:
+            queue_in.put(None)
+        queue_in.join()
+    finally:
+        reader.close()
+    w_thread.join()
     logging.info("Segmented video written to: %s", output_video_path)
     return output_video_path
 def run_depth_inference(
     input_video_path: str,
     output_video_path: str,
     first_frame_depth_path: Optional[str] = None,
     job_id: Optional[str] = None,
 ) -> str:
+    # 1. Setup Reader
     try:
+        reader = VideoReader(input_video_path)
+    except ValueError:
+        logging.exception("Failed to open video at %s", input_video_path)
         raise
+    fps = reader.fps
+    width = reader.width
+    height = reader.height
+    total_frames = reader.total_frames
     if max_frames is not None:
+        total_frames = min(total_frames, max_frames)
+    logging.info("Using depth estimator: %s", depth_estimator_name)
+    # 2. Load Estimators (Parallel)
     num_gpus = torch.cuda.device_count()
+    estimators = []
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+         del os.environ["CUDA_VISIBLE_DEVICES"]
+    if num_gpus > 0:
+        logging.info("Detected %d GPUs. Loading depth estimators...", num_gpus)
+        def load_est(gpu_id: int):
+            device_str = f"cuda:{gpu_id}"
             est = load_depth_estimator_on_device(depth_estimator_name, device_str)
             est.lock = RLock()
+            return (gpu_id, est)
+        with ThreadPoolExecutor(max_workers=num_gpus) as loader:
+            futures = [loader.submit(load_est, i) for i in range(num_gpus)]
+            results = [f.result() for f in futures]
+            results.sort(key=lambda x: x[0])
+            estimators = [r[1] for r in results]
     else:
+        est = load_depth_estimator(depth_estimator_name)
+        est.lock = RLock()
+        estimators.append(est)
+    # 3. Phase 1: Pre-scan for Stats
+    # We sample ~5% of frames or at least 20 frames distributed evenly
+    stride = max(1, total_frames // 20)
+    logging.info("Starting Phase 1: Pre-scan (stride=%d)...", stride)
+    scan_values = []
+    def scan_task(gpu_idx: int, frame_data: np.ndarray):
+        est = estimators[gpu_idx]
+        with est.lock:
+            result = est.predict(frame_data)
+        return result.depth_map
+    # Run scan
+    # We can just run this sequentially or with pool? Pool is better.
+    # We need to construct a list of frames to scan.
+    scan_indices = list(range(0, total_frames, stride))
+    # We need to read specific frames. VideoReader is sequential.
+    # So we iterate and skip.
+    scan_frames = []
+    # Optimization: If total frames is huge, reading simply to skip might be slow?
+    # VideoReader uses cv2.read() which decodes.
+    # If we need random access, we should use set(cv2.CAP_PROP_POS_FRAMES).
+    # But for now, simple skip logic:
+    current_idx = 0
+    # To avoid re-opening multiple times or complex seeking, let's just use the Reader
+    # and skip if not in indices.
+    # BUT, if video is 1 hour, skipping 99% frames is wastage of decode.
+    # Re-opening with set POS is better for sparse sampling.
+    # Actually, for robustness, let's just stick to VideoReader sequential read but only process selective frames.
+    # If the video is truly huge, we might want to optimize this later.
+    # Given the constraints, let's just scan the first N frames + some middle ones?
+    # User agreed to "Small startup delay".
+    # Let's try to just grab the frames we want.
+    scan_frames_data = []
+    # Just grab first 50 frames? No, distribution is better.
+    # Let's use a temporary reader for scanning
+    try:
+        from concurrent.futures import as_completed
+        # Simple Approach: Process first 30 frames to get a baseline.
+        # This is usually enough for a "rough" estimation unless scenes change drastically.
+        # But for stability, spread is better.
+        # Let's read first 10, middle 10, last 10.
+        target_indices = set(list(range(0, 10)) +
+                             list(range(total_frames//2, total_frames//2 + 10)) +
+                             list(range(max(0, total_frames-10), total_frames)))
+        # Filter valid
+        target_indices = sorted([i for i in target_indices if i < total_frames])
+        # Manual read with seek is tricky with cv2 (unreliable keyframes).
+        # We will iterate and pick.
+        cnt = 0
+        reader_scan = VideoReader(input_video_path)
+        for i, frame in enumerate(reader_scan):
+            if i in target_indices:
+               scan_frames_data.append(frame)
+            if i > max(target_indices):
+                break
+        reader_scan.close()
+        # Run inference on these frames
+        with ThreadPoolExecutor(max_workers=min(len(estimators)*2, 8)) as pool:
+            futures = []
+            for i, frm in enumerate(scan_frames_data):
+                gpu = i % len(estimators)
+                futures.append(pool.submit(scan_task, gpu, frm))
+            for f in as_completed(futures):
+                dm = f.result()
+                scan_values.append(dm)
+    except Exception as e:
+        logging.warning("Pre-scan failed, falling back to default range: %s", e)
+    # Compute stats
+    global_min, global_max = 0.0, 1.0
+    if scan_values:
+        all_vals = np.concatenate([v.ravel() for v in scan_values])
+        valid = all_vals[np.isfinite(all_vals)]
+        if valid.size > 0:
+            global_min = float(np.percentile(valid, 1))
+            global_max = float(np.percentile(valid, 99))
+            # Safety
+            if abs(global_max - global_min) < 1e-6:
+                global_max = global_min + 1.0
+    logging.info("Global Depth Range: %.2f - %.2f", global_min, global_max)
+    # 4. Phase 2: Streaming Inference
+    logging.info("Starting Phase 2: Streaming...")
+    queue_in = Queue(maxsize=16)
+    queue_out = Queue()
+    def worker_depth(gpu_idx: int):
+        est = estimators[gpu_idx]
+        while True:
+            item = queue_in.get()
+            if item is None:
+                queue_in.task_done()
+                break
+            idx, frame = item
+            try:
+                if idx % 30 == 0:
+                    logging.info("Depth frame %d (GPU %d)", idx, gpu_idx)
+                with est.lock:
+                    res = est.predict(frame)
+                depth_map = res.depth_map
+                # Colorize
+                colored = colorize_depth_map(depth_map, global_min, global_max)
+                # Overlay Detections
+                # Detections list is [ [det1, det2], [det1, det2] ... ]
+                if detections and idx < len(detections):
+                     frame_dets = detections[idx]
+                     if frame_dets:
+                         import cv2
+                         boxes = []
+                         labels = []
+                         for d in frame_dets:
+                             boxes.append(d.get("bbox"))
+                             lbl = d.get("label", "obj")
+                             if d.get("depth_est_m"):
+                                 lbl = f"{lbl} {int(d['depth_est_m'])}m"
+                             labels.append(lbl)
+                         colored = draw_boxes(colored, boxes=boxes, label_names=labels)
+                queue_out.put((idx, colored))
+            except Exception as e:
+                logging.error("Depth worker failed frame %d: %s", idx, e)
+                queue_out.put((idx, frame)) # Fallback to original?
+            queue_in.task_done()
+    # Workers
+    workers = []
+    for i in range(len(estimators)):
+        t = Thread(target=worker_depth, args=(i,), daemon=True)
+        t.start()
+        workers.append(t)
+    # Writer
+    writer_finished = False
+    first_frame_saved = False
+    def writer_loop():
+        nonlocal writer_finished, first_frame_saved
+        next_idx = 0
+        buffer = {}
+        processed_frames_subset = [] # Keep first frame for saving if needed
+        try:
+            with VideoWriter(output_video_path, fps, width, height) as writer:
+                while next_idx < total_frames:
+                    try:
+                        while next_idx not in buffer:
+                            idx, frm = queue_out.get(timeout=1.0)
+                            buffer[idx] = frm
+                        frm = buffer.pop(next_idx)
+                        writer.write(frm)
+                        if first_frame_depth_path and not first_frame_saved and next_idx == 0:
+                            cv2.imwrite(first_frame_depth_path, frm)
+                            first_frame_saved = True
+                        next_idx += 1
+                        if next_idx % 30 == 0:
+                             logging.debug("Wrote depth frame %d/%d", next_idx, total_frames)
+                    except Exception:
+                         if job_id and _check_cancellation(job_id): pass
+                         if not any(w.is_alive() for w in workers) and queue_out.empty():
+                             break
+                         continue
+        finally:
+            writer_finished = True
+    w_thread = Thread(target=writer_loop, daemon=True)
+    w_thread.start()
+    # Feeder
+    try:
+        reader = VideoReader(input_video_path)
+        for i, frame in enumerate(reader):
+            _check_cancellation(job_id)
+            if max_frames is not None and i >= max_frames:
+                break
+            queue_in.put((i, frame))
+        for _ in workers:
+            queue_in.put(None)
+        queue_in.join()
+    finally:
+        reader.close()
+    w_thread.join()
+    return output_video_path
 def colorize_depth_map(

utils/video.py CHANGED Viewed

@@ -77,3 +77,80 @@ def write_video(frames: List[np.ndarray], output_path: str, fps: float, width: i
     except RuntimeError as exc:
         logging.warning("ffmpeg transcode failed (%s); serving fallback MP4V output.", exc)
         shutil.move(temp_path, output_path)

     except RuntimeError as exc:
         logging.warning("ffmpeg transcode failed (%s); serving fallback MP4V output.", exc)
         shutil.move(temp_path, output_path)
+class VideoReader:
+    def __init__(self, video_path: str):
+        self.video_path = video_path
+        self.cap = cv2.VideoCapture(video_path)
+        if not self.cap.isOpened():
+            raise ValueError("Unable to open video.")
+        self.fps = self.cap.get(cv2.CAP_PROP_FPS) or 30.0
+        self.width = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        self.height = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        self.total_frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    def __iter__(self):
+        return self
+    def __next__(self) -> np.ndarray:
+        if not self.cap.isOpened():
+            raise StopIteration
+        success, frame = self.cap.read()
+        if not success:
+            self.cap.release()
+            raise StopIteration
+        return frame
+    def close(self):
+        if self.cap.isOpened():
+            self.cap.release()
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+class VideoWriter:
+    def __init__(self, output_path: str, fps: float, width: int, height: int):
+        self.output_path = output_path
+        self.fps = fps
+        self.width = width
+        self.height = height
+        self.temp_fd, self.temp_path = tempfile.mkstemp(prefix="raw_", suffix=".mp4")
+        os.close(self.temp_fd)
+        # Use mp4v for speed during writing, then transcode
+        self.writer = cv2.VideoWriter(self.temp_path, cv2.VideoWriter_fourcc(*"mp4v"), self.fps, (self.width, self.height))
+        if not self.writer.isOpened():
+            os.remove(self.temp_path)
+            raise ValueError("Failed to open VideoWriter.")
+    def write(self, frame: np.ndarray):
+        self.writer.write(frame)
+    def close(self):
+        if self.writer.isOpened():
+            self.writer.release()
+        # Transcode phase
+        try:
+            _transcode_with_ffmpeg(self.temp_path, self.output_path)
+            logging.debug("Transcoded video to H.264 for browser compatibility.")
+            os.remove(self.temp_path)
+        except FileNotFoundError:
+            logging.warning("ffmpeg not found; serving fallback MP4V output.")
+            shutil.move(self.temp_path, self.output_path)
+        except RuntimeError as exc:
+            logging.warning("ffmpeg transcode failed (%s); serving fallback MP4V output.", exc)
+            shutil.move(self.temp_path, self.output_path)
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()