perception2

Sleeping

App Files Files Community

Zhen Ye commited on Jan 21

Commit

8d938e9

1 Parent(s): 64bbe44

Optimize streaming buffer and implement parallel depth pipeline

Browse files

Files changed (4) hide show

app.py +34 -13
inference.py +113 -15
jobs/background.py +19 -39
jobs/streaming.py +2 -1

app.py CHANGED Viewed

@@ -529,23 +529,40 @@ async def stream_video(job_id: str):
     async def stream_generator():
         loop = asyncio.get_running_loop()
         while True:
             q = get_stream(job_id)
             if not q:
                 break
             try:
-                # Get latest frame (skipping updated ones if laggy?)
-                # Actually, standard queue get is fine if we consume fast enough.
-                # To be super real-time, we could drain the queue?
-                frame = q.get_nowait()
-                while not q.empty():
-                    try:
-                        frame = q.get_nowait()
-                    except queue.Empty:
-                        break
                 # Resize if too big (e.g. > 640 width)
                 h, w = frame.shape[:2]
                 if w > 640:
                     scale = 640 / w
@@ -553,15 +570,19 @@ async def stream_video(job_id: str):
                     frame = cv2.resize(frame, (640, new_h), interpolation=cv2.INTER_NEAREST)
                 # Encode in thread
-                # JPEG Quality = 50 (Balance between speed/size)
-                encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 50]
                 success, buffer = await loop.run_in_executor(None, cv2.imencode, '.jpg', frame, encode_param)
                 if success:
                     yield (b'--frame\r\n'
                            b'Content-Type: image/jpeg\r\n\r\n' + buffer.tobytes() + b'\r\n')
-            except queue.Empty:
-                await asyncio.sleep(0.02)
             except Exception:
                 await asyncio.sleep(0.1)

     async def stream_generator():
         loop = asyncio.get_running_loop()
+        buffered = False
         while True:
             q = get_stream(job_id)
             if not q:
                 break
             try:
+                # Initial Buffer: Wait until we have enough frames or job is done
+                if not buffered:
+                    if q.qsize() < 30:
+                        # If queue is empty, wait a bit
+                        await asyncio.sleep(0.1)
+                        # Check if job is still running? For now just wait for buffer or stream close
+                        continue
+                    buffered = True
+                # Get ONE frame (no skipping)
+                # Use wait to allow generator to yield cleanly
+                try:
+                    # Blocking get in executor to avoid hanging async loop?
+                    # Actually standard queue.get() is blocking. get_nowait is not.
+                    # We can sleep-poll for async compatibility
+                    while q.empty():
+                        await asyncio.sleep(0.01)
+                        if not get_stream(job_id): # Stream closed
+                            return
+                    frame = q.get_nowait()
+                except queue.Empty:
+                    continue
                 # Resize if too big (e.g. > 640 width)
+                # Optimization: Only resize if needed
                 h, w = frame.shape[:2]
                 if w > 640:
                     scale = 640 / w
                     frame = cv2.resize(frame, (640, new_h), interpolation=cv2.INTER_NEAREST)
                 # Encode in thread
+                # JPEG Quality = 60 (Better quality for smooth video)
+                encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 60]
                 success, buffer = await loop.run_in_executor(None, cv2.imencode, '.jpg', frame, encode_param)
                 if success:
                     yield (b'--frame\r\n'
                            b'Content-Type: image/jpeg\r\n\r\n' + buffer.tobytes() + b'\r\n')
+                # Control playback speed?
+                # If we blast frames as fast as possible, it might play accelerated.
+                # Ideally we want to sync to ~30fps.
+                await asyncio.sleep(0.033) # Simple pacer (~30fps)
             except Exception:
                 await asyncio.sleep(0.1)

inference.py CHANGED Viewed

@@ -669,19 +669,72 @@ def run_inference(
         else:
             depth_estimators.append(None)
-    # 4. Processing Queues
     # queue_in: (frame_idx, frame_data)
     # queue_out: (frame_idx, processed_frame, detections)
     queue_in = Queue(maxsize=16)
-    # Bound queue_out to prevent OOM
-    # Maxsize should be enough to keep writer busy but not explode memory
     queue_out_max = max(32, (len(detectors) if detectors else 1) * 4)
     queue_out = Queue(maxsize=queue_out_max)
-    # 5. Worker Function
     def worker_task(gpu_idx: int):
         detector_instance = detectors[gpu_idx]
-        depth_instance = depth_estimators[gpu_idx] if depth_estimators[gpu_idx] else None
         batch_size = detector_instance.max_batch_size if detector_instance.supports_batch else 1
         batch_accum = [] # List[Tuple[idx, frame]]
@@ -692,20 +745,65 @@ def run_inference(
             indices = [item[0] for item in batch_accum]
             frames = [item[1] for item in batch_accum]
-            batch_outputs = infer_batch(
-                frames, indices, queries, detector_instance,
-                depth_estimator_instance=depth_instance,
-                depth_scale=depth_scale
-            )
-            for out_item in batch_outputs:
                 while True:
                     try:
-                        queue_out.put(out_item, timeout=1.0)
                         break
                     except Full:
                         if job_id: _check_cancellation(job_id)
             batch_accum.clear()
         while True:
@@ -790,7 +888,7 @@ def run_inference(
                         if stream_queue:
                             try:
-                                stream_queue.put_nowait(p_frame)
                             except:
                                 pass

         else:
             depth_estimators.append(None)
+    # 4. Phase 1: Pre-Scan (Depth Normalization Stats) - ONLY IF DEPTH ENABLED
+    global_min, global_max = 0.0, 1.0
+    if depth_estimator_name and depth_estimators[0]:
+        logging.info("Starting Phase 1: Pre-scan for depth stats...")
+        # We need a quick scan logic here.
+        # Since we have loaded models, we can use one of them to scan a few frames.
+        # Let's pick 0-th GPU model.
+        scan_est = depth_estimators[0]
+        scan_values = []
+        # Sample frames: First 10, Middle 10, Last 10
+        target_indices = set(list(range(0, 10)) +
+                             list(range(total_frames//2, total_frames//2 + 10)) +
+                             list(range(max(0, total_frames-10), total_frames)))
+        target_indices = sorted([i for i in target_indices if i < total_frames])
+        try:
+             # Quick reader scan
+             reader_scan = VideoReader(input_video_path)
+             scan_frames = []
+             for i, frame in enumerate(reader_scan):
+                 if i in target_indices:
+                     scan_frames.append(frame)
+                 if i > max(target_indices):
+                     break
+             reader_scan.close()
+             # Predict
+             with scan_est.lock:
+                # Batch if supported, else loop
+                if scan_est.supports_batch and scan_frames:
+                     scan_res = scan_est.predict_batch(scan_frames)
+                else:
+                     scan_res = [scan_est.predict(f) for f in scan_frames]
+             for r in scan_res:
+                 if r.depth_map is not None:
+                     scan_values.append(r.depth_map)
+             # Stats
+             if scan_values:
+                all_vals = np.concatenate([v.ravel() for v in scan_values])
+                valid = all_vals[np.isfinite(all_vals)]
+                if valid.size > 0:
+                    global_min = float(np.percentile(valid, 1))
+                    global_max = float(np.percentile(valid, 99))
+                    # Prevent zero range
+                    if abs(global_max - global_min) < 1e-6: global_max = global_min + 1.0
+             logging.info("Global Depth Range: %.2f - %.2f", global_min, global_max)
+        except Exception as e:
+             logging.warning("Pre-scan failed, using default range: %s", e)
+    # 5. Processing Queues
     # queue_in: (frame_idx, frame_data)
     # queue_out: (frame_idx, processed_frame, detections)
     queue_in = Queue(maxsize=16)
     queue_out_max = max(32, (len(detectors) if detectors else 1) * 4)
     queue_out = Queue(maxsize=queue_out_max)
+    # 6. Worker Function (Unified)
     def worker_task(gpu_idx: int):
         detector_instance = detectors[gpu_idx]
+        depth_instance = depth_estimators[gpu_idx] if gpu_idx < len(depth_estimators) else None # Handle mismatched lists safely
         batch_size = detector_instance.max_batch_size if detector_instance.supports_batch else 1
         batch_accum = [] # List[Tuple[idx, frame]]
             indices = [item[0] for item in batch_accum]
             frames = [item[1] for item in batch_accum]
+            # --- UNIFIED INFERENCE ---
+            # Run detection batch
+            try:
+                if detector_instance.supports_batch:
+                    with detector_instance.lock:
+                         det_results = detector_instance.predict_batch(frames, queries) # Assuming predict_batch takes queries
+                else:
+                    with detector_instance.lock:
+                         det_results = [detector_instance.predict(f, queries) for f in frames]
+            except Exception:
+                logging.exception("Batch detection failed")
+                det_results = [None] * len(frames)
+            # Run depth batch (if enabled)
+            depth_results = [None] * len(frames)
+            if depth_instance and depth_estimator_name:
+                try:
+                     with depth_instance.lock:
+                         if depth_instance.supports_batch:
+                             depth_results = depth_instance.predict_batch(frames)
+                         else:
+                             depth_results = [depth_instance.predict(f) for f in frames]
+                except Exception:
+                     logging.exception("Batch depth failed")
+            # --- POST PROCESSING ---
+            for i, (idx, frame, d_res, dep_res) in enumerate(zip(indices, frames, det_results, depth_results)):
+                # 1. Detections
+                detections = []
+                if d_res:
+                     detections = _build_detection_records(
+                        d_res.boxes, d_res.scores, d_res.labels, queries, d_res.label_names
+                     )
+                # 2. Frame Rendering
+                processed = frame.copy()
+                # A. Render Depth Heatmap (if enabled)
+                # Overwrites original frame visual
+                if dep_res and dep_res.depth_map is not None:
+                     processed = colorize_depth_map(dep_res.depth_map, global_min, global_max)
+                     # Also optionally attach 'depth_rel' to detections based on this map?
+                     try:
+                         _attach_depth_from_result(detections, dep_res, depth_scale)
+                     except: pass
+                # B. Render Boxes (on top of whatever visual we have)
+                display_labels = [_build_display_label(d) for d in detections]
+                if d_res:
+                    processed = draw_boxes(processed, d_res.boxes, label_names=display_labels)
+                # 3. Output
                 while True:
                     try:
+                        queue_out.put((idx, processed, detections), timeout=1.0)
                         break
                     except Full:
                         if job_id: _check_cancellation(job_id)
             batch_accum.clear()
         while True:
                         if stream_queue:
                             try:
+                                stream_queue.put(p_frame, timeout=0.01)
                             except:
                                 pass

jobs/background.py CHANGED Viewed

@@ -39,6 +39,7 @@ async def process_video_async(job_id: str) -> None:
             )
         else:
             detections_list = None
             result_pkg = await asyncio.to_thread(
                 run_inference,
                 job.input_video_path,
@@ -47,50 +48,29 @@ async def process_video_async(job_id: str) -> None:
                 None,
                 job.detector_name,
                 job_id,
-                job.depth_estimator_name,
                 job.depth_scale,
                 stream_queue,
             )
-            # run_inference now returns (path, detections)
             detection_path, detections_list = result_pkg
-        # Try to run depth estimation (only if requested)
-        if job.depth_estimator_name:
-            try:
-                depth_path = await asyncio.to_thread(
-                    run_depth_inference,
-                    job.input_video_path,
-                    str(get_depth_output_path(job_id)),
-                    detections_list,  # Pass detections for overlay
-                    None,  # max_frames
-                    job.depth_estimator_name,
-                    str(get_first_frame_depth_path(job_id)),
-                    job_id,
-                    stream_queue,
-                )
-                logging.info("Depth estimation completed for job %s", job_id)
-            except (ImportError, ModuleNotFoundError) as exc:
-                logging.exception("Depth model not available for job %s", job_id)
-                depth_error = f"Depth model import failed: {exc}"
-                partial_success = True
-            except torch.cuda.OutOfMemoryError:
-                logging.exception("Depth estimation failed due to GPU OOM for job %s", job_id)
-                depth_error = "Depth estimation failed due to GPU memory limits"
-                partial_success = True
-            except RuntimeError as exc:
-                # Handle cancellation specifically for depth
-                if "cancelled" in str(exc).lower():
-                    logging.info("Depth processing cancelled for job %s", job_id)
-                    depth_error = "Depth processing cancelled"
-                    partial_success = True
-                else:
-                    logging.exception("Depth estimation failed for job %s", job_id)
-                    depth_error = f"Depth processing error: {str(exc)}"
-                    partial_success = True
-            except Exception as exc:
-                logging.exception("Depth estimation failed for job %s", job_id)
-                depth_error = f"Depth processing error: {str(exc)}"
-                partial_success = True
         # Mark as completed (with or without depth)
         storage.update(

             )
         else:
             detections_list = None
+            # Unified inference pipeline (handles depth internally if enabled)
             result_pkg = await asyncio.to_thread(
                 run_inference,
                 job.input_video_path,
                 None,
                 job.detector_name,
                 job_id,
+                job.depth_estimator_name, # Pass depth estimator to trigger unified loop
                 job.depth_scale,
                 stream_queue,
             )
             detection_path, detections_list = result_pkg
+            # If depth was requested, checking if output path exists for depth
+            # The unified pipeline creates 'output_video_path'.
+            # If depth enabled, it might have written depth there?
+            # Actually run_inference returns (video_path, detections).
+            # If depth was ON, the video at video_path *has* depth overlays.
+            # But the 'Depth Video' (heatmap only) is usually separate.
+            # Our Plan says: "Unified loop... Write Frame to Disk".
+            # If we want separate depth video, we need to instruct run_inference to write TWO videos?
+            # Or just update 'depth_path' to be the same main video if it's merged?
+            # Let's keep it simple: If depth enabled, the main video IS the depth view (overlay).
+            # Or if we want separate `depth_output_path`, we need `run_inference` to handle it.
+            # Let's assume for now `run_inference` writes the main visualization path.
+            if job.depth_estimator_name:
+                 # In unified mode, the main video contains the depth viz
+                 depth_path = detection_path
+                 logging.info("Depth estimation included in main video for job %s", job_id)
         # Mark as completed (with or without depth)
         storage.update(

jobs/streaming.py CHANGED Viewed

@@ -12,7 +12,8 @@ def create_stream(job_id: str) -> queue.Queue:
     with _LOCK:
         # standard Queue, thread-safe
         # maxsize to prevent memory explosion if consumer is slow
-        q = queue.Queue(maxsize=10)
         _STREAMS[job_id] = q
         return q

     with _LOCK:
         # standard Queue, thread-safe
         # maxsize to prevent memory explosion if consumer is slow
+        # Buffer increased to 300 (approx 10s at 30fps) for smooth streaming
+        q = queue.Queue(maxsize=300)
         _STREAMS[job_id] = q
         return q