Spaces:

BiasLab2025
/

perception

Sleeping

App Files Files Community

Zhen Ye commited on 9 days ago

Commit

4977275

1 Parent(s): 55e372a

Add StreamingVideoWriter (ffmpeg pipe), reuse first-frame GPT results in inference pipeline

Browse files

Files changed (2) hide show

inference.py +69 -61
utils/video.py +78 -3

inference.py CHANGED Viewed

@@ -21,8 +21,8 @@ from models.model_loader import load_detector, load_detector_on_device
 from models.segmenters.model_loader import load_segmenter, load_segmenter_on_device
 from models.depth_estimators.model_loader import load_depth_estimator, load_depth_estimator_on_device
 from models.depth_estimators.base import DepthEstimator
-from utils.video import extract_frames, write_video, VideoReader, VideoWriter, AsyncVideoReader
-from utils.gpt_reasoning import estimate_threat_gpt
 from utils.relevance import evaluate_relevance, evaluate_relevance_llm
 from jobs.storage import set_track_data
 import tempfile
@@ -718,13 +718,13 @@ def process_first_frame(
     enable_depth_estimator: bool = False,
     enable_gpt: bool = True,  # ENABLED BY DEFAULT
     mission_spec=None,  # Optional[MissionSpecification]
-) -> Tuple[np.ndarray, List[Dict[str, Any]], Optional[np.ndarray]]:
     frame, _, _, _ = extract_first_frame(video_path)
     if mode == "segmentation":
         processed, _ = infer_segmentation_frame(
             frame, text_queries=queries, segmenter_name=segmenter_name
         )
-        return processed, [], None
     processed, detections = infer_frame(
         frame, queries, detector_name=detector_name
@@ -822,15 +822,15 @@ def process_first_frame(
     # 2. GPT-based Distance/Direction Estimation (Explicitly enabled)
     # Only assess mission-relevant detections
     if enable_gpt and gpt_input_dets:
         try:
-            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_img:
-                cv2.imwrite(tmp_img.name, frame)
-                gpt_results = estimate_threat_gpt(
-                    tmp_img.name, gpt_input_dets, mission_spec=mission_spec
-                )
-                logging.info(f"GPT Output for First Frame:\n{gpt_results}")
-                os.remove(tmp_img.name)
             # Merge GPT results into detections (polyfilled keys from gpt_reasoning)
             for i, det in enumerate(gpt_input_dets):
@@ -851,7 +851,7 @@ def process_first_frame(
         if "assessment_status" not in det:
             det["assessment_status"] = "UNASSESSED"
-    return processed, detections, depth_map
 def run_inference(
@@ -866,6 +866,7 @@ def run_inference(
     enable_gpt: bool = True,
     stream_queue: Optional[Queue] = None,
     mission_spec=None,  # Optional[MissionSpecification]
 ) -> Tuple[str, List[List[Dict[str, Any]]]]:
     # 1. Setup Video Reader
@@ -995,7 +996,7 @@ def run_inference(
     queue_in = Queue(maxsize=16)
     # Tuning for A10: buffer at least 32 frames per GPU (batch size)
     # GPT Latency Buffer: GPT takes ~3s. At 30fps, that's 90 frames. We need to absorb this burst.
-    queue_out_max = max(512, (len(detectors) if detectors else 1) * 64)
     queue_out = Queue(maxsize=queue_out_max)
@@ -1147,20 +1148,15 @@ def run_inference(
         llm_filtered = False  # LLM post-filter runs once on frame 0
         try:
-            with VideoWriter(output_video_path, fps, width, height) as writer:
                 while next_idx < total_frames:
                     # Fetch from queue
                     try:
                         while next_idx not in buffer:
-                            # Backpressure: If buffer gets too big due to out-of-order frames,
-                            # we might want to warn or just hope for the best.
-                            # But here we are just consuming.
-                            # However, if 'buffer' grows too large (because we are missing next_idx),
-                            # we are effectively unbounded again if queue_out fills up with future frames.
-                            # So we should monitor buffer size.
-                            if len(buffer) > 200 and len(buffer) % 50 == 0:
-                                logging.warning("Writer buffer large (%d items), waiting for frame %d (GPT Latency?)...", len(buffer), next_idx)
                             item = queue_out.get(timeout=1.0) # wait
@@ -1180,20 +1176,27 @@ def run_inference(
                                 and mission_spec.parse_mode == "LLM_EXTRACTED"
                                 and not llm_filtered
                                 and next_idx == 0):
-                            unique_labels = list({
-                                d.get("label", "").lower()
-                                for d in dets if d.get("label")
-                            })
-                            relevant_labels = evaluate_relevance_llm(
-                                unique_labels, mission_spec.operator_text
-                            )
-                            # Cache into relevance_criteria for all subsequent frames
-                            mission_spec.relevance_criteria.required_classes = list(relevant_labels)
                             llm_filtered = True
-                            logging.info(
-                                "LLM post-filter applied on frame 0: relevant=%s",
-                                relevant_labels,
-                            )
                         # --- RELEVANCE GATE (deterministic, uses updated criteria) ---
                         if mission_spec:
@@ -1222,22 +1225,26 @@ def run_inference(
                         # --- GPT ESTIMATION (Frame 0 Only) ---
                         if next_idx == 0 and enable_gpt and gpt_dets:
                             try:
-                                logging.info("Running GPT estimation for video start (Frame 0)...")
-                                with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
-                                    cv2.imwrite(tmp.name, p_frame)
                                     gpt_res = estimate_threat_gpt(
-                                        tmp.name, gpt_dets, mission_spec=mission_spec
                                     )
-                                    os.remove(tmp.name)
-                                    # Merge using real track_id assigned by ByteTracker
-                                    for d in gpt_dets:
-                                        oid = d.get('track_id')
-                                        if oid and oid in gpt_res:
-                                            d.update(gpt_res[oid])
-                                            d["gpt_raw"] = gpt_res[oid]
-                                            d["assessment_frame_index"] = 0
-                                            d["assessment_status"] = "ASSESSED"
                                 # Push GPT data back into tracker's internal STrack objects
                                 tracker.inject_metadata(gpt_dets)
@@ -1563,17 +1570,17 @@ def run_segmentation(
         buffer = {}
         try:
-            with VideoWriter(output_video_path, fps, width, height) as writer:
                 while next_idx < total_frames:
                     try:
                         while next_idx not in buffer:
-                            # Check buffer size
-                            if len(buffer) > 64:
-                                logging.warning("Writer buffer large (%d), waiting for %d", len(buffer), next_idx)
                             idx, frm = queue_out.get(timeout=1.0)
                             buffer[idx] = frm
                         frm = buffer.pop(next_idx)
                         writer.write(frm)
@@ -1898,15 +1905,16 @@ def run_depth_inference(
         processed_frames_subset = [] # Keep first frame for saving if needed
         try:
-            with VideoWriter(output_video_path, fps, width, height) as writer:
                 while next_idx < total_frames:
                     try:
                         while next_idx not in buffer:
-                            if len(buffer) > 64:
-                                logging.warning("Writer buffer large (%d), waiting for %d", len(buffer), next_idx)
                             idx, frm = queue_out.get(timeout=1.0)
                             buffer[idx] = frm
                         frm = buffer.pop(next_idx)
                         writer.write(frm)
@@ -1916,11 +1924,11 @@ def run_depth_inference(
                             except:
                                 pass
                         if first_frame_depth_path and not first_frame_saved and next_idx == 0:
                             cv2.imwrite(first_frame_depth_path, frm)
                             first_frame_saved = True
                         next_idx += 1
                         if next_idx % 30 == 0:
                              logging.debug("Wrote depth frame %d/%d", next_idx, total_frames)
@@ -1934,7 +1942,7 @@ def run_depth_inference(
     w_thread = Thread(target=writer_loop, daemon=True)
     w_thread.start()
     # Feeder
     try:
         reader_iter = iter(reader)

 from models.segmenters.model_loader import load_segmenter, load_segmenter_on_device
 from models.depth_estimators.model_loader import load_depth_estimator, load_depth_estimator_on_device
 from models.depth_estimators.base import DepthEstimator
+from utils.video import extract_frames, write_video, VideoReader, VideoWriter, AsyncVideoReader, StreamingVideoWriter
+from utils.gpt_reasoning import estimate_threat_gpt, encode_frame_to_b64
 from utils.relevance import evaluate_relevance, evaluate_relevance_llm
 from jobs.storage import set_track_data
 import tempfile
     enable_depth_estimator: bool = False,
     enable_gpt: bool = True,  # ENABLED BY DEFAULT
     mission_spec=None,  # Optional[MissionSpecification]
+) -> Tuple[np.ndarray, List[Dict[str, Any]], Optional[np.ndarray], Optional[Dict[str, Any]]]:
     frame, _, _, _ = extract_first_frame(video_path)
     if mode == "segmentation":
         processed, _ = infer_segmentation_frame(
             frame, text_queries=queries, segmenter_name=segmenter_name
         )
+        return processed, [], None, None
     processed, detections = infer_frame(
         frame, queries, detector_name=detector_name
     # 2. GPT-based Distance/Direction Estimation (Explicitly enabled)
     # Only assess mission-relevant detections
+    gpt_results = None
     if enable_gpt and gpt_input_dets:
         try:
+            frame_b64 = encode_frame_to_b64(frame)
+            gpt_results = estimate_threat_gpt(
+                detections=gpt_input_dets, mission_spec=mission_spec,
+                image_b64=frame_b64,
+            )
+            logging.info(f"GPT Output for First Frame:\n{gpt_results}")
             # Merge GPT results into detections (polyfilled keys from gpt_reasoning)
             for i, det in enumerate(gpt_input_dets):
         if "assessment_status" not in det:
             det["assessment_status"] = "UNASSESSED"
+    return processed, detections, depth_map, gpt_results
 def run_inference(
     enable_gpt: bool = True,
     stream_queue: Optional[Queue] = None,
     mission_spec=None,  # Optional[MissionSpecification]
+    first_frame_gpt_results: Optional[Dict[str, Any]] = None,
 ) -> Tuple[str, List[List[Dict[str, Any]]]]:
     # 1. Setup Video Reader
     queue_in = Queue(maxsize=16)
     # Tuning for A10: buffer at least 32 frames per GPU (batch size)
     # GPT Latency Buffer: GPT takes ~3s. At 30fps, that's 90 frames. We need to absorb this burst.
+    queue_out_max = max(128, (len(detectors) if detectors else 1) * 32)
     queue_out = Queue(maxsize=queue_out_max)
         llm_filtered = False  # LLM post-filter runs once on frame 0
         try:
+            with StreamingVideoWriter(output_video_path, fps, width, height) as writer:
                 while next_idx < total_frames:
                     # Fetch from queue
                     try:
                         while next_idx not in buffer:
+                            # Backpressure: bound the reorder buffer to prevent memory blowup
+                            if len(buffer) > 128:
+                                logging.warning("Writer reorder buffer too large (%d items), applying backpressure (waiting for frame %d)...", len(buffer), next_idx)
+                                time.sleep(0.05)
                             item = queue_out.get(timeout=1.0) # wait
                                 and mission_spec.parse_mode == "LLM_EXTRACTED"
                                 and not llm_filtered
                                 and next_idx == 0):
+                            # Skip if process_first_frame already populated required_classes
+                            if mission_spec.relevance_criteria.required_classes:
+                                logging.info(
+                                    "LLM post-filter already applied by process_first_frame: classes=%s",
+                                    mission_spec.relevance_criteria.required_classes,
+                                )
+                            else:
+                                unique_labels = list({
+                                    d.get("label", "").lower()
+                                    for d in dets if d.get("label")
+                                })
+                                relevant_labels = evaluate_relevance_llm(
+                                    unique_labels, mission_spec.operator_text
+                                )
+                                # Cache into relevance_criteria for all subsequent frames
+                                mission_spec.relevance_criteria.required_classes = list(relevant_labels)
+                                logging.info(
+                                    "LLM post-filter applied on frame 0: relevant=%s",
+                                    relevant_labels,
+                                )
                             llm_filtered = True
                         # --- RELEVANCE GATE (deterministic, uses updated criteria) ---
                         if mission_spec:
                         # --- GPT ESTIMATION (Frame 0 Only) ---
                         if next_idx == 0 and enable_gpt and gpt_dets:
                             try:
+                                if first_frame_gpt_results:
+                                    # Re-use GPT results from process_first_frame (avoid duplicate call)
+                                    logging.info("Re-using GPT results from first-frame processing (skipping duplicate call)")
+                                    gpt_res = first_frame_gpt_results
+                                else:
+                                    logging.info("Running GPT estimation for video start (Frame 0)...")
+                                    frame_b64 = encode_frame_to_b64(p_frame)
                                     gpt_res = estimate_threat_gpt(
+                                        detections=gpt_dets, mission_spec=mission_spec,
+                                        image_b64=frame_b64,
                                     )
+                                # Merge using real track_id assigned by ByteTracker
+                                for d in gpt_dets:
+                                    oid = d.get('track_id')
+                                    if oid and oid in gpt_res:
+                                        d.update(gpt_res[oid])
+                                        d["gpt_raw"] = gpt_res[oid]
+                                        d["assessment_frame_index"] = 0
+                                        d["assessment_status"] = "ASSESSED"
                                 # Push GPT data back into tracker's internal STrack objects
                                 tracker.inject_metadata(gpt_dets)
         buffer = {}
         try:
+            with StreamingVideoWriter(output_video_path, fps, width, height) as writer:
                 while next_idx < total_frames:
                     try:
                         while next_idx not in buffer:
+                            if len(buffer) > 128:
+                                logging.warning("Writer reorder buffer too large (%d), applying backpressure (waiting for frame %d)...", len(buffer), next_idx)
+                                time.sleep(0.05)
                             idx, frm = queue_out.get(timeout=1.0)
                             buffer[idx] = frm
                         frm = buffer.pop(next_idx)
                         writer.write(frm)
         processed_frames_subset = [] # Keep first frame for saving if needed
         try:
+            with StreamingVideoWriter(output_video_path, fps, width, height) as writer:
                 while next_idx < total_frames:
                     try:
                         while next_idx not in buffer:
+                            if len(buffer) > 128:
+                                logging.warning("Writer reorder buffer too large (%d), applying backpressure (waiting for frame %d)...", len(buffer), next_idx)
+                                time.sleep(0.05)
                             idx, frm = queue_out.get(timeout=1.0)
                             buffer[idx] = frm
                         frm = buffer.pop(next_idx)
                         writer.write(frm)
                             except:
                                 pass
                         if first_frame_depth_path and not first_frame_saved and next_idx == 0:
                             cv2.imwrite(first_frame_depth_path, frm)
                             first_frame_saved = True
                         next_idx += 1
                         if next_idx % 30 == 0:
                              logging.debug("Wrote depth frame %d/%d", next_idx, total_frames)
     w_thread = Thread(target=writer_loop, daemon=True)
     w_thread.start()
     # Feeder
     try:
         reader_iter = iter(reader)

utils/video.py CHANGED Viewed

@@ -212,10 +212,10 @@ class VideoWriter:
         self.fps = fps
         self.width = width
         self.height = height
         self.temp_fd, self.temp_path = tempfile.mkstemp(prefix="raw_", suffix=".mp4")
         os.close(self.temp_fd)
         # Use mp4v for speed during writing, then transcode
         self.writer = cv2.VideoWriter(self.temp_path, cv2.VideoWriter_fourcc(*"mp4v"), self.fps, (self.width, self.height))
         if not self.writer.isOpened():
@@ -228,7 +228,7 @@ class VideoWriter:
     def close(self):
         if self.writer.isOpened():
             self.writer.release()
         # Transcode phase
         try:
             _transcode_with_ffmpeg(self.temp_path, self.output_path)
@@ -246,3 +246,78 @@ class VideoWriter:
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.close()

         self.fps = fps
         self.width = width
         self.height = height
         self.temp_fd, self.temp_path = tempfile.mkstemp(prefix="raw_", suffix=".mp4")
         os.close(self.temp_fd)
         # Use mp4v for speed during writing, then transcode
         self.writer = cv2.VideoWriter(self.temp_path, cv2.VideoWriter_fourcc(*"mp4v"), self.fps, (self.width, self.height))
         if not self.writer.isOpened():
     def close(self):
         if self.writer.isOpened():
             self.writer.release()
         # Transcode phase
         try:
             _transcode_with_ffmpeg(self.temp_path, self.output_path)
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.close()
+def _ffmpeg_available() -> bool:
+    """Check if ffmpeg is available on the system PATH."""
+    return shutil.which("ffmpeg") is not None
+class StreamingVideoWriter:
+    """
+    Pipes raw BGR frames directly to an ffmpeg subprocess for H.264 encoding.
+    Eliminates the cv2.VideoWriter + post-transcode round-trip.
+    Falls back to VideoWriter if ffmpeg is unavailable.
+    """
+    def __init__(self, output_path: str, fps: float, width: int, height: int):
+        self.output_path = output_path
+        self._fallback = None
+        if not _ffmpeg_available():
+            logging.warning("ffmpeg not found; StreamingVideoWriter falling back to VideoWriter.")
+            self._fallback = VideoWriter(output_path, fps, width, height)
+            return
+        cmd = [
+            "ffmpeg", "-y",
+            "-f", "rawvideo",
+            "-pix_fmt", "bgr24",
+            "-s", f"{width}x{height}",
+            "-r", str(fps),
+            "-i", "pipe:",
+            "-c:v", "libx264",
+            "-preset", "veryfast",
+            "-pix_fmt", "yuv420p",
+            "-movflags", "+faststart",
+            output_path,
+        ]
+        try:
+            self.proc = subprocess.Popen(
+                cmd,
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+        except OSError as e:
+            logging.warning("Failed to start ffmpeg (%s); falling back to VideoWriter.", e)
+            self._fallback = VideoWriter(output_path, fps, width, height)
+    def write(self, frame: np.ndarray):
+        if self._fallback is not None:
+            self._fallback.write(frame)
+            return
+        try:
+            self.proc.stdin.write(frame.tobytes())
+        except BrokenPipeError:
+            logging.error("ffmpeg pipe broken; frames may be lost.")
+    def close(self):
+        if self._fallback is not None:
+            self._fallback.close()
+            return
+        try:
+            self.proc.stdin.close()
+        except OSError:
+            pass
+        self.proc.wait()
+        if self.proc.returncode != 0:
+            stderr = self.proc.stderr.read().decode("utf-8", errors="ignore")
+            logging.error("StreamingVideoWriter ffmpeg exited with code %d: %s",
+                          self.proc.returncode, stderr)
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()