Spaces:

BiasLab2025
/

detection_base

Paused

App Files Files Community

Zhen Ye commited on 22 days ago

Commit

301e154

1 Parent(s): a4e3c2b

feat(ttfs): instrument segmentation time-to-first-stream

Browse files

Files changed (5) hide show

app.py +28 -0
inference.py +33 -0
jobs/background.py +1 -0
jobs/models.py +1 -0
models/segmenters/grounded_sam2.py +11 -0

app.py CHANGED Viewed

@@ -28,6 +28,7 @@ except Exception as e:
 import asyncio
 import shutil
 import tempfile
 import uuid
 from contextlib import asynccontextmanager
 from datetime import timedelta
@@ -409,6 +410,8 @@ async def detect_async_endpoint(
     enable_depth: bool = Form(False),
     enable_gpt: bool = Form(True),
 ):
     if mode not in VALID_MODES:
         raise HTTPException(
             status_code=400,
@@ -434,6 +437,8 @@ async def detect_async_endpoint(
     finally:
         await video.close()
     # --- Mission-Driven Query Parsing ---
     mission_spec = None
     mission_mode = "LEGACY"
@@ -476,6 +481,8 @@ async def detect_async_endpoint(
             "LEGACY mode: no mission text, defaults=%s, GPT disabled", query_list
         )
     available_depth_estimators = set(list_depth_estimators())
     if depth_estimator not in available_depth_estimators:
         raise HTTPException(
@@ -490,6 +497,7 @@ async def detect_async_endpoint(
     active_depth = depth_estimator if enable_depth else None
     try:
         processed_frame, detections = process_first_frame(
             str(input_path),
             query_list,
@@ -498,6 +506,7 @@ async def detect_async_endpoint(
             segmenter_name=segmenter,
         )
         cv2.imwrite(str(first_frame_path), processed_frame)
         # GPT and depth are now handled in the async pipeline (enrichment thread)
         first_frame_gpt_results = None
     except Exception:
@@ -524,6 +533,7 @@ async def detect_async_endpoint(
         mission_spec=mission_spec,
         mission_mode=mission_mode,
         first_frame_gpt_results=first_frame_gpt_results,
     )
     get_job_storage().create(job)
     asyncio.create_task(process_video_async(job_id))
@@ -771,6 +781,15 @@ async def stream_video(job_id: str):
         loop = asyncio.get_running_loop()
         buffered = False
         # Get or create the asyncio.Event for this stream (must be in async context)
         event = get_stream_event(job_id)
@@ -782,10 +801,15 @@ async def stream_video(job_id: str):
             try:
                 # Initial Buffer: Wait until we have enough frames or job is done
                 if not buffered:
                     if q.qsize() < 5:
                         await asyncio.sleep(0.1)
                         continue
                     buffered = True
                 # Event-driven wait — replaces busy-wait polling
                 if event is not None:
@@ -811,6 +835,10 @@ async def stream_video(job_id: str):
                 success, buffer = await loop.run_in_executor(None, cv2.imencode, '.jpg', frame, encode_param)
                 if success:
                     yield (b'--frame\r\n'
                            b'Content-Type: image/jpeg\r\n\r\n' + buffer.tobytes() + b'\r\n')

 import asyncio
 import shutil
 import tempfile
+import time
 import uuid
 from contextlib import asynccontextmanager
 from datetime import timedelta
     enable_depth: bool = Form(False),
     enable_gpt: bool = Form(True),
 ):
+    _ttfs_t0 = time.perf_counter()
     if mode not in VALID_MODES:
         raise HTTPException(
             status_code=400,
     finally:
         await video.close()
+    logging.info("[TTFS:%s] +%.1fs upload_saved", job_id, time.perf_counter() - _ttfs_t0)
     # --- Mission-Driven Query Parsing ---
     mission_spec = None
     mission_mode = "LEGACY"
             "LEGACY mode: no mission text, defaults=%s, GPT disabled", query_list
         )
+    logging.info("[TTFS:%s] +%.1fs mission_parsed", job_id, time.perf_counter() - _ttfs_t0)
     available_depth_estimators = set(list_depth_estimators())
     if depth_estimator not in available_depth_estimators:
         raise HTTPException(
     active_depth = depth_estimator if enable_depth else None
     try:
+        logging.info("[TTFS:%s] +%.1fs process_first_frame start", job_id, time.perf_counter() - _ttfs_t0)
         processed_frame, detections = process_first_frame(
             str(input_path),
             query_list,
             segmenter_name=segmenter,
         )
         cv2.imwrite(str(first_frame_path), processed_frame)
+        logging.info("[TTFS:%s] +%.1fs process_first_frame done", job_id, time.perf_counter() - _ttfs_t0)
         # GPT and depth are now handled in the async pipeline (enrichment thread)
         first_frame_gpt_results = None
     except Exception:
         mission_spec=mission_spec,
         mission_mode=mission_mode,
         first_frame_gpt_results=first_frame_gpt_results,
+        ttfs_t0=_ttfs_t0,
     )
     get_job_storage().create(job)
     asyncio.create_task(process_video_async(job_id))
         loop = asyncio.get_running_loop()
         buffered = False
+        # TTFS instrumentation
+        _first_yielded = False
+        _buffer_wait_logged = False
+        _job = get_job_storage().get(job_id)
+        _stream_t0 = _job.ttfs_t0 if _job else None
+        if _stream_t0:
+            logging.info("[TTFS:%s] +%.1fs stream_subscribed", job_id, time.perf_counter() - _stream_t0)
         # Get or create the asyncio.Event for this stream (must be in async context)
         event = get_stream_event(job_id)
             try:
                 # Initial Buffer: Wait until we have enough frames or job is done
                 if not buffered:
+                    if not _buffer_wait_logged and _stream_t0:
+                        logging.info("[TTFS:%s] +%.1fs stream_buffer_wait (qsize=%d)", job_id, time.perf_counter() - _stream_t0, q.qsize())
+                        _buffer_wait_logged = True
                     if q.qsize() < 5:
                         await asyncio.sleep(0.1)
                         continue
                     buffered = True
+                    if _stream_t0:
+                        logging.info("[TTFS:%s] +%.1fs stream_buffer_ready", job_id, time.perf_counter() - _stream_t0)
                 # Event-driven wait — replaces busy-wait polling
                 if event is not None:
                 success, buffer = await loop.run_in_executor(None, cv2.imencode, '.jpg', frame, encode_param)
                 if success:
+                    if not _first_yielded:
+                        _first_yielded = True
+                        if _stream_t0:
+                            logging.info("[TTFS:%s] +%.1fs first_yield_to_client", job_id, time.perf_counter() - _stream_t0)
                     yield (b'--frame\r\n'
                            b'Content-Type: image/jpeg\r\n\r\n' + buffer.tobytes() + b'\r\n')

inference.py CHANGED Viewed

@@ -1277,6 +1277,7 @@ def run_grounded_sam2_tracking(
     _perf_lock=None,
     num_maskmem: Optional[int] = None,
     detector_name: Optional[str] = None,
 ) -> str:
     """Run Grounded-SAM-2 video tracking pipeline.
@@ -1292,6 +1293,13 @@ def run_grounded_sam2_tracking(
     from models.segmenters.grounded_sam2 import MaskDictionary, ObjectInfo, LazyFrameObjects
     active_segmenter = segmenter_name or "GSAM2-L"
     logging.info(
         "Grounded-SAM-2 tracking: segmenter=%s, queries=%s, step=%d",
         active_segmenter, queries, step,
@@ -1315,6 +1323,7 @@ def run_grounded_sam2_tracking(
         if _perf_metrics is not None:
             _perf_metrics["frame_extraction_ms"] = (time.perf_counter() - _t_ext) * 1000.0
         total_frames = len(frame_names)
         logging.info("Extracted %d frames to %s", total_frames, frame_dir)
         num_gpus = torch.cuda.device_count()
@@ -1474,6 +1483,7 @@ def run_grounded_sam2_tracking(
         def _writer_loop():
             nonlocal render_done
             next_idx = 0
             buf: Dict[int, Tuple] = {}
@@ -1615,6 +1625,9 @@ def run_grounded_sam2_tracking(
                             if stream_queue or job_id:
                                 with _stream_lock:
                                     _stream_deque.append(frm)
                             next_idx += 1
                             if next_idx % 30 == 0:
@@ -1679,6 +1692,7 @@ def run_grounded_sam2_tracking(
                 "accumulated=%d frames in %.1fs",
                 r_prod, r_stream, accumulated, elapsed,
             )
             # --- Phase 2: adaptive streaming ---
             last_adjust = time.perf_counter()
@@ -1702,6 +1716,8 @@ def run_grounded_sam2_tracking(
                             stream_queue.put(frame, timeout=0.01)
                         except Exception:
                             pass
                     published += 1
                     last_publish_time = time.perf_counter()
                     time.sleep(frame_interval)
@@ -1757,6 +1773,8 @@ def run_grounded_sam2_tracking(
             _publisher_thread = Thread(target=_stream_publisher_thread, daemon=True)
             _publisher_thread.start()
         # ==================================================================
         # Phase 1-4: Tracking  (single-GPU fallback vs multi-GPU pipeline)
         # Segments are fed incrementally to render_in as they complete.
@@ -1780,6 +1798,8 @@ def run_grounded_sam2_tracking(
                     segmenter._perf_metrics = _perf_metrics
                     segmenter._perf_lock = None
                 if _perf_metrics is not None:
                     _t_track = time.perf_counter()
@@ -1797,10 +1817,13 @@ def run_grounded_sam2_tracking(
                             seen.add(fi)
                             render_in.put((fi, LazyFrameObjects(segment_output, fi)))
                 tracking_results = segmenter.process_video(
                     frame_dir, frame_names, queries,
                     on_segment=_feed_segment,
                     on_segment_output=_feed_segment_gpu,
                 )
                 if _perf_metrics is not None:
@@ -1846,6 +1869,8 @@ def run_grounded_sam2_tracking(
                         seg._perf_metrics = _perf_metrics
                         seg._perf_lock = _actual_lock
                 # Phase 2: Init SAM2 models/state per GPU (parallel)
                 if _perf_metrics is not None:
                     _t_init = time.perf_counter()
@@ -1866,6 +1891,8 @@ def run_grounded_sam2_tracking(
                     _perf_metrics["init_state_ms"] = (time.perf_counter() - _t_init) * 1000.0
                     _t_track = time.perf_counter()
                 # Phase 3: Parallel segment processing (queue-based workers)
                 segments = list(range(0, total_frames, step))
                 num_total_segments = len(segments)
@@ -2028,6 +2055,8 @@ def run_grounded_sam2_tracking(
                                         else {}
                                     )
                                 render_in.put((fi, tracking_results.get(fi, {})))
                             next_seg_idx += 1
                             continue
@@ -2055,6 +2084,8 @@ def run_grounded_sam2_tracking(
                             ):
                                 tracking_results[fi] = {}
                                 render_in.put((fi, {}))
                             next_seg_idx += 1
                             continue
@@ -2091,6 +2122,8 @@ def run_grounded_sam2_tracking(
                                     LazyFrameObjects(segment_output, fi, remapping),
                                 ))
                         next_seg_idx += 1
                 for t in seg_workers:

     _perf_lock=None,
     num_maskmem: Optional[int] = None,
     detector_name: Optional[str] = None,
+    _ttfs_t0: Optional[float] = None,
 ) -> str:
     """Run Grounded-SAM-2 video tracking pipeline.
     from models.segmenters.grounded_sam2 import MaskDictionary, ObjectInfo, LazyFrameObjects
     active_segmenter = segmenter_name or "GSAM2-L"
+    def _ttfs(msg):
+        if _ttfs_t0 is not None:
+            logging.info("[TTFS:%s] +%.1fs %s", job_id, time.perf_counter() - _ttfs_t0, msg)
+    _ttfs("enter run_grounded_sam2_tracking")
     logging.info(
         "Grounded-SAM-2 tracking: segmenter=%s, queries=%s, step=%d",
         active_segmenter, queries, step,
         if _perf_metrics is not None:
             _perf_metrics["frame_extraction_ms"] = (time.perf_counter() - _t_ext) * 1000.0
         total_frames = len(frame_names)
+        _ttfs(f"frame_extraction done ({total_frames} frames)")
         logging.info("Extracted %d frames to %s", total_frames, frame_dir)
         num_gpus = torch.cuda.device_count()
         def _writer_loop():
             nonlocal render_done
+            _first_deposit = False
             next_idx = 0
             buf: Dict[int, Tuple] = {}
                             if stream_queue or job_id:
                                 with _stream_lock:
                                     _stream_deque.append(frm)
+                                if not _first_deposit:
+                                    _first_deposit = True
+                                    _ttfs("first_frame_deposited_to_deque")
                             next_idx += 1
                             if next_idx % 30 == 0:
                 "accumulated=%d frames in %.1fs",
                 r_prod, r_stream, accumulated, elapsed,
             )
+            _ttfs(f"publisher: startup_wait done ({accumulated} frames in {elapsed:.1f}s)")
             # --- Phase 2: adaptive streaming ---
             last_adjust = time.perf_counter()
                             stream_queue.put(frame, timeout=0.01)
                         except Exception:
                             pass
+                    if published == 0:
+                        _ttfs("first_publish_frame")
                     published += 1
                     last_publish_time = time.perf_counter()
                     time.sleep(frame_interval)
             _publisher_thread = Thread(target=_stream_publisher_thread, daemon=True)
             _publisher_thread.start()
+        _ttfs("writer+publisher threads started")
         # ==================================================================
         # Phase 1-4: Tracking  (single-GPU fallback vs multi-GPU pipeline)
         # Segments are fed incrementally to render_in as they complete.
                     segmenter._perf_metrics = _perf_metrics
                     segmenter._perf_lock = None
+                _ttfs(f"model loaded ({active_segmenter})")
                 if _perf_metrics is not None:
                     _t_track = time.perf_counter()
                             seen.add(fi)
                             render_in.put((fi, LazyFrameObjects(segment_output, fi)))
+                _ttfs("process_video started")
                 tracking_results = segmenter.process_video(
                     frame_dir, frame_names, queries,
                     on_segment=_feed_segment,
                     on_segment_output=_feed_segment_gpu,
+                    _ttfs_t0=_ttfs_t0,
+                    _ttfs_job_id=job_id,
                 )
                 if _perf_metrics is not None:
                         seg._perf_metrics = _perf_metrics
                         seg._perf_lock = _actual_lock
+                _ttfs(f"model loaded ({active_segmenter}, {num_gpus} GPUs)")
                 # Phase 2: Init SAM2 models/state per GPU (parallel)
                 if _perf_metrics is not None:
                     _t_init = time.perf_counter()
                     _perf_metrics["init_state_ms"] = (time.perf_counter() - _t_init) * 1000.0
                     _t_track = time.perf_counter()
+                _ttfs("multi-GPU tracking started")
                 # Phase 3: Parallel segment processing (queue-based workers)
                 segments = list(range(0, total_frames, step))
                 num_total_segments = len(segments)
                                         else {}
                                     )
                                 render_in.put((fi, tracking_results.get(fi, {})))
+                            if next_seg_idx == 0:
+                                _ttfs("first_segment_reconciled (multi-GPU, no detections)")
                             next_seg_idx += 1
                             continue
                             ):
                                 tracking_results[fi] = {}
                                 render_in.put((fi, {}))
+                            if next_seg_idx == 0:
+                                _ttfs("first_segment_reconciled (multi-GPU, empty masks)")
                             next_seg_idx += 1
                             continue
                                     LazyFrameObjects(segment_output, fi, remapping),
                                 ))
+                        if next_seg_idx == 0:
+                            _ttfs("first_segment_reconciled (multi-GPU)")
                         next_seg_idx += 1
                 for t in seg_workers:

jobs/background.py CHANGED Viewed

@@ -40,6 +40,7 @@ async def process_video_async(job_id: str) -> None:
                 first_frame_gpt_results=job.first_frame_gpt_results,
                 num_maskmem=7,
                 detector_name=job.detector_name,
             )
         else:
             detections_list = None

                 first_frame_gpt_results=job.first_frame_gpt_results,
                 num_maskmem=7,
                 detector_name=job.detector_name,
+                _ttfs_t0=job.ttfs_t0,
             )
         else:
             detections_list = None

jobs/models.py CHANGED Viewed

@@ -38,3 +38,4 @@ class JobInfo:
     mission_spec: Optional[Any] = None  # utils.schemas.MissionSpecification
     mission_mode: str = "LEGACY"  # "MISSION" or "LEGACY"
     first_frame_gpt_results: Optional[Dict[str, Any]] = None  # Cached GPT results from process_first_frame

     mission_spec: Optional[Any] = None  # utils.schemas.MissionSpecification
     mission_mode: str = "LEGACY"  # "MISSION" or "LEGACY"
     first_frame_gpt_results: Optional[Dict[str, Any]] = None  # Cached GPT results from process_first_frame
+    ttfs_t0: Optional[float] = None  # TTFS anchor: time.perf_counter() at endpoint entry

models/segmenters/grounded_sam2.py CHANGED Viewed

@@ -715,6 +715,8 @@ class GroundedSAM2Segmenter(Segmenter):
         text_prompts: List[str],
         on_segment: Optional[Callable[[Dict[int, Dict[int, "ObjectInfo"]]], None]] = None,
         on_segment_output: Optional[Callable[["SegmentOutput"], None]] = None,
     ) -> Dict[int, Dict[int, ObjectInfo]]:
         """Run full Grounded-SAM-2 tracking pipeline on extracted JPEG frames.
@@ -819,6 +821,9 @@ class GroundedSAM2Segmenter(Segmenter):
                             seg_results[fi] = all_results[fi]
                     if on_segment and seg_results:
                         on_segment(seg_results)
                     continue
                 # -- SAM2 image predictor on keyframe --
@@ -867,6 +872,9 @@ class GroundedSAM2Segmenter(Segmenter):
                         seg_results_empty[fi] = {}
                     if on_segment:
                         on_segment(seg_results_empty)
                     continue
                 # -- SAM2 video predictor: propagate masks --
@@ -892,6 +900,9 @@ class GroundedSAM2Segmenter(Segmenter):
                             sam2_masks.mask_width = first_info.mask.shape[-1] if first_info.mask.ndim >= 2 else 0
                 if on_segment_output is not None:
                     on_segment_output(segment_output)
         logging.info(
             "Grounded-SAM-2 tracking complete: %d frames, %d tracked objects",

         text_prompts: List[str],
         on_segment: Optional[Callable[[Dict[int, Dict[int, "ObjectInfo"]]], None]] = None,
         on_segment_output: Optional[Callable[["SegmentOutput"], None]] = None,
+        _ttfs_t0: Optional[float] = None,
+        _ttfs_job_id: Optional[str] = None,
     ) -> Dict[int, Dict[int, ObjectInfo]]:
         """Run full Grounded-SAM-2 tracking pipeline on extracted JPEG frames.
                             seg_results[fi] = all_results[fi]
                     if on_segment and seg_results:
                         on_segment(seg_results)
+                    if start_idx == 0 and _ttfs_t0 is not None:
+                        logging.info("[TTFS:%s] +%.1fs first_segment_complete (no detections, step=%d)",
+                                     _ttfs_job_id, time.perf_counter() - _ttfs_t0, step)
                     continue
                 # -- SAM2 image predictor on keyframe --
                         seg_results_empty[fi] = {}
                     if on_segment:
                         on_segment(seg_results_empty)
+                    if start_idx == 0 and _ttfs_t0 is not None:
+                        logging.info("[TTFS:%s] +%.1fs first_segment_complete (empty masks, step=%d)",
+                                     _ttfs_job_id, time.perf_counter() - _ttfs_t0, step)
                     continue
                 # -- SAM2 video predictor: propagate masks --
                             sam2_masks.mask_width = first_info.mask.shape[-1] if first_info.mask.ndim >= 2 else 0
                 if on_segment_output is not None:
                     on_segment_output(segment_output)
+                if start_idx == 0 and _ttfs_t0 is not None:
+                    logging.info("[TTFS:%s] +%.1fs first_segment_complete (step=%d)",
+                                 _ttfs_job_id, time.perf_counter() - _ttfs_t0, step)
         logging.info(
             "Grounded-SAM-2 tracking complete: %d frames, %d tracked objects",