Spaces:

BiasLab2025
/

detection_base

Paused

App Files Files Community

Zhen Ye commited on Feb 19

Commit

74ae00c

1 Parent(s): 517108e

feat: add detailed timing metrics to GroundedSAM2 segmenter

Browse files

Files changed (6) hide show

app.py +75 -20
frontend/js/api/client.js +0 -12
inference.py +64 -4
jobs/background.py +5 -28
jobs/models.py +0 -6
models/segmenters/grounded_sam2.py +81 -0

app.py CHANGED Viewed

@@ -28,7 +28,6 @@ except Exception as e:
 import asyncio
 import shutil
 import tempfile
-import time
 import uuid
 from contextlib import asynccontextmanager
 from datetime import timedelta
@@ -89,7 +88,6 @@ async def _enrich_first_frame_gpt(
     """
     if not enable_gpt or not detections:
         return
-    t_gpt_start = time.monotonic()
     try:
         # Non-LLM_EXTRACTED relevance filter runs BEFORE run_enrichment (FAST_PATH case)
         if mission_spec and mission_spec.parse_mode != "LLM_EXTRACTED":
@@ -101,12 +99,9 @@ async def _enrich_first_frame_gpt(
             if not filtered:
                 for det in detections:
                     det["assessment_status"] = AssessmentStatus.ASSESSED
-                gpt_elapsed = time.monotonic() - t_gpt_start
-                logging.info("TIMING gpt_first_frame=%.3fs for job %s (all non-relevant)", gpt_elapsed, job_id)
                 get_job_storage().update(
                     job_id,
                     first_frame_detections=detections,
-                    timing_gpt_first_frame_s=gpt_elapsed,
                 )
                 logging.info("All detections non-relevant for job %s; marked ASSESSED", job_id)
                 return
@@ -115,8 +110,6 @@ async def _enrich_first_frame_gpt(
             run_enrichment, 0, frame, detections, mission_spec,
             job_id=job_id,
         )
-        gpt_elapsed = time.monotonic() - t_gpt_start
-        logging.info("TIMING gpt_first_frame=%.3fs for job %s", gpt_elapsed, job_id)
         logging.info("Background GPT enrichment complete for job %s", job_id)
         if not gpt_results:
@@ -126,7 +119,6 @@ async def _enrich_first_frame_gpt(
             get_job_storage().update(
                 job_id,
                 first_frame_detections=detections,
-                timing_gpt_first_frame_s=gpt_elapsed,
             )
             logging.info("All detections non-relevant for job %s; marked ASSESSED", job_id)
             return
@@ -141,7 +133,6 @@ async def _enrich_first_frame_gpt(
             job_id,
             first_frame_detections=detections,
             first_frame_gpt_results=gpt_results,
-            timing_gpt_first_frame_s=gpt_elapsed,
         )
         logging.info("Updated first_frame_detections with GPT results for job %s", job_id)
@@ -425,7 +416,6 @@ async def detect_async_endpoint(
         raise HTTPException(status_code=400, detail="Video file is required.")
     job_id = uuid.uuid4().hex
-    t_job_start = time.monotonic()
     job_dir = get_job_directory(job_id)
     input_path = get_input_video_path(job_id)
     output_path = get_output_video_path(job_id)
@@ -498,8 +488,6 @@ async def detect_async_endpoint(
             segmenter_name=segmenter,
         )
         cv2.imwrite(str(first_frame_path), processed_frame)
-        first_frame_elapsed = time.monotonic() - t_job_start
-        logging.info("TIMING first_frame=%.3fs for job %s", first_frame_elapsed, job_id)
         # GPT and depth are now handled in the async pipeline (enrichment thread)
         depth_map = None
         first_frame_gpt_results = None
@@ -527,7 +515,6 @@ async def detect_async_endpoint(
         mission_spec=mission_spec,
         mission_mode=mission_mode,
         first_frame_gpt_results=first_frame_gpt_results,
-        timing_first_frame_s=first_frame_elapsed,
     )
     get_job_storage().create(job)
     asyncio.create_task(process_video_async(job_id))
@@ -582,13 +569,6 @@ async def detect_status(job_id: str):
         "completed_at": job.completed_at.isoformat() if job.completed_at else None,
         "error": job.error,
         "first_frame_detections": job.first_frame_detections,
-        "timing": {
-            "first_frame_s": job.timing_first_frame_s,
-            "video_processing_s": job.timing_video_processing_s,
-            "gpt_first_frame_s": job.timing_gpt_first_frame_s,
-            "gpt_enrichment_s": job.timing_gpt_enrichment_s,
-            "total_s": job.timing_total_s,
-        },
     }
@@ -871,5 +851,80 @@ async def chat_threat_endpoint(
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)

 import asyncio
 import shutil
 import tempfile
 import uuid
 from contextlib import asynccontextmanager
 from datetime import timedelta
     """
     if not enable_gpt or not detections:
         return
     try:
         # Non-LLM_EXTRACTED relevance filter runs BEFORE run_enrichment (FAST_PATH case)
         if mission_spec and mission_spec.parse_mode != "LLM_EXTRACTED":
             if not filtered:
                 for det in detections:
                     det["assessment_status"] = AssessmentStatus.ASSESSED
                 get_job_storage().update(
                     job_id,
                     first_frame_detections=detections,
                 )
                 logging.info("All detections non-relevant for job %s; marked ASSESSED", job_id)
                 return
             run_enrichment, 0, frame, detections, mission_spec,
             job_id=job_id,
         )
         logging.info("Background GPT enrichment complete for job %s", job_id)
         if not gpt_results:
             get_job_storage().update(
                 job_id,
                 first_frame_detections=detections,
             )
             logging.info("All detections non-relevant for job %s; marked ASSESSED", job_id)
             return
             job_id,
             first_frame_detections=detections,
             first_frame_gpt_results=gpt_results,
         )
         logging.info("Updated first_frame_detections with GPT results for job %s", job_id)
         raise HTTPException(status_code=400, detail="Video file is required.")
     job_id = uuid.uuid4().hex
     job_dir = get_job_directory(job_id)
     input_path = get_input_video_path(job_id)
     output_path = get_output_video_path(job_id)
             segmenter_name=segmenter,
         )
         cv2.imwrite(str(first_frame_path), processed_frame)
         # GPT and depth are now handled in the async pipeline (enrichment thread)
         depth_map = None
         first_frame_gpt_results = None
         mission_spec=mission_spec,
         mission_mode=mission_mode,
         first_frame_gpt_results=first_frame_gpt_results,
     )
     get_job_storage().create(job)
     asyncio.create_task(process_video_async(job_id))
         "completed_at": job.completed_at.isoformat() if job.completed_at else None,
         "error": job.error,
         "first_frame_detections": job.first_frame_detections,
     }
         raise HTTPException(status_code=500, detail=str(e))
+@app.post("/benchmark")
+async def benchmark_endpoint(
+    video: UploadFile = File(...),
+    queries: str = Form("person,car,truck"),
+    segmenter: str = Form("gsam2_large"),
+    step: int = Form(20),
+):
+    """Run instrumented GSAM2 pipeline and return latency breakdown JSON.
+    This is a long-running synchronous request (may take minutes).
+    Callers should set an appropriate HTTP timeout.
+    """
+    import threading
+    # Save uploaded video to temp path
+    input_path = tempfile.mktemp(suffix=".mp4", prefix="bench_in_")
+    output_path = tempfile.mktemp(suffix=".mp4", prefix="bench_out_")
+    try:
+        with open(input_path, "wb") as f:
+            shutil.copyfileobj(video.file, f)
+        query_list = [q.strip() for q in queries.split(",") if q.strip()]
+        metrics = {
+            "end_to_end_ms": 0.0,
+            "frame_extraction_ms": 0.0,
+            "tracking_total_ms": 0.0,
+            "gdino_total_ms": 0.0,
+            "sam_image_total_ms": 0.0,
+            "sam_video_total_ms": 0.0,
+            "id_reconciliation_ms": 0.0,
+            "render_total_ms": 0.0,
+            "writer_total_ms": 0.0,
+            "gpu_peak_mem_mb": 0.0,
+        }
+        lock = threading.Lock()
+        await asyncio.to_thread(
+            run_grounded_sam2_tracking,
+            input_path,
+            output_path,
+            query_list,
+            segmenter_name=segmenter,
+            step=step,
+            enable_gpt=False,
+            _perf_metrics=metrics,
+            _perf_lock=lock,
+        )
+        # Read frame count and fps from output video
+        total_frames = 0
+        fps = 0.0
+        cap = cv2.VideoCapture(output_path)
+        if cap.isOpened():
+            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            fps = cap.get(cv2.CAP_PROP_FPS) or 0.0
+            cap.release()
+        num_gpus = torch.cuda.device_count()
+        return JSONResponse({
+            "total_frames": total_frames,
+            "fps": fps,
+            "num_gpus": num_gpus,
+            "metrics": metrics,
+        })
+    finally:
+        for p in (input_path, output_path):
+            try:
+                os.remove(p)
+            except OSError:
+                pass
 if __name__ == "__main__":
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)

frontend/js/api/client.js CHANGED Viewed

@@ -192,18 +192,6 @@ APP.api.client.pollAsyncJob = async function () {
                         syncGpt(status.first_frame_detections, "final sync");
                     }
-                    // Display timing summary
-                    if (status.timing) {
-                        const t = status.timing;
-                        const parts = [];
-                        if (t.first_frame_s != null) parts.push(`1st frame: ${t.first_frame_s.toFixed(2)}s`);
-                        if (t.video_processing_s != null) parts.push(`video: ${t.video_processing_s.toFixed(2)}s`);
-                        if (t.gpt_first_frame_s != null) parts.push(`GPT: ${t.gpt_first_frame_s.toFixed(2)}s`);
-                        if (t.gpt_enrichment_s != null) parts.push(`GPT enrich: ${t.gpt_enrichment_s.toFixed(2)}s`);
-                        if (t.total_s != null) parts.push(`total: ${t.total_s.toFixed(2)}s`);
-                        if (parts.length) log(`Timing: ${parts.join(" | ")}`, "t");
-                    }
                     try {
                         await fetchProcessedVideo();
                         await fetchDepthVideo();

                         syncGpt(status.first_frame_detections, "final sync");
                     }
                     try {
                         await fetchProcessedVideo();
                         await fetchDepthVideo();

inference.py CHANGED Viewed

@@ -1629,6 +1629,8 @@ def run_grounded_sam2_tracking(
     enable_gpt: bool = False,
     mission_spec=None,  # Optional[MissionSpecification]
     first_frame_gpt_results: Optional[Dict[str, Any]] = None,
 ) -> str:
     """Run Grounded-SAM-2 video tracking pipeline.
@@ -1652,9 +1654,20 @@ def run_grounded_sam2_tracking(
     # 1. Extract frames to JPEG directory
     frame_dir = tempfile.mkdtemp(prefix="gsam2_frames_")
     try:
         frame_names, fps, width, height = extract_frames_to_jpeg_dir(
             input_video_path, frame_dir, max_frames=max_frames,
         )
         total_frames = len(frame_names)
         logging.info("Extracted %d frames to %s", total_frames, frame_dir)
@@ -1668,9 +1681,21 @@ def run_grounded_sam2_tracking(
             device_str = "cuda:0" if torch.cuda.is_available() else "cpu"
             segmenter = load_segmenter_on_device(active_segmenter, device_str)
             _check_cancellation(job_id)
             tracking_results = segmenter.process_video(
                 frame_dir, frame_names, queries,
             )
             logging.info(
                 "Single-GPU tracking complete: %d frames",
                 len(tracking_results),
@@ -1696,6 +1721,13 @@ def run_grounded_sam2_tracking(
                 segmenters = [f.result() for f in futs]
             logging.info("Loaded %d segmenters", len(segmenters))
             # Phase 2: Init SAM2 models/state per GPU (parallel)
             def _init_seg_state(seg):
                 seg._ensure_models_loaded()
@@ -1709,6 +1741,9 @@ def run_grounded_sam2_tracking(
                 futs = [pool.submit(_init_seg_state, seg) for seg in segmenters]
                 inference_states = [f.result() for f in futs]
             # Phase 3: Parallel segment processing (queue-based workers)
             segments = list(range(0, total_frames, step))
             seg_queue_in: Queue = Queue()
@@ -1822,6 +1857,9 @@ def run_grounded_sam2_tracking(
                 segment_data[seg_idx] = (start_idx, mask_dict, results)
             # Phase 4: Sequential ID reconciliation
             global_id_counter = 0
             sam2_masks = MaskDictionary()
             tracking_results: Dict[int, Dict[int, ObjectInfo]] = {}
@@ -1909,6 +1947,10 @@ def run_grounded_sam2_tracking(
                                 m.shape[-1] if m.ndim >= 2 else 0
                             )
             logging.info(
                 "Multi-GPU reconciliation complete: %d frames, %d objects",
                 len(tracking_results), global_id_counter,
@@ -1931,11 +1973,22 @@ def run_grounded_sam2_tracking(
                     break
                 fidx, fobjs = item
                 try:
                     frm = _gsam2_render_frame(
                         frame_dir, frame_names, fidx, fobjs,
                         height, width,
                         masks_only=enable_gpt,
                     )
                     payload = (fidx, frm, fobjs) if enable_gpt else (fidx, frm, {})
                     while True:
                         try:
@@ -1985,15 +2038,12 @@ def run_grounded_sam2_tracking(
                     break
                 frame_idx, frame_data, gpt_dets, ms = item
                 try:
-                    t_enrich_start = time.monotonic()
                     gpt_res = run_enrichment(
                         frame_idx, frame_data, gpt_dets, ms,
                         first_frame_gpt_results=first_frame_gpt_results,
                         job_id=job_id,
                         relevance_refined_event=_relevance_refined,
                     )
-                    gpt_enrichment_elapsed = time.monotonic() - t_enrich_start
-                    logging.info("TIMING gpt_enrichment=%.3fs for job %s", gpt_enrichment_elapsed, job_id)
                     # GSAM2-specific: store results in per-track dict and persist to job storage
                     if gpt_res:
@@ -2035,7 +2085,6 @@ def run_grounded_sam2_tracking(
                                         job_id,
                                         first_frame_detections=_st.first_frame_detections,
                                         first_frame_gpt_results=gpt_res,
-                                        timing_gpt_enrichment_s=gpt_enrichment_elapsed,
                                     )
                                     logging.info(
                                         "GSAM2 enrichment: updated first_frame_detections in job storage for %s",
@@ -2180,8 +2229,14 @@ def run_grounded_sam2_tracking(
                                 if job_id:
                                     set_track_data(job_id, next_idx, [])
                             writer.write(frm)
                             if stream_queue:
                                 try:
                                     from jobs.streaming import (
@@ -2237,6 +2292,11 @@ def run_grounded_sam2_tracking(
             t.join()
         writer_thread.join()
         logging.info("Grounded-SAM-2 output written to: %s", output_video_path)
         return output_video_path

     enable_gpt: bool = False,
     mission_spec=None,  # Optional[MissionSpecification]
     first_frame_gpt_results: Optional[Dict[str, Any]] = None,
+    _perf_metrics: Optional[Dict[str, float]] = None,
+    _perf_lock=None,
 ) -> str:
     """Run Grounded-SAM-2 video tracking pipeline.
     # 1. Extract frames to JPEG directory
     frame_dir = tempfile.mkdtemp(prefix="gsam2_frames_")
     try:
+        if _perf_metrics is not None:
+            _t_e2e = time.perf_counter()
+            if torch.cuda.is_available():
+                torch.cuda.reset_peak_memory_stats()
+        if _perf_metrics is not None:
+            _t_ext = time.perf_counter()
         frame_names, fps, width, height = extract_frames_to_jpeg_dir(
             input_video_path, frame_dir, max_frames=max_frames,
         )
+        if _perf_metrics is not None:
+            _perf_metrics["frame_extraction_ms"] = (time.perf_counter() - _t_ext) * 1000.0
         total_frames = len(frame_names)
         logging.info("Extracted %d frames to %s", total_frames, frame_dir)
             device_str = "cuda:0" if torch.cuda.is_available() else "cpu"
             segmenter = load_segmenter_on_device(active_segmenter, device_str)
             _check_cancellation(job_id)
+            if _perf_metrics is not None:
+                segmenter._perf_metrics = _perf_metrics
+                segmenter._perf_lock = None
+            if _perf_metrics is not None:
+                _t_track = time.perf_counter()
             tracking_results = segmenter.process_video(
                 frame_dir, frame_names, queries,
             )
+            if _perf_metrics is not None:
+                _perf_metrics["tracking_total_ms"] = (time.perf_counter() - _t_track) * 1000.0
             logging.info(
                 "Single-GPU tracking complete: %d frames",
                 len(tracking_results),
                 segmenters = [f.result() for f in futs]
             logging.info("Loaded %d segmenters", len(segmenters))
+            if _perf_metrics is not None:
+                import threading as _th
+                _actual_lock = _perf_lock or _th.Lock()
+                for seg in segmenters:
+                    seg._perf_metrics = _perf_metrics
+                    seg._perf_lock = _actual_lock
             # Phase 2: Init SAM2 models/state per GPU (parallel)
             def _init_seg_state(seg):
                 seg._ensure_models_loaded()
                 futs = [pool.submit(_init_seg_state, seg) for seg in segmenters]
                 inference_states = [f.result() for f in futs]
+            if _perf_metrics is not None:
+                _t_track = time.perf_counter()
             # Phase 3: Parallel segment processing (queue-based workers)
             segments = list(range(0, total_frames, step))
             seg_queue_in: Queue = Queue()
                 segment_data[seg_idx] = (start_idx, mask_dict, results)
             # Phase 4: Sequential ID reconciliation
+            if _perf_metrics is not None:
+                _t_recon = time.perf_counter()
             global_id_counter = 0
             sam2_masks = MaskDictionary()
             tracking_results: Dict[int, Dict[int, ObjectInfo]] = {}
                                 m.shape[-1] if m.ndim >= 2 else 0
                             )
+            if _perf_metrics is not None:
+                _perf_metrics["id_reconciliation_ms"] = (time.perf_counter() - _t_recon) * 1000.0
+                _perf_metrics["tracking_total_ms"] = (time.perf_counter() - _t_track) * 1000.0
             logging.info(
                 "Multi-GPU reconciliation complete: %d frames, %d objects",
                 len(tracking_results), global_id_counter,
                     break
                 fidx, fobjs = item
                 try:
+                    if _perf_metrics is not None:
+                        _t_r = time.perf_counter()
                     frm = _gsam2_render_frame(
                         frame_dir, frame_names, fidx, fobjs,
                         height, width,
                         masks_only=enable_gpt,
                     )
+                    if _perf_metrics is not None:
+                        _r_ms = (time.perf_counter() - _t_r) * 1000.0
+                        if _perf_lock:
+                            with _perf_lock: _perf_metrics["render_total_ms"] += _r_ms
+                        else:
+                            _perf_metrics["render_total_ms"] += _r_ms
                     payload = (fidx, frm, fobjs) if enable_gpt else (fidx, frm, {})
                     while True:
                         try:
                     break
                 frame_idx, frame_data, gpt_dets, ms = item
                 try:
                     gpt_res = run_enrichment(
                         frame_idx, frame_data, gpt_dets, ms,
                         first_frame_gpt_results=first_frame_gpt_results,
                         job_id=job_id,
                         relevance_refined_event=_relevance_refined,
                     )
                     # GSAM2-specific: store results in per-track dict and persist to job storage
                     if gpt_res:
                                         job_id,
                                         first_frame_detections=_st.first_frame_detections,
                                         first_frame_gpt_results=gpt_res,
                                     )
                                     logging.info(
                                         "GSAM2 enrichment: updated first_frame_detections in job storage for %s",
                                 if job_id:
                                     set_track_data(job_id, next_idx, [])
+                            if _perf_metrics is not None:
+                                _t_w = time.perf_counter()
                             writer.write(frm)
+                            if _perf_metrics is not None:
+                                _perf_metrics["writer_total_ms"] += (time.perf_counter() - _t_w) * 1000.0
                             if stream_queue:
                                 try:
                                     from jobs.streaming import (
             t.join()
         writer_thread.join()
+        if _perf_metrics is not None:
+            _perf_metrics["end_to_end_ms"] = (time.perf_counter() - _t_e2e) * 1000.0
+            if torch.cuda.is_available():
+                _perf_metrics["gpu_peak_mem_mb"] = torch.cuda.max_memory_allocated() / (1024 * 1024)
         logging.info("Grounded-SAM-2 output written to: %s", output_video_path)
         return output_video_path

jobs/background.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import asyncio
 import logging
-import time
 from datetime import datetime
 import torch
@@ -25,8 +24,6 @@ async def process_video_async(job_id: str) -> None:
     # Create stream for live view
     stream_queue = create_stream(job_id)
-    t_video_start = time.monotonic()
     try:
         # Run detection or segmentation first
         if job.mode == "segmentation":
@@ -72,7 +69,7 @@ async def process_video_async(job_id: str) -> None:
             # If depth was ON, the video at video_path *has* depth overlays.
             # But the 'Depth Video' (heatmap only) is usually separate.
             # Our Plan says: "Unified loop... Write Frame to Disk".
-            # If we want separate depth video, we need to instruct run_inference to write TWO videos?
             # Or just update 'depth_path' to be the same main video if it's merged?
             # Let's keep it simple: If depth enabled, the main video IS the depth view (overlay).
             # Or if we want separate `depth_output_path`, we need `run_inference` to handle it.
@@ -83,62 +80,42 @@ async def process_video_async(job_id: str) -> None:
                  depth_path = detection_path
                  logging.info("Depth estimation included in main video for job %s", job_id)
-        video_elapsed = time.monotonic() - t_video_start
-        completed_at = datetime.utcnow()
-        total_elapsed = (completed_at - job.created_at).total_seconds()
-        logging.info("TIMING video_processing=%.3fs for job %s", video_elapsed, job_id)
-        logging.info("TIMING total=%.3fs for job %s", total_elapsed, job_id)
         # Mark as completed (with or without depth)
         storage.update(
             job_id,
             status=JobStatus.COMPLETED,
-            completed_at=completed_at,
             output_video_path=detection_path,
             depth_output_path=depth_path,
             partial_success=partial_success,
             depth_error=depth_error,
-            timing_video_processing_s=video_elapsed,
-            timing_total_s=total_elapsed,
         )
     except RuntimeError as exc:
-        video_elapsed = time.monotonic() - t_video_start
-        completed_at = datetime.utcnow()
-        total_elapsed = (completed_at - job.created_at).total_seconds()
         # Handle cancellation specifically
         if "cancelled" in str(exc).lower():
             logging.info("Job %s was cancelled", job_id)
             storage.update(
                 job_id,
                 status=JobStatus.CANCELLED,
-                completed_at=completed_at,
                 error="Cancelled by user",
-                timing_video_processing_s=video_elapsed,
-                timing_total_s=total_elapsed,
             )
         else:
             logging.exception("Background processing failed for job %s", job_id)
             storage.update(
                 job_id,
                 status=JobStatus.FAILED,
-                completed_at=completed_at,
                 error=str(exc),
-                timing_video_processing_s=video_elapsed,
-                timing_total_s=total_elapsed,
             )
     except Exception as exc:
-        video_elapsed = time.monotonic() - t_video_start
-        completed_at = datetime.utcnow()
-        total_elapsed = (completed_at - job.created_at).total_seconds()
         logging.exception("Background processing failed for job %s", job_id)
         storage.update(
             job_id,
             status=JobStatus.FAILED,
-            completed_at=completed_at,
             error=str(exc),
-            timing_video_processing_s=video_elapsed,
-            timing_total_s=total_elapsed,
         )
     finally:
         remove_stream(job_id)

 import asyncio
 import logging
 from datetime import datetime
 import torch
     # Create stream for live view
     stream_queue = create_stream(job_id)
     try:
         # Run detection or segmentation first
         if job.mode == "segmentation":
             # If depth was ON, the video at video_path *has* depth overlays.
             # But the 'Depth Video' (heatmap only) is usually separate.
             # Our Plan says: "Unified loop... Write Frame to Disk".
+            # If we want separate depth video, we need `run_inference` to handle it.
             # Or just update 'depth_path' to be the same main video if it's merged?
             # Let's keep it simple: If depth enabled, the main video IS the depth view (overlay).
             # Or if we want separate `depth_output_path`, we need `run_inference` to handle it.
                  depth_path = detection_path
                  logging.info("Depth estimation included in main video for job %s", job_id)
         # Mark as completed (with or without depth)
         storage.update(
             job_id,
             status=JobStatus.COMPLETED,
+            completed_at=datetime.utcnow(),
             output_video_path=detection_path,
             depth_output_path=depth_path,
             partial_success=partial_success,
             depth_error=depth_error,
         )
     except RuntimeError as exc:
         # Handle cancellation specifically
         if "cancelled" in str(exc).lower():
             logging.info("Job %s was cancelled", job_id)
             storage.update(
                 job_id,
                 status=JobStatus.CANCELLED,
+                completed_at=datetime.utcnow(),
                 error="Cancelled by user",
             )
         else:
             logging.exception("Background processing failed for job %s", job_id)
             storage.update(
                 job_id,
                 status=JobStatus.FAILED,
+                completed_at=datetime.utcnow(),
                 error=str(exc),
             )
     except Exception as exc:
         logging.exception("Background processing failed for job %s", job_id)
         storage.update(
             job_id,
             status=JobStatus.FAILED,
+            completed_at=datetime.utcnow(),
             error=str(exc),
         )
     finally:
         remove_stream(job_id)

jobs/models.py CHANGED Viewed

@@ -38,9 +38,3 @@ class JobInfo:
     mission_spec: Optional[Any] = None  # utils.schemas.MissionSpecification
     mission_mode: str = "LEGACY"  # "MISSION" or "LEGACY"
     first_frame_gpt_results: Optional[Dict[str, Any]] = None  # Cached GPT results from process_first_frame
-    # Latency measurements (seconds)
-    timing_first_frame_s: Optional[float] = None
-    timing_video_processing_s: Optional[float] = None
-    timing_gpt_first_frame_s: Optional[float] = None
-    timing_gpt_enrichment_s: Optional[float] = None
-    timing_total_s: Optional[float] = None

     mission_spec: Optional[Any] = None  # utils.schemas.MissionSpecification
     mission_mode: str = "LEGACY"  # "MISSION" or "LEGACY"
     first_frame_gpt_results: Optional[Dict[str, Any]] = None  # Cached GPT results from process_first_frame

models/segmenters/grounded_sam2.py CHANGED Viewed

@@ -10,6 +10,7 @@ Reference implementation:
 import copy
 import logging
 from contextlib import nullcontext
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Sequence, Tuple
@@ -357,11 +358,15 @@ class GroundedSAM2Segmenter(Segmenter):
             when no objects are detected.
         """
         self._ensure_models_loaded()
         prompt = self._gdino_detector._build_prompt(text_prompts)
         gdino_processor = self._gdino_detector.processor
         gdino_model = self._gdino_detector.model
         inputs = gdino_processor(
             images=image, text=prompt, return_tensors="pt"
         )
@@ -376,6 +381,14 @@ class GroundedSAM2Segmenter(Segmenter):
             target_sizes=[image.size[::-1]],
         )
         input_boxes = results[0]["boxes"]
         det_labels = results[0].get("text_labels") or results[0].get("labels", [])
         if torch.is_tensor(det_labels):
@@ -386,6 +399,9 @@ class GroundedSAM2Segmenter(Segmenter):
             return None, None, []
         # SAM2 image predictor
         self._image_predictor.set_image(np.array(image))
         masks, scores, logits = self._image_predictor.predict(
             point_coords=None,
@@ -394,6 +410,14 @@ class GroundedSAM2Segmenter(Segmenter):
             multimask_output=False,
         )
         if masks.ndim == 2:
             masks = masks[None]
         elif masks.ndim == 4:
@@ -423,6 +447,10 @@ class GroundedSAM2Segmenter(Segmenter):
             Dict mapping ``frame_idx`` → ``{obj_id: ObjectInfo}`` using the
             IDs from *mask_dict* (local, not yet reconciled).
         """
         self._video_predictor.reset_state(inference_state)
         for obj_id, obj_info in mask_dict.labels.items():
@@ -451,6 +479,14 @@ class GroundedSAM2Segmenter(Segmenter):
                 frame_objects[out_obj_id] = info
             segment_results[out_frame_idx] = frame_objects
         return segment_results
     # -- Video-level tracking interface -------------------------------------
@@ -494,6 +530,7 @@ class GroundedSAM2Segmenter(Segmenter):
         device_type = device.split(":")[0]
         autocast_ctx = torch.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
         sam2_masks = MaskDictionary()
         objects_count = 0
         all_results: Dict[int, Dict[int, ObjectInfo]] = {}
@@ -515,6 +552,9 @@ class GroundedSAM2Segmenter(Segmenter):
                 mask_dict = MaskDictionary()
                 # -- Grounding DINO detection on keyframe --
                 inputs = gdino_processor(
                     images=image, text=prompt, return_tensors="pt"
                 )
@@ -530,6 +570,14 @@ class GroundedSAM2Segmenter(Segmenter):
                     target_sizes=[image.size[::-1]],
                 )
                 input_boxes = results[0]["boxes"]
                 det_labels = results[0].get("text_labels") or results[0].get("labels", [])
                 if torch.is_tensor(det_labels):
@@ -554,6 +602,9 @@ class GroundedSAM2Segmenter(Segmenter):
                     continue
                 # -- SAM2 image predictor on keyframe --
                 self._image_predictor.set_image(np.array(image))
                 masks, scores, logits = self._image_predictor.predict(
                     point_coords=None,
@@ -562,6 +613,14 @@ class GroundedSAM2Segmenter(Segmenter):
                     multimask_output=False,
                 )
                 # Normalize mask dims
                 if masks.ndim == 2:
                     masks = masks[None]
@@ -577,18 +636,32 @@ class GroundedSAM2Segmenter(Segmenter):
                 )
                 # -- IoU matching to maintain persistent IDs --
                 objects_count = mask_dict.update_masks(
                     tracking_dict=sam2_masks,
                     iou_threshold=self.iou_threshold,
                     objects_count=objects_count,
                 )
                 if len(mask_dict.labels) == 0:
                     for fi in range(start_idx, min(start_idx + step, total_frames)):
                         all_results[fi] = {}
                     continue
                 # -- SAM2 video predictor: propagate masks --
                 self._video_predictor.reset_state(inference_state)
                 for obj_id, obj_info in mask_dict.labels.items():
@@ -625,6 +698,14 @@ class GroundedSAM2Segmenter(Segmenter):
                             sam2_masks.mask_height = first_info.mask.shape[-2] if first_info.mask.ndim >= 2 else 0
                             sam2_masks.mask_width = first_info.mask.shape[-1] if first_info.mask.ndim >= 2 else 0
         logging.info(
             "Grounded-SAM-2 tracking complete: %d frames, %d tracked objects",
             len(all_results), objects_count,

 import copy
 import logging
+import time
 from contextlib import nullcontext
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Sequence, Tuple
             when no objects are detected.
         """
         self._ensure_models_loaded()
+        _pm = getattr(self, '_perf_metrics', None)
         prompt = self._gdino_detector._build_prompt(text_prompts)
         gdino_processor = self._gdino_detector.processor
         gdino_model = self._gdino_detector.model
+        if _pm is not None:
+            _t0 = time.perf_counter()
         inputs = gdino_processor(
             images=image, text=prompt, return_tensors="pt"
         )
             target_sizes=[image.size[::-1]],
         )
+        if _pm is not None:
+            _pl = getattr(self, '_perf_lock', None)
+            _d = (time.perf_counter() - _t0) * 1000.0
+            if _pl:
+                with _pl: _pm["gdino_total_ms"] += _d
+            else:
+                _pm["gdino_total_ms"] += _d
         input_boxes = results[0]["boxes"]
         det_labels = results[0].get("text_labels") or results[0].get("labels", [])
         if torch.is_tensor(det_labels):
             return None, None, []
         # SAM2 image predictor
+        if _pm is not None:
+            _t1 = time.perf_counter()
         self._image_predictor.set_image(np.array(image))
         masks, scores, logits = self._image_predictor.predict(
             point_coords=None,
             multimask_output=False,
         )
+        if _pm is not None:
+            _pl = getattr(self, '_perf_lock', None)
+            _d = (time.perf_counter() - _t1) * 1000.0
+            if _pl:
+                with _pl: _pm["sam_image_total_ms"] += _d
+            else:
+                _pm["sam_image_total_ms"] += _d
         if masks.ndim == 2:
             masks = masks[None]
         elif masks.ndim == 4:
             Dict mapping ``frame_idx`` → ``{obj_id: ObjectInfo}`` using the
             IDs from *mask_dict* (local, not yet reconciled).
         """
+        _pm = getattr(self, '_perf_metrics', None)
+        if _pm is not None:
+            _t0 = time.perf_counter()
         self._video_predictor.reset_state(inference_state)
         for obj_id, obj_info in mask_dict.labels.items():
                 frame_objects[out_obj_id] = info
             segment_results[out_frame_idx] = frame_objects
+        if _pm is not None:
+            _pl = getattr(self, '_perf_lock', None)
+            _d = (time.perf_counter() - _t0) * 1000.0
+            if _pl:
+                with _pl: _pm["sam_video_total_ms"] += _d
+            else:
+                _pm["sam_video_total_ms"] += _d
         return segment_results
     # -- Video-level tracking interface -------------------------------------
         device_type = device.split(":")[0]
         autocast_ctx = torch.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
+        _pm = getattr(self, '_perf_metrics', None)
         sam2_masks = MaskDictionary()
         objects_count = 0
         all_results: Dict[int, Dict[int, ObjectInfo]] = {}
                 mask_dict = MaskDictionary()
                 # -- Grounding DINO detection on keyframe --
+                if _pm is not None:
+                    _t_gd = time.perf_counter()
                 inputs = gdino_processor(
                     images=image, text=prompt, return_tensors="pt"
                 )
                     target_sizes=[image.size[::-1]],
                 )
+                if _pm is not None:
+                    _pl = getattr(self, '_perf_lock', None)
+                    _d = (time.perf_counter() - _t_gd) * 1000.0
+                    if _pl:
+                        with _pl: _pm["gdino_total_ms"] += _d
+                    else:
+                        _pm["gdino_total_ms"] += _d
                 input_boxes = results[0]["boxes"]
                 det_labels = results[0].get("text_labels") or results[0].get("labels", [])
                 if torch.is_tensor(det_labels):
                     continue
                 # -- SAM2 image predictor on keyframe --
+                if _pm is not None:
+                    _t_si = time.perf_counter()
                 self._image_predictor.set_image(np.array(image))
                 masks, scores, logits = self._image_predictor.predict(
                     point_coords=None,
                     multimask_output=False,
                 )
+                if _pm is not None:
+                    _pl = getattr(self, '_perf_lock', None)
+                    _d = (time.perf_counter() - _t_si) * 1000.0
+                    if _pl:
+                        with _pl: _pm["sam_image_total_ms"] += _d
+                    else:
+                        _pm["sam_image_total_ms"] += _d
                 # Normalize mask dims
                 if masks.ndim == 2:
                     masks = masks[None]
                 )
                 # -- IoU matching to maintain persistent IDs --
+                if _pm is not None:
+                    _t_id = time.perf_counter()
                 objects_count = mask_dict.update_masks(
                     tracking_dict=sam2_masks,
                     iou_threshold=self.iou_threshold,
                     objects_count=objects_count,
                 )
+                if _pm is not None:
+                    _pl = getattr(self, '_perf_lock', None)
+                    _d = (time.perf_counter() - _t_id) * 1000.0
+                    if _pl:
+                        with _pl: _pm["id_reconciliation_ms"] += _d
+                    else:
+                        _pm["id_reconciliation_ms"] += _d
                 if len(mask_dict.labels) == 0:
                     for fi in range(start_idx, min(start_idx + step, total_frames)):
                         all_results[fi] = {}
                     continue
                 # -- SAM2 video predictor: propagate masks --
+                if _pm is not None:
+                    _t_sv = time.perf_counter()
                 self._video_predictor.reset_state(inference_state)
                 for obj_id, obj_info in mask_dict.labels.items():
                             sam2_masks.mask_height = first_info.mask.shape[-2] if first_info.mask.ndim >= 2 else 0
                             sam2_masks.mask_width = first_info.mask.shape[-1] if first_info.mask.ndim >= 2 else 0
+                if _pm is not None:
+                    _pl = getattr(self, '_perf_lock', None)
+                    _d = (time.perf_counter() - _t_sv) * 1000.0
+                    if _pl:
+                        with _pl: _pm["sam_video_total_ms"] += _d
+                    else:
+                        _pm["sam_video_total_ms"] += _d
         logging.info(
             "Grounded-SAM-2 tracking complete: %d frames, %d tracked objects",
             len(all_results), objects_count,