Zhen Ye Claude Opus 4.6 commited on
Commit
2f284f5
·
1 Parent(s): 3015ea3

fix: correct benchmark metrics — remove double-counting and add missing timers

Browse files

- Fix sam_video_total_ms double-counting in single-GPU process_video() path
(propagate_segment already accumulates, outer timer was adding it twice)
- Fix id_reconciliation_ms in multi-GPU path to measure actual IoU work
instead of wall-clock queue waits
- Add model_load_ms and init_state_ms metrics for both single/multi-GPU paths

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (3) hide show
  1. app.py +2 -0
  2. inference.py +23 -3
  3. models/segmenters/grounded_sam2.py +13 -11
app.py CHANGED
@@ -880,6 +880,8 @@ async def benchmark_endpoint(
880
  metrics = {
881
  "end_to_end_ms": 0.0,
882
  "frame_extraction_ms": 0.0,
 
 
883
  "tracking_total_ms": 0.0,
884
  "gdino_total_ms": 0.0,
885
  "sam_image_total_ms": 0.0,
 
880
  metrics = {
881
  "end_to_end_ms": 0.0,
882
  "frame_extraction_ms": 0.0,
883
+ "model_load_ms": 0.0,
884
+ "init_state_ms": 0.0,
885
  "tracking_total_ms": 0.0,
886
  "gdino_total_ms": 0.0,
887
  "sam_image_total_ms": 0.0,
inference.py CHANGED
@@ -2048,10 +2048,15 @@ def run_grounded_sam2_tracking(
2048
  # ---------- Single-GPU fallback ----------
2049
  device_str = "cuda:0" if torch.cuda.is_available() else "cpu"
2050
  _seg_kw = {"num_maskmem": num_maskmem} if num_maskmem is not None else {}
 
 
 
 
2051
  segmenter = load_segmenter_on_device(active_segmenter, device_str, **_seg_kw)
2052
  _check_cancellation(job_id)
2053
 
2054
  if _perf_metrics is not None:
 
2055
  segmenter._perf_metrics = _perf_metrics
2056
  segmenter._perf_lock = None
2057
 
@@ -2082,6 +2087,9 @@ def run_grounded_sam2_tracking(
2082
  )
2083
 
2084
  # Phase 1: Load one segmenter per GPU (parallel)
 
 
 
2085
  segmenters = []
2086
  with ThreadPoolExecutor(max_workers=num_gpus) as pool:
2087
  _seg_kw_multi = {"num_maskmem": num_maskmem} if num_maskmem is not None else {}
@@ -2098,6 +2106,7 @@ def run_grounded_sam2_tracking(
2098
  logging.info("Loaded %d segmenters", len(segmenters))
2099
 
2100
  if _perf_metrics is not None:
 
2101
  import threading as _th
2102
  _actual_lock = _perf_lock or _th.Lock()
2103
  for seg in segmenters:
@@ -2105,6 +2114,9 @@ def run_grounded_sam2_tracking(
2105
  seg._perf_lock = _actual_lock
2106
 
2107
  # Phase 2: Init SAM2 models/state per GPU (parallel)
 
 
 
2108
  def _init_seg_state(seg):
2109
  seg._ensure_models_loaded()
2110
  return seg._video_predictor.init_state(
@@ -2118,6 +2130,7 @@ def run_grounded_sam2_tracking(
2118
  inference_states = [f.result() for f in futs]
2119
 
2120
  if _perf_metrics is not None:
 
2121
  _t_track = time.perf_counter()
2122
 
2123
  # Phase 3: Parallel segment processing (queue-based workers)
@@ -2226,8 +2239,7 @@ def run_grounded_sam2_tracking(
2226
 
2227
  # Phase 4: Streaming reconciliation — process segments in order
2228
  # as they arrive, feeding render_in incrementally.
2229
- if _perf_metrics is not None:
2230
- _t_recon = time.perf_counter()
2231
 
2232
  global_id_counter = 0
2233
  sam2_masks = MaskDictionary()
@@ -2287,6 +2299,9 @@ def run_grounded_sam2_tracking(
2287
  continue
2288
 
2289
  # Normalize keyframe masks to CPU before cross-GPU IoU matching.
 
 
 
2290
  for info in mask_dict.labels.values():
2291
  info.mask = _mask_to_cpu(info.mask)
2292
 
@@ -2300,6 +2315,8 @@ def run_grounded_sam2_tracking(
2300
  )
2301
 
2302
  if not mask_dict.labels:
 
 
2303
  for fi in range(
2304
  start_idx, min(start_idx + step, total_frames)
2305
  ):
@@ -2327,6 +2344,9 @@ def run_grounded_sam2_tracking(
2327
  )
2328
  tracking_results[frame_idx] = remapped
2329
 
 
 
 
2330
  # Update running tracker with last frame of this segment
2331
  if segment_results:
2332
  last_fi = max(segment_results.keys())
@@ -2354,7 +2374,7 @@ def run_grounded_sam2_tracking(
2354
  t.join()
2355
 
2356
  if _perf_metrics is not None:
2357
- _perf_metrics["id_reconciliation_ms"] = (time.perf_counter() - _t_recon) * 1000.0
2358
  _perf_metrics["tracking_total_ms"] = (time.perf_counter() - _t_track) * 1000.0
2359
 
2360
  logging.info(
 
2048
  # ---------- Single-GPU fallback ----------
2049
  device_str = "cuda:0" if torch.cuda.is_available() else "cpu"
2050
  _seg_kw = {"num_maskmem": num_maskmem} if num_maskmem is not None else {}
2051
+
2052
+ if _perf_metrics is not None:
2053
+ _t_load = time.perf_counter()
2054
+
2055
  segmenter = load_segmenter_on_device(active_segmenter, device_str, **_seg_kw)
2056
  _check_cancellation(job_id)
2057
 
2058
  if _perf_metrics is not None:
2059
+ _perf_metrics["model_load_ms"] = (time.perf_counter() - _t_load) * 1000.0
2060
  segmenter._perf_metrics = _perf_metrics
2061
  segmenter._perf_lock = None
2062
 
 
2087
  )
2088
 
2089
  # Phase 1: Load one segmenter per GPU (parallel)
2090
+ if _perf_metrics is not None:
2091
+ _t_load = time.perf_counter()
2092
+
2093
  segmenters = []
2094
  with ThreadPoolExecutor(max_workers=num_gpus) as pool:
2095
  _seg_kw_multi = {"num_maskmem": num_maskmem} if num_maskmem is not None else {}
 
2106
  logging.info("Loaded %d segmenters", len(segmenters))
2107
 
2108
  if _perf_metrics is not None:
2109
+ _perf_metrics["model_load_ms"] = (time.perf_counter() - _t_load) * 1000.0
2110
  import threading as _th
2111
  _actual_lock = _perf_lock or _th.Lock()
2112
  for seg in segmenters:
 
2114
  seg._perf_lock = _actual_lock
2115
 
2116
  # Phase 2: Init SAM2 models/state per GPU (parallel)
2117
+ if _perf_metrics is not None:
2118
+ _t_init = time.perf_counter()
2119
+
2120
  def _init_seg_state(seg):
2121
  seg._ensure_models_loaded()
2122
  return seg._video_predictor.init_state(
 
2130
  inference_states = [f.result() for f in futs]
2131
 
2132
  if _perf_metrics is not None:
2133
+ _perf_metrics["init_state_ms"] = (time.perf_counter() - _t_init) * 1000.0
2134
  _t_track = time.perf_counter()
2135
 
2136
  # Phase 3: Parallel segment processing (queue-based workers)
 
2239
 
2240
  # Phase 4: Streaming reconciliation — process segments in order
2241
  # as they arrive, feeding render_in incrementally.
2242
+ _recon_accum_ms = 0.0
 
2243
 
2244
  global_id_counter = 0
2245
  sam2_masks = MaskDictionary()
 
2299
  continue
2300
 
2301
  # Normalize keyframe masks to CPU before cross-GPU IoU matching.
2302
+ if _perf_metrics is not None:
2303
+ _t_rc = time.perf_counter()
2304
+
2305
  for info in mask_dict.labels.values():
2306
  info.mask = _mask_to_cpu(info.mask)
2307
 
 
2315
  )
2316
 
2317
  if not mask_dict.labels:
2318
+ if _perf_metrics is not None:
2319
+ _recon_accum_ms += (time.perf_counter() - _t_rc) * 1000.0
2320
  for fi in range(
2321
  start_idx, min(start_idx + step, total_frames)
2322
  ):
 
2344
  )
2345
  tracking_results[frame_idx] = remapped
2346
 
2347
+ if _perf_metrics is not None:
2348
+ _recon_accum_ms += (time.perf_counter() - _t_rc) * 1000.0
2349
+
2350
  # Update running tracker with last frame of this segment
2351
  if segment_results:
2352
  last_fi = max(segment_results.keys())
 
2374
  t.join()
2375
 
2376
  if _perf_metrics is not None:
2377
+ _perf_metrics["id_reconciliation_ms"] = _recon_accum_ms
2378
  _perf_metrics["tracking_total_ms"] = (time.perf_counter() - _t_track) * 1000.0
2379
 
2380
  logging.info(
models/segmenters/grounded_sam2.py CHANGED
@@ -717,12 +717,23 @@ class GroundedSAM2Segmenter(Segmenter):
717
 
718
  with autocast_ctx:
719
  # Init SAM2 video predictor state
 
 
 
720
  inference_state = self._video_predictor.init_state(
721
  video_path=frame_dir,
722
  offload_video_to_cpu=True,
723
  async_loading_frames=True,
724
  )
725
 
 
 
 
 
 
 
 
 
726
  for start_idx in range(0, total_frames, step):
727
  logging.info("Processing keyframe %d / %d", start_idx, total_frames)
728
 
@@ -847,9 +858,8 @@ class GroundedSAM2Segmenter(Segmenter):
847
  continue
848
 
849
  # -- SAM2 video predictor: propagate masks --
850
- if _pm is not None:
851
- _t_sv = time.perf_counter()
852
-
853
  segment_output = self.propagate_segment(
854
  inference_state, start_idx, mask_dict, step,
855
  )
@@ -870,14 +880,6 @@ class GroundedSAM2Segmenter(Segmenter):
870
  sam2_masks.mask_height = first_info.mask.shape[-2] if first_info.mask.ndim >= 2 else 0
871
  sam2_masks.mask_width = first_info.mask.shape[-1] if first_info.mask.ndim >= 2 else 0
872
 
873
- if _pm is not None:
874
- _pl = getattr(self, '_perf_lock', None)
875
- _d = (time.perf_counter() - _t_sv) * 1000.0
876
- if _pl:
877
- with _pl: _pm["sam_video_total_ms"] += _d
878
- else:
879
- _pm["sam_video_total_ms"] += _d
880
-
881
  logging.info(
882
  "Grounded-SAM-2 tracking complete: %d frames, %d tracked objects",
883
  len(all_results), objects_count,
 
717
 
718
  with autocast_ctx:
719
  # Init SAM2 video predictor state
720
+ if _pm is not None:
721
+ _t_init = time.perf_counter()
722
+
723
  inference_state = self._video_predictor.init_state(
724
  video_path=frame_dir,
725
  offload_video_to_cpu=True,
726
  async_loading_frames=True,
727
  )
728
 
729
+ if _pm is not None:
730
+ _pl = getattr(self, '_perf_lock', None)
731
+ _d = (time.perf_counter() - _t_init) * 1000.0
732
+ if _pl:
733
+ with _pl: _pm["init_state_ms"] += _d
734
+ else:
735
+ _pm["init_state_ms"] += _d
736
+
737
  for start_idx in range(0, total_frames, step):
738
  logging.info("Processing keyframe %d / %d", start_idx, total_frames)
739
 
 
858
  continue
859
 
860
  # -- SAM2 video predictor: propagate masks --
861
+ # NOTE: propagate_segment() already accumulates into
862
+ # _pm["sam_video_total_ms"], so no outer timer here.
 
863
  segment_output = self.propagate_segment(
864
  inference_state, start_idx, mask_dict, step,
865
  )
 
880
  sam2_masks.mask_height = first_info.mask.shape[-2] if first_info.mask.ndim >= 2 else 0
881
  sam2_masks.mask_width = first_info.mask.shape[-1] if first_info.mask.ndim >= 2 else 0
882
 
 
 
 
 
 
 
 
 
883
  logging.info(
884
  "Grounded-SAM-2 tracking complete: %d frames, %d tracked objects",
885
  len(all_results), objects_count,