Spaces:
Running
on
A10G
Running
on
A10G
Zhen Ye
Claude Opus 4.6
commited on
Commit
·
2f284f5
1
Parent(s):
3015ea3
fix: correct benchmark metrics — remove double-counting and add missing timers
Browse files- Fix sam_video_total_ms double-counting in single-GPU process_video() path
(propagate_segment already accumulates, outer timer was adding it twice)
- Fix id_reconciliation_ms in multi-GPU path to measure actual IoU work
instead of wall-clock queue waits
- Add model_load_ms and init_state_ms metrics for both single/multi-GPU paths
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- app.py +2 -0
- inference.py +23 -3
- models/segmenters/grounded_sam2.py +13 -11
app.py
CHANGED
|
@@ -880,6 +880,8 @@ async def benchmark_endpoint(
|
|
| 880 |
metrics = {
|
| 881 |
"end_to_end_ms": 0.0,
|
| 882 |
"frame_extraction_ms": 0.0,
|
|
|
|
|
|
|
| 883 |
"tracking_total_ms": 0.0,
|
| 884 |
"gdino_total_ms": 0.0,
|
| 885 |
"sam_image_total_ms": 0.0,
|
|
|
|
| 880 |
metrics = {
|
| 881 |
"end_to_end_ms": 0.0,
|
| 882 |
"frame_extraction_ms": 0.0,
|
| 883 |
+
"model_load_ms": 0.0,
|
| 884 |
+
"init_state_ms": 0.0,
|
| 885 |
"tracking_total_ms": 0.0,
|
| 886 |
"gdino_total_ms": 0.0,
|
| 887 |
"sam_image_total_ms": 0.0,
|
inference.py
CHANGED
|
@@ -2048,10 +2048,15 @@ def run_grounded_sam2_tracking(
|
|
| 2048 |
# ---------- Single-GPU fallback ----------
|
| 2049 |
device_str = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 2050 |
_seg_kw = {"num_maskmem": num_maskmem} if num_maskmem is not None else {}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2051 |
segmenter = load_segmenter_on_device(active_segmenter, device_str, **_seg_kw)
|
| 2052 |
_check_cancellation(job_id)
|
| 2053 |
|
| 2054 |
if _perf_metrics is not None:
|
|
|
|
| 2055 |
segmenter._perf_metrics = _perf_metrics
|
| 2056 |
segmenter._perf_lock = None
|
| 2057 |
|
|
@@ -2082,6 +2087,9 @@ def run_grounded_sam2_tracking(
|
|
| 2082 |
)
|
| 2083 |
|
| 2084 |
# Phase 1: Load one segmenter per GPU (parallel)
|
|
|
|
|
|
|
|
|
|
| 2085 |
segmenters = []
|
| 2086 |
with ThreadPoolExecutor(max_workers=num_gpus) as pool:
|
| 2087 |
_seg_kw_multi = {"num_maskmem": num_maskmem} if num_maskmem is not None else {}
|
|
@@ -2098,6 +2106,7 @@ def run_grounded_sam2_tracking(
|
|
| 2098 |
logging.info("Loaded %d segmenters", len(segmenters))
|
| 2099 |
|
| 2100 |
if _perf_metrics is not None:
|
|
|
|
| 2101 |
import threading as _th
|
| 2102 |
_actual_lock = _perf_lock or _th.Lock()
|
| 2103 |
for seg in segmenters:
|
|
@@ -2105,6 +2114,9 @@ def run_grounded_sam2_tracking(
|
|
| 2105 |
seg._perf_lock = _actual_lock
|
| 2106 |
|
| 2107 |
# Phase 2: Init SAM2 models/state per GPU (parallel)
|
|
|
|
|
|
|
|
|
|
| 2108 |
def _init_seg_state(seg):
|
| 2109 |
seg._ensure_models_loaded()
|
| 2110 |
return seg._video_predictor.init_state(
|
|
@@ -2118,6 +2130,7 @@ def run_grounded_sam2_tracking(
|
|
| 2118 |
inference_states = [f.result() for f in futs]
|
| 2119 |
|
| 2120 |
if _perf_metrics is not None:
|
|
|
|
| 2121 |
_t_track = time.perf_counter()
|
| 2122 |
|
| 2123 |
# Phase 3: Parallel segment processing (queue-based workers)
|
|
@@ -2226,8 +2239,7 @@ def run_grounded_sam2_tracking(
|
|
| 2226 |
|
| 2227 |
# Phase 4: Streaming reconciliation — process segments in order
|
| 2228 |
# as they arrive, feeding render_in incrementally.
|
| 2229 |
-
|
| 2230 |
-
_t_recon = time.perf_counter()
|
| 2231 |
|
| 2232 |
global_id_counter = 0
|
| 2233 |
sam2_masks = MaskDictionary()
|
|
@@ -2287,6 +2299,9 @@ def run_grounded_sam2_tracking(
|
|
| 2287 |
continue
|
| 2288 |
|
| 2289 |
# Normalize keyframe masks to CPU before cross-GPU IoU matching.
|
|
|
|
|
|
|
|
|
|
| 2290 |
for info in mask_dict.labels.values():
|
| 2291 |
info.mask = _mask_to_cpu(info.mask)
|
| 2292 |
|
|
@@ -2300,6 +2315,8 @@ def run_grounded_sam2_tracking(
|
|
| 2300 |
)
|
| 2301 |
|
| 2302 |
if not mask_dict.labels:
|
|
|
|
|
|
|
| 2303 |
for fi in range(
|
| 2304 |
start_idx, min(start_idx + step, total_frames)
|
| 2305 |
):
|
|
@@ -2327,6 +2344,9 @@ def run_grounded_sam2_tracking(
|
|
| 2327 |
)
|
| 2328 |
tracking_results[frame_idx] = remapped
|
| 2329 |
|
|
|
|
|
|
|
|
|
|
| 2330 |
# Update running tracker with last frame of this segment
|
| 2331 |
if segment_results:
|
| 2332 |
last_fi = max(segment_results.keys())
|
|
@@ -2354,7 +2374,7 @@ def run_grounded_sam2_tracking(
|
|
| 2354 |
t.join()
|
| 2355 |
|
| 2356 |
if _perf_metrics is not None:
|
| 2357 |
-
_perf_metrics["id_reconciliation_ms"] =
|
| 2358 |
_perf_metrics["tracking_total_ms"] = (time.perf_counter() - _t_track) * 1000.0
|
| 2359 |
|
| 2360 |
logging.info(
|
|
|
|
| 2048 |
# ---------- Single-GPU fallback ----------
|
| 2049 |
device_str = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 2050 |
_seg_kw = {"num_maskmem": num_maskmem} if num_maskmem is not None else {}
|
| 2051 |
+
|
| 2052 |
+
if _perf_metrics is not None:
|
| 2053 |
+
_t_load = time.perf_counter()
|
| 2054 |
+
|
| 2055 |
segmenter = load_segmenter_on_device(active_segmenter, device_str, **_seg_kw)
|
| 2056 |
_check_cancellation(job_id)
|
| 2057 |
|
| 2058 |
if _perf_metrics is not None:
|
| 2059 |
+
_perf_metrics["model_load_ms"] = (time.perf_counter() - _t_load) * 1000.0
|
| 2060 |
segmenter._perf_metrics = _perf_metrics
|
| 2061 |
segmenter._perf_lock = None
|
| 2062 |
|
|
|
|
| 2087 |
)
|
| 2088 |
|
| 2089 |
# Phase 1: Load one segmenter per GPU (parallel)
|
| 2090 |
+
if _perf_metrics is not None:
|
| 2091 |
+
_t_load = time.perf_counter()
|
| 2092 |
+
|
| 2093 |
segmenters = []
|
| 2094 |
with ThreadPoolExecutor(max_workers=num_gpus) as pool:
|
| 2095 |
_seg_kw_multi = {"num_maskmem": num_maskmem} if num_maskmem is not None else {}
|
|
|
|
| 2106 |
logging.info("Loaded %d segmenters", len(segmenters))
|
| 2107 |
|
| 2108 |
if _perf_metrics is not None:
|
| 2109 |
+
_perf_metrics["model_load_ms"] = (time.perf_counter() - _t_load) * 1000.0
|
| 2110 |
import threading as _th
|
| 2111 |
_actual_lock = _perf_lock or _th.Lock()
|
| 2112 |
for seg in segmenters:
|
|
|
|
| 2114 |
seg._perf_lock = _actual_lock
|
| 2115 |
|
| 2116 |
# Phase 2: Init SAM2 models/state per GPU (parallel)
|
| 2117 |
+
if _perf_metrics is not None:
|
| 2118 |
+
_t_init = time.perf_counter()
|
| 2119 |
+
|
| 2120 |
def _init_seg_state(seg):
|
| 2121 |
seg._ensure_models_loaded()
|
| 2122 |
return seg._video_predictor.init_state(
|
|
|
|
| 2130 |
inference_states = [f.result() for f in futs]
|
| 2131 |
|
| 2132 |
if _perf_metrics is not None:
|
| 2133 |
+
_perf_metrics["init_state_ms"] = (time.perf_counter() - _t_init) * 1000.0
|
| 2134 |
_t_track = time.perf_counter()
|
| 2135 |
|
| 2136 |
# Phase 3: Parallel segment processing (queue-based workers)
|
|
|
|
| 2239 |
|
| 2240 |
# Phase 4: Streaming reconciliation — process segments in order
|
| 2241 |
# as they arrive, feeding render_in incrementally.
|
| 2242 |
+
_recon_accum_ms = 0.0
|
|
|
|
| 2243 |
|
| 2244 |
global_id_counter = 0
|
| 2245 |
sam2_masks = MaskDictionary()
|
|
|
|
| 2299 |
continue
|
| 2300 |
|
| 2301 |
# Normalize keyframe masks to CPU before cross-GPU IoU matching.
|
| 2302 |
+
if _perf_metrics is not None:
|
| 2303 |
+
_t_rc = time.perf_counter()
|
| 2304 |
+
|
| 2305 |
for info in mask_dict.labels.values():
|
| 2306 |
info.mask = _mask_to_cpu(info.mask)
|
| 2307 |
|
|
|
|
| 2315 |
)
|
| 2316 |
|
| 2317 |
if not mask_dict.labels:
|
| 2318 |
+
if _perf_metrics is not None:
|
| 2319 |
+
_recon_accum_ms += (time.perf_counter() - _t_rc) * 1000.0
|
| 2320 |
for fi in range(
|
| 2321 |
start_idx, min(start_idx + step, total_frames)
|
| 2322 |
):
|
|
|
|
| 2344 |
)
|
| 2345 |
tracking_results[frame_idx] = remapped
|
| 2346 |
|
| 2347 |
+
if _perf_metrics is not None:
|
| 2348 |
+
_recon_accum_ms += (time.perf_counter() - _t_rc) * 1000.0
|
| 2349 |
+
|
| 2350 |
# Update running tracker with last frame of this segment
|
| 2351 |
if segment_results:
|
| 2352 |
last_fi = max(segment_results.keys())
|
|
|
|
| 2374 |
t.join()
|
| 2375 |
|
| 2376 |
if _perf_metrics is not None:
|
| 2377 |
+
_perf_metrics["id_reconciliation_ms"] = _recon_accum_ms
|
| 2378 |
_perf_metrics["tracking_total_ms"] = (time.perf_counter() - _t_track) * 1000.0
|
| 2379 |
|
| 2380 |
logging.info(
|
models/segmenters/grounded_sam2.py
CHANGED
|
@@ -717,12 +717,23 @@ class GroundedSAM2Segmenter(Segmenter):
|
|
| 717 |
|
| 718 |
with autocast_ctx:
|
| 719 |
# Init SAM2 video predictor state
|
|
|
|
|
|
|
|
|
|
| 720 |
inference_state = self._video_predictor.init_state(
|
| 721 |
video_path=frame_dir,
|
| 722 |
offload_video_to_cpu=True,
|
| 723 |
async_loading_frames=True,
|
| 724 |
)
|
| 725 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 726 |
for start_idx in range(0, total_frames, step):
|
| 727 |
logging.info("Processing keyframe %d / %d", start_idx, total_frames)
|
| 728 |
|
|
@@ -847,9 +858,8 @@ class GroundedSAM2Segmenter(Segmenter):
|
|
| 847 |
continue
|
| 848 |
|
| 849 |
# -- SAM2 video predictor: propagate masks --
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
segment_output = self.propagate_segment(
|
| 854 |
inference_state, start_idx, mask_dict, step,
|
| 855 |
)
|
|
@@ -870,14 +880,6 @@ class GroundedSAM2Segmenter(Segmenter):
|
|
| 870 |
sam2_masks.mask_height = first_info.mask.shape[-2] if first_info.mask.ndim >= 2 else 0
|
| 871 |
sam2_masks.mask_width = first_info.mask.shape[-1] if first_info.mask.ndim >= 2 else 0
|
| 872 |
|
| 873 |
-
if _pm is not None:
|
| 874 |
-
_pl = getattr(self, '_perf_lock', None)
|
| 875 |
-
_d = (time.perf_counter() - _t_sv) * 1000.0
|
| 876 |
-
if _pl:
|
| 877 |
-
with _pl: _pm["sam_video_total_ms"] += _d
|
| 878 |
-
else:
|
| 879 |
-
_pm["sam_video_total_ms"] += _d
|
| 880 |
-
|
| 881 |
logging.info(
|
| 882 |
"Grounded-SAM-2 tracking complete: %d frames, %d tracked objects",
|
| 883 |
len(all_results), objects_count,
|
|
|
|
| 717 |
|
| 718 |
with autocast_ctx:
|
| 719 |
# Init SAM2 video predictor state
|
| 720 |
+
if _pm is not None:
|
| 721 |
+
_t_init = time.perf_counter()
|
| 722 |
+
|
| 723 |
inference_state = self._video_predictor.init_state(
|
| 724 |
video_path=frame_dir,
|
| 725 |
offload_video_to_cpu=True,
|
| 726 |
async_loading_frames=True,
|
| 727 |
)
|
| 728 |
|
| 729 |
+
if _pm is not None:
|
| 730 |
+
_pl = getattr(self, '_perf_lock', None)
|
| 731 |
+
_d = (time.perf_counter() - _t_init) * 1000.0
|
| 732 |
+
if _pl:
|
| 733 |
+
with _pl: _pm["init_state_ms"] += _d
|
| 734 |
+
else:
|
| 735 |
+
_pm["init_state_ms"] += _d
|
| 736 |
+
|
| 737 |
for start_idx in range(0, total_frames, step):
|
| 738 |
logging.info("Processing keyframe %d / %d", start_idx, total_frames)
|
| 739 |
|
|
|
|
| 858 |
continue
|
| 859 |
|
| 860 |
# -- SAM2 video predictor: propagate masks --
|
| 861 |
+
# NOTE: propagate_segment() already accumulates into
|
| 862 |
+
# _pm["sam_video_total_ms"], so no outer timer here.
|
|
|
|
| 863 |
segment_output = self.propagate_segment(
|
| 864 |
inference_state, start_idx, mask_dict, step,
|
| 865 |
)
|
|
|
|
| 880 |
sam2_masks.mask_height = first_info.mask.shape[-2] if first_info.mask.ndim >= 2 else 0
|
| 881 |
sam2_masks.mask_width = first_info.mask.shape[-1] if first_info.mask.ndim >= 2 else 0
|
| 882 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 883 |
logging.info(
|
| 884 |
"Grounded-SAM-2 tracking complete: %d frames, %d tracked objects",
|
| 885 |
len(all_results), objects_count,
|