ColabWan / preprocessing /sam3 /model /sam3_multiplex_tracking.py
1ripon1's picture
Upload folder using huggingface_hub
7344bef verified
Raw
History Blame Contribute Delete
157 kB
from collections import defaultdict
from functools import reduce
from typing import Dict
import numpy as np
from ..model import sam3_multiplex_base
from ..model import sam3_video_base
import torch
import torch.distributed as dist
import torch.nn.functional as F
from .. import perflib
from ..logger import get_logger
from ..model.box_ops import box_xywh_to_cxcywh, box_xyxy_to_xywh
from ..model.data_misc import BatchedDatapoint
from ..model.device_utils import get_accelerator_device
from ..model.sam3_multiplex_base import MaskletConfirmationStatus, Sam3MultiplexBase
from ..model.sam3_tracker_utils import fill_holes_in_mask_scores
from ..model.sam3_video_inference import is_image_type
from ..perflib.compile import (
clone_output_wrapper,
compile_wrapper,
shape_logging_wrapper,
)
from ..perflib.masks_ops import mask_iou, masks_to_boxes as perf_masks_to_boxes
from torch import Tensor
from torchvision.ops import masks_to_boxes
from tqdm.auto import tqdm
logger = get_logger(__name__)
import gc
from collections.abc import Mapping, Sequence
from dataclasses import fields, is_dataclass
from typing import List
from ..model.data_misc import (
BatchedPointer,
convert_my_tensors,
FindStage,
NestedTensor,
)
from ..model.geometry_encoders import Prompt
from ..model.io_utils import load_resource_as_video_frames
def recursive_to(data, *args, **kwargs):
if isinstance(data, torch.Tensor):
ret = data.to(*args, **kwargs)
elif isinstance(data, np.ndarray):
ret = data
elif isinstance(data, Mapping):
ret = type(data)()
for key in data:
ret[key] = recursive_to(data[key], *args, **kwargs)
elif isinstance(data, tuple):
ret = ()
for value in data:
ret += (recursive_to(value, *args, **kwargs),)
elif isinstance(data, Sequence) and not isinstance(data, str):
ret = type(data)()
for value in data:
ret.append(recursive_to(value, *args, **kwargs))
elif is_dataclass(data):
ret_cls = type(data)
ret_fields = {
field.name: recursive_to(getattr(data, field.name), *args, **kwargs)
for field in fields(data)
}
ret = ret_cls(**ret_fields)
else:
ret = data
return ret
DUMMY_OUTPUT = "DUMMY_OUTPUT"
class Sam3MultiplexTracking(Sam3MultiplexBase):
def __init__(
self,
image_size=1008,
image_mean=(0.5, 0.5, 0.5),
image_std=(0.5, 0.5, 0.5),
compile_model=False,
postprocess_batch_size=1,
**kwargs,
):
"""
hotstart_delay: int, the delay (in #frames) before the model starts to yield output, 0 to disable hotstart delay.
hotstart_unmatch_thresh: int, remove the object if it has this many unmatched frames within its hotstart_delay period.
If `hotstart_delay` is set to 0, this parameter is ignored.
hotstart_dup_thresh: int, remove the object if it has overlapped with another object this many frames within its hotstart_delay period.
postprocess_batch_size: int, the number of frames to accumulate before running postprocessing. Set to 1 to disable batching.
"""
super().__init__(**kwargs)
self.image_size = image_size
self.image_mean = image_mean
self.image_std = image_std
self.compile_model = compile_model
self.detector.compile_model = self.compile_model
self.postprocess_batch_size = postprocess_batch_size
TEXT_ID_FOR_TEXT = 0
TEXT_ID_FOR_VISUAL = 1
TEXT_ID_FOR_GEOMETRIC = 2
def _construct_initial_input_batch(self, inference_state, images):
"""Construct an initial `BatchedDatapoint` instance as input."""
# 1) img_batch
num_frames = len(images)
device = inference_state["device"]
img_batch = NestedTensor(tensors=images, mask=None)
# 2) find_text_batch
# "<text placeholder>" will be replaced by the actual text prompt when adding prompts
find_text_batch = ["<text placeholder>", "visual", "geometric"]
# 3) find_inputs
input_box_embedding_dim = 258 # historical default
input_points_embedding_dim = 257 # historical default
dummy_ptrs = BatchedPointer(
stage_ids=[], query_ids=[], object_ids=[], ptr_mask=[], ptr_types=[]
)
stages = [
FindStage(
img_ids=[stage_id],
img_ids_np=np.array([stage_id]),
text_ids=[0],
input_boxes=[torch.zeros(input_box_embedding_dim)],
input_boxes_before_embed=[torch.empty(0, 4)],
input_boxes_mask=[torch.empty(0, dtype=torch.bool)],
input_boxes_label=[torch.empty(0, dtype=torch.long)],
input_points=[torch.empty(0, input_points_embedding_dim)],
input_points_before_embed=[torch.empty(0, 3)],
input_points_mask=[torch.empty(0)],
ptrs=dummy_ptrs,
ptrs_seg=dummy_ptrs,
object_ids=[],
)
for stage_id in range(num_frames)
]
with torch.profiler.record_function(
"Sam3MultiplexTracking._construct_initial_input_batch"
):
for i in range(len(stages)):
stages[i] = convert_my_tensors(stages[i])
# construct the final `BatchedDatapoint` and cast to GPU
input_batch = BatchedDatapoint(
img_batch=img_batch,
find_text_batch=find_text_batch,
find_inputs=stages,
find_targets=[None] * num_frames,
get_queries=None,
find_metadatas=[None] * num_frames,
)
with torch.profiler.record_function("Sam3MultiplexTracking.recursive_to"):
input_batch = recursive_to(input_batch, device, non_blocking=True)
inference_state["input_batch"] = input_batch
# construct the placeholder interactive prompts and tracking queries
bs = 1
inference_state["constants"]["empty_geometric_prompt"] = Prompt(
box_embeddings=torch.zeros(0, bs, 4, device=device),
box_mask=torch.zeros(bs, 0, device=device, dtype=torch.bool),
box_labels=torch.zeros(0, bs, device=device, dtype=torch.long),
point_embeddings=torch.zeros(0, bs, 2, device=device),
point_mask=torch.zeros(bs, 0, device=device, dtype=torch.bool),
point_labels=torch.zeros(0, bs, device=device, dtype=torch.long),
)
# constructing an output list in inference state (we start with an empty list)
inference_state["previous_stages_out"] = [None] * num_frames
inference_state["text_prompt"] = None
inference_state["per_frame_raw_point_input"] = [None] * num_frames
inference_state["per_frame_raw_box_input"] = [None] * num_frames
inference_state["per_frame_visual_prompt"] = [None] * num_frames
inference_state["per_frame_geometric_prompt"] = [None] * num_frames
inference_state["per_frame_cur_step"] = [0] * num_frames
# placeholders for cached outputs
# (note: currently, a single visual prompt embedding is shared for all frames)
inference_state["backbone_out"] = None
inference_state["visual_prompt_embed"] = None
inference_state["visual_prompt_mask"] = None
def _get_visual_prompt(self, inference_state, frame_idx, boxes_cxcywh, box_labels):
batch_size = 1
geometric_prompt = Prompt(
box_embeddings=torch.zeros(
0, batch_size, 4, device=inference_state["device"]
),
box_mask=torch.zeros(
batch_size, 0, device=inference_state["device"], dtype=torch.bool
),
point_embeddings=None,
point_mask=None,
)
geometric_prompt.append_boxes(
boxes=boxes_cxcywh.view(-1, batch_size, 4).to(inference_state["device"]),
labels=box_labels.view(-1, batch_size).to(inference_state["device"]),
)
return boxes_cxcywh, box_labels, geometric_prompt
@torch.inference_mode()
def init_state(
self,
resource_path,
offload_video_to_cpu=False,
async_loading_frames=False,
use_cv2=False,
input_is_mp4=False,
):
# Initialize inference state (inlined from Sam3DemoMixin.init_state)
video_loader_type = "cv2" if use_cv2 else "ffmpeg"
images, orig_height, orig_width = load_resource_as_video_frames(
resource_path=resource_path,
image_size=self.image_size,
offload_video_to_cpu=offload_video_to_cpu,
img_mean=self.image_mean,
img_std=self.image_std,
async_loading_frames=async_loading_frames,
video_loader_type=video_loader_type,
)
inference_state = {}
inference_state["image_size"] = self.image_size
inference_state["num_frames"] = len(images)
inference_state["device"] = get_accelerator_device()
inference_state["orig_height"] = orig_height
inference_state["orig_width"] = orig_width
inference_state["constants"] = {}
self._construct_initial_input_batch(inference_state, images)
# initialize extra states
# sam2_inference_states will contain separate inference_states for each frame having new objects if
# self.tracker.per_obj_inference is False (bucketized batching), or a single inference_state
# containing all objects if self.tracker.per_obj_inference is True (no batching at all).
inference_state["sam2_inference_states"] = []
inference_state["tracker_metadata"] = {}
inference_state["feature_cache"] = {}
inference_state["cached_frame_outputs"] = {}
inference_state["is_image_only"] = is_image_type(resource_path)
return inference_state
def reset_state(self, inference_state):
# Inlined from Sam3DemoMixin.reset_state
inference_state["input_batch"].find_text_batch[0] = "<text placeholder>"
inference_state["text_prompt"] = None
for t in range(inference_state["num_frames"]):
inference_state["input_batch"].find_inputs[t].text_ids[...] = 0
inference_state["previous_stages_out"][t] = None
inference_state["per_frame_raw_point_input"][t] = None
inference_state["per_frame_raw_box_input"][t] = None
inference_state["per_frame_visual_prompt"][t] = None
inference_state["per_frame_geometric_prompt"][t] = None
inference_state["per_frame_cur_step"][t] = 0
inference_state["backbone_out"] = None
inference_state["visual_prompt_embed"] = None
inference_state["visual_prompt_mask"] = None
# reset extra states
inference_state["sam2_inference_states"].clear()
inference_state["tracker_metadata"].clear()
inference_state["feature_cache"].clear()
inference_state["cached_frame_outputs"] = {}
def _get_processing_order(
self, inference_state, start_frame_idx, max_frame_num_to_track, reverse
):
num_frames = inference_state["num_frames"]
previous_stages_out = inference_state["previous_stages_out"]
if all(out is None for out in previous_stages_out) and start_frame_idx is None:
raise RuntimeError(
"No prompts are received on any frames. Please add prompt on at least one frame before propagation."
)
# set start index, end index, and processing order
if start_frame_idx is None:
# default: start from the earliest frame with input points
start_frame_idx = min(
t for t, out in enumerate(previous_stages_out) if out is not None
)
if max_frame_num_to_track is None:
# default: track all the frames in the video
max_frame_num_to_track = num_frames
if reverse:
end_frame_idx = start_frame_idx - max_frame_num_to_track
end_frame_idx = max(end_frame_idx, 0)
processing_order = range(start_frame_idx - 1, end_frame_idx - 1, -1)
else:
end_frame_idx = start_frame_idx + max_frame_num_to_track
end_frame_idx = min(end_frame_idx, num_frames - 1)
processing_order = range(start_frame_idx, end_frame_idx + 1)
return processing_order, end_frame_idx
@torch.inference_mode()
def propagate_in_video(
self,
inference_state,
start_frame_idx=None,
max_frame_num_to_track=None,
reverse=False,
output_prob_thresh=0.5,
compute_stability_score=False,
is_instance_processing=False,
progress_callback=None,
**kwargs, # To support passing extra args to child classes
):
"""
Propagate the prompts to get grounding results for the entire video. This method
is a generator and yields inference outputs for all frames in the range specified
by `start_frame_idx`, `max_frame_num_to_track`, and `reverse`.
"""
# compile the model (it's a no-op if the model is already compiled)
# note that it's intentionally added to `self.propagate_in_video`, so that the first
# `self.add_prompt` call will be done in eager mode to fill in the decoder buffers
# such as positional encoding cache)
self._compile_model()
processing_order, end_frame_idx = self._get_processing_order(
inference_state,
start_frame_idx,
max_frame_num_to_track,
reverse=reverse,
)
# Store max_frame_num_to_track in feature_cache for downstream methods
inference_state["feature_cache"]["tracking_bounds"] = {
"max_frame_num_to_track": max_frame_num_to_track,
"propagate_in_video_start_frame_idx": start_frame_idx,
}
hotstart_buffer = []
hotstart_removed_obj_ids = set()
# when deciding whether to output a masklet on `yield_frame_idx`, we check whether the object is confirmed
# in a future frame (`unconfirmed_frame_delay` frames after the current frame). For example, if we require
# an object to be detected in 3 consecutive frames to be confirmed, then we look 2 frames in the future --
# e.g., we output an object on frame 4 only if it becomes confirmed on frame 6.
unconfirmed_status_delay = self.masklet_confirmation_consecutive_det_thresh - 1
unconfirmed_obj_ids_per_frame = {} # frame_idx -> hidden_obj_ids
# Batch postprocessing: accumulate yield_list entries and process every postprocess_batch_size frames
postprocess_yield_list = []
progress_done = 0
progress_total = len(processing_order)
for frame_idx in tqdm(
processing_order, desc="propagate_in_video", disable=self.rank > 0
):
out = self._run_single_frame_inference(
inference_state,
frame_idx,
reverse,
is_instance_processing=is_instance_processing,
)
progress_done += 1
if progress_callback is not None and self.rank == 0:
progress_callback(progress_done, progress_total)
if self.hotstart_delay > 0:
# accumulate the outputs for the first `hotstart_delay` frames
hotstart_buffer.append([frame_idx, out])
# update the object IDs removed by hotstart so that we don't output them
if self.rank == 0:
hotstart_removed_obj_ids.update(out["removed_obj_ids"])
unconfirmed_obj_ids = out.get("unconfirmed_obj_ids", None)
if unconfirmed_obj_ids is not None:
unconfirmed_obj_ids_per_frame[frame_idx] = unconfirmed_obj_ids
if frame_idx == end_frame_idx:
# we reached the end of propagation -- yield all frames in the buffer
yield_list = hotstart_buffer
hotstart_buffer = []
elif len(hotstart_buffer) >= self.hotstart_delay:
# we have enough frames -- yield and remove the first (oldest) frame from the buffer
yield_list = hotstart_buffer[:1]
hotstart_buffer = hotstart_buffer[1:]
else:
# not enough frames yet -- skip yielding
yield_list = []
else:
yield_list = [(frame_idx, out)] # output the current frame
# Accumulate yield_list into postprocess_yield_list
# Snapshot hotstart_removed_obj_ids at the time of accumulation to preserve
# the correct state for each frame (important: this set is mutated over time)
for yield_frame_idx, yield_out in yield_list:
postprocess_yield_list.append(
(yield_frame_idx, yield_out, set(hotstart_removed_obj_ids))
)
# Process batch when we have enough frames
while len(postprocess_yield_list) >= self.postprocess_batch_size:
batch_to_process = postprocess_yield_list[: self.postprocess_batch_size]
postprocess_yield_list = postprocess_yield_list[
self.postprocess_batch_size :
]
with torch.profiler.record_function(
"Sam3MultiplexTracking.postprocess_output_batched"
):
if self.rank == 0:
# Prepare batched inputs for postprocessing
H_video, W_video = (
inference_state["orig_height"],
inference_state["orig_width"],
)
num_frames = inference_state["num_frames"]
batched_outs = []
frame_indices = []
for (
yield_frame_idx,
yield_out,
removed_obj_ids_snapshot,
) in batch_to_process:
suppressed_obj_ids = yield_out["suppressed_obj_ids"]
unconfirmed_status_frame_idx = (
yield_frame_idx + unconfirmed_status_delay
if not reverse
else yield_frame_idx - unconfirmed_status_delay
)
unconfirmed_status_frame_idx = max(
0, min(unconfirmed_status_frame_idx, num_frames - 1)
)
unconfirmed_obj_ids = unconfirmed_obj_ids_per_frame.get(
unconfirmed_status_frame_idx, None
)
batched_outs.append(
(
yield_out,
removed_obj_ids_snapshot,
suppressed_obj_ids,
unconfirmed_obj_ids,
)
)
frame_indices.append(yield_frame_idx)
# Cache frame outputs
self._cache_frame_outputs(
inference_state,
yield_frame_idx,
yield_out["obj_id_to_mask"],
suppressed_obj_ids=suppressed_obj_ids,
removed_obj_ids=removed_obj_ids_snapshot,
unconfirmed_obj_ids=unconfirmed_obj_ids,
)
if self.postprocess_batch_size > 1:
# Process all frames in batch
postprocessed_outs = self._postprocess_output_batched(
H_video, W_video, batched_outs
)
else:
# Process each frame individually but output together
postprocessed_outs = []
for (
yield_out,
removed_obj_ids_snapshot,
suppressed_obj_ids,
unconfirmed_obj_ids,
) in batched_outs:
postprocessed_out = self._postprocess_output(
inference_state,
yield_out,
removed_obj_ids_snapshot,
suppressed_obj_ids,
unconfirmed_obj_ids,
)
postprocessed_outs.append(postprocessed_out)
# Yield results
for yield_frame_idx, postprocessed_out in zip(
frame_indices, postprocessed_outs
):
yield yield_frame_idx, postprocessed_out
else:
# No output on other GPUs
for yield_frame_idx, _, _ in batch_to_process:
yield yield_frame_idx, DUMMY_OUTPUT
# Flush any remaining frames in the postprocess buffer
if len(postprocess_yield_list) > 0:
with torch.profiler.record_function(
"Sam3MultiplexTracking.postprocess_output_batched"
):
if self.rank == 0:
H_video, W_video = (
inference_state["orig_height"],
inference_state["orig_width"],
)
num_frames = inference_state["num_frames"]
batched_outs = []
frame_indices = []
for (
yield_frame_idx,
yield_out,
removed_obj_ids_snapshot,
) in postprocess_yield_list:
suppressed_obj_ids = yield_out["suppressed_obj_ids"]
unconfirmed_status_frame_idx = (
yield_frame_idx + unconfirmed_status_delay
if not reverse
else yield_frame_idx - unconfirmed_status_delay
)
unconfirmed_status_frame_idx = max(
0, min(unconfirmed_status_frame_idx, num_frames - 1)
)
unconfirmed_obj_ids = unconfirmed_obj_ids_per_frame.get(
unconfirmed_status_frame_idx, None
)
batched_outs.append(
(
yield_out,
removed_obj_ids_snapshot,
suppressed_obj_ids,
unconfirmed_obj_ids,
)
)
frame_indices.append(yield_frame_idx)
self._cache_frame_outputs(
inference_state,
yield_frame_idx,
yield_out["obj_id_to_mask"],
suppressed_obj_ids=suppressed_obj_ids,
removed_obj_ids=removed_obj_ids_snapshot,
unconfirmed_obj_ids=unconfirmed_obj_ids,
)
if self.postprocess_batch_size > 1:
postprocessed_outs = self._postprocess_output_batched(
H_video, W_video, batched_outs
)
else:
# Process each frame individually but output together
postprocessed_outs = []
for (
yield_out,
removed_obj_ids_snapshot,
suppressed_obj_ids,
unconfirmed_obj_ids,
) in batched_outs:
postprocessed_out = self._postprocess_output(
inference_state,
yield_out,
removed_obj_ids_snapshot,
suppressed_obj_ids,
unconfirmed_obj_ids,
)
postprocessed_outs.append(postprocessed_out)
for yield_frame_idx, postprocessed_out in zip(
frame_indices, postprocessed_outs
):
yield yield_frame_idx, postprocessed_out
else:
for yield_frame_idx, _, _ in postprocess_yield_list:
yield yield_frame_idx, DUMMY_OUTPUT
if self.is_multiplex:
# log the bucket utilization stats
# bucket utilization rate is total valid objects / total capacity -> represents rooms for improvement
# subscription rate is total valid objects / total number of buckets -> represents speedup
total_valid_objects = 0
total_num_buckets = 0
for state in inference_state["sam2_inference_states"]:
assert (
len(state["obj_ids"])
== state["multiplex_state"].total_valid_entries
)
total_valid_objects += len(state["obj_ids"])
total_num_buckets += state["multiplex_state"].num_buckets
if total_num_buckets > 0:
bucket_utilization_rate = (
total_valid_objects / (total_num_buckets * self.bucket_capacity)
) * 100
subscription_rate = (total_valid_objects / total_num_buckets) * 100
logger.info(
f"Bucket utilization rate: {bucket_utilization_rate:.2f}%, subscription rate: {subscription_rate:.2f}%"
)
def _run_single_frame_inference(
self,
inference_state,
frame_idx,
reverse,
is_instance_processing=False,
):
"""
Perform inference on a single frame and get its inference results. This would
also update `inference_state`.
"""
# prepare inputs
input_batch = inference_state["input_batch"]
tracker_states_local = inference_state["sam2_inference_states"]
geometric_prompt = (
inference_state["constants"]["empty_geometric_prompt"]
if inference_state["per_frame_geometric_prompt"][frame_idx] is None
else inference_state["per_frame_geometric_prompt"][frame_idx]
)
text_batch_key = tuple(input_batch.find_text_batch)
inference_state["feature_cache"]["text"] = {
text_batch_key: {
"language_features": inference_state["backbone_out"][
"language_features"
],
"language_mask": inference_state["backbone_out"]["language_mask"],
}
}
# run inference for the current frame
(
obj_id_to_mask,
obj_id_to_score,
tracker_states_local_new,
tracker_metadata_new,
frame_stats,
_,
) = self._det_track_one_frame(
frame_idx=frame_idx,
num_frames=inference_state["num_frames"],
reverse=reverse,
input_batch=input_batch,
geometric_prompt=geometric_prompt,
tracker_states_local=tracker_states_local,
tracker_metadata_prev=inference_state["tracker_metadata"],
feature_cache=inference_state["feature_cache"],
orig_vid_height=inference_state["orig_height"],
orig_vid_width=inference_state["orig_width"],
is_image_only=inference_state["is_image_only"],
)
# update inference state
inference_state["sam2_inference_states"] = tracker_states_local_new
inference_state["tracker_metadata"] = tracker_metadata_new
# use a dummy string in "previous_stages_out" to indicate this frame has outputs
inference_state["previous_stages_out"][frame_idx] = "_THIS_FRAME_HAS_OUTPUTS_"
if self.rank == 0:
self._cache_frame_outputs(inference_state, frame_idx, obj_id_to_mask)
out = {
"obj_id_to_mask": obj_id_to_mask,
"obj_id_to_score": obj_id_to_score, # first frame detection score
"obj_id_to_sam2_score": tracker_metadata_new[
"obj_id_to_sam2_score_frame_wise"
][frame_idx],
}
# removed_obj_ids is only needed on rank 0 to handle hotstart delay buffer
if self.rank == 0:
rank0_metadata = tracker_metadata_new["rank0_metadata"]
removed_obj_ids = rank0_metadata["removed_obj_ids"]
out["removed_obj_ids"] = removed_obj_ids
out["suppressed_obj_ids"] = rank0_metadata["suppressed_obj_ids"][frame_idx]
out["frame_stats"] = frame_stats
if self.masklet_confirmation_enable:
status = rank0_metadata["masklet_confirmation"]["status"]
is_unconfirmed = status == MaskletConfirmationStatus.UNCONFIRMED.value
out["unconfirmed_obj_ids"] = tracker_metadata_new["obj_ids_all_gpu"][
is_unconfirmed
].tolist()
else:
out["unconfirmed_obj_ids"] = []
return out
def _postprocess_output(
self,
inference_state,
out,
removed_obj_ids=None,
suppressed_obj_ids=None,
unconfirmed_obj_ids=None,
):
obj_id_to_mask = out["obj_id_to_mask"] # low res masks
curr_obj_ids = sorted(obj_id_to_mask.keys())
H_video, W_video = inference_state["orig_height"], inference_state["orig_width"]
if len(curr_obj_ids) == 0:
out_obj_ids = torch.zeros(0, dtype=torch.int64)
out_probs = torch.zeros(0, dtype=torch.float32)
out_binary_masks = torch.zeros(0, H_video, W_video, dtype=torch.bool)
out_boxes_xywh = torch.zeros(0, 4, dtype=torch.float32)
else:
out_obj_ids = torch.tensor(curr_obj_ids, dtype=torch.int64, device="cpu")
out_probs = torch.tensor(
[out["obj_id_to_score"][obj_id] for obj_id in curr_obj_ids], device="cpu"
)
out_sam2_probs = torch.tensor(
[
(
out["obj_id_to_sam2_score"][obj_id]
if obj_id in out["obj_id_to_sam2_score"]
else 0.0
)
for obj_id in curr_obj_ids
],
device="cpu",
)
out_binary_masks = torch.cat(
[obj_id_to_mask[obj_id] for obj_id in curr_obj_ids], dim=0
)
assert out_binary_masks.dtype == torch.bool
keep = out_binary_masks.any(dim=(1, 2)).cpu() # remove masks with 0 areas
# hide outputs for those object IDs in `obj_ids_to_hide`
obj_ids_to_hide = []
if suppressed_obj_ids is not None:
obj_ids_to_hide.extend(suppressed_obj_ids)
if removed_obj_ids is not None:
obj_ids_to_hide.extend(removed_obj_ids)
if unconfirmed_obj_ids is not None:
obj_ids_to_hide.extend(unconfirmed_obj_ids)
if len(obj_ids_to_hide) > 0:
obj_ids_to_hide_t = torch.tensor(obj_ids_to_hide, dtype=torch.int64, device="cpu")
keep &= ~torch.isin(out_obj_ids, obj_ids_to_hide_t)
# slice those valid entries from the original outputs
keep_idx = torch.nonzero(keep, as_tuple=True)[0]
keep_idx_gpu = keep_idx.pin_memory().to(
device=out_binary_masks.device, non_blocking=True
)
out_obj_ids = torch.index_select(out_obj_ids, 0, keep_idx)
out_probs = torch.index_select(out_probs, 0, keep_idx)
out_sam2_probs = torch.index_select(out_sam2_probs, 0, keep_idx)
out_binary_masks = torch.index_select(out_binary_masks, 0, keep_idx_gpu)
if perflib.is_enabled:
out_boxes_xyxy = perf_masks_to_boxes(
out_binary_masks, out_obj_ids.tolist()
)
else:
out_boxes_xyxy = masks_to_boxes(out_binary_masks)
out_boxes_xywh = box_xyxy_to_xywh(out_boxes_xyxy) # convert to xywh format
# normalize boxes
out_boxes_xywh[..., 0] /= W_video
out_boxes_xywh[..., 1] /= H_video
out_boxes_xywh[..., 2] /= W_video
out_boxes_xywh[..., 3] /= H_video
# apply non-overlapping constraints on the existing masklets
if out_binary_masks.shape[0] > 1:
assert len(out_binary_masks) == len(out_sam2_probs)
out_binary_masks = (
self.tracker._apply_object_wise_non_overlapping_constraints(
out_binary_masks.unsqueeze(1),
out_sam2_probs.unsqueeze(1).to(out_binary_masks.device),
background_value=0,
).squeeze(1)
) > 0
prod_outputs = {}
if self.running_in_prod:
with torch.profiler.record_function(
"Sam3MultiplexTracking._postprocess_output.prod_outputs"
):
out_centers = torch.zeros(
out_binary_masks.shape[0],
2,
dtype=torch.float32,
device=out_binary_masks.device,
)
y_coords = torch.arange(
H_video, device=out_binary_masks.device, dtype=torch.float32
)
x_coords = torch.arange(
W_video, device=out_binary_masks.device, dtype=torch.float32
)
y_grid = y_coords.view(1, H_video, 1)
x_grid = x_coords.view(1, 1, W_video)
with torch.profiler.record_function(
"Sam3MultiplexTracking._postprocess_output.prod_outputs.center"
):
weighted_y_sum = (out_binary_masks * y_grid).sum(dim=(1, 2))
weighted_x_sum = (out_binary_masks * x_grid).sum(dim=(1, 2))
total_mass = out_binary_masks.sum(dim=(1, 2)).clamp_min(1e-6)
center_y = weighted_y_sum / total_mass / H_video
center_x = weighted_x_sum / total_mass / W_video
out_centers[:, 0] = center_x
out_centers[:, 1] = center_y
with torch.profiler.record_function(
"Sam3MultiplexTracking._postprocess_output.prod_outputs.to_cpu"
):
prod_outputs["out_centers"] = out_centers.cpu().numpy()
outputs = {
"out_obj_ids": out_obj_ids.cpu().numpy(),
"out_probs": out_probs.cpu().numpy(),
"out_boxes_xywh": out_boxes_xywh.cpu().numpy(),
"out_binary_masks": out_binary_masks.cpu().numpy(),
"frame_stats": out.get("frame_stats", None),
} | prod_outputs
return outputs
def _postprocess_output_batched(
self,
H_video,
W_video,
batched_outs,
):
"""
Batched version of _postprocess_output that batches GPU computations
(keep filtering, box computation) across frames for efficiency.
Args:
H_video: Video height
W_video: Video width
batched_outs: List of tuples, each containing:
(out, removed_obj_ids, suppressed_obj_ids, unconfirmed_obj_ids)
where out is the output dict from _run_single_frame_inference
Returns:
List of output dicts, one per frame in batched_outs
"""
batch_size = len(batched_outs)
if batch_size == 0:
return []
# ========== Phase 1: Collect per-frame data ==========
# We'll track: frame_data[i] = (obj_ids, probs, sam2_probs, masks, keep_mask, frame_stats)
# or None if frame has no objects
frame_data = []
device = None
for (
out,
removed_obj_ids,
suppressed_obj_ids,
unconfirmed_obj_ids,
) in batched_outs:
obj_id_to_mask = out["obj_id_to_mask"]
curr_obj_ids = sorted(obj_id_to_mask.keys())
frame_stats = out.get("frame_stats", None)
if len(curr_obj_ids) == 0:
frame_data.append((None, None, None, None, None, frame_stats))
continue
out_obj_ids = torch.tensor(curr_obj_ids, dtype=torch.int64, device="cpu")
obj_id_to_score_dict = out["obj_id_to_score"]
obj_id_to_sam2_score = out["obj_id_to_sam2_score"]
if device is None:
device = obj_id_to_mask[curr_obj_ids[0]].device
default_sam2_score = torch.zeros((), dtype=torch.float32, device=device)
probs_list = []
sam2_probs_list = []
binary_masks_list = []
for obj_id in curr_obj_ids:
probs_list.append(obj_id_to_score_dict[obj_id])
sam2_probs_list.append(
obj_id_to_sam2_score.get(obj_id, default_sam2_score)
)
binary_masks_list.append(obj_id_to_mask[obj_id])
out_probs = torch.tensor(probs_list, dtype=torch.float32, device="cpu")
out_sam2_probs_gpu = torch.stack(sam2_probs_list)
out_binary_masks = torch.cat(binary_masks_list, dim=0)
# Compute keep mask (which objects to hide)
obj_ids_to_hide = []
if suppressed_obj_ids is not None:
obj_ids_to_hide.extend(suppressed_obj_ids)
if removed_obj_ids is not None:
obj_ids_to_hide.extend(removed_obj_ids)
if unconfirmed_obj_ids is not None:
obj_ids_to_hide.extend(unconfirmed_obj_ids)
if len(obj_ids_to_hide) > 0:
obj_ids_to_hide_t = torch.tensor(obj_ids_to_hide, dtype=torch.int64, device="cpu")
hide_mask = torch.isin(out_obj_ids, obj_ids_to_hide_t)
else:
hide_mask = torch.zeros(len(out_obj_ids), dtype=torch.bool, device="cpu")
frame_data.append(
(
out_obj_ids,
out_probs,
out_sam2_probs_gpu,
out_binary_masks,
hide_mask,
frame_stats,
)
)
# ========== Phase 2: Batch concatenate masks for GPU operations ==========
# Collect frames with objects
frames_with_objects = []
frame_obj_counts = [] # Number of objects per frame (for frames with objects only)
all_masks_list = []
all_hide_masks_list = []
for i, data in enumerate(frame_data):
if data[0] is not None:
frames_with_objects.append(i)
frame_obj_counts.append(data[0].shape[0])
all_masks_list.append(data[3]) # binary_masks
all_hide_masks_list.append(data[4]) # hide_mask
# Handle case where all frames have 0 objects
if len(frames_with_objects) == 0:
outputs = []
for data in frame_data:
output_dict = {
"out_obj_ids": np.zeros(0, dtype=np.int64),
"out_probs": np.zeros(0, dtype=np.float32),
"out_boxes_xywh": np.zeros((0, 4), dtype=np.float32),
"out_binary_masks": np.zeros((0, H_video, W_video), dtype=bool),
"frame_stats": data[5],
}
if self.running_in_prod:
output_dict["out_centers"] = np.zeros((0, 2), dtype=np.float32)
outputs.append(output_dict)
return outputs
# Concatenate all masks for batched GPU operations
all_masks = torch.cat(all_masks_list, dim=0)
all_hide_masks = torch.cat(all_hide_masks_list, dim=0)
# ========== Phase 3: Batched keep mask computation on GPU ==========
# Compute which masks have non-zero area (batched on GPU)
has_area = all_masks.any(dim=(1, 2)) # GPU operation
# Combine with hide mask (move hide_mask to GPU for the operation)
all_hide_masks_gpu = all_hide_masks.to(device=all_masks.device)
keep_mask_gpu = has_area & ~all_hide_masks_gpu
# Get keep indices
keep_indices = torch.nonzero(keep_mask_gpu, as_tuple=True)[0]
if len(keep_indices) == 0:
# All objects filtered out
outputs = []
for data in frame_data:
output_dict = {
"out_obj_ids": np.zeros(0, dtype=np.int64),
"out_probs": np.zeros(0, dtype=np.float32),
"out_boxes_xywh": np.zeros((0, 4), dtype=np.float32),
"out_binary_masks": np.zeros((0, H_video, W_video), dtype=bool),
"frame_stats": data[5],
}
if self.running_in_prod:
output_dict["out_centers"] = np.zeros((0, 2), dtype=np.float32)
outputs.append(output_dict)
return outputs
# ========== Phase 4: Batched filtering and box computation ==========
# Filter masks on GPU
kept_masks = torch.index_select(all_masks, 0, keep_indices)
# Compute bounding boxes in batch on GPU
if perflib.is_enabled:
# Need to gather obj_ids for perflib
all_obj_ids_list = [frame_data[i][0] for i in frames_with_objects]
all_obj_ids_cat = torch.cat(all_obj_ids_list, dim=0)
kept_obj_ids_for_perf = torch.index_select(
all_obj_ids_cat, 0, keep_indices.cpu()
)
kept_boxes_xyxy = perf_masks_to_boxes(
kept_masks, kept_obj_ids_for_perf.tolist()
)
else:
kept_boxes_xyxy = masks_to_boxes(kept_masks)
kept_boxes_xywh = box_xyxy_to_xywh(kept_boxes_xyxy)
kept_boxes_xywh[..., 0] /= W_video
kept_boxes_xywh[..., 1] /= H_video
kept_boxes_xywh[..., 2] /= W_video
kept_boxes_xywh[..., 3] /= H_video
# ========== Phase 5: Split back to per-frame for non-overlapping ==========
# Compute how many objects were kept per frame
keep_indices_cpu = keep_indices.cpu()
keep_set = set(keep_indices_cpu.tolist())
kept_counts = []
offset = 0
for count in frame_obj_counts:
kept_in_frame = sum(
1 for j in range(offset, offset + count) if j in keep_set
)
kept_counts.append(kept_in_frame)
offset += count
# Split the kept tensors back to per-frame
split_masks = torch.split(kept_masks, kept_counts)
split_boxes = torch.split(kept_boxes_xywh, kept_counts)
# Also need to split obj_ids, probs, sam2_probs (filtering from original frame_data)
# We need to track which original indices were kept per frame
frame_kept_indices = [] # List of (local_kept_indices) per frame
offset = 0
for count in frame_obj_counts:
local_kept = []
for j in range(offset, offset + count):
if j in keep_set:
local_kept.append(j - offset) # Local index within frame
frame_kept_indices.append(local_kept)
offset += count
# ========== Phase 6: Apply non-overlapping per frame, collect final results ==========
final_results = [] # List of (frame_idx, obj_ids, probs, boxes, masks)
for idx, frame_i in enumerate(frames_with_objects):
data = frame_data[frame_i]
local_kept = frame_kept_indices[idx]
if len(local_kept) == 0:
continue
# Get the filtered data for this frame
local_kept_t = torch.tensor(local_kept, dtype=torch.int64, device="cpu")
out_obj_ids = torch.index_select(data[0], 0, local_kept_t)
out_probs = torch.index_select(data[1], 0, local_kept_t)
out_sam2_probs = torch.index_select(
data[2], 0, local_kept_t.to(data[2].device)
)
out_masks = split_masks[idx]
out_boxes = split_boxes[idx]
# Apply non-overlapping constraints (per-frame operation)
if out_masks.shape[0] > 1:
# Copy sam2_probs to CPU pinned memory then back to GPU for the operation
out_sam2_probs_cpu = torch.empty(
out_sam2_probs.shape, dtype=out_sam2_probs.dtype, device="cpu", pin_memory=True
)
out_sam2_probs_cpu.copy_(out_sam2_probs, non_blocking=True)
out_masks = (
self.tracker._apply_object_wise_non_overlapping_constraints(
out_masks.unsqueeze(1),
out_sam2_probs_cpu.unsqueeze(1).to(out_masks.device),
background_value=0,
).squeeze(1)
) > 0
final_results.append(
(frame_i, out_obj_ids, out_probs, out_boxes, out_masks)
)
# ========== Phase 6.5: Compute centers for prod ==========
all_centers = None
if self.running_in_prod and len(final_results) > 0:
with torch.profiler.record_function(
"Sam3MultiplexTracking._postprocess_output_batched.prod_outputs"
):
# Concatenate all masks for batched center computation
all_masks = torch.cat([r[4] for r in final_results], dim=0)
if all_masks.shape[0] > 0:
y_coords = torch.arange(
H_video, device=all_masks.device, dtype=torch.float32
)
x_coords = torch.arange(
W_video, device=all_masks.device, dtype=torch.float32
)
y_grid = y_coords.view(1, H_video, 1)
x_grid = x_coords.view(1, 1, W_video)
weighted_y_sum = (all_masks * y_grid).sum(dim=(1, 2))
weighted_x_sum = (all_masks * x_grid).sum(dim=(1, 2))
total_mass = all_masks.sum(dim=(1, 2)).clamp_min(1e-6)
center_y = weighted_y_sum / total_mass / H_video
center_x = weighted_x_sum / total_mass / W_video
all_centers = torch.stack([center_x, center_y], dim=1)
# Handle case where all filtered out
if len(final_results) == 0:
outputs = []
for data in frame_data:
output_dict = {
"out_obj_ids": np.zeros(0, dtype=np.int64),
"out_probs": np.zeros(0, dtype=np.float32),
"out_boxes_xywh": np.zeros((0, 4), dtype=np.float32),
"out_binary_masks": np.zeros((0, H_video, W_video), dtype=bool),
"frame_stats": data[5],
}
if self.running_in_prod:
output_dict["out_centers"] = np.zeros((0, 2), dtype=np.float32)
outputs.append(output_dict)
return outputs
# ========== Phase 7: Concatenate for batched GPU→CPU copy ==========
final_obj_ids = torch.cat([r[1] for r in final_results], dim=0)
final_probs = torch.cat([r[2] for r in final_results], dim=0)
final_boxes = torch.cat([r[3] for r in final_results], dim=0)
final_masks = torch.cat([r[4] for r in final_results], dim=0)
total_objects = final_obj_ids.shape[0]
# Initialize or resize batched CPU buffer
batched_buffer_size = self.postprocess_batch_size * self.max_num_objects
needs_buffer_init = not hasattr(self, "buffer_cpu_batched")
needs_buffer_resize = not needs_buffer_init and (
self.buffer_cpu_batched["out_binary_masks"].shape[0] != batched_buffer_size
or self.buffer_cpu_batched["out_binary_masks"].shape[1] != H_video
or self.buffer_cpu_batched["out_binary_masks"].shape[2] != W_video
)
if needs_buffer_init or needs_buffer_resize:
self.buffer_cpu_batched = {
"out_obj_ids": torch.zeros(
batched_buffer_size,
dtype=torch.int64,
device="cpu",
pin_memory=True,
),
"out_probs": torch.zeros(
batched_buffer_size,
dtype=torch.float32,
device="cpu",
pin_memory=True,
),
"out_boxes_xywh": torch.zeros(
batched_buffer_size,
4,
dtype=torch.float32,
device="cpu",
pin_memory=True,
),
"out_binary_masks": torch.zeros(
batched_buffer_size,
H_video,
W_video,
dtype=bool,
device="cpu",
pin_memory=True,
),
}
if self.running_in_prod:
self.buffer_cpu_batched["out_centers"] = torch.zeros(
batched_buffer_size,
2,
dtype=torch.float32,
device="cpu",
pin_memory=True,
)
self.buffer_cpu_batched["out_obj_ids"][:total_objects].copy_(final_obj_ids)
self.buffer_cpu_batched["out_probs"][:total_objects].copy_(final_probs)
self.buffer_cpu_batched["out_boxes_xywh"][:total_objects].copy_(final_boxes)
self.buffer_cpu_batched["out_binary_masks"][:total_objects].copy_(final_masks)
if all_centers is not None:
self.buffer_cpu_batched["out_centers"][:total_objects].copy_(all_centers)
# ========== Phase 8: Build output list ==========
# Create mapping from frame index to (offset, count) in the buffer
frame_to_offset_count = {}
offset = 0
for frame_i, obj_ids, _, _, _ in final_results:
count = obj_ids.shape[0]
frame_to_offset_count[frame_i] = (offset, count)
offset += count
outputs = []
for i, data in enumerate(frame_data):
frame_stats = data[5]
if i not in frame_to_offset_count:
# Frame has no objects (either originally or after filtering)
output_dict = {
"out_obj_ids": np.zeros(0, dtype=np.int64),
"out_probs": np.zeros(0, dtype=np.float32),
"out_boxes_xywh": np.zeros((0, 4), dtype=np.float32),
"out_binary_masks": np.zeros((0, H_video, W_video), dtype=bool),
"frame_stats": frame_stats,
}
if all_centers is not None:
output_dict["out_centers"] = np.zeros((0, 2), dtype=np.float32)
outputs.append(output_dict)
else:
buf_offset, num_objects = frame_to_offset_count[i]
output_dict = {
"out_obj_ids": self.buffer_cpu_batched["out_obj_ids"][
buf_offset : buf_offset + num_objects
]
.numpy()
.copy(),
"out_probs": self.buffer_cpu_batched["out_probs"][
buf_offset : buf_offset + num_objects
]
.numpy()
.copy(),
"out_boxes_xywh": self.buffer_cpu_batched["out_boxes_xywh"][
buf_offset : buf_offset + num_objects
]
.numpy()
.copy(),
"out_binary_masks": self.buffer_cpu_batched["out_binary_masks"][
buf_offset : buf_offset + num_objects
]
.numpy()
.copy(),
"frame_stats": frame_stats,
}
if all_centers is not None:
output_dict["out_centers"] = (
self.buffer_cpu_batched["out_centers"][
buf_offset : buf_offset + num_objects
]
.numpy()
.copy()
)
outputs.append(output_dict)
return outputs
def _cache_frame_outputs(
self,
inference_state,
frame_idx,
obj_id_to_mask,
suppressed_obj_ids=None,
removed_obj_ids=None,
unconfirmed_obj_ids=None,
):
if not inference_state.get("cache_frame_outputs", True):
return
if "cached_frame_outputs" not in inference_state:
inference_state["cached_frame_outputs"] = {}
objects_to_exclude = set()
if suppressed_obj_ids is not None:
objects_to_exclude.update(suppressed_obj_ids)
if removed_obj_ids is not None:
objects_to_exclude.update(removed_obj_ids)
if unconfirmed_obj_ids is not None:
objects_to_exclude.update(unconfirmed_obj_ids)
# This cache is only used for later fetch/refine output assembly. The tracker
# keeps its active low-res memory separately, so video-res masks should not pin VRAM.
inference_state["cached_frame_outputs"][frame_idx] = {
obj_id: self._cache_output_mask(mask)
for obj_id, mask in obj_id_to_mask.items()
if obj_id not in objects_to_exclude
}
def _build_sam2_output(
self, inference_state, frame_idx, refined_obj_id_to_mask=None
):
if frame_idx not in inference_state["cached_frame_outputs"]:
if refined_obj_id_to_mask is None:
return {}
return {
obj_id: self._cache_output_mask(mask)
for obj_id, mask in refined_obj_id_to_mask.items()
}
cached_outputs = inference_state["cached_frame_outputs"][frame_idx]
obj_id_to_mask = cached_outputs.copy()
# Update with refined masks if provided
if refined_obj_id_to_mask is not None:
for obj_id, refined_mask in refined_obj_id_to_mask.items():
assert refined_mask is not None, (
f"Refined mask data must be provided for obj_id {obj_id}"
)
obj_id_to_mask[obj_id] = self._cache_output_mask(refined_mask)
return obj_id_to_mask
@staticmethod
def _cache_output_mask(mask):
if torch.is_tensor(mask):
return mask.detach().to(device="cpu", non_blocking=True, copy=True)
return np.array(mask, copy=True)
def _compile_model(self):
"""Compile the SAM model with torch.compile for speedup."""
# TODO: compile SAM2 model components
is_compiled = getattr(self, "_model_is_compiled", False)
if is_compiled or not self.compile_model:
return
import torch._dynamo
# a larger cache size to hold varying number of shapes for torch.compile
# see https://github.com/pytorch/pytorch/blob/v2.5.1/torch/_dynamo/config.py#L42-L49
torch._dynamo.config.cache_size_limit = 128
torch._dynamo.config.accumulated_cache_size_limit = 2048
torch._dynamo.config.capture_scalar_outputs = True
torch._dynamo.config.suppress_errors = True
# Compile module components following https://www.internalfb.com/diff/D70935785
# skip compilation of `_encode_prompt` since it sometimes tiggger SymInt errors
# self._encode_prompt = clone_output_wrapper(
# torch.compile(self._encode_prompt, fullgraph=True, mode="max-autotune")
# )
## Compile SAM3 model components (matching OV: clone_output_wrapper(torch.compile(fn)))
if self.detector.backbone.language_backbone is not None:
self.detector.backbone.language_backbone.encoder.forward = clone_output_wrapper(
torch.compile(
self.detector.backbone.language_backbone.encoder.forward,
fullgraph=True,
mode="max-autotune",
)
)
self.detector.backbone.vision_backbone.forward = clone_output_wrapper(
torch.compile(
self.detector.backbone.vision_backbone.forward,
fullgraph=True,
mode="max-autotune",
)
)
self.detector.transformer.encoder.forward = clone_output_wrapper(
torch.compile(
self.detector.transformer.encoder.forward,
fullgraph=True,
mode="max-autotune",
)
)
self.detector.transformer.decoder.forward = clone_output_wrapper(
torch.compile(
self.detector.transformer.decoder.forward,
fullgraph=True,
mode="max-autotune",
dynamic=False, # note: FA decoder uses static shapes
)
)
self.detector.segmentation_head.forward = clone_output_wrapper(
torch.compile(
self.detector.segmentation_head.forward,
fullgraph=True,
mode="max-autotune",
)
)
## Compile SAM2 model components
self.tracker.maskmem_backbone.forward = compile_wrapper(
self.tracker.maskmem_backbone.forward,
mode="max-autotune",
fullgraph=True,
dynamic=False,
)
self.tracker.transformer.encoder.forward = shape_logging_wrapper(
compile_wrapper(
self.tracker.transformer.encoder.forward,
mode="max-autotune-no-cudagraphs",
fullgraph=True,
dynamic=True,
),
keep_kwargs=["src", "src_pos", "prompt", "prompt_pos"],
)
self.tracker.sam_mask_decoder.forward = compile_wrapper(
self.tracker.sam_mask_decoder.forward,
mode="max-autotune",
fullgraph=True,
dynamic=False, # Accuracy regression on True
)
sam3_video_base._associate_det_trk_compilable = compile_wrapper(
sam3_video_base._associate_det_trk_compilable,
mode="max-autotune-no-cudagraphs",
fullgraph=True,
dynamic=False,
)
self.tracker._suppress_object_pw_area_shrinkage = compile_wrapper(
self.tracker._suppress_object_pw_area_shrinkage,
mode="max-autotune-no-cudagraphs",
fullgraph=True,
dynamic=False,
)
self._model_is_compiled = True
def _warm_up_vg_propagation(self, inference_state, start_frame_idx=0):
# use different tracking score thresholds for each round to simulate different number of output objects
num_objects_list = range(self.num_obj_for_compile + 1)
num_rounds = 3
orig_new_det_thresh = self.new_det_thresh
for i in range(num_rounds):
for num_objects in num_objects_list:
logger.info(
f"round {i + 1}/{num_rounds} warming up model compilation -- simulating {num_objects}/{self.num_obj_for_compile} objects"
)
# Initialize text prompt and cache image features
self.add_prompt(
inference_state, frame_idx=start_frame_idx, text_str="cat"
)
if num_objects > 0:
inference_state = self.add_fake_objects_to_inference_state(
inference_state, num_objects, frame_idx=start_frame_idx
)
inference_state["tracker_metadata"]["rank0_metadata"].update(
{
"masklet_confirmation": {
"status": np.zeros(num_objects, dtype=np.int64),
"consecutive_det_num": np.zeros(
num_objects, dtype=np.int64
),
}
}
)
for _ in self.propagate_in_video(
inference_state, start_frame_idx, reverse=False
):
pass
for _ in self.propagate_in_video(
inference_state, start_frame_idx, reverse=True
):
pass
self.reset_state(inference_state)
logger.info(
f"{i + 1}/{num_rounds} warming up model compilation -- completed round {i + 1} out of {num_rounds}"
)
# Warm up SAM2 memory encoder with varying input shapes
num_iters = 3
feat_size = self.tracker.sam_image_embedding_size**2 # 72 * 72 = 5184
hidden_dim = self.tracker.hidden_dim # 256
mem_dim = self.tracker.mem_dim # 64 for non-multiplex, 256 for multiplex
is_multiplex = self.tracker.is_multiplex
for _ in tqdm(range(num_iters)):
for b in range(1, self.num_obj_for_compile + 1):
for i in range(
1,
self.tracker.max_cond_frames_in_attn + self.tracker.num_maskmem,
):
for j in range(
self.tracker.max_cond_frames_in_attn
+ self.tracker.max_obj_ptrs_in_encoder
):
if is_multiplex:
# Multiplex encoder: mem_dim == hidden_dim, uses decoupled cross-attention
# num_obj_ptr_tokens = j (since hidden_dim // mem_dim = 1)
num_obj_ptr_tokens = j
memory_seq_len = feat_size * i + num_obj_ptr_tokens
# src and memory have batch=num_buckets (b)
src = torch.randn(
feat_size, b, hidden_dim, device=self.device
)
src_pos = torch.randn(
feat_size, b, hidden_dim, device=self.device
)
memory = torch.randn(
memory_seq_len, b, hidden_dim, device=self.device
)
memory_pos = torch.randn(
memory_seq_len, b, hidden_dim, device=self.device
)
# image and memory_image always have batch=1 (shared image features)
image = torch.randn(
feat_size, 1, hidden_dim, device=self.device
)
image_pos = torch.randn(
feat_size, 1, hidden_dim, device=self.device
)
memory_image = torch.randn(
feat_size * i, 1, hidden_dim, device=self.device
)
memory_image_pos = torch.randn(
feat_size * i, 1, hidden_dim, device=self.device
)
self.tracker.transformer.encoder.forward(
image=image,
src=src,
memory_image=memory_image,
memory=memory,
image_pos=image_pos,
src_pos=src_pos,
memory_image_pos=memory_image_pos,
memory_pos=memory_pos,
num_obj_ptr_tokens=num_obj_ptr_tokens,
)
else:
# Non-multiplex encoder: mem_dim = 64, uses standard cross-attention
# num_obj_ptr_tokens = (hidden_dim // mem_dim) * j = 4 * j
num_obj_ptr_tokens = (hidden_dim // mem_dim) * j
src = torch.randn(
feat_size, b, hidden_dim, device=self.device
)
src_pos = torch.randn(
feat_size, b, hidden_dim, device=self.device
)
prompt = torch.randn(
feat_size * i + num_obj_ptr_tokens,
b,
mem_dim,
device=self.device,
)
prompt_pos = torch.randn(
feat_size * i + num_obj_ptr_tokens,
b,
mem_dim,
device=self.device,
)
self.tracker.transformer.encoder.forward(
src=src,
src_pos=src_pos,
prompt=prompt,
prompt_pos=prompt_pos,
num_obj_ptr_tokens=num_obj_ptr_tokens,
)
# Warm up different number of kbox
for _ in tqdm(range(num_iters)):
for i in range(1, self.max_num_kboxes + 1):
kboxes = (
torch.rand(i, 4, dtype=torch.float32) * 0.5
) # Generate positive values between 0 and 1
print(
"Warming up masks_to_boxes with",
i,
f"kboxes.shape={kboxes.shape}",
)
self.add_prompt(
inference_state,
frame_idx=start_frame_idx,
text_str="cat",
boxes_xywh=kboxes,
box_labels=[1] * len(kboxes),
)
for _ in self.propagate_in_video(
inference_state, start_frame_idx, reverse=False
):
pass
self.new_det_thresh = orig_new_det_thresh
return inference_state
def add_fake_objects_to_inference_state(
self, inference_state, num_objects, frame_idx
):
new_det_obj_ids_local = np.arange(num_objects)
high_res_H, high_res_W = (
self.tracker.maskmem_backbone.mask_downsampler.interpol_size
)
new_det_masks = torch.ones(
len(new_det_obj_ids_local), high_res_H, high_res_W
).to(self.device)
inference_state["sam2_inference_states"] = self._tracker_add_new_objects(
frame_idx=frame_idx,
num_frames=inference_state["num_frames"],
new_obj_ids=new_det_obj_ids_local,
new_obj_masks=new_det_masks,
tracker_states_local=inference_state["sam2_inference_states"],
orig_vid_height=inference_state["orig_height"],
orig_vid_width=inference_state["orig_width"],
feature_cache=inference_state["feature_cache"],
)
# Synthesize obj_id_to_mask data for cached_frame_outputs to support _build_sam2_output during warmup
obj_id_to_mask = {}
if num_objects > 0:
H_video = inference_state["orig_height"]
W_video = inference_state["orig_width"]
video_res_masks = F.interpolate(
new_det_masks.unsqueeze(1), # Add channel dimension for interpolation
size=(H_video, W_video),
mode="bilinear",
align_corners=False,
) # (num_objects, 1, H_video, W_video)
for i, obj_id in enumerate(new_det_obj_ids_local):
obj_id_to_mask[obj_id] = (video_res_masks[i] > 0.0).to(torch.bool)
if self.rank == 0:
for fidx in range(inference_state["num_frames"]):
self._cache_frame_outputs(inference_state, fidx, obj_id_to_mask)
inference_state["tracker_metadata"] = {
"obj_ids_per_gpu": [np.arange(num_objects)],
"obj_ids_all_gpu": np.arange(num_objects), # Same as 1 GPU
"num_obj_per_gpu": [num_objects],
"obj_id_to_score": {i: 1.0 for i in range(num_objects)},
"obj_id_to_sam2_score_frame_wise": defaultdict(dict),
"obj_id_to_last_occluded": {},
"max_obj_id": num_objects,
"rank0_metadata": {
"masklet_confirmation": {
"status": np.zeros(num_objects, dtype=np.int64),
"consecutive_det_num": np.zeros(num_objects, dtype=np.int64),
},
"removed_obj_ids": set(),
"suppressed_obj_ids": defaultdict(set),
},
# gpu_metadata for hotstart tracking on GPU
"gpu_metadata": {
"N_obj": num_objects,
"obj_first_frame": torch.zeros(
num_objects, dtype=torch.long, device=self.device
),
"consecutive_unmatch_count": torch.zeros(
num_objects, dtype=torch.long, device=self.device
),
"trk_keep_alive": torch.ones(
num_objects, dtype=torch.bool, device=self.device
),
"removed_mask": torch.zeros(
num_objects, dtype=torch.bool, device=self.device
),
"overlap_pair_counts": torch.zeros(
(num_objects, num_objects), dtype=torch.long, device=self.device
),
"last_occluded_tensor": torch.zeros(
num_objects, dtype=torch.long, device=self.device
),
},
}
# Add num_buc_per_gpu for multiplex mode
if self.is_multiplex:
# Count actual buckets from the inference states
num_buc = self._count_buckets_in_states(
inference_state["sam2_inference_states"]
)
inference_state["tracker_metadata"]["num_buc_per_gpu"] = np.array(
[num_buc], dtype=np.int64
)
return inference_state
@torch.inference_mode()
@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
def warm_up_compilation(self):
"""
Warm up the model by running a dummy inference to compile the model. This is
useful to avoid the compilation overhead in the first inference call.
"""
if not self.compile_model:
return
self._warm_up_complete = False
if self.device.type not in {"cuda", "mps"}:
raise RuntimeError(
f"The model must be on an accelerator for warm-up compilation, got {self.device=}."
)
# temporally set to single GPU temporarily for warm-up compilation
orig_rank = self.rank
orig_world_size = self.world_size
self.rank = self.detector.rank = 0
self.world_size = self.detector.world_size = 1
orig_recondition_every_nth_frame = self.recondition_every_nth_frame
# self.recondition_every_nth_frame = 2
# Get a random video
inference_state = self.init_state(resource_path="<load-zero-video-30>")
start_frame_idx = 0
# Run basic propagation warm-up
inference_state = self._warm_up_vg_propagation(inference_state, start_frame_idx)
logger.info("Warm-up compilation completed.")
# revert to the original GPU and rank
self.rank = self.detector.rank = orig_rank
self.world_size = self.detector.world_size = orig_world_size
self.recondition_every_nth_frame = orig_recondition_every_nth_frame
self._warm_up_complete = True
self.tracker.transformer.encoder.forward.set_logging(True)
@torch.inference_mode()
def add_prompt(
self,
inference_state,
frame_idx,
text_str=None,
clear_old_points=True,
points=None,
point_labels=None,
boxes_xywh=None,
box_labels=None,
clear_old_boxes=True,
output_prob_thresh=0.5,
preencoded_text_outputs=None,
):
"""
Add text, point or box prompts on a single frame. This method returns the inference
outputs only on the prompted frame.
Note that text prompts are NOT associated with a particular frame (i.e. they apply
to all frames). However, we only run inference on the frame specified in `frame_idx`.
Copied from sam3_demo.Sam3DemoMixin.add_prompt, simplified to support only text prompts.
"""
logger.info("Running add_prompt on frame %d", frame_idx)
device = inference_state["device"]
num_frames = inference_state["num_frames"]
assert text_str is not None or points is not None or boxes_xywh is not None, (
"at least one type of prompt (text, points, boxes) must be provided"
)
assert 0 <= frame_idx < num_frames, (
f"{frame_idx=} is out of range for a total of {num_frames} frames"
)
assert clear_old_boxes, "clear old boxes must be True"
assert points is None and clear_old_points is True and point_labels is None, (
"Point prompts not accepted"
)
# since it's a semantic prompt, we start over
self.reset_state(inference_state)
# 1) add text prompt
if text_str is not None:
inference_state["text_prompt"] = text_str
# add the text prompt into the input batch (to be applied to *all* frames)
inference_state["input_batch"].find_text_batch[0] = text_str
for t in range(inference_state["num_frames"]):
text_id = self.TEXT_ID_FOR_TEXT
inference_state["input_batch"].find_inputs[t].text_ids[...] = text_id
# 2) handle box prompt
assert (boxes_xywh is not None) == (box_labels is not None)
if boxes_xywh is not None:
boxes_xywh = torch.as_tensor(boxes_xywh, dtype=torch.float32)
box_labels = torch.as_tensor(box_labels, dtype=torch.long)
# input boxes are expected to be [xmin, ymin, width, height] format
# in normalized coordinates of range 0~1, similar to FA
assert boxes_xywh.dim() == 2
assert boxes_xywh.size(0) > 0 and boxes_xywh.size(-1) == 4
assert box_labels.dim() == 1 and box_labels.size(0) == boxes_xywh.size(0)
boxes_cxcywh = box_xywh_to_cxcywh(boxes_xywh)
assert (boxes_xywh >= 0).all().item() and (boxes_xywh <= 1).all().item()
assert (boxes_cxcywh >= 0).all().item() and (boxes_cxcywh <= 1).all().item()
new_box_input = boxes_cxcywh, box_labels
inference_state["per_frame_raw_box_input"][frame_idx] = new_box_input
# handle the case of visual prompt (also added as an input box from the UI)
boxes_cxcywh, box_labels, geometric_prompt = self._get_visual_prompt(
inference_state, frame_idx, boxes_cxcywh, box_labels
)
inference_state["per_frame_geometric_prompt"][frame_idx] = geometric_prompt
with torch.profiler.record_function("add_prompt._init_backbone_out"):
inference_state["backbone_out"] = self._init_backbone_out(inference_state, preencoded_text_outputs)
out = self._run_single_frame_inference(
inference_state,
frame_idx,
reverse=False,
)
return frame_idx, self._postprocess_output(inference_state, out)
def _init_backbone_out(self, inference_state, preencoded_text_outputs=None):
"""
Initialize a backbone_out dictionary and extract the text features.
Note that the visual features of each frame are not extracted here. They will be
extracted on the fly when running inference on each frame.
"""
input = inference_state["input_batch"]
device = self.device
backbone_out = {"img_batch_all_stages": input.img_batch}
if preencoded_text_outputs is None:
if self.detector.backbone.language_backbone is None:
raise RuntimeError("SAM3 text encoder is not loaded; preencoded_text_outputs is required.")
text_outputs = self.detector.backbone.forward_text(input.find_text_batch, device=device)
else:
text_outputs = {key: value.to(device=device, non_blocking=True) if torch.is_tensor(value) else value for key, value in preencoded_text_outputs.items()}
backbone_out.update(text_outputs)
return backbone_out
@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
def forward(self, input: BatchedDatapoint, is_inference: bool = False):
"""This method is only used for benchmark eval (not used in the demo)."""
# set the model to single GPU for benchmark evaluation (to be compatible with trainer)
orig_rank = self.rank
orig_world_size = self.world_size
self.rank = self.detector.rank = 0
self.world_size = self.detector.world_size = 1
# get data
text_prompt_ids = input.find_metadatas[0].original_category_id
text_prompt_list = input.find_text_batch
# loop over txt prompts
tracking_res = defaultdict(dict) # frame_idx --> {obj_id: mask}
scores_labels = defaultdict(tuple) # obj_id --> (score, text_prompt_id)
inference_state = self.init_state(resource_path=input.raw_images)
for prompt_id, prompt in zip(text_prompt_ids, text_prompt_list):
self.add_prompt(inference_state, frame_idx=0, text_str=prompt)
start_obj_id = max(scores_labels.keys(), default=-1) + 1 # prev max + 1
# propagate the prompts
obj_ids_this_prompt = set()
for frame_idx, out in self.propagate_in_video(
inference_state,
start_frame_idx=0,
max_frame_num_to_track=inference_state["num_frames"],
reverse=False,
):
out_obj_ids = (
out["out_obj_ids"].numpy()
if isinstance(out["out_obj_ids"], torch.Tensor)
else out["out_obj_ids"]
)
out_binary_masks = (
out["out_binary_masks"].numpy()
if isinstance(out["out_binary_masks"], torch.Tensor)
else out["out_binary_masks"]
)
current_frame_res = tracking_res[frame_idx]
for obj_id, mask in zip(out_obj_ids, out_binary_masks):
mask_tensor = torch.tensor(mask[None], dtype=torch.bool)
current_frame_res[obj_id + start_obj_id] = mask_tensor
obj_ids_this_prompt.update(current_frame_res.keys())
obj_id_to_score = inference_state["tracker_metadata"]["obj_id_to_score"]
for obj_id, score in obj_id_to_score.items():
if obj_id + start_obj_id in obj_ids_this_prompt:
score_tensor = torch.tensor(score, dtype=torch.float32)
scores_labels[obj_id + start_obj_id] = (score_tensor, prompt_id)
self.reset_state(inference_state)
video_id = input.find_metadatas[0].original_image_id[0].cpu().item()
preds = self.prep_for_evaluator(input.raw_images, tracking_res, scores_labels)
# revert the model to the original GPU and rank
self.rank = self.detector.rank = orig_rank
self.world_size = self.detector.world_size = orig_world_size
return {video_id: preds}
class Sam3MultiplexTrackingProd(Sam3MultiplexTracking):
"""
Subclass of Sam3MultiplexTracking with support for batched processing.
This class enables processing videos in batches rather than all at once by:
1. Adding an `is_last_batch` parameter to control buffer flushing
2. Persisting generator state (hotstart_buffer, hotstart_removed_obj_ids,
unconfirmed_obj_ids_per_frame) in inference_state across generator instantiations
This is useful for processing large videos in smaller chunks to manage memory
or distribute processing across multiple calls.
"""
@torch.inference_mode()
def init_state(
self,
resource_path,
offload_video_to_cpu=False,
async_loading_frames=False,
use_cv2=False,
input_is_mp4=False,
):
inference_state = super().init_state(
resource_path=resource_path,
offload_video_to_cpu=offload_video_to_cpu,
async_loading_frames=async_loading_frames,
use_cv2=use_cv2,
input_is_mp4=input_is_mp4,
)
# Initialize generator state for batched processing
inference_state["generator_state"] = {
"hotstart_buffer": [],
"hotstart_removed_obj_ids": set(),
"unconfirmed_obj_ids_per_frame": {},
"postprocess_yield_list": [],
}
return inference_state
def reset_state(self, inference_state):
super().reset_state(inference_state)
# Reset generator state for batched processing
inference_state["generator_state"] = {
"hotstart_buffer": [],
"hotstart_removed_obj_ids": set(),
"unconfirmed_obj_ids_per_frame": {},
"postprocess_yield_list": [],
}
@torch.inference_mode()
def propagate_in_video(
self,
inference_state,
start_frame_idx=None,
max_frame_num_to_track=None,
reverse=False,
output_prob_thresh=0.5,
compute_stability_score=False,
is_instance_processing=False,
is_last_batch=True,
progress_callback=None,
):
"""
Propagate the prompts to get grounding results for the entire video. This method
is a generator and yields inference outputs for all frames in the range specified
by `start_frame_idx`, `max_frame_num_to_track`, and `reverse`.
Args:
is_last_batch: Whether this is the last batch in a batched processing scenario.
When True (default), the hotstart buffer will be flushed at end_frame_idx.
When False, the buffer is preserved in inference_state for the next batch.
This flag should be set to False for all batches except the last one when
processing a video in multiple batches.
"""
# compile the model (it's a no-op if the model is already compiled)
# note that it's intentionally added to `self.propagate_in_video`, so that the first
# `self.add_prompt` call will be done in eager mode to fill in the decoder buffers
# such as positional encoding cache)
self._compile_model()
processing_order, end_frame_idx = self._get_processing_order(
inference_state,
start_frame_idx,
max_frame_num_to_track,
reverse=reverse,
)
# Store max_frame_num_to_track in feature_cache for downstream methods
inference_state["feature_cache"]["tracking_bounds"] = {
"max_frame_num_to_track": max_frame_num_to_track,
"propagate_in_video_start_frame_idx": start_frame_idx,
}
# Initialize or retrieve generator state from inference_state to persist across batches
if "generator_state" not in inference_state:
inference_state["generator_state"] = {
"hotstart_buffer": [],
"hotstart_removed_obj_ids": set(),
"unconfirmed_obj_ids_per_frame": {},
"postprocess_yield_list": [],
}
generator_state = inference_state["generator_state"]
hotstart_buffer = generator_state["hotstart_buffer"]
hotstart_removed_obj_ids = generator_state["hotstart_removed_obj_ids"]
unconfirmed_obj_ids_per_frame = generator_state["unconfirmed_obj_ids_per_frame"]
postprocess_yield_list = generator_state.get("postprocess_yield_list", [])
# when deciding whether to output a masklet on `yield_frame_idx`, we check whether the object is confirmed
# in a future frame (`unconfirmed_frame_delay` frames after the current frame). For example, if we require
# an object to be detected in 3 consecutive frames to be confirmed, then we look 2 frames in the future --
# e.g., we output an object on frame 4 only if it becomes confirmed on frame 6.
unconfirmed_status_delay = self.masklet_confirmation_consecutive_det_thresh - 1
progress_done = 0
progress_total = len(processing_order)
for frame_idx in tqdm(
processing_order, desc="propagate_in_video", disable=self.rank > 0
):
out = self._run_single_frame_inference(
inference_state,
frame_idx,
reverse,
is_instance_processing=is_instance_processing,
)
progress_done += 1
if progress_callback is not None and self.rank == 0:
progress_callback(progress_done, progress_total)
if self.hotstart_delay > 0:
# accumulate the outputs for the first `hotstart_delay` frames
hotstart_buffer.append([frame_idx, out])
# update the object IDs removed by hotstart so that we don't output them
if self.rank == 0:
hotstart_removed_obj_ids.update(out["removed_obj_ids"])
unconfirmed_obj_ids = out.get("unconfirmed_obj_ids", None)
if unconfirmed_obj_ids is not None:
unconfirmed_obj_ids_per_frame[frame_idx] = unconfirmed_obj_ids
if frame_idx == end_frame_idx and is_last_batch:
# we reached the end of propagation -- yield all frames in the buffer
yield_list = hotstart_buffer
hotstart_buffer = []
elif len(hotstart_buffer) >= self.hotstart_delay:
# we have enough frames -- yield and remove the first (oldest) frame from the buffer
yield_list = hotstart_buffer[:1]
hotstart_buffer = hotstart_buffer[1:]
else:
# not enough frames yet -- skip yielding
yield_list = []
else:
yield_list = [(frame_idx, out)] # output the current frame
# Accumulate yield_list into postprocess_yield_list
# Snapshot hotstart_removed_obj_ids at the time of accumulation to preserve
# the correct state for each frame (important: this set is mutated over time)
for yield_frame_idx, yield_out in yield_list:
postprocess_yield_list.append(
(yield_frame_idx, yield_out, set(hotstart_removed_obj_ids))
)
# Process batch when we have enough frames
while len(postprocess_yield_list) >= self.postprocess_batch_size:
batch_to_process = postprocess_yield_list[: self.postprocess_batch_size]
postprocess_yield_list = postprocess_yield_list[
self.postprocess_batch_size :
]
with torch.profiler.record_function(
"Sam3MultiplexTrackingProd.postprocess_output_batched"
):
if self.rank == 0:
# Prepare batched inputs for postprocessing
H_video, W_video = (
inference_state["orig_height"],
inference_state["orig_width"],
)
num_frames = inference_state["num_frames"]
batched_outs = []
frame_indices = []
for (
yield_frame_idx,
yield_out,
removed_obj_ids_snapshot,
) in batch_to_process:
suppressed_obj_ids = yield_out["suppressed_obj_ids"]
unconfirmed_status_frame_idx = (
yield_frame_idx + unconfirmed_status_delay
if not reverse
else yield_frame_idx - unconfirmed_status_delay
)
unconfirmed_status_frame_idx = max(
0, min(unconfirmed_status_frame_idx, num_frames - 1)
)
unconfirmed_obj_ids = unconfirmed_obj_ids_per_frame.get(
unconfirmed_status_frame_idx, None
)
batched_outs.append(
(
yield_out,
removed_obj_ids_snapshot,
suppressed_obj_ids,
unconfirmed_obj_ids,
)
)
frame_indices.append(yield_frame_idx)
# Cache frame outputs
self._cache_frame_outputs(
inference_state,
yield_frame_idx,
yield_out["obj_id_to_mask"],
suppressed_obj_ids=suppressed_obj_ids,
removed_obj_ids=removed_obj_ids_snapshot,
unconfirmed_obj_ids=unconfirmed_obj_ids,
)
# Process all frames in batch
if self.postprocess_batch_size > 1:
postprocessed_outs = self._postprocess_output_batched(
H_video, W_video, batched_outs
)
else:
# Process each frame individually but output together
postprocessed_outs = []
for (
yield_out,
removed_obj_ids_snapshot,
suppressed_obj_ids,
unconfirmed_obj_ids,
) in batched_outs:
postprocessed_out = self._postprocess_output(
inference_state,
yield_out,
removed_obj_ids_snapshot,
suppressed_obj_ids,
unconfirmed_obj_ids,
)
postprocessed_outs.append(postprocessed_out)
# Yield results
for yield_frame_idx, postprocessed_out in zip(
frame_indices, postprocessed_outs
):
yield yield_frame_idx, postprocessed_out
else:
# No output on other GPUs
for yield_frame_idx, _, _ in batch_to_process:
yield yield_frame_idx, DUMMY_OUTPUT
# Handle remaining frames in hotstart buffer at end of last batch
if is_last_batch and len(hotstart_buffer) > 0:
for yield_frame_idx, yield_out in hotstart_buffer:
postprocess_yield_list.append(
(yield_frame_idx, yield_out, set(hotstart_removed_obj_ids))
)
hotstart_buffer = []
# Flush any remaining frames in the postprocess buffer (even partial
# batches) so that the caller gets results as soon as possible. This is
# especially important for the first batch where hotstart_delay causes
# only a few frames to exit the hotstart buffer — without this flush
# the client would have to wait for the next batch before receiving any
# output, hurting time-to-first-frame.
if len(postprocess_yield_list) > 0:
with torch.profiler.record_function(
"Sam3MultiplexTrackingProd.postprocess_output_batched"
):
if self.rank == 0:
H_video, W_video = (
inference_state["orig_height"],
inference_state["orig_width"],
)
num_frames = inference_state["num_frames"]
batched_outs = []
frame_indices = []
for (
yield_frame_idx,
yield_out,
removed_obj_ids_snapshot,
) in postprocess_yield_list:
suppressed_obj_ids = yield_out["suppressed_obj_ids"]
unconfirmed_status_frame_idx = (
yield_frame_idx + unconfirmed_status_delay
if not reverse
else yield_frame_idx - unconfirmed_status_delay
)
unconfirmed_status_frame_idx = max(
0, min(unconfirmed_status_frame_idx, num_frames - 1)
)
unconfirmed_obj_ids = unconfirmed_obj_ids_per_frame.get(
unconfirmed_status_frame_idx, None
)
batched_outs.append(
(
yield_out,
removed_obj_ids_snapshot,
suppressed_obj_ids,
unconfirmed_obj_ids,
)
)
frame_indices.append(yield_frame_idx)
self._cache_frame_outputs(
inference_state,
yield_frame_idx,
yield_out["obj_id_to_mask"],
suppressed_obj_ids=suppressed_obj_ids,
removed_obj_ids=removed_obj_ids_snapshot,
unconfirmed_obj_ids=unconfirmed_obj_ids,
)
if self.postprocess_batch_size > 1:
postprocessed_outs = self._postprocess_output_batched(
H_video, W_video, batched_outs
)
else:
# Process each frame individually but output together
postprocessed_outs = []
for (
yield_out,
removed_obj_ids_snapshot,
suppressed_obj_ids,
unconfirmed_obj_ids,
) in batched_outs:
postprocessed_out = self._postprocess_output(
inference_state,
yield_out,
removed_obj_ids_snapshot,
suppressed_obj_ids,
unconfirmed_obj_ids,
)
postprocessed_outs.append(postprocessed_out)
for yield_frame_idx, postprocessed_out in zip(
frame_indices, postprocessed_outs
):
yield yield_frame_idx, postprocessed_out
else:
for yield_frame_idx, _, _ in postprocess_yield_list:
yield yield_frame_idx, DUMMY_OUTPUT
postprocess_yield_list = []
# Store the generator state back to inference_state for persistence across batches
generator_state["postprocess_yield_list"] = postprocess_yield_list
generator_state["hotstart_buffer"] = hotstart_buffer
generator_state["hotstart_removed_obj_ids"] = hotstart_removed_obj_ids
generator_state["unconfirmed_obj_ids_per_frame"] = unconfirmed_obj_ids_per_frame
if self.is_multiplex:
# log the bucket utilization stats
# bucket utilization rate is total valid objects / total capacity -> represents rooms for improvement
# subscription rate is total valid objects / total number of buckets -> represents speedup
total_valid_objects = 0
total_num_buckets = 0
for state in inference_state["sam2_inference_states"]:
assert (
len(state["obj_ids"])
== state["multiplex_state"].total_valid_entries
)
total_valid_objects += len(state["obj_ids"])
total_num_buckets += state["multiplex_state"].num_buckets
if total_num_buckets > 0:
bucket_utilization_rate = (
total_valid_objects / (total_num_buckets * self.bucket_capacity)
) * 100
subscription_rate = (total_valid_objects / total_num_buckets) * 100
logger.info(
f"Bucket utilization rate: {bucket_utilization_rate:.2f}%, subscription rate: {subscription_rate:.2f}%"
)
class Sam3MultiplexTrackingWithInteractivity(Sam3MultiplexTracking):
def __init__(
self,
use_prev_mem_frame=False,
use_stateless_refinement=False,
refinement_detector_cond_frame_removal_window=30 * 4,
**kwargs,
):
"""
use_prev_mem_frame: bool, whether to condition on previous memory frames for adding points
use_stateless_refinement: bool, whether to enable stateless refinement behavior
refinement_detector_cond_frame_removal_window: int, we remove a detector conditioning frame if it
is within this many frames of a user refined frame. Set to a large value (e.g. 10000) to
always remove detector conditioning frames if there is any user refinement in the video.
"""
super().__init__(**kwargs)
self.use_prev_mem_frame = use_prev_mem_frame
self.use_stateless_refinement = use_stateless_refinement
self.refinement_detector_cond_frame_removal_window = (
refinement_detector_cond_frame_removal_window
)
@torch.inference_mode()
def init_state(
self,
resource_path,
offload_video_to_cpu=False,
async_loading_frames=False,
use_cv2=False,
input_is_mp4=False,
):
inference_state = super().init_state(
resource_path=resource_path,
offload_video_to_cpu=offload_video_to_cpu,
async_loading_frames=async_loading_frames,
use_cv2=use_cv2,
input_is_mp4=input_is_mp4,
)
# initialize extra states
inference_state["action_history"] = [] # for logging user actions
if self.tracker.per_obj_inference:
# in per_obj mode only 1 inference state is needed, we init it here.
inference_state["sam2_inference_states"] = [
self._init_new_sam2_state(inference_state)
]
return inference_state
def reset_state(self, inference_state):
super().reset_state(inference_state)
# reset extra states
inference_state["action_history"].clear()
if self.tracker.per_obj_inference:
inference_state["sam2_inference_states"] = [
self._init_new_sam2_state(inference_state)
]
def _init_new_sam2_state(self, inference_state):
return self.tracker.init_state(
cached_features=inference_state["feature_cache"],
video_height=inference_state["orig_height"],
video_width=inference_state["orig_width"],
num_frames=inference_state["num_frames"],
)
def cancel_propagation(self, inference_state):
"""
Cancel any ongoing propagation and reset the model state.
"""
logger.info("Cancelling ongoing propagation.")
self.add_action_history(
inference_state,
action_type="propagation_cancel",
obj_ids=None,
frame_idx=None,
)
def fetch_and_process_single_frame_results(self, inference_state, frame_idx):
tracker_metadata = inference_state["tracker_metadata"]
obj_id_to_mask = inference_state["cached_frame_outputs"][frame_idx]
# post processing - remove suppressed obj_ids
obj_id_to_score = tracker_metadata["obj_id_to_score"]
suppressed_obj_ids = tracker_metadata["rank0_metadata"]["suppressed_obj_ids"][
frame_idx
]
obj_id_to_sam2_score = tracker_metadata["obj_id_to_sam2_score_frame_wise"][
frame_idx
]
out = {
"obj_id_to_mask": obj_id_to_mask,
"obj_id_to_score": obj_id_to_score,
"obj_id_to_sam2_score": obj_id_to_sam2_score,
}
return frame_idx, self._postprocess_output(
inference_state, out, suppressed_obj_ids=suppressed_obj_ids
)
@torch.inference_mode()
def propagate_in_video(
self,
inference_state,
start_frame_idx=None,
max_frame_num_to_track=None,
reverse=False,
output_prob_thresh=0.5,
compute_stability_score=False,
is_instance_processing=False,
is_last_batch: bool = False,
progress_callback=None,
):
# step 1: check which type of propagation to run, should be the same for all GPUs.
propagation_type, obj_ids = self.parse_action_history_for_propagation(
inference_state
)
self.add_action_history(
inference_state,
action_type=propagation_type,
obj_ids=obj_ids,
frame_idx=start_frame_idx,
)
# step 2: run full VG propagation
if propagation_type == "propagation_full":
logger.info(f"Running full VG propagation (reverse={reverse}).")
yield from super().propagate_in_video(
inference_state,
start_frame_idx=start_frame_idx,
max_frame_num_to_track=max_frame_num_to_track,
reverse=reverse,
is_last_batch=is_last_batch,
progress_callback=progress_callback,
)
return
# step 3: run SAM2 partial propagation or direct fetch existing predictions
assert propagation_type in ["propagation_partial", "propagation_fetch"]
logger.info(
f"Running SAM2 propagation for objects {obj_ids} and merging it with existing VG predictions (reverse={reverse})."
if propagation_type == "propagation_partial"
else f"Fetching existing VG predictions without running any propagation (reverse={reverse})."
)
processing_order, _end_frame_idx = self._get_processing_order(
inference_state,
start_frame_idx=start_frame_idx,
max_frame_num_to_track=max_frame_num_to_track,
reverse=reverse,
)
tracker_metadata = inference_state["tracker_metadata"]
# if fetch just return from output
if propagation_type == "propagation_fetch":
progress_done = 0
progress_total = len(processing_order)
for frame_idx in tqdm(processing_order):
if self.rank == 0:
frame_idx, out = self.fetch_and_process_single_frame_results(
inference_state, frame_idx
)
progress_done += 1
if progress_callback is not None:
progress_callback(progress_done, progress_total)
yield frame_idx, out
else:
yield frame_idx, DUMMY_OUTPUT # no output for other GPUs
return
# get SAM2 inference states containing selected obj_ids
if propagation_type == "propagation_partial":
# can be empty for GPUs where objects are not in their inference states
tracker_states_local = self._get_sam2_inference_states_by_obj_ids(
inference_state, obj_ids
)
for sam2_state in tracker_states_local:
self.tracker.propagate_in_video_preflight(
sam2_state, run_mem_encoder=True
)
progress_done = 0
progress_total = len(processing_order)
for frame_idx in tqdm(processing_order):
# run SAM2 propagation
if propagation_type == "propagation_partial":
self._prepare_backbone_feats(inference_state, frame_idx, reverse)
obj_ids_local, low_res_masks_local, sam2_scores_local = (
self._propogate_tracker_one_frame_local_gpu(
tracker_states_local,
frame_idx=frame_idx,
reverse=reverse,
run_mem_encoder=True,
)
)
# broadcast refined object sam2 scores and masks to all GPUs
# handle multiple objects that can be located on different GPUs
refined_obj_data = {} # obj_id -> (score, mask_video_res)
# Collect data for objects on this GPU
local_obj_data = {}
for obj_id in obj_ids:
obj_rank = self._get_gpu_id_by_obj_id(inference_state, obj_id)
if self.rank == obj_rank and obj_id in obj_ids_local:
refined_obj_idx = obj_ids_local.index(obj_id)
refined_mask_low_res = low_res_masks_local[
refined_obj_idx
] # (H_low_res, W_low_res)
refined_score = sam2_scores_local[refined_obj_idx]
# Keep low resolution for broadcasting to reduce communication cost
local_obj_data[obj_id] = (refined_score, refined_mask_low_res)
# Broadcast data from each GPU that has refined objects
if self.world_size > 1:
for obj_id in obj_ids:
obj_rank = self._get_gpu_id_by_obj_id(inference_state, obj_id)
if self.rank == obj_rank:
# This GPU has the object, broadcast its data
data_to_broadcast = local_obj_data.get(obj_id, None)
data_list = [data_to_broadcast]
self.broadcast_python_obj_cpu(data_list, src=obj_rank)
if data_to_broadcast is not None:
refined_obj_data[obj_id] = data_to_broadcast
elif self.rank != obj_rank:
# This GPU doesn't have the object, receive data
data_list = [None]
self.broadcast_python_obj_cpu(data_list, src=obj_rank)
if data_list[0] is not None:
refined_obj_data[obj_id] = data_list[0]
else:
# Single GPU case
refined_obj_data = local_obj_data
# Update SAM2 scores for all refined objects
for obj_id, (refined_score, _) in refined_obj_data.items():
# After broadcast_python_obj_cpu in multi-GPU, tensors may become numpy scalars
# Ensure it's a GPU tensor for consistency with base class behavior
if not isinstance(refined_score, torch.Tensor):
refined_score = torch.tensor(
refined_score, dtype=torch.float32, device=self.device
)
tracker_metadata["obj_id_to_sam2_score_frame_wise"][
frame_idx
].update({obj_id: refined_score})
if self.rank == 0:
# get predictions from SAM2 inference states, it includes the original
# VG predictions and the refined predictions from interactivity.
# Prepare refined masks dictionary - upscale to video resolution after broadcast
refined_obj_id_to_mask = {}
for obj_id, (_, refined_mask_low_res) in refined_obj_data.items():
refined_mask_video_res = (
self._convert_low_res_mask_to_video_res(
refined_mask_low_res, inference_state
)
) # (1, H_video, W_video) bool
refined_obj_id_to_mask[obj_id] = refined_mask_video_res
obj_id_to_mask = self._build_sam2_output(
inference_state, frame_idx, refined_obj_id_to_mask
)
out = {
"obj_id_to_mask": obj_id_to_mask,
"obj_id_to_score": tracker_metadata["obj_id_to_score"],
"obj_id_to_sam2_score": tracker_metadata[
"obj_id_to_sam2_score_frame_wise"
][frame_idx],
}
suppressed_obj_ids = tracker_metadata["rank0_metadata"][
"suppressed_obj_ids"
][frame_idx]
self._cache_frame_outputs(
inference_state,
frame_idx,
obj_id_to_mask,
suppressed_obj_ids=suppressed_obj_ids,
)
suppressed_obj_ids = tracker_metadata["rank0_metadata"][
"suppressed_obj_ids"
][frame_idx]
progress_done += 1
if progress_callback is not None:
progress_callback(progress_done, progress_total)
yield (
frame_idx,
self._postprocess_output(
inference_state, out, suppressed_obj_ids=suppressed_obj_ids
),
)
else:
yield frame_idx, DUMMY_OUTPUT # no output for other GPUs
def add_action_history(
self, inference_state, action_type, frame_idx=None, obj_ids=None
):
"""
action_history is used to automatically decide what to do during propagation.
action_type: one of ["add", "remove", "refine"] + ["propagation_full", "propagation_partial", "propagation_fetch", "propagation_cancel"]
"""
instance_actions = ["add", "remove", "refine"]
propagation_actions = [
"propagation_full",
"propagation_partial",
"propagation_fetch",
"propagation_cancel",
]
assert action_type in instance_actions + propagation_actions, (
f"Invalid action type: {action_type}, must be one of {instance_actions + propagation_actions}"
)
action = {
"type": action_type,
"frame_idx": frame_idx,
"obj_ids": obj_ids,
}
inference_state["action_history"].append(action)
def _has_object_been_refined(self, inference_state, obj_id):
if "action_history" not in inference_state:
return False
action_history = inference_state["action_history"]
for action in action_history:
if action["type"] in ["add", "refine"] and action.get("obj_ids"):
if obj_id in action["obj_ids"]:
return True
return False
def parse_action_history_for_propagation(self, inference_state):
action_history = inference_state["action_history"]
if (
len(action_history) == 1
and action_history[0]["type"] == "propagation_cancel"
):
# only one action and it is cancel, we do full propagation
return "propagation_full", None
elif (
len(action_history) >= 2
and action_history[-1]["type"] == "propagation_cancel"
):
# last action is cancel, we go back to the action before cancel
action_before_cancelation = inference_state["action_history"][-2]
# the action before cancellation can be a propagation_fetch from running both forward
# and backward propagation as in webdemo interface, in that case we go back one more step
if action_before_cancelation["type"] == "propagation_fetch":
action_before_cancelation = inference_state["action_history"][-3]
return action_before_cancelation["type"], action_before_cancelation.get(
"obj_ids", None
)
return self._parse_action_history_for_propagation(
inference_state["action_history"], inference_state["num_frames"]
)
def _parse_action_history_for_propagation(self, action_history, num_frames):
"""
Parse the actions in history before the last propagation and prepare for the next propagation.
We support multiple actions (add/remove/refine) between two propagations. If we had an action
history similar to this ["propagate", "add", "refine", "remove", "add"], the next propagation
would remove the removed object, and also propagate the two added/refined objects.
Returns:
propagation_type: one of ["propagation_full", "propagation_partial", "propagation_fetch"]
- "propagation_full": run VG propagation for all objects
- "propagation_partial": run SAM2 propagation for selected objects, useful for add/refine actions
- "propagation_fetch": fetch existing VG predictions without running any propagation
- "propagation_cancel": this will be handled in parse_action_history_for_propagation() not this function.
obj_ids: list of object ids to run SAM2 propagation on if propagation_type is "propagation_partial".
TODO: (Jie) this function works for our current workflows, but may need more tests to ensure it works
correctly with different action histories for future workflows.
"""
if len(action_history) == 0:
# we run propagation for the first time
return "propagation_full", None
if "propagation" in action_history[-1]["type"]:
if action_history[-1]["type"] in ["propagation_fetch"]:
# last propagation is direct fetch, we fetch existing predictions
return "propagation_fetch", None
elif action_history[-1]["type"] in [
"propagation_partial",
"propagation_full",
]:
# we do fetch prediction if we have already run propagation twice or we have run
# propagation once and it is from the first frame or last frame.
if (
len(action_history) > 1
and action_history[-2]["type"]
in ["propagation_partial", "propagation_full"]
) or action_history[-1]["frame_idx"] in [
0,
num_frames - 1,
]:
# we have run both forward and backward partial/full propagation
return "propagation_fetch", None
else:
# we have run partial/full forward or backward propagation once, need run it for the rest of the frames
return action_history[-1]["type"], action_history[-1]["obj_ids"]
# parse actions since last propagation
obj_ids = []
for action in action_history[::-1]:
if "propagation" in action["type"]:
# we reached the last propagation action, stop parsing
break
if action["type"] in ["add", "refine"]:
obj_ids.extend(action["obj_ids"])
# else action["type"] == "remove": noop
obj_ids = list(set(obj_ids)) if len(obj_ids) > 0 else None
propagation_type = (
"propagation_partial" if obj_ids is not None else "propagation_fetch"
)
return propagation_type, obj_ids
def remove_object(self, inference_state, obj_id, frame_idx, is_user_action=False):
"""
We try to remove object from sam2 states on every GPU, it will do nothing
for states without this object.
"""
obj_rank = self._get_gpu_id_by_obj_id(inference_state, obj_id)
if obj_rank is None:
# Object was already removed (e.g., by hotstart heuristics during
# propagation). Log a warning and skip SAM2 state and metadata
# removal, but still record action history and clean up cached outputs.
logger.warning(
f"Object {obj_id} not found in any GPU (already removed). "
f"Skipping SAM2 state and metadata removal."
)
else:
tracker_states_local = inference_state["sam2_inference_states"]
if self.rank == obj_rank:
self._tracker_remove_objects(tracker_states_local, [obj_id])
# update metadata
tracker_metadata = inference_state["tracker_metadata"]
_obj_ids = tracker_metadata["obj_ids_per_gpu"][obj_rank]
tracker_metadata["obj_ids_per_gpu"][obj_rank] = _obj_ids[_obj_ids != obj_id]
tracker_metadata["num_obj_per_gpu"][obj_rank] = len(
tracker_metadata["obj_ids_per_gpu"][obj_rank]
)
tracker_metadata["obj_ids_all_gpu"] = np.concatenate(
tracker_metadata["obj_ids_per_gpu"]
)
tracker_metadata["obj_id_to_score"].pop(obj_id, None)
# tracker_metadata["max_obj_id"] # we do not reuse the object id, so we do not update it here
if is_user_action:
self.add_action_history(
inference_state, action_type="remove", obj_ids=[obj_id]
)
# Clean up cached frame outputs to remove references to the deleted object
if "cached_frame_outputs" in inference_state:
for _frame_idx in inference_state["cached_frame_outputs"]:
frame_cache = inference_state["cached_frame_outputs"][_frame_idx]
if obj_id in frame_cache:
del frame_cache[obj_id]
out = None
if frame_idx is not None and self.rank == 0:
frame_idx, out = self.fetch_and_process_single_frame_results(
inference_state, frame_idx
)
return frame_idx, out
def _get_gpu_id_by_obj_id(self, inference_state, obj_id):
"""
Locate GPU ID for a given object.
"""
obj_ids_per_gpu = inference_state["tracker_metadata"]["obj_ids_per_gpu"]
for rank, obj_ids in enumerate(obj_ids_per_gpu):
if obj_id in obj_ids:
return rank
return None # object not found in any GPU
def _get_sam2_inference_states_by_obj_ids(self, inference_state, obj_ids):
"""
Get the SAM2 inference states that contain the given object ids.
This is used to run partial SAM2 propagation on a single object/bucket.
Possibly multiple or zero states can be returned.
"""
states = [
state
for state in inference_state["sam2_inference_states"]
if set(obj_ids) & set(state["obj_ids"])
]
return states
def _prepare_backbone_feats(self, inference_state, frame_idx, reverse):
input_batch = inference_state["input_batch"]
feature_cache = inference_state["feature_cache"]
num_frames = inference_state["num_frames"]
geometric_prompt = (
inference_state["constants"]["empty_geometric_prompt"]
if inference_state["per_frame_geometric_prompt"][frame_idx] is None
else inference_state["per_frame_geometric_prompt"][frame_idx]
)
_ = self.run_backbone_and_detection(
frame_idx=frame_idx,
num_frames=num_frames,
reverse=reverse,
input_batch=input_batch,
geometric_prompt=geometric_prompt,
feature_cache=feature_cache,
)
@torch.inference_mode()
def add_prompt(
self,
inference_state,
frame_idx,
text_str=None,
clear_old_points=True,
points=None,
point_labels=None,
boxes_xywh=None,
box_labels=None,
clear_old_boxes=True,
output_prob_thresh=0.5,
obj_id=None,
rel_coordinates=True,
preencoded_text_outputs=None,
):
if points is not None:
if preencoded_text_outputs is not None:
text_batch_key = tuple(inference_state["input_batch"].find_text_batch)
text_outputs = {key: value.to(device=self.device, non_blocking=True) if torch.is_tensor(value) else value for key, value in preencoded_text_outputs.items()}
inference_state["feature_cache"]["text"] = {text_batch_key: text_outputs}
# SAM2 instance prompts
assert text_str is None and boxes_xywh is None, (
"When points are provided, text_str and boxes_xywh must be None."
)
assert obj_id is not None, (
"When points are provided, obj_id must be provided."
)
return self.add_sam2_new_points(
inference_state,
frame_idx,
obj_id=obj_id,
points=points,
labels=point_labels,
clear_old_points=clear_old_points,
rel_coordinates=rel_coordinates,
use_prev_mem_frame=self.use_prev_mem_frame,
)
else:
# SAM3 prompts — disable batched grounding for single-frame add_prompt
_orig_batched = self.use_batched_grounding
self.use_batched_grounding = False
try:
return super().add_prompt(
inference_state,
frame_idx,
text_str=text_str,
clear_old_points=clear_old_points,
points=points,
point_labels=point_labels,
boxes_xywh=boxes_xywh,
box_labels=box_labels,
clear_old_boxes=clear_old_boxes,
output_prob_thresh=output_prob_thresh,
preencoded_text_outputs=preencoded_text_outputs,
)
finally:
self.use_batched_grounding = _orig_batched
@torch.inference_mode()
def add_sam2_new_points(
self,
inference_state,
frame_idx,
obj_id,
points,
labels,
clear_old_points,
rel_coordinates=True,
use_prev_mem_frame=False,
):
"""Add a new point prompt to SAM2. Suppporting instance refinement to existing
objects by passing existing obj_id or adding a new object by passing a new obj_id.
use_prev_mem_frame=False to disable cross attention to previous memory frames.
Every GPU returns the same results, and results should contain all masks including
these masks not refined or not added by the current user points.
"""
assert obj_id is not None, "obj_id must be provided to add new points"
tracker_metadata = inference_state["tracker_metadata"]
if tracker_metadata == {}:
# initialize masklet metadata if it's uninitialized (empty dict)
tracker_metadata.update(self._initialize_metadata())
obj_rank = self._get_gpu_id_by_obj_id(inference_state, obj_id)
# prepare feature
self._prepare_backbone_feats(inference_state, frame_idx, reverse=False)
object_has_been_refined = self._has_object_been_refined(inference_state, obj_id)
if (
obj_rank is not None
and self.use_stateless_refinement
and not object_has_been_refined
):
# The first time we start refinement on the object, we remove it.
logger.info(
f"[rank={self.rank}] Removing object {obj_id} before refinement."
)
self.remove_object(inference_state, obj_id, is_user_action=False)
obj_rank = None
elif obj_rank is not None and not object_has_been_refined:
# Extract the object into its own singleton inference state if it belongs to a batch
if self.rank == obj_rank and not self.tracker.per_obj_inference:
tracker_states = self._get_sam2_inference_states_by_obj_ids(
inference_state, [obj_id]
)
assert len(tracker_states) == 1
# Check if this is a batched state (contains multiple objects)
sam2_state = tracker_states[0]
if len(sam2_state["obj_ids"]) > 1:
logger.info(
f"[rank={self.rank}] Extracting object {obj_id} into singleton inference state."
)
self._extract_object_to_singleton_state(
inference_state, obj_id, obj_rank
)
if obj_rank is None:
# new object, we assign it a GPU and create a new inference state if limit allows
num_prev_obj = np.sum(tracker_metadata["num_obj_per_gpu"])
if num_prev_obj >= self.max_num_objects:
logger.warning(
f"add_sam2_new_points: cannot add a new object as we are already tracking {num_prev_obj=} "
f"masklets (under {self.max_num_objects=})"
)
return frame_idx, None
new_det_gpu_ids = self._assign_new_det_to_gpus(
new_det_num=1,
prev_workload_per_gpu=tracker_metadata["num_obj_per_gpu"],
)
obj_rank = new_det_gpu_ids[0]
# get sam2 inference state for the new object
if self.rank == obj_rank:
if self.tracker.per_obj_inference:
sam2_state = inference_state["sam2_inference_states"][0]
else:
# for batched inference, we create a new inference state
sam2_state = self._init_new_sam2_state(inference_state)
inference_state["sam2_inference_states"].append(sam2_state)
# update metadata
tracker_metadata["obj_ids_per_gpu"][obj_rank] = np.concatenate(
[
tracker_metadata["obj_ids_per_gpu"][obj_rank],
np.array([obj_id], dtype=np.int64),
]
)
tracker_metadata["num_obj_per_gpu"][obj_rank] = len(
tracker_metadata["obj_ids_per_gpu"][obj_rank]
)
tracker_metadata["obj_ids_all_gpu"] = np.concatenate(
tracker_metadata["obj_ids_per_gpu"]
)
tracker_metadata["max_obj_id"] = max(tracker_metadata["max_obj_id"], obj_id)
logger.info(
f"[rank={self.rank}] Adding new object with id {obj_id} at frame {frame_idx}."
)
self.add_action_history(
inference_state, "add", frame_idx=frame_idx, obj_ids=[obj_id]
)
else:
# existing object, for refinement
if self.rank == obj_rank:
tracker_states = self._get_sam2_inference_states_by_obj_ids(
inference_state, [obj_id]
)
assert len(tracker_states) == 1, (
f"[rank={self.rank}] Multiple SAM2 inference states found for the same object id."
)
sam2_state = tracker_states[0]
# log
logger.info(
f"[rank={self.rank}] Refining existing object with id {obj_id} at frame {frame_idx}."
)
self.add_action_history(
inference_state, "refine", frame_idx=frame_idx, obj_ids=[obj_id]
)
# assign higher score to added/refined object
tracker_metadata["obj_id_to_score"][obj_id] = 1.0
tracker_metadata["obj_id_to_sam2_score_frame_wise"][frame_idx][obj_id] = (
torch.tensor(1.0, dtype=torch.float32, device=self.device)
)
if self.rank == 0:
rank0_metadata = tracker_metadata.get("rank0_metadata", {})
if "removed_obj_ids" in rank0_metadata:
rank0_metadata["removed_obj_ids"].discard(obj_id)
if "suppressed_obj_ids" in rank0_metadata:
for frame_id in rank0_metadata["suppressed_obj_ids"]:
rank0_metadata["suppressed_obj_ids"][frame_id].discard(obj_id)
if "masklet_confirmation" in rank0_metadata:
obj_ids_all_gpu = tracker_metadata["obj_ids_all_gpu"]
obj_indices = np.where(obj_ids_all_gpu == obj_id)[0]
if len(obj_indices) > 0:
obj_idx = obj_indices[0]
if obj_idx < len(rank0_metadata["masklet_confirmation"]["status"]):
rank0_metadata["masklet_confirmation"]["status"][obj_idx] = 1
rank0_metadata["masklet_confirmation"]["consecutive_det_num"][
obj_idx
] = self.masklet_confirmation_consecutive_det_thresh
if self.rank == obj_rank:
should_fallback_to_original_mask = (
len(points) == 0 and inference_state["is_image_only"]
)
if should_fallback_to_original_mask:
mask_input = self._get_mask_input(sam2_state, frame_idx, obj_id)
if mask_input is None or 0 in mask_input.shape:
logger.warning(
f"Cannot retrieve original mask input for obj_id {obj_id} at frame {frame_idx} to fallback."
)
should_fallback_to_original_mask = False
if should_fallback_to_original_mask:
# When user cancels all points on an image, we recover the original mask
# by re-feeding the detector mask to SAM2.
mask_input = self._get_mask_input(sam2_state, frame_idx, obj_id)
# clear out states related to this object to have a fresh start
self.tracker.clear_all_points_in_frame(
sam2_state, frame_idx, obj_id, need_output=False
)
frame_idx, obj_ids, low_res_masks, video_res_masks = (
self.tracker.add_new_mask(
sam2_state,
frame_idx,
obj_id,
mask_input,
)
)
else:
frame_idx, obj_ids, low_res_masks, video_res_masks = (
self.tracker.add_new_points(
inference_state=sam2_state,
frame_idx=frame_idx,
obj_id=obj_id,
points=points,
labels=labels,
clear_old_points=clear_old_points,
rel_coordinates=rel_coordinates,
use_prev_mem_frame=use_prev_mem_frame,
)
)
if video_res_masks is not None and len(video_res_masks) > 0:
video_res_masks = fill_holes_in_mask_scores(
video_res_masks, # shape (N, 1, H_video, W_video)
fill_hole_area=self.fill_hole_area,
sprinkle_removal_area=self.sprinkle_removal_area,
fill_holes=True,
remove_sprinkles=True,
)
# TODO: will this cause issue when user switching to refine another object?
# Since the mem encoder has already run for the current input points?
# FIX: Synchronize consolidated_frame_inds with actual point/mask
# inputs before propagate_in_video_preflight. Two issues can cause
# the `all_consolidated_frame_inds == input_frames_inds` assertion
# to fail:
# 1) VG detector conditioning frames in mask_inputs_per_obj without
# corresponding point inputs (stale VG entries).
# 2) Previously consolidated point-input frames (from earlier
# add_points) whose consolidated_frame_inds entries were lost
# during subsequent propagation.
# We fix both by: (a) clearing mask-only inputs, (b) rebuilding
# consolidated_frame_inds from the remaining inputs, excluding
# temp output frames (which preflight will add itself).
# (a) Clear detector-only mask inputs
for _obj_idx in list(sam2_state["mask_inputs_per_obj"].keys()):
_point_frames = set(
sam2_state["point_inputs_per_obj"].get(_obj_idx, {}).keys()
)
_mask_only_frames = [
f
for f in list(sam2_state["mask_inputs_per_obj"][_obj_idx].keys())
if f not in _point_frames
]
for f in _mask_only_frames:
sam2_state["mask_inputs_per_obj"][_obj_idx].pop(f, None)
# (b) Rebuild consolidated_frame_inds from remaining inputs
_input_frames = set()
for _oi in sam2_state["point_inputs_per_obj"]:
_input_frames.update(sam2_state["point_inputs_per_obj"][_oi].keys())
for _oi in sam2_state["mask_inputs_per_obj"]:
_input_frames.update(sam2_state["mask_inputs_per_obj"][_oi].keys())
# Exclude temp output frames — preflight will consolidate those
_temp_frames = set()
for _obj_temp in sam2_state["temp_output_dict_per_obj"].values():
_temp_frames.update(_obj_temp["cond_frame_outputs"].keys())
_temp_frames.update(_obj_temp["non_cond_frame_outputs"].keys())
_prev_frames = _input_frames - _temp_frames
_cond = set()
_non_cond = set()
for f in _prev_frames:
if f in sam2_state["output_dict"].get("cond_frame_outputs", {}):
_cond.add(f)
else:
_non_cond.add(f)
sam2_state["consolidated_frame_inds"] = {
"cond_frame_outputs": _cond,
"non_cond_frame_outputs": _non_cond,
}
self.tracker.propagate_in_video_preflight(sam2_state, run_mem_encoder=True)
if not inference_state["is_image_only"]:
# Clear detector conditioning frames when user clicks are received to allow
# model updating masks on these frames. It is a noop if user is refining on the
# detector conditioning frames or adding new objects.
self.clear_detector_added_cond_frame_in_sam2(
sam2_state, obj_id, frame_idx
)
# fetch results from states and gather across GPUs
# Use optimized caching approach to avoid reprocessing unmodified objects
if self.rank == obj_rank and len(obj_ids) > 0:
new_mask_data = (video_res_masks[obj_ids.index(obj_id)] > 0.0).to(
torch.bool
)
else:
new_mask_data = None
# Broadcast the new mask data across all ranks for consistency
if self.world_size > 1:
data_list = [new_mask_data]
self.broadcast_python_obj_cpu(data_list, src=obj_rank)
new_mask_data = data_list[0]
if self.rank == 0:
obj_id_to_mask = self._build_sam2_output(
inference_state,
frame_idx,
{obj_id: new_mask_data} if new_mask_data is not None else None,
)
# post processing - remove suppressed obj_ids
obj_id_to_score = tracker_metadata["obj_id_to_score"]
suppressed_obj_ids = tracker_metadata["rank0_metadata"][
"suppressed_obj_ids"
][frame_idx]
obj_id_to_sam2_score = tracker_metadata["obj_id_to_sam2_score_frame_wise"][
frame_idx
]
out = {
"obj_id_to_mask": obj_id_to_mask,
"obj_id_to_score": obj_id_to_score,
"obj_id_to_sam2_score": obj_id_to_sam2_score,
}
self._cache_frame_outputs(
inference_state,
frame_idx,
obj_id_to_mask,
suppressed_obj_ids=suppressed_obj_ids,
)
return frame_idx, self._postprocess_output(
inference_state, out, suppressed_obj_ids=suppressed_obj_ids
)
else:
return frame_idx, None # no output on other GPUs
def _get_mask_input(self, inference_state, frame_idx, obj_id):
"""Get the mask input for a specific object on a specific frame."""
obj_idx = self.tracker._obj_id_to_idx(inference_state, obj_id)
mask_inputs_per_frame = inference_state["mask_inputs_per_obj"][obj_idx]
if frame_idx not in mask_inputs_per_frame:
logger.info(
f"frame {frame_idx} not in mask_inputs_per_frame for obj_id {obj_id}"
)
return None
mask_inputs_orig = mask_inputs_per_frame[frame_idx].squeeze(0, 1) # (H, W)
return mask_inputs_orig
def _gather_obj_id_to_mask_across_gpus(self, inference_state, obj_id_to_mask_local):
"""Gather obj_id_to_mask from all GPUs. Optionally resize the masks to the video resolution."""
tracker_metadata = inference_state["tracker_metadata"]
# concatenate the output masklets from all local inference states
H_mask = W_mask = self.tracker.low_res_mask_size
obj_ids_local = tracker_metadata["obj_ids_per_gpu"][self.rank]
low_res_masks_local = []
for obj_id in obj_ids_local:
if obj_id in obj_id_to_mask_local:
low_res_masks_local.append(obj_id_to_mask_local[obj_id])
else:
low_res_masks_local.append(
torch.full((H_mask, W_mask), -1024.0, device=self.device)
)
if len(low_res_masks_local) > 0:
low_res_masks_local = torch.stack(low_res_masks_local, dim=0) # (N, H, W)
assert low_res_masks_local.shape[1:] == (H_mask, W_mask)
else:
low_res_masks_local = torch.zeros(0, H_mask, W_mask, device=self.device)
# all-gather `low_res_masks_local` into `low_res_masks_global`
# - low_res_masks_global: Tensor -- (num_global_obj, H_mask, W_mask)
if self.world_size > 1:
low_res_masks_local = low_res_masks_local.float().contiguous()
low_res_masks_peers = [
low_res_masks_local.new_empty(num_obj, H_mask, W_mask)
for num_obj in tracker_metadata["num_obj_per_gpu"]
]
dist.all_gather(low_res_masks_peers, low_res_masks_local)
low_res_masks_global = torch.cat(low_res_masks_peers, dim=0)
else:
low_res_masks_global = low_res_masks_local
return low_res_masks_global
def _convert_low_res_mask_to_video_res(self, low_res_mask, inference_state):
"""
Convert a low-res mask to video resolution, matching the format expected by _build_sam2_output.
Args:
low_res_mask: Tensor of shape (H_low_res, W_low_res)
inference_state: Contains video dimensions
Returns:
video_res_mask: Tensor of shape (1, H_video, W_video) bool
"""
if low_res_mask is None:
return None
# Convert to 3D for interpolation: (H_low_res, W_low_res) -> (1, H_low_res, W_low_res)
low_res_mask_3d = low_res_mask.unsqueeze(0).unsqueeze(0)
# Get video dimensions
H_video = inference_state["orig_height"]
W_video = inference_state["orig_width"]
video_res_mask = F.interpolate(
low_res_mask_3d.float(),
size=(H_video, W_video),
mode="bilinear",
align_corners=False,
) # (1, H_video, W_video)
# Convert to boolean - already in the right shape!
return (video_res_mask.squeeze(0) > 0.0).to(torch.bool)
def clear_detector_added_cond_frame_in_sam2(
self, sam2_state, obj_id, refined_frame_idx
):
"""Clear detector added conditioning frame if it is within a predefined window
of the refined frame. This allow model to update masks on these frames."""
obj_idx = self.tracker._obj_id_to_idx(sam2_state, obj_id)
mask_only_cond_frame_indices = []
window = self.refinement_detector_cond_frame_removal_window
for frame_idx in sam2_state["mask_inputs_per_obj"][obj_idx]:
if frame_idx not in sam2_state["point_inputs_per_obj"][obj_idx]:
# clear conditioning frames within a window of the refined frame
if abs(frame_idx - refined_frame_idx) <= window:
mask_only_cond_frame_indices.append(frame_idx)
# clear
if len(mask_only_cond_frame_indices) > 0:
for frame_idx in mask_only_cond_frame_indices:
# obj_ids_on_this_frame is essentially all obj_ids in the state
# since they are bucket batched
obj_ids_on_this_frame = sam2_state["obj_id_to_idx"].keys()
for obj_id2 in obj_ids_on_this_frame:
self.tracker.clear_all_points_in_frame(
sam2_state, frame_idx, obj_id2, need_output=False
)
logger.info(
f"Cleared detector mask only conditioning frames ({mask_only_cond_frame_indices}) in SAM2."
)
return
def _extract_object_to_singleton_state(self, inference_state, obj_id, obj_rank):
"""
Extract an object from a batched inference state into its own singleton state.
"""
if self.rank != obj_rank:
return
tracker_states_local = inference_state["sam2_inference_states"]
# Find the inference state containing this object
source_state = None
source_state_idx = None
for idx, state in enumerate(tracker_states_local):
if obj_id in state["obj_ids"]:
source_state = state
source_state_idx = idx
break
assert source_state is not None
if len(source_state["obj_ids"]) <= 1:
# Object not found or already in singleton state
return
# Step 1: Extract all the object's state data before removing it
obj_idx_in_source = source_state["obj_id_to_idx"][obj_id]
multiplex_state = source_state.get("multiplex_state")
# Extract consolidated outputs (obj_ptr, maskmem_features, etc.) BEFORE
# remove_object modifies the source tensors.
singleton_consolidated_outputs = {
"cond_frame_outputs": {},
"non_cond_frame_outputs": {},
}
if "output_dict" in source_state:
for storage_key in ["cond_frame_outputs", "non_cond_frame_outputs"]:
source_outputs = source_state["output_dict"].get(storage_key, {})
for f_idx, source_frame_out in source_outputs.items():
if source_frame_out["pred_masks"].shape[0] < obj_idx_in_source + 1:
continue
singleton_frame_out = {
"pred_masks": source_frame_out["pred_masks"][
obj_idx_in_source : obj_idx_in_source + 1
].clone(),
"object_score_logits": source_frame_out["object_score_logits"][
obj_idx_in_source : obj_idx_in_source + 1
].clone(),
"image_features": source_frame_out.get("image_features"),
"image_pos_enc": source_frame_out.get("image_pos_enc"),
"local_obj_id_to_idx": {obj_id: 0},
}
# Extract maskmem_features (demux from multiplex space)
maskmem_features = source_frame_out.get("maskmem_features")
if maskmem_features is not None and multiplex_state is not None:
try:
demuxed = multiplex_state.demux(maskmem_features)
maskmem_features = demuxed[
obj_idx_in_source : obj_idx_in_source + 1
].clone()
except (AssertionError, IndexError):
maskmem_features = None
elif maskmem_features is not None:
maskmem_features = maskmem_features[
obj_idx_in_source : obj_idx_in_source + 1
].clone()
singleton_frame_out["maskmem_features"] = maskmem_features
# Extract maskmem_pos_enc (demux level by level)
maskmem_pos_enc = source_frame_out.get("maskmem_pos_enc")
if maskmem_pos_enc is not None:
remapped = []
for level_enc in maskmem_pos_enc:
if level_enc is None:
remapped.append(None)
continue
if multiplex_state is not None:
try:
demuxed = multiplex_state.demux(level_enc)
remapped.append(
demuxed[
obj_idx_in_source : obj_idx_in_source + 1
].clone()
)
except (AssertionError, IndexError):
remapped.append(None)
else:
remapped.append(
level_enc[
obj_idx_in_source : obj_idx_in_source + 1
].clone()
)
maskmem_pos_enc = remapped
singleton_frame_out["maskmem_pos_enc"] = maskmem_pos_enc
# Extract obj_ptr (demux from multiplex space)
if (
"obj_ptr" in source_frame_out
and self.tracker.use_obj_ptrs_in_encoder
):
source_obj_ptr = source_frame_out["obj_ptr"]
if multiplex_state is not None:
obj_ptr_data = multiplex_state.demux(source_obj_ptr)
singleton_frame_out["obj_ptr"] = obj_ptr_data[
obj_idx_in_source : obj_idx_in_source + 1
].clone()
else:
singleton_frame_out["obj_ptr"] = source_obj_ptr[
obj_idx_in_source : obj_idx_in_source + 1
].clone()
# Extract conditioning_objects
if "conditioning_objects" in source_frame_out:
if (
obj_idx_in_source
in source_frame_out["conditioning_objects"]
):
singleton_frame_out["conditioning_objects"] = {0}
else:
singleton_frame_out["conditioning_objects"] = set()
singleton_consolidated_outputs[storage_key][f_idx] = (
singleton_frame_out
)
# Extract point and mask inputs for this object
extracted_point_inputs = {}
extracted_mask_inputs = {}
if (
"point_inputs_per_obj" in source_state
and obj_idx_in_source in source_state["point_inputs_per_obj"]
):
extracted_point_inputs = source_state["point_inputs_per_obj"][
obj_idx_in_source
].copy()
if (
"mask_inputs_per_obj" in source_state
and obj_idx_in_source in source_state["mask_inputs_per_obj"]
):
extracted_mask_inputs = source_state["mask_inputs_per_obj"][
obj_idx_in_source
].copy()
# Extract per-object outputs - these are already properly sliced for the object
extracted_obj_cond_outputs = {}
extracted_obj_non_cond_outputs = {}
extracted_temp_cond_outputs = {}
extracted_temp_non_cond_outputs = {}
if (
"output_dict_per_obj" in source_state
and obj_idx_in_source in source_state["output_dict_per_obj"]
):
obj_output_dict = source_state["output_dict_per_obj"][obj_idx_in_source]
extracted_obj_cond_outputs = obj_output_dict.get(
"cond_frame_outputs", {}
).copy()
cond_input_keys = (
extracted_point_inputs.keys() | extracted_mask_inputs.keys()
)
# we may have obj cond outputs for other objects in a batch, so limit to cond inputs for only this object
extracted_obj_cond_outputs = {
k: v
for k, v in extracted_obj_cond_outputs.items()
if k in cond_input_keys
}
extracted_obj_non_cond_outputs = obj_output_dict.get(
"non_cond_frame_outputs", {}
).copy()
if (
"temp_output_dict_per_obj" in source_state
and obj_idx_in_source in source_state["temp_output_dict_per_obj"]
):
temp_obj_output_dict = source_state["temp_output_dict_per_obj"][
obj_idx_in_source
]
extracted_temp_cond_outputs = temp_obj_output_dict.get(
"cond_frame_outputs", {}
).copy()
extracted_temp_non_cond_outputs = temp_obj_output_dict.get(
"non_cond_frame_outputs", {}
).copy()
# Step 2: Remove the object from the source state
remaining_obj_ids, _ = self.tracker.remove_object(
source_state, obj_id, strict=False, need_output=False
)
# Step 3: Create a new singleton inference state
new_sam2_state = self.tracker.init_state(
cached_features=inference_state["feature_cache"],
video_height=inference_state["orig_height"],
video_width=inference_state["orig_width"],
num_frames=inference_state["num_frames"],
)
# Step 4: Set up the singleton state structure for the extracted object
# Map the object to index 0 in the new singleton state
new_sam2_state["obj_id_to_idx"] = {obj_id: 0}
new_sam2_state["obj_idx_to_id"] = {0: obj_id}
new_sam2_state["obj_ids"] = [obj_id]
# Step 5: Restore all the extracted state
# Restore point and mask inputs
new_sam2_state["point_inputs_per_obj"] = {0: extracted_point_inputs}
new_sam2_state["mask_inputs_per_obj"] = {0: extracted_mask_inputs}
# Restore per-object output dictionaries (already properly sliced)
new_sam2_state["output_dict_per_obj"] = {
0: {
"cond_frame_outputs": extracted_obj_cond_outputs,
"non_cond_frame_outputs": extracted_obj_non_cond_outputs,
}
}
# Restore temporary outputs
new_sam2_state["temp_output_dict_per_obj"] = {
0: {
"cond_frame_outputs": extracted_temp_cond_outputs,
"non_cond_frame_outputs": extracted_temp_non_cond_outputs,
}
}
# Step 6: Rebuild the consolidated output_dict for the singleton state
# Use the extracted consolidated outputs which include obj_ptr,
# maskmem_features, maskmem_pos_enc (not just pred_masks/object_score_logits)
# Create singleton multiplex state and remux extracted tensors
new_multiplex_state = self.tracker.multiplex_controller.get_state(
num_valid_entries=1,
device=source_state.get("device", "cuda"),
dtype=torch.float32,
random=False,
object_ids=[obj_id],
)
new_sam2_state["multiplex_state"] = new_multiplex_state
for storage_key in ["cond_frame_outputs", "non_cond_frame_outputs"]:
for f_idx, frame_out in singleton_consolidated_outputs[storage_key].items():
if frame_out.get("maskmem_features") is not None:
frame_out["maskmem_features"] = frame_out[
"maskmem_features"
].clone()
if frame_out.get("maskmem_pos_enc") is not None:
frame_out["maskmem_pos_enc"] = [
level.clone() if level is not None else None
for level in frame_out["maskmem_pos_enc"]
]
if "obj_ptr" in frame_out and self.tracker.use_obj_ptrs_in_encoder:
frame_out["obj_ptr"] = new_multiplex_state.mux(frame_out["obj_ptr"])
new_sam2_state["output_dict"] = singleton_consolidated_outputs
# Step 7: Copy other important state if it exists
for key in [
"first_ann_frame_idx",
"tracking_has_started",
]:
if key in source_state:
new_sam2_state[key] = source_state[key]
# Leave consolidated_frame_inds empty so preflight reconstructs from per-obj data
new_sam2_state["consolidated_frame_inds"] = {
"cond_frame_outputs": set(),
"non_cond_frame_outputs": set(),
}
# Step 8: Add the new singleton state to the list
tracker_states_local.append(new_sam2_state)
# Step 9: If the source state is now empty, remove it
if len(remaining_obj_ids) == 0:
tracker_states_local.pop(source_state_idx)
logger.info(
f"Removed empty inference state after extracting object {obj_id}"
)
logger.info(f"Object {obj_id} successfully extracted to singleton state")