Spaces:

1ripon1
/

ColabWan

Build error

File size: 157,286 Bytes

7344bef

from collections import defaultdict
from functools import reduce
from typing import Dict

import numpy as np
from ..model import sam3_multiplex_base
from ..model import sam3_video_base
import torch
import torch.distributed as dist
import torch.nn.functional as F
from .. import perflib
from ..logger import get_logger
from ..model.box_ops import box_xywh_to_cxcywh, box_xyxy_to_xywh
from ..model.data_misc import BatchedDatapoint
from ..model.device_utils import get_accelerator_device
from ..model.sam3_multiplex_base import MaskletConfirmationStatus, Sam3MultiplexBase
from ..model.sam3_tracker_utils import fill_holes_in_mask_scores
from ..model.sam3_video_inference import is_image_type
from ..perflib.compile import (
    clone_output_wrapper,
    compile_wrapper,
    shape_logging_wrapper,
)
from ..perflib.masks_ops import mask_iou, masks_to_boxes as perf_masks_to_boxes
from torch import Tensor
from torchvision.ops import masks_to_boxes
from tqdm.auto import tqdm

logger = get_logger(__name__)

import gc
from collections.abc import Mapping, Sequence
from dataclasses import fields, is_dataclass
from typing import List

from ..model.data_misc import (
    BatchedPointer,
    convert_my_tensors,
    FindStage,
    NestedTensor,
)
from ..model.geometry_encoders import Prompt
from ..model.io_utils import load_resource_as_video_frames


def recursive_to(data, *args, **kwargs):
    if isinstance(data, torch.Tensor):
        ret = data.to(*args, **kwargs)
    elif isinstance(data, np.ndarray):
        ret = data
    elif isinstance(data, Mapping):
        ret = type(data)()
        for key in data:
            ret[key] = recursive_to(data[key], *args, **kwargs)
    elif isinstance(data, tuple):
        ret = ()
        for value in data:
            ret += (recursive_to(value, *args, **kwargs),)
    elif isinstance(data, Sequence) and not isinstance(data, str):
        ret = type(data)()
        for value in data:
            ret.append(recursive_to(value, *args, **kwargs))
    elif is_dataclass(data):
        ret_cls = type(data)
        ret_fields = {
            field.name: recursive_to(getattr(data, field.name), *args, **kwargs)
            for field in fields(data)
        }
        ret = ret_cls(**ret_fields)
    else:
        ret = data
    return ret


DUMMY_OUTPUT = "DUMMY_OUTPUT"


class Sam3MultiplexTracking(Sam3MultiplexBase):
    def __init__(
        self,
        image_size=1008,
        image_mean=(0.5, 0.5, 0.5),
        image_std=(0.5, 0.5, 0.5),
        compile_model=False,
        postprocess_batch_size=1,
        **kwargs,
    ):
        """
        hotstart_delay: int, the delay (in #frames) before the model starts to yield output, 0 to disable hotstart delay.
        hotstart_unmatch_thresh: int, remove the object if it has this many unmatched frames within its hotstart_delay period.
            If `hotstart_delay` is set to 0, this parameter is ignored.
        hotstart_dup_thresh: int, remove the object if it has overlapped with another object this many frames within its hotstart_delay period.
        postprocess_batch_size: int, the number of frames to accumulate before running postprocessing. Set to 1 to disable batching.
        """
        super().__init__(**kwargs)
        self.image_size = image_size
        self.image_mean = image_mean
        self.image_std = image_std
        self.compile_model = compile_model
        self.detector.compile_model = self.compile_model
        self.postprocess_batch_size = postprocess_batch_size

    TEXT_ID_FOR_TEXT = 0
    TEXT_ID_FOR_VISUAL = 1
    TEXT_ID_FOR_GEOMETRIC = 2

    def _construct_initial_input_batch(self, inference_state, images):
        """Construct an initial `BatchedDatapoint` instance as input."""
        # 1) img_batch
        num_frames = len(images)
        device = inference_state["device"]
        img_batch = NestedTensor(tensors=images, mask=None)

        # 2) find_text_batch
        # "<text placeholder>" will be replaced by the actual text prompt when adding prompts
        find_text_batch = ["<text placeholder>", "visual", "geometric"]

        # 3) find_inputs
        input_box_embedding_dim = 258  # historical default
        input_points_embedding_dim = 257  # historical default
        dummy_ptrs = BatchedPointer(
            stage_ids=[], query_ids=[], object_ids=[], ptr_mask=[], ptr_types=[]
        )
        stages = [
            FindStage(
                img_ids=[stage_id],
                img_ids_np=np.array([stage_id]),
                text_ids=[0],
                input_boxes=[torch.zeros(input_box_embedding_dim)],
                input_boxes_before_embed=[torch.empty(0, 4)],
                input_boxes_mask=[torch.empty(0, dtype=torch.bool)],
                input_boxes_label=[torch.empty(0, dtype=torch.long)],
                input_points=[torch.empty(0, input_points_embedding_dim)],
                input_points_before_embed=[torch.empty(0, 3)],
                input_points_mask=[torch.empty(0)],
                ptrs=dummy_ptrs,
                ptrs_seg=dummy_ptrs,
                object_ids=[],
            )
            for stage_id in range(num_frames)
        ]
        with torch.profiler.record_function(
            "Sam3MultiplexTracking._construct_initial_input_batch"
        ):
            for i in range(len(stages)):
                stages[i] = convert_my_tensors(stages[i])

        # construct the final `BatchedDatapoint` and cast to GPU
        input_batch = BatchedDatapoint(
            img_batch=img_batch,
            find_text_batch=find_text_batch,
            find_inputs=stages,
            find_targets=[None] * num_frames,
            get_queries=None,
            find_metadatas=[None] * num_frames,
        )
        with torch.profiler.record_function("Sam3MultiplexTracking.recursive_to"):
            input_batch = recursive_to(input_batch, device, non_blocking=True)
        inference_state["input_batch"] = input_batch

        # construct the placeholder interactive prompts and tracking queries
        bs = 1
        inference_state["constants"]["empty_geometric_prompt"] = Prompt(
            box_embeddings=torch.zeros(0, bs, 4, device=device),
            box_mask=torch.zeros(bs, 0, device=device, dtype=torch.bool),
            box_labels=torch.zeros(0, bs, device=device, dtype=torch.long),
            point_embeddings=torch.zeros(0, bs, 2, device=device),
            point_mask=torch.zeros(bs, 0, device=device, dtype=torch.bool),
            point_labels=torch.zeros(0, bs, device=device, dtype=torch.long),
        )

        # constructing an output list in inference state (we start with an empty list)
        inference_state["previous_stages_out"] = [None] * num_frames
        inference_state["text_prompt"] = None
        inference_state["per_frame_raw_point_input"] = [None] * num_frames
        inference_state["per_frame_raw_box_input"] = [None] * num_frames
        inference_state["per_frame_visual_prompt"] = [None] * num_frames
        inference_state["per_frame_geometric_prompt"] = [None] * num_frames
        inference_state["per_frame_cur_step"] = [0] * num_frames

        # placeholders for cached outputs
        # (note: currently, a single visual prompt embedding is shared for all frames)
        inference_state["backbone_out"] = None
        inference_state["visual_prompt_embed"] = None
        inference_state["visual_prompt_mask"] = None

    def _get_visual_prompt(self, inference_state, frame_idx, boxes_cxcywh, box_labels):
        batch_size = 1
        geometric_prompt = Prompt(
            box_embeddings=torch.zeros(
                0, batch_size, 4, device=inference_state["device"]
            ),
            box_mask=torch.zeros(
                batch_size, 0, device=inference_state["device"], dtype=torch.bool
            ),
            point_embeddings=None,
            point_mask=None,
        )

        geometric_prompt.append_boxes(
            boxes=boxes_cxcywh.view(-1, batch_size, 4).to(inference_state["device"]),
            labels=box_labels.view(-1, batch_size).to(inference_state["device"]),
        )

        return boxes_cxcywh, box_labels, geometric_prompt

    @torch.inference_mode()
    def init_state(
        self,
        resource_path,
        offload_video_to_cpu=False,
        async_loading_frames=False,
        use_cv2=False,
        input_is_mp4=False,
    ):
        # Initialize inference state (inlined from Sam3DemoMixin.init_state)
        video_loader_type = "cv2" if use_cv2 else "ffmpeg"
        images, orig_height, orig_width = load_resource_as_video_frames(
            resource_path=resource_path,
            image_size=self.image_size,
            offload_video_to_cpu=offload_video_to_cpu,
            img_mean=self.image_mean,
            img_std=self.image_std,
            async_loading_frames=async_loading_frames,
            video_loader_type=video_loader_type,
        )
        inference_state = {}
        inference_state["image_size"] = self.image_size
        inference_state["num_frames"] = len(images)
        inference_state["device"] = get_accelerator_device()
        inference_state["orig_height"] = orig_height
        inference_state["orig_width"] = orig_width
        inference_state["constants"] = {}
        self._construct_initial_input_batch(inference_state, images)
        # initialize extra states
        # sam2_inference_states will contain separate inference_states for each frame having new objects if
        # self.tracker.per_obj_inference is False (bucketized batching), or a single inference_state
        # containing all objects if self.tracker.per_obj_inference is True (no batching at all).
        inference_state["sam2_inference_states"] = []
        inference_state["tracker_metadata"] = {}
        inference_state["feature_cache"] = {}
        inference_state["cached_frame_outputs"] = {}
        inference_state["is_image_only"] = is_image_type(resource_path)
        return inference_state

    def reset_state(self, inference_state):
        # Inlined from Sam3DemoMixin.reset_state
        inference_state["input_batch"].find_text_batch[0] = "<text placeholder>"
        inference_state["text_prompt"] = None
        for t in range(inference_state["num_frames"]):
            inference_state["input_batch"].find_inputs[t].text_ids[...] = 0
            inference_state["previous_stages_out"][t] = None
            inference_state["per_frame_raw_point_input"][t] = None
            inference_state["per_frame_raw_box_input"][t] = None
            inference_state["per_frame_visual_prompt"][t] = None
            inference_state["per_frame_geometric_prompt"][t] = None
            inference_state["per_frame_cur_step"][t] = 0
        inference_state["backbone_out"] = None
        inference_state["visual_prompt_embed"] = None
        inference_state["visual_prompt_mask"] = None
        # reset extra states
        inference_state["sam2_inference_states"].clear()
        inference_state["tracker_metadata"].clear()
        inference_state["feature_cache"].clear()
        inference_state["cached_frame_outputs"] = {}

    def _get_processing_order(
        self, inference_state, start_frame_idx, max_frame_num_to_track, reverse
    ):
        num_frames = inference_state["num_frames"]
        previous_stages_out = inference_state["previous_stages_out"]
        if all(out is None for out in previous_stages_out) and start_frame_idx is None:
            raise RuntimeError(
                "No prompts are received on any frames. Please add prompt on at least one frame before propagation."
            )
        # set start index, end index, and processing order
        if start_frame_idx is None:
            # default: start from the earliest frame with input points
            start_frame_idx = min(
                t for t, out in enumerate(previous_stages_out) if out is not None
            )
        if max_frame_num_to_track is None:
            # default: track all the frames in the video
            max_frame_num_to_track = num_frames
        if reverse:
            end_frame_idx = start_frame_idx - max_frame_num_to_track
            end_frame_idx = max(end_frame_idx, 0)
            processing_order = range(start_frame_idx - 1, end_frame_idx - 1, -1)
        else:
            end_frame_idx = start_frame_idx + max_frame_num_to_track
            end_frame_idx = min(end_frame_idx, num_frames - 1)
            processing_order = range(start_frame_idx, end_frame_idx + 1)
        return processing_order, end_frame_idx

    @torch.inference_mode()
    def propagate_in_video(
        self,
        inference_state,
        start_frame_idx=None,
        max_frame_num_to_track=None,
        reverse=False,
        output_prob_thresh=0.5,
        compute_stability_score=False,
        is_instance_processing=False,
        progress_callback=None,
        **kwargs,  # To support passing extra args to child classes
    ):
        """
        Propagate the prompts to get grounding results for the entire video. This method
        is a generator and yields inference outputs for all frames in the range specified
        by `start_frame_idx`, `max_frame_num_to_track`, and `reverse`.
        """
        # compile the model (it's a no-op if the model is already compiled)
        # note that it's intentionally added to `self.propagate_in_video`, so that the first
        # `self.add_prompt` call will be done in eager mode to fill in the decoder buffers
        # such as positional encoding cache)
        self._compile_model()

        processing_order, end_frame_idx = self._get_processing_order(
            inference_state,
            start_frame_idx,
            max_frame_num_to_track,
            reverse=reverse,
        )

        # Store max_frame_num_to_track in feature_cache for downstream methods
        inference_state["feature_cache"]["tracking_bounds"] = {
            "max_frame_num_to_track": max_frame_num_to_track,
            "propagate_in_video_start_frame_idx": start_frame_idx,
        }

        hotstart_buffer = []
        hotstart_removed_obj_ids = set()
        # when deciding whether to output a masklet on `yield_frame_idx`, we check whether the object is confirmed
        # in a future frame (`unconfirmed_frame_delay` frames after the current frame). For example, if we require
        # an object to be detected in 3 consecutive frames to be confirmed, then we look 2 frames in the future --
        # e.g., we output an object on frame 4 only if it becomes confirmed on frame 6.
        unconfirmed_status_delay = self.masklet_confirmation_consecutive_det_thresh - 1
        unconfirmed_obj_ids_per_frame = {}  # frame_idx -> hidden_obj_ids

        # Batch postprocessing: accumulate yield_list entries and process every postprocess_batch_size frames
        postprocess_yield_list = []

        progress_done = 0
        progress_total = len(processing_order)
        for frame_idx in tqdm(
            processing_order, desc="propagate_in_video", disable=self.rank > 0
        ):
            out = self._run_single_frame_inference(
                inference_state,
                frame_idx,
                reverse,
                is_instance_processing=is_instance_processing,
            )
            progress_done += 1
            if progress_callback is not None and self.rank == 0:
                progress_callback(progress_done, progress_total)

            if self.hotstart_delay > 0:
                # accumulate the outputs for the first `hotstart_delay` frames
                hotstart_buffer.append([frame_idx, out])
                # update the object IDs removed by hotstart so that we don't output them
                if self.rank == 0:
                    hotstart_removed_obj_ids.update(out["removed_obj_ids"])
                    unconfirmed_obj_ids = out.get("unconfirmed_obj_ids", None)
                    if unconfirmed_obj_ids is not None:
                        unconfirmed_obj_ids_per_frame[frame_idx] = unconfirmed_obj_ids

                if frame_idx == end_frame_idx:
                    # we reached the end of propagation -- yield all frames in the buffer
                    yield_list = hotstart_buffer
                    hotstart_buffer = []
                elif len(hotstart_buffer) >= self.hotstart_delay:
                    # we have enough frames -- yield and remove the first (oldest) frame from the buffer
                    yield_list = hotstart_buffer[:1]
                    hotstart_buffer = hotstart_buffer[1:]
                else:
                    # not enough frames yet -- skip yielding
                    yield_list = []
            else:
                yield_list = [(frame_idx, out)]  # output the current frame

            # Accumulate yield_list into postprocess_yield_list
            # Snapshot hotstart_removed_obj_ids at the time of accumulation to preserve
            # the correct state for each frame (important: this set is mutated over time)
            for yield_frame_idx, yield_out in yield_list:
                postprocess_yield_list.append(
                    (yield_frame_idx, yield_out, set(hotstart_removed_obj_ids))
                )

            # Process batch when we have enough frames
            while len(postprocess_yield_list) >= self.postprocess_batch_size:
                batch_to_process = postprocess_yield_list[: self.postprocess_batch_size]
                postprocess_yield_list = postprocess_yield_list[
                    self.postprocess_batch_size :
                ]

                with torch.profiler.record_function(
                    "Sam3MultiplexTracking.postprocess_output_batched"
                ):
                    if self.rank == 0:
                        # Prepare batched inputs for postprocessing
                        H_video, W_video = (
                            inference_state["orig_height"],
                            inference_state["orig_width"],
                        )
                        num_frames = inference_state["num_frames"]

                        batched_outs = []
                        frame_indices = []
                        for (
                            yield_frame_idx,
                            yield_out,
                            removed_obj_ids_snapshot,
                        ) in batch_to_process:
                            suppressed_obj_ids = yield_out["suppressed_obj_ids"]
                            unconfirmed_status_frame_idx = (
                                yield_frame_idx + unconfirmed_status_delay
                                if not reverse
                                else yield_frame_idx - unconfirmed_status_delay
                            )
                            unconfirmed_status_frame_idx = max(
                                0, min(unconfirmed_status_frame_idx, num_frames - 1)
                            )
                            unconfirmed_obj_ids = unconfirmed_obj_ids_per_frame.get(
                                unconfirmed_status_frame_idx, None
                            )

                            batched_outs.append(
                                (
                                    yield_out,
                                    removed_obj_ids_snapshot,
                                    suppressed_obj_ids,
                                    unconfirmed_obj_ids,
                                )
                            )
                            frame_indices.append(yield_frame_idx)

                            # Cache frame outputs
                            self._cache_frame_outputs(
                                inference_state,
                                yield_frame_idx,
                                yield_out["obj_id_to_mask"],
                                suppressed_obj_ids=suppressed_obj_ids,
                                removed_obj_ids=removed_obj_ids_snapshot,
                                unconfirmed_obj_ids=unconfirmed_obj_ids,
                            )

                        if self.postprocess_batch_size > 1:
                            # Process all frames in batch
                            postprocessed_outs = self._postprocess_output_batched(
                                H_video, W_video, batched_outs
                            )
                        else:
                            # Process each frame individually but output together
                            postprocessed_outs = []
                            for (
                                yield_out,
                                removed_obj_ids_snapshot,
                                suppressed_obj_ids,
                                unconfirmed_obj_ids,
                            ) in batched_outs:
                                postprocessed_out = self._postprocess_output(
                                    inference_state,
                                    yield_out,
                                    removed_obj_ids_snapshot,
                                    suppressed_obj_ids,
                                    unconfirmed_obj_ids,
                                )
                                postprocessed_outs.append(postprocessed_out)

                        # Yield results
                        for yield_frame_idx, postprocessed_out in zip(
                            frame_indices, postprocessed_outs
                        ):
                            yield yield_frame_idx, postprocessed_out
                    else:
                        # No output on other GPUs
                        for yield_frame_idx, _, _ in batch_to_process:
                            yield yield_frame_idx, DUMMY_OUTPUT

        # Flush any remaining frames in the postprocess buffer
        if len(postprocess_yield_list) > 0:
            with torch.profiler.record_function(
                "Sam3MultiplexTracking.postprocess_output_batched"
            ):
                if self.rank == 0:
                    H_video, W_video = (
                        inference_state["orig_height"],
                        inference_state["orig_width"],
                    )
                    num_frames = inference_state["num_frames"]

                    batched_outs = []
                    frame_indices = []
                    for (
                        yield_frame_idx,
                        yield_out,
                        removed_obj_ids_snapshot,
                    ) in postprocess_yield_list:
                        suppressed_obj_ids = yield_out["suppressed_obj_ids"]
                        unconfirmed_status_frame_idx = (
                            yield_frame_idx + unconfirmed_status_delay
                            if not reverse
                            else yield_frame_idx - unconfirmed_status_delay
                        )
                        unconfirmed_status_frame_idx = max(
                            0, min(unconfirmed_status_frame_idx, num_frames - 1)
                        )
                        unconfirmed_obj_ids = unconfirmed_obj_ids_per_frame.get(
                            unconfirmed_status_frame_idx, None
                        )

                        batched_outs.append(
                            (
                                yield_out,
                                removed_obj_ids_snapshot,
                                suppressed_obj_ids,
                                unconfirmed_obj_ids,
                            )
                        )
                        frame_indices.append(yield_frame_idx)

                        self._cache_frame_outputs(
                            inference_state,
                            yield_frame_idx,
                            yield_out["obj_id_to_mask"],
                            suppressed_obj_ids=suppressed_obj_ids,
                            removed_obj_ids=removed_obj_ids_snapshot,
                            unconfirmed_obj_ids=unconfirmed_obj_ids,
                        )

                    if self.postprocess_batch_size > 1:
                        postprocessed_outs = self._postprocess_output_batched(
                            H_video, W_video, batched_outs
                        )
                    else:
                        # Process each frame individually but output together
                        postprocessed_outs = []
                        for (
                            yield_out,
                            removed_obj_ids_snapshot,
                            suppressed_obj_ids,
                            unconfirmed_obj_ids,
                        ) in batched_outs:
                            postprocessed_out = self._postprocess_output(
                                inference_state,
                                yield_out,
                                removed_obj_ids_snapshot,
                                suppressed_obj_ids,
                                unconfirmed_obj_ids,
                            )
                            postprocessed_outs.append(postprocessed_out)

                    for yield_frame_idx, postprocessed_out in zip(
                        frame_indices, postprocessed_outs
                    ):
                        yield yield_frame_idx, postprocessed_out
                else:
                    for yield_frame_idx, _, _ in postprocess_yield_list:
                        yield yield_frame_idx, DUMMY_OUTPUT

        if self.is_multiplex:
            # log the bucket utilization stats
            # bucket utilization rate is total valid objects / total capacity -> represents rooms for improvement
            # subscription rate is total valid objects / total number of buckets -> represents speedup
            total_valid_objects = 0
            total_num_buckets = 0
            for state in inference_state["sam2_inference_states"]:
                assert (
                    len(state["obj_ids"])
                    == state["multiplex_state"].total_valid_entries
                )
                total_valid_objects += len(state["obj_ids"])
                total_num_buckets += state["multiplex_state"].num_buckets
            if total_num_buckets > 0:
                bucket_utilization_rate = (
                    total_valid_objects / (total_num_buckets * self.bucket_capacity)
                ) * 100
                subscription_rate = (total_valid_objects / total_num_buckets) * 100
                logger.info(
                    f"Bucket utilization rate: {bucket_utilization_rate:.2f}%, subscription rate: {subscription_rate:.2f}%"
                )

    def _run_single_frame_inference(
        self,
        inference_state,
        frame_idx,
        reverse,
        is_instance_processing=False,
    ):
        """
        Perform inference on a single frame and get its inference results. This would
        also update `inference_state`.
        """
        # prepare inputs
        input_batch = inference_state["input_batch"]
        tracker_states_local = inference_state["sam2_inference_states"]
        geometric_prompt = (
            inference_state["constants"]["empty_geometric_prompt"]
            if inference_state["per_frame_geometric_prompt"][frame_idx] is None
            else inference_state["per_frame_geometric_prompt"][frame_idx]
        )
        text_batch_key = tuple(input_batch.find_text_batch)
        inference_state["feature_cache"]["text"] = {
            text_batch_key: {
                "language_features": inference_state["backbone_out"][
                    "language_features"
                ],
                "language_mask": inference_state["backbone_out"]["language_mask"],
            }
        }
        # run inference for the current frame
        (
            obj_id_to_mask,
            obj_id_to_score,
            tracker_states_local_new,
            tracker_metadata_new,
            frame_stats,
            _,
        ) = self._det_track_one_frame(
            frame_idx=frame_idx,
            num_frames=inference_state["num_frames"],
            reverse=reverse,
            input_batch=input_batch,
            geometric_prompt=geometric_prompt,
            tracker_states_local=tracker_states_local,
            tracker_metadata_prev=inference_state["tracker_metadata"],
            feature_cache=inference_state["feature_cache"],
            orig_vid_height=inference_state["orig_height"],
            orig_vid_width=inference_state["orig_width"],
            is_image_only=inference_state["is_image_only"],
        )
        # update inference state
        inference_state["sam2_inference_states"] = tracker_states_local_new
        inference_state["tracker_metadata"] = tracker_metadata_new
        # use a dummy string in "previous_stages_out" to indicate this frame has outputs
        inference_state["previous_stages_out"][frame_idx] = "_THIS_FRAME_HAS_OUTPUTS_"

        if self.rank == 0:
            self._cache_frame_outputs(inference_state, frame_idx, obj_id_to_mask)

        out = {
            "obj_id_to_mask": obj_id_to_mask,
            "obj_id_to_score": obj_id_to_score,  # first frame detection score
            "obj_id_to_sam2_score": tracker_metadata_new[
                "obj_id_to_sam2_score_frame_wise"
            ][frame_idx],
        }
        # removed_obj_ids is only needed on rank 0 to handle hotstart delay buffer
        if self.rank == 0:
            rank0_metadata = tracker_metadata_new["rank0_metadata"]
            removed_obj_ids = rank0_metadata["removed_obj_ids"]
            out["removed_obj_ids"] = removed_obj_ids
            out["suppressed_obj_ids"] = rank0_metadata["suppressed_obj_ids"][frame_idx]
            out["frame_stats"] = frame_stats
            if self.masklet_confirmation_enable:
                status = rank0_metadata["masklet_confirmation"]["status"]
                is_unconfirmed = status == MaskletConfirmationStatus.UNCONFIRMED.value
                out["unconfirmed_obj_ids"] = tracker_metadata_new["obj_ids_all_gpu"][
                    is_unconfirmed
                ].tolist()
            else:
                out["unconfirmed_obj_ids"] = []

        return out

    def _postprocess_output(
        self,
        inference_state,
        out,
        removed_obj_ids=None,
        suppressed_obj_ids=None,
        unconfirmed_obj_ids=None,
    ):
        obj_id_to_mask = out["obj_id_to_mask"]  # low res masks
        curr_obj_ids = sorted(obj_id_to_mask.keys())
        H_video, W_video = inference_state["orig_height"], inference_state["orig_width"]
        if len(curr_obj_ids) == 0:
            out_obj_ids = torch.zeros(0, dtype=torch.int64)
            out_probs = torch.zeros(0, dtype=torch.float32)
            out_binary_masks = torch.zeros(0, H_video, W_video, dtype=torch.bool)
            out_boxes_xywh = torch.zeros(0, 4, dtype=torch.float32)
        else:
            out_obj_ids = torch.tensor(curr_obj_ids, dtype=torch.int64, device="cpu")
            out_probs = torch.tensor(
                [out["obj_id_to_score"][obj_id] for obj_id in curr_obj_ids], device="cpu"
            )
            out_sam2_probs = torch.tensor(
                [
                    (
                        out["obj_id_to_sam2_score"][obj_id]
                        if obj_id in out["obj_id_to_sam2_score"]
                        else 0.0
                    )
                    for obj_id in curr_obj_ids
                ],
                device="cpu",
            )
            out_binary_masks = torch.cat(
                [obj_id_to_mask[obj_id] for obj_id in curr_obj_ids], dim=0
            )

            assert out_binary_masks.dtype == torch.bool
            keep = out_binary_masks.any(dim=(1, 2)).cpu()  # remove masks with 0 areas
            # hide outputs for those object IDs in `obj_ids_to_hide`
            obj_ids_to_hide = []
            if suppressed_obj_ids is not None:
                obj_ids_to_hide.extend(suppressed_obj_ids)
            if removed_obj_ids is not None:
                obj_ids_to_hide.extend(removed_obj_ids)
            if unconfirmed_obj_ids is not None:
                obj_ids_to_hide.extend(unconfirmed_obj_ids)
            if len(obj_ids_to_hide) > 0:
                obj_ids_to_hide_t = torch.tensor(obj_ids_to_hide, dtype=torch.int64, device="cpu")
                keep &= ~torch.isin(out_obj_ids, obj_ids_to_hide_t)

            # slice those valid entries from the original outputs
            keep_idx = torch.nonzero(keep, as_tuple=True)[0]
            keep_idx_gpu = keep_idx.pin_memory().to(
                device=out_binary_masks.device, non_blocking=True
            )

            out_obj_ids = torch.index_select(out_obj_ids, 0, keep_idx)
            out_probs = torch.index_select(out_probs, 0, keep_idx)
            out_sam2_probs = torch.index_select(out_sam2_probs, 0, keep_idx)
            out_binary_masks = torch.index_select(out_binary_masks, 0, keep_idx_gpu)

            if perflib.is_enabled:
                out_boxes_xyxy = perf_masks_to_boxes(
                    out_binary_masks, out_obj_ids.tolist()
                )
            else:
                out_boxes_xyxy = masks_to_boxes(out_binary_masks)

            out_boxes_xywh = box_xyxy_to_xywh(out_boxes_xyxy)  # convert to xywh format
            # normalize boxes
            out_boxes_xywh[..., 0] /= W_video
            out_boxes_xywh[..., 1] /= H_video
            out_boxes_xywh[..., 2] /= W_video
            out_boxes_xywh[..., 3] /= H_video

        # apply non-overlapping constraints on the existing masklets
        if out_binary_masks.shape[0] > 1:
            assert len(out_binary_masks) == len(out_sam2_probs)
            out_binary_masks = (
                self.tracker._apply_object_wise_non_overlapping_constraints(
                    out_binary_masks.unsqueeze(1),
                    out_sam2_probs.unsqueeze(1).to(out_binary_masks.device),
                    background_value=0,
                ).squeeze(1)
            ) > 0

        prod_outputs = {}
        if self.running_in_prod:
            with torch.profiler.record_function(
                "Sam3MultiplexTracking._postprocess_output.prod_outputs"
            ):
                out_centers = torch.zeros(
                    out_binary_masks.shape[0],
                    2,
                    dtype=torch.float32,
                    device=out_binary_masks.device,
                )

                y_coords = torch.arange(
                    H_video, device=out_binary_masks.device, dtype=torch.float32
                )
                x_coords = torch.arange(
                    W_video, device=out_binary_masks.device, dtype=torch.float32
                )
                y_grid = y_coords.view(1, H_video, 1)
                x_grid = x_coords.view(1, 1, W_video)
                with torch.profiler.record_function(
                    "Sam3MultiplexTracking._postprocess_output.prod_outputs.center"
                ):
                    weighted_y_sum = (out_binary_masks * y_grid).sum(dim=(1, 2))
                    weighted_x_sum = (out_binary_masks * x_grid).sum(dim=(1, 2))
                    total_mass = out_binary_masks.sum(dim=(1, 2)).clamp_min(1e-6)
                    center_y = weighted_y_sum / total_mass / H_video
                    center_x = weighted_x_sum / total_mass / W_video
                    out_centers[:, 0] = center_x
                    out_centers[:, 1] = center_y

                with torch.profiler.record_function(
                    "Sam3MultiplexTracking._postprocess_output.prod_outputs.to_cpu"
                ):
                    prod_outputs["out_centers"] = out_centers.cpu().numpy()

        outputs = {
            "out_obj_ids": out_obj_ids.cpu().numpy(),
            "out_probs": out_probs.cpu().numpy(),
            "out_boxes_xywh": out_boxes_xywh.cpu().numpy(),
            "out_binary_masks": out_binary_masks.cpu().numpy(),
            "frame_stats": out.get("frame_stats", None),
        } | prod_outputs

        return outputs

    def _postprocess_output_batched(
        self,
        H_video,
        W_video,
        batched_outs,
    ):
        """
        Batched version of _postprocess_output that batches GPU computations
        (keep filtering, box computation) across frames for efficiency.

        Args:
            H_video: Video height
            W_video: Video width
            batched_outs: List of tuples, each containing:
                (out, removed_obj_ids, suppressed_obj_ids, unconfirmed_obj_ids)
                where out is the output dict from _run_single_frame_inference

        Returns:
            List of output dicts, one per frame in batched_outs
        """
        batch_size = len(batched_outs)
        if batch_size == 0:
            return []

        # ========== Phase 1: Collect per-frame data ==========
        # We'll track: frame_data[i] = (obj_ids, probs, sam2_probs, masks, keep_mask, frame_stats)
        # or None if frame has no objects
        frame_data = []
        device = None

        for (
            out,
            removed_obj_ids,
            suppressed_obj_ids,
            unconfirmed_obj_ids,
        ) in batched_outs:
            obj_id_to_mask = out["obj_id_to_mask"]
            curr_obj_ids = sorted(obj_id_to_mask.keys())
            frame_stats = out.get("frame_stats", None)

            if len(curr_obj_ids) == 0:
                frame_data.append((None, None, None, None, None, frame_stats))
                continue

            out_obj_ids = torch.tensor(curr_obj_ids, dtype=torch.int64, device="cpu")
            obj_id_to_score_dict = out["obj_id_to_score"]
            obj_id_to_sam2_score = out["obj_id_to_sam2_score"]

            if device is None:
                device = obj_id_to_mask[curr_obj_ids[0]].device
            default_sam2_score = torch.zeros((), dtype=torch.float32, device=device)

            probs_list = []
            sam2_probs_list = []
            binary_masks_list = []

            for obj_id in curr_obj_ids:
                probs_list.append(obj_id_to_score_dict[obj_id])
                sam2_probs_list.append(
                    obj_id_to_sam2_score.get(obj_id, default_sam2_score)
                )
                binary_masks_list.append(obj_id_to_mask[obj_id])

            out_probs = torch.tensor(probs_list, dtype=torch.float32, device="cpu")
            out_sam2_probs_gpu = torch.stack(sam2_probs_list)
            out_binary_masks = torch.cat(binary_masks_list, dim=0)

            # Compute keep mask (which objects to hide)
            obj_ids_to_hide = []
            if suppressed_obj_ids is not None:
                obj_ids_to_hide.extend(suppressed_obj_ids)
            if removed_obj_ids is not None:
                obj_ids_to_hide.extend(removed_obj_ids)
            if unconfirmed_obj_ids is not None:
                obj_ids_to_hide.extend(unconfirmed_obj_ids)

            if len(obj_ids_to_hide) > 0:
                obj_ids_to_hide_t = torch.tensor(obj_ids_to_hide, dtype=torch.int64, device="cpu")
                hide_mask = torch.isin(out_obj_ids, obj_ids_to_hide_t)
            else:
                hide_mask = torch.zeros(len(out_obj_ids), dtype=torch.bool, device="cpu")

            frame_data.append(
                (
                    out_obj_ids,
                    out_probs,
                    out_sam2_probs_gpu,
                    out_binary_masks,
                    hide_mask,
                    frame_stats,
                )
            )

        # ========== Phase 2: Batch concatenate masks for GPU operations ==========
        # Collect frames with objects
        frames_with_objects = []
        frame_obj_counts = []  # Number of objects per frame (for frames with objects only)
        all_masks_list = []
        all_hide_masks_list = []

        for i, data in enumerate(frame_data):
            if data[0] is not None:
                frames_with_objects.append(i)
                frame_obj_counts.append(data[0].shape[0])
                all_masks_list.append(data[3])  # binary_masks
                all_hide_masks_list.append(data[4])  # hide_mask

        # Handle case where all frames have 0 objects
        if len(frames_with_objects) == 0:
            outputs = []
            for data in frame_data:
                output_dict = {
                    "out_obj_ids": np.zeros(0, dtype=np.int64),
                    "out_probs": np.zeros(0, dtype=np.float32),
                    "out_boxes_xywh": np.zeros((0, 4), dtype=np.float32),
                    "out_binary_masks": np.zeros((0, H_video, W_video), dtype=bool),
                    "frame_stats": data[5],
                }
                if self.running_in_prod:
                    output_dict["out_centers"] = np.zeros((0, 2), dtype=np.float32)
                outputs.append(output_dict)
            return outputs

        # Concatenate all masks for batched GPU operations
        all_masks = torch.cat(all_masks_list, dim=0)
        all_hide_masks = torch.cat(all_hide_masks_list, dim=0)

        # ========== Phase 3: Batched keep mask computation on GPU ==========
        # Compute which masks have non-zero area (batched on GPU)
        has_area = all_masks.any(dim=(1, 2))  # GPU operation

        # Combine with hide mask (move hide_mask to GPU for the operation)
        all_hide_masks_gpu = all_hide_masks.to(device=all_masks.device)
        keep_mask_gpu = has_area & ~all_hide_masks_gpu

        # Get keep indices
        keep_indices = torch.nonzero(keep_mask_gpu, as_tuple=True)[0]

        if len(keep_indices) == 0:
            # All objects filtered out
            outputs = []
            for data in frame_data:
                output_dict = {
                    "out_obj_ids": np.zeros(0, dtype=np.int64),
                    "out_probs": np.zeros(0, dtype=np.float32),
                    "out_boxes_xywh": np.zeros((0, 4), dtype=np.float32),
                    "out_binary_masks": np.zeros((0, H_video, W_video), dtype=bool),
                    "frame_stats": data[5],
                }
                if self.running_in_prod:
                    output_dict["out_centers"] = np.zeros((0, 2), dtype=np.float32)
                outputs.append(output_dict)
            return outputs

        # ========== Phase 4: Batched filtering and box computation ==========
        # Filter masks on GPU
        kept_masks = torch.index_select(all_masks, 0, keep_indices)

        # Compute bounding boxes in batch on GPU
        if perflib.is_enabled:
            # Need to gather obj_ids for perflib
            all_obj_ids_list = [frame_data[i][0] for i in frames_with_objects]
            all_obj_ids_cat = torch.cat(all_obj_ids_list, dim=0)
            kept_obj_ids_for_perf = torch.index_select(
                all_obj_ids_cat, 0, keep_indices.cpu()
            )
            kept_boxes_xyxy = perf_masks_to_boxes(
                kept_masks, kept_obj_ids_for_perf.tolist()
            )
        else:
            kept_boxes_xyxy = masks_to_boxes(kept_masks)

        kept_boxes_xywh = box_xyxy_to_xywh(kept_boxes_xyxy)
        kept_boxes_xywh[..., 0] /= W_video
        kept_boxes_xywh[..., 1] /= H_video
        kept_boxes_xywh[..., 2] /= W_video
        kept_boxes_xywh[..., 3] /= H_video

        # ========== Phase 5: Split back to per-frame for non-overlapping ==========
        # Compute how many objects were kept per frame
        keep_indices_cpu = keep_indices.cpu()
        keep_set = set(keep_indices_cpu.tolist())

        kept_counts = []
        offset = 0
        for count in frame_obj_counts:
            kept_in_frame = sum(
                1 for j in range(offset, offset + count) if j in keep_set
            )
            kept_counts.append(kept_in_frame)
            offset += count

        # Split the kept tensors back to per-frame
        split_masks = torch.split(kept_masks, kept_counts)
        split_boxes = torch.split(kept_boxes_xywh, kept_counts)

        # Also need to split obj_ids, probs, sam2_probs (filtering from original frame_data)
        # We need to track which original indices were kept per frame
        frame_kept_indices = []  # List of (local_kept_indices) per frame
        offset = 0
        for count in frame_obj_counts:
            local_kept = []
            for j in range(offset, offset + count):
                if j in keep_set:
                    local_kept.append(j - offset)  # Local index within frame
            frame_kept_indices.append(local_kept)
            offset += count

        # ========== Phase 6: Apply non-overlapping per frame, collect final results ==========
        final_results = []  # List of (frame_idx, obj_ids, probs, boxes, masks)

        for idx, frame_i in enumerate(frames_with_objects):
            data = frame_data[frame_i]
            local_kept = frame_kept_indices[idx]

            if len(local_kept) == 0:
                continue

            # Get the filtered data for this frame
            local_kept_t = torch.tensor(local_kept, dtype=torch.int64, device="cpu")
            out_obj_ids = torch.index_select(data[0], 0, local_kept_t)
            out_probs = torch.index_select(data[1], 0, local_kept_t)
            out_sam2_probs = torch.index_select(
                data[2], 0, local_kept_t.to(data[2].device)
            )
            out_masks = split_masks[idx]
            out_boxes = split_boxes[idx]

            # Apply non-overlapping constraints (per-frame operation)
            if out_masks.shape[0] > 1:
                # Copy sam2_probs to CPU pinned memory then back to GPU for the operation
                out_sam2_probs_cpu = torch.empty(
                    out_sam2_probs.shape, dtype=out_sam2_probs.dtype, device="cpu", pin_memory=True
                )
                out_sam2_probs_cpu.copy_(out_sam2_probs, non_blocking=True)
                out_masks = (
                    self.tracker._apply_object_wise_non_overlapping_constraints(
                        out_masks.unsqueeze(1),
                        out_sam2_probs_cpu.unsqueeze(1).to(out_masks.device),
                        background_value=0,
                    ).squeeze(1)
                ) > 0

            final_results.append(
                (frame_i, out_obj_ids, out_probs, out_boxes, out_masks)
            )

        # ========== Phase 6.5: Compute centers for prod ==========
        all_centers = None
        if self.running_in_prod and len(final_results) > 0:
            with torch.profiler.record_function(
                "Sam3MultiplexTracking._postprocess_output_batched.prod_outputs"
            ):
                # Concatenate all masks for batched center computation
                all_masks = torch.cat([r[4] for r in final_results], dim=0)
                if all_masks.shape[0] > 0:
                    y_coords = torch.arange(
                        H_video, device=all_masks.device, dtype=torch.float32
                    )
                    x_coords = torch.arange(
                        W_video, device=all_masks.device, dtype=torch.float32
                    )
                    y_grid = y_coords.view(1, H_video, 1)
                    x_grid = x_coords.view(1, 1, W_video)

                    weighted_y_sum = (all_masks * y_grid).sum(dim=(1, 2))
                    weighted_x_sum = (all_masks * x_grid).sum(dim=(1, 2))
                    total_mass = all_masks.sum(dim=(1, 2)).clamp_min(1e-6)
                    center_y = weighted_y_sum / total_mass / H_video
                    center_x = weighted_x_sum / total_mass / W_video
                    all_centers = torch.stack([center_x, center_y], dim=1)

        # Handle case where all filtered out
        if len(final_results) == 0:
            outputs = []
            for data in frame_data:
                output_dict = {
                    "out_obj_ids": np.zeros(0, dtype=np.int64),
                    "out_probs": np.zeros(0, dtype=np.float32),
                    "out_boxes_xywh": np.zeros((0, 4), dtype=np.float32),
                    "out_binary_masks": np.zeros((0, H_video, W_video), dtype=bool),
                    "frame_stats": data[5],
                }
                if self.running_in_prod:
                    output_dict["out_centers"] = np.zeros((0, 2), dtype=np.float32)
                outputs.append(output_dict)
            return outputs

        # ========== Phase 7: Concatenate for batched GPU→CPU copy ==========
        final_obj_ids = torch.cat([r[1] for r in final_results], dim=0)
        final_probs = torch.cat([r[2] for r in final_results], dim=0)
        final_boxes = torch.cat([r[3] for r in final_results], dim=0)
        final_masks = torch.cat([r[4] for r in final_results], dim=0)

        total_objects = final_obj_ids.shape[0]

        # Initialize or resize batched CPU buffer
        batched_buffer_size = self.postprocess_batch_size * self.max_num_objects
        needs_buffer_init = not hasattr(self, "buffer_cpu_batched")
        needs_buffer_resize = not needs_buffer_init and (
            self.buffer_cpu_batched["out_binary_masks"].shape[0] != batched_buffer_size
            or self.buffer_cpu_batched["out_binary_masks"].shape[1] != H_video
            or self.buffer_cpu_batched["out_binary_masks"].shape[2] != W_video
        )

        if needs_buffer_init or needs_buffer_resize:
            self.buffer_cpu_batched = {
                "out_obj_ids": torch.zeros(
                    batched_buffer_size,
                    dtype=torch.int64,
                    device="cpu",
                    pin_memory=True,
                ),
                "out_probs": torch.zeros(
                    batched_buffer_size,
                    dtype=torch.float32,
                    device="cpu",
                    pin_memory=True,
                ),
                "out_boxes_xywh": torch.zeros(
                    batched_buffer_size,
                    4,
                    dtype=torch.float32,
                    device="cpu",
                    pin_memory=True,
                ),
                "out_binary_masks": torch.zeros(
                    batched_buffer_size,
                    H_video,
                    W_video,
                    dtype=bool,
                    device="cpu",
                    pin_memory=True,
                ),
            }
            if self.running_in_prod:
                self.buffer_cpu_batched["out_centers"] = torch.zeros(
                    batched_buffer_size,
                    2,
                    dtype=torch.float32,
                    device="cpu",
                    pin_memory=True,
                )

        self.buffer_cpu_batched["out_obj_ids"][:total_objects].copy_(final_obj_ids)
        self.buffer_cpu_batched["out_probs"][:total_objects].copy_(final_probs)
        self.buffer_cpu_batched["out_boxes_xywh"][:total_objects].copy_(final_boxes)
        self.buffer_cpu_batched["out_binary_masks"][:total_objects].copy_(final_masks)

        if all_centers is not None:
            self.buffer_cpu_batched["out_centers"][:total_objects].copy_(all_centers)

        # ========== Phase 8: Build output list ==========
        # Create mapping from frame index to (offset, count) in the buffer
        frame_to_offset_count = {}
        offset = 0
        for frame_i, obj_ids, _, _, _ in final_results:
            count = obj_ids.shape[0]
            frame_to_offset_count[frame_i] = (offset, count)
            offset += count

        outputs = []
        for i, data in enumerate(frame_data):
            frame_stats = data[5]
            if i not in frame_to_offset_count:
                # Frame has no objects (either originally or after filtering)
                output_dict = {
                    "out_obj_ids": np.zeros(0, dtype=np.int64),
                    "out_probs": np.zeros(0, dtype=np.float32),
                    "out_boxes_xywh": np.zeros((0, 4), dtype=np.float32),
                    "out_binary_masks": np.zeros((0, H_video, W_video), dtype=bool),
                    "frame_stats": frame_stats,
                }
                if all_centers is not None:
                    output_dict["out_centers"] = np.zeros((0, 2), dtype=np.float32)
                outputs.append(output_dict)
            else:
                buf_offset, num_objects = frame_to_offset_count[i]
                output_dict = {
                    "out_obj_ids": self.buffer_cpu_batched["out_obj_ids"][
                        buf_offset : buf_offset + num_objects
                    ]
                    .numpy()
                    .copy(),
                    "out_probs": self.buffer_cpu_batched["out_probs"][
                        buf_offset : buf_offset + num_objects
                    ]
                    .numpy()
                    .copy(),
                    "out_boxes_xywh": self.buffer_cpu_batched["out_boxes_xywh"][
                        buf_offset : buf_offset + num_objects
                    ]
                    .numpy()
                    .copy(),
                    "out_binary_masks": self.buffer_cpu_batched["out_binary_masks"][
                        buf_offset : buf_offset + num_objects
                    ]
                    .numpy()
                    .copy(),
                    "frame_stats": frame_stats,
                }
                if all_centers is not None:
                    output_dict["out_centers"] = (
                        self.buffer_cpu_batched["out_centers"][
                            buf_offset : buf_offset + num_objects
                        ]
                        .numpy()
                        .copy()
                    )
                outputs.append(output_dict)

        return outputs

    def _cache_frame_outputs(
        self,
        inference_state,
        frame_idx,
        obj_id_to_mask,
        suppressed_obj_ids=None,
        removed_obj_ids=None,
        unconfirmed_obj_ids=None,
    ):
        if not inference_state.get("cache_frame_outputs", True):
            return

        if "cached_frame_outputs" not in inference_state:
            inference_state["cached_frame_outputs"] = {}

        objects_to_exclude = set()
        if suppressed_obj_ids is not None:
            objects_to_exclude.update(suppressed_obj_ids)
        if removed_obj_ids is not None:
            objects_to_exclude.update(removed_obj_ids)
        if unconfirmed_obj_ids is not None:
            objects_to_exclude.update(unconfirmed_obj_ids)

        # This cache is only used for later fetch/refine output assembly. The tracker
        # keeps its active low-res memory separately, so video-res masks should not pin VRAM.
        inference_state["cached_frame_outputs"][frame_idx] = {
            obj_id: self._cache_output_mask(mask)
            for obj_id, mask in obj_id_to_mask.items()
            if obj_id not in objects_to_exclude
        }

    def _build_sam2_output(
        self, inference_state, frame_idx, refined_obj_id_to_mask=None
    ):
        if frame_idx not in inference_state["cached_frame_outputs"]:
            if refined_obj_id_to_mask is None:
                return {}
            return {
                obj_id: self._cache_output_mask(mask)
                for obj_id, mask in refined_obj_id_to_mask.items()
            }

        cached_outputs = inference_state["cached_frame_outputs"][frame_idx]
        obj_id_to_mask = cached_outputs.copy()

        # Update with refined masks if provided
        if refined_obj_id_to_mask is not None:
            for obj_id, refined_mask in refined_obj_id_to_mask.items():
                assert refined_mask is not None, (
                    f"Refined mask data must be provided for obj_id {obj_id}"
                )
                obj_id_to_mask[obj_id] = self._cache_output_mask(refined_mask)

        return obj_id_to_mask

    @staticmethod
    def _cache_output_mask(mask):
        if torch.is_tensor(mask):
            return mask.detach().to(device="cpu", non_blocking=True, copy=True)
        return np.array(mask, copy=True)

    def _compile_model(self):
        """Compile the SAM model with torch.compile for speedup."""
        # TODO: compile SAM2 model components
        is_compiled = getattr(self, "_model_is_compiled", False)
        if is_compiled or not self.compile_model:
            return

        import torch._dynamo

        # a larger cache size to hold varying number of shapes for torch.compile
        # see https://github.com/pytorch/pytorch/blob/v2.5.1/torch/_dynamo/config.py#L42-L49
        torch._dynamo.config.cache_size_limit = 128
        torch._dynamo.config.accumulated_cache_size_limit = 2048
        torch._dynamo.config.capture_scalar_outputs = True
        torch._dynamo.config.suppress_errors = True

        # Compile module components following https://www.internalfb.com/diff/D70935785
        # skip compilation of `_encode_prompt` since it sometimes tiggger SymInt errors
        # self._encode_prompt = clone_output_wrapper(
        #     torch.compile(self._encode_prompt, fullgraph=True, mode="max-autotune")
        # )

        ## Compile SAM3 model components (matching OV: clone_output_wrapper(torch.compile(fn)))
        if self.detector.backbone.language_backbone is not None:
            self.detector.backbone.language_backbone.encoder.forward = clone_output_wrapper(
                torch.compile(
                    self.detector.backbone.language_backbone.encoder.forward,
                    fullgraph=True,
                    mode="max-autotune",
                )
            )

        self.detector.backbone.vision_backbone.forward = clone_output_wrapper(
            torch.compile(
                self.detector.backbone.vision_backbone.forward,
                fullgraph=True,
                mode="max-autotune",
            )
        )
        self.detector.transformer.encoder.forward = clone_output_wrapper(
            torch.compile(
                self.detector.transformer.encoder.forward,
                fullgraph=True,
                mode="max-autotune",
            )
        )
        self.detector.transformer.decoder.forward = clone_output_wrapper(
            torch.compile(
                self.detector.transformer.decoder.forward,
                fullgraph=True,
                mode="max-autotune",
                dynamic=False,  # note: FA decoder uses static shapes
            )
        )

        self.detector.segmentation_head.forward = clone_output_wrapper(
            torch.compile(
                self.detector.segmentation_head.forward,
                fullgraph=True,
                mode="max-autotune",
            )
        )

        ## Compile SAM2 model components
        self.tracker.maskmem_backbone.forward = compile_wrapper(
            self.tracker.maskmem_backbone.forward,
            mode="max-autotune",
            fullgraph=True,
            dynamic=False,
        )

        self.tracker.transformer.encoder.forward = shape_logging_wrapper(
            compile_wrapper(
                self.tracker.transformer.encoder.forward,
                mode="max-autotune-no-cudagraphs",
                fullgraph=True,
                dynamic=True,
            ),
            keep_kwargs=["src", "src_pos", "prompt", "prompt_pos"],
        )

        self.tracker.sam_mask_decoder.forward = compile_wrapper(
            self.tracker.sam_mask_decoder.forward,
            mode="max-autotune",
            fullgraph=True,
            dynamic=False,  # Accuracy regression on True
        )

        sam3_video_base._associate_det_trk_compilable = compile_wrapper(
            sam3_video_base._associate_det_trk_compilable,
            mode="max-autotune-no-cudagraphs",
            fullgraph=True,
            dynamic=False,
        )

        self.tracker._suppress_object_pw_area_shrinkage = compile_wrapper(
            self.tracker._suppress_object_pw_area_shrinkage,
            mode="max-autotune-no-cudagraphs",
            fullgraph=True,
            dynamic=False,
        )

        self._model_is_compiled = True

    def _warm_up_vg_propagation(self, inference_state, start_frame_idx=0):
        # use different tracking score thresholds for each round to simulate different number of output objects
        num_objects_list = range(self.num_obj_for_compile + 1)
        num_rounds = 3
        orig_new_det_thresh = self.new_det_thresh
        for i in range(num_rounds):
            for num_objects in num_objects_list:
                logger.info(
                    f"round {i + 1}/{num_rounds} warming up model compilation -- simulating {num_objects}/{self.num_obj_for_compile} objects"
                )
                # Initialize text prompt and cache image features
                self.add_prompt(
                    inference_state, frame_idx=start_frame_idx, text_str="cat"
                )
                if num_objects > 0:
                    inference_state = self.add_fake_objects_to_inference_state(
                        inference_state, num_objects, frame_idx=start_frame_idx
                    )
                inference_state["tracker_metadata"]["rank0_metadata"].update(
                    {
                        "masklet_confirmation": {
                            "status": np.zeros(num_objects, dtype=np.int64),
                            "consecutive_det_num": np.zeros(
                                num_objects, dtype=np.int64
                            ),
                        }
                    }
                )
                for _ in self.propagate_in_video(
                    inference_state, start_frame_idx, reverse=False
                ):
                    pass
                for _ in self.propagate_in_video(
                    inference_state, start_frame_idx, reverse=True
                ):
                    pass
                self.reset_state(inference_state)
                logger.info(
                    f"{i + 1}/{num_rounds} warming up model compilation -- completed round {i + 1} out of {num_rounds}"
                )

        # Warm up SAM2 memory encoder with varying input shapes
        num_iters = 3
        feat_size = self.tracker.sam_image_embedding_size**2  # 72 * 72 = 5184
        hidden_dim = self.tracker.hidden_dim  # 256
        mem_dim = self.tracker.mem_dim  # 64 for non-multiplex, 256 for multiplex
        is_multiplex = self.tracker.is_multiplex

        for _ in tqdm(range(num_iters)):
            for b in range(1, self.num_obj_for_compile + 1):
                for i in range(
                    1,
                    self.tracker.max_cond_frames_in_attn + self.tracker.num_maskmem,
                ):
                    for j in range(
                        self.tracker.max_cond_frames_in_attn
                        + self.tracker.max_obj_ptrs_in_encoder
                    ):
                        if is_multiplex:
                            # Multiplex encoder: mem_dim == hidden_dim, uses decoupled cross-attention
                            # num_obj_ptr_tokens = j (since hidden_dim // mem_dim = 1)
                            num_obj_ptr_tokens = j
                            memory_seq_len = feat_size * i + num_obj_ptr_tokens

                            # src and memory have batch=num_buckets (b)
                            src = torch.randn(
                                feat_size, b, hidden_dim, device=self.device
                            )
                            src_pos = torch.randn(
                                feat_size, b, hidden_dim, device=self.device
                            )
                            memory = torch.randn(
                                memory_seq_len, b, hidden_dim, device=self.device
                            )
                            memory_pos = torch.randn(
                                memory_seq_len, b, hidden_dim, device=self.device
                            )

                            # image and memory_image always have batch=1 (shared image features)
                            image = torch.randn(
                                feat_size, 1, hidden_dim, device=self.device
                            )
                            image_pos = torch.randn(
                                feat_size, 1, hidden_dim, device=self.device
                            )
                            memory_image = torch.randn(
                                feat_size * i, 1, hidden_dim, device=self.device
                            )
                            memory_image_pos = torch.randn(
                                feat_size * i, 1, hidden_dim, device=self.device
                            )

                            self.tracker.transformer.encoder.forward(
                                image=image,
                                src=src,
                                memory_image=memory_image,
                                memory=memory,
                                image_pos=image_pos,
                                src_pos=src_pos,
                                memory_image_pos=memory_image_pos,
                                memory_pos=memory_pos,
                                num_obj_ptr_tokens=num_obj_ptr_tokens,
                            )
                        else:
                            # Non-multiplex encoder: mem_dim = 64, uses standard cross-attention
                            # num_obj_ptr_tokens = (hidden_dim // mem_dim) * j = 4 * j
                            num_obj_ptr_tokens = (hidden_dim // mem_dim) * j
                            src = torch.randn(
                                feat_size, b, hidden_dim, device=self.device
                            )
                            src_pos = torch.randn(
                                feat_size, b, hidden_dim, device=self.device
                            )
                            prompt = torch.randn(
                                feat_size * i + num_obj_ptr_tokens,
                                b,
                                mem_dim,
                                device=self.device,
                            )
                            prompt_pos = torch.randn(
                                feat_size * i + num_obj_ptr_tokens,
                                b,
                                mem_dim,
                                device=self.device,
                            )

                            self.tracker.transformer.encoder.forward(
                                src=src,
                                src_pos=src_pos,
                                prompt=prompt,
                                prompt_pos=prompt_pos,
                                num_obj_ptr_tokens=num_obj_ptr_tokens,
                            )

        # Warm up different number of kbox
        for _ in tqdm(range(num_iters)):
            for i in range(1, self.max_num_kboxes + 1):
                kboxes = (
                    torch.rand(i, 4, dtype=torch.float32) * 0.5
                )  # Generate positive values between 0 and 1
                print(
                    "Warming up masks_to_boxes with",
                    i,
                    f"kboxes.shape={kboxes.shape}",
                )
                self.add_prompt(
                    inference_state,
                    frame_idx=start_frame_idx,
                    text_str="cat",
                    boxes_xywh=kboxes,
                    box_labels=[1] * len(kboxes),
                )

                for _ in self.propagate_in_video(
                    inference_state, start_frame_idx, reverse=False
                ):
                    pass

        self.new_det_thresh = orig_new_det_thresh
        return inference_state

    def add_fake_objects_to_inference_state(
        self, inference_state, num_objects, frame_idx
    ):
        new_det_obj_ids_local = np.arange(num_objects)
        high_res_H, high_res_W = (
            self.tracker.maskmem_backbone.mask_downsampler.interpol_size
        )
        new_det_masks = torch.ones(
            len(new_det_obj_ids_local), high_res_H, high_res_W
        ).to(self.device)

        inference_state["sam2_inference_states"] = self._tracker_add_new_objects(
            frame_idx=frame_idx,
            num_frames=inference_state["num_frames"],
            new_obj_ids=new_det_obj_ids_local,
            new_obj_masks=new_det_masks,
            tracker_states_local=inference_state["sam2_inference_states"],
            orig_vid_height=inference_state["orig_height"],
            orig_vid_width=inference_state["orig_width"],
            feature_cache=inference_state["feature_cache"],
        )

        # Synthesize obj_id_to_mask data for cached_frame_outputs to support _build_sam2_output during warmup
        obj_id_to_mask = {}
        if num_objects > 0:
            H_video = inference_state["orig_height"]
            W_video = inference_state["orig_width"]

            video_res_masks = F.interpolate(
                new_det_masks.unsqueeze(1),  # Add channel dimension for interpolation
                size=(H_video, W_video),
                mode="bilinear",
                align_corners=False,
            )  # (num_objects, 1, H_video, W_video)
            for i, obj_id in enumerate(new_det_obj_ids_local):
                obj_id_to_mask[obj_id] = (video_res_masks[i] > 0.0).to(torch.bool)
        if self.rank == 0:
            for fidx in range(inference_state["num_frames"]):
                self._cache_frame_outputs(inference_state, fidx, obj_id_to_mask)

        inference_state["tracker_metadata"] = {
            "obj_ids_per_gpu": [np.arange(num_objects)],
            "obj_ids_all_gpu": np.arange(num_objects),  # Same as 1 GPU
            "num_obj_per_gpu": [num_objects],
            "obj_id_to_score": {i: 1.0 for i in range(num_objects)},
            "obj_id_to_sam2_score_frame_wise": defaultdict(dict),
            "obj_id_to_last_occluded": {},
            "max_obj_id": num_objects,
            "rank0_metadata": {
                "masklet_confirmation": {
                    "status": np.zeros(num_objects, dtype=np.int64),
                    "consecutive_det_num": np.zeros(num_objects, dtype=np.int64),
                },
                "removed_obj_ids": set(),
                "suppressed_obj_ids": defaultdict(set),
            },
            # gpu_metadata for hotstart tracking on GPU
            "gpu_metadata": {
                "N_obj": num_objects,
                "obj_first_frame": torch.zeros(
                    num_objects, dtype=torch.long, device=self.device
                ),
                "consecutive_unmatch_count": torch.zeros(
                    num_objects, dtype=torch.long, device=self.device
                ),
                "trk_keep_alive": torch.ones(
                    num_objects, dtype=torch.bool, device=self.device
                ),
                "removed_mask": torch.zeros(
                    num_objects, dtype=torch.bool, device=self.device
                ),
                "overlap_pair_counts": torch.zeros(
                    (num_objects, num_objects), dtype=torch.long, device=self.device
                ),
                "last_occluded_tensor": torch.zeros(
                    num_objects, dtype=torch.long, device=self.device
                ),
            },
        }
        # Add num_buc_per_gpu for multiplex mode
        if self.is_multiplex:
            # Count actual buckets from the inference states
            num_buc = self._count_buckets_in_states(
                inference_state["sam2_inference_states"]
            )
            inference_state["tracker_metadata"]["num_buc_per_gpu"] = np.array(
                [num_buc], dtype=np.int64
            )

        return inference_state

    @torch.inference_mode()
    @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
    def warm_up_compilation(self):
        """
        Warm up the model by running a dummy inference to compile the model. This is
        useful to avoid the compilation overhead in the first inference call.
        """
        if not self.compile_model:
            return
        self._warm_up_complete = False
        if self.device.type not in {"cuda", "mps"}:
            raise RuntimeError(
                f"The model must be on an accelerator for warm-up compilation, got {self.device=}."
            )

        # temporally set to single GPU temporarily for warm-up compilation
        orig_rank = self.rank
        orig_world_size = self.world_size
        self.rank = self.detector.rank = 0
        self.world_size = self.detector.world_size = 1
        orig_recondition_every_nth_frame = self.recondition_every_nth_frame
        # self.recondition_every_nth_frame = 2

        # Get a random video
        inference_state = self.init_state(resource_path="<load-zero-video-30>")
        start_frame_idx = 0

        # Run basic propagation warm-up
        inference_state = self._warm_up_vg_propagation(inference_state, start_frame_idx)

        logger.info("Warm-up compilation completed.")

        # revert to the original GPU and rank
        self.rank = self.detector.rank = orig_rank
        self.world_size = self.detector.world_size = orig_world_size
        self.recondition_every_nth_frame = orig_recondition_every_nth_frame
        self._warm_up_complete = True
        self.tracker.transformer.encoder.forward.set_logging(True)

    @torch.inference_mode()
    def add_prompt(
        self,
        inference_state,
        frame_idx,
        text_str=None,
        clear_old_points=True,
        points=None,
        point_labels=None,
        boxes_xywh=None,
        box_labels=None,
        clear_old_boxes=True,
        output_prob_thresh=0.5,
        preencoded_text_outputs=None,
    ):
        """
        Add text, point or box prompts on a single frame. This method returns the inference
        outputs only on the prompted frame.

        Note that text prompts are NOT associated with a particular frame (i.e. they apply
        to all frames). However, we only run inference on the frame specified in `frame_idx`.

        Copied from sam3_demo.Sam3DemoMixin.add_prompt, simplified to support only text prompts.
        """
        logger.info("Running add_prompt on frame %d", frame_idx)

        device = inference_state["device"]
        num_frames = inference_state["num_frames"]
        assert text_str is not None or points is not None or boxes_xywh is not None, (
            "at least one type of prompt (text, points, boxes) must be provided"
        )
        assert 0 <= frame_idx < num_frames, (
            f"{frame_idx=} is out of range for a total of {num_frames} frames"
        )

        assert clear_old_boxes, "clear old boxes must be True"

        assert points is None and clear_old_points is True and point_labels is None, (
            "Point prompts not accepted"
        )

        # since it's a semantic prompt, we start over
        self.reset_state(inference_state)

        # 1) add text prompt
        if text_str is not None:
            inference_state["text_prompt"] = text_str
            # add the text prompt into the input batch (to be applied to *all* frames)
            inference_state["input_batch"].find_text_batch[0] = text_str
            for t in range(inference_state["num_frames"]):
                text_id = self.TEXT_ID_FOR_TEXT
                inference_state["input_batch"].find_inputs[t].text_ids[...] = text_id

        # 2) handle box prompt
        assert (boxes_xywh is not None) == (box_labels is not None)
        if boxes_xywh is not None:
            boxes_xywh = torch.as_tensor(boxes_xywh, dtype=torch.float32)
            box_labels = torch.as_tensor(box_labels, dtype=torch.long)
            # input boxes are expected to be [xmin, ymin, width, height] format
            # in normalized coordinates of range 0~1, similar to FA
            assert boxes_xywh.dim() == 2
            assert boxes_xywh.size(0) > 0 and boxes_xywh.size(-1) == 4
            assert box_labels.dim() == 1 and box_labels.size(0) == boxes_xywh.size(0)
            boxes_cxcywh = box_xywh_to_cxcywh(boxes_xywh)
            assert (boxes_xywh >= 0).all().item() and (boxes_xywh <= 1).all().item()
            assert (boxes_cxcywh >= 0).all().item() and (boxes_cxcywh <= 1).all().item()

            new_box_input = boxes_cxcywh, box_labels
            inference_state["per_frame_raw_box_input"][frame_idx] = new_box_input

            # handle the case of visual prompt (also added as an input box from the UI)
            boxes_cxcywh, box_labels, geometric_prompt = self._get_visual_prompt(
                inference_state, frame_idx, boxes_cxcywh, box_labels
            )

            inference_state["per_frame_geometric_prompt"][frame_idx] = geometric_prompt

        with torch.profiler.record_function("add_prompt._init_backbone_out"):
            inference_state["backbone_out"] = self._init_backbone_out(inference_state, preencoded_text_outputs)
        out = self._run_single_frame_inference(
            inference_state,
            frame_idx,
            reverse=False,
        )
        return frame_idx, self._postprocess_output(inference_state, out)

    def _init_backbone_out(self, inference_state, preencoded_text_outputs=None):
        """
        Initialize a backbone_out dictionary and extract the text features.

        Note that the visual features of each frame are not extracted here. They will be
        extracted on the fly when running inference on each frame.
        """
        input = inference_state["input_batch"]
        device = self.device
        backbone_out = {"img_batch_all_stages": input.img_batch}
        if preencoded_text_outputs is None:
            if self.detector.backbone.language_backbone is None:
                raise RuntimeError("SAM3 text encoder is not loaded; preencoded_text_outputs is required.")
            text_outputs = self.detector.backbone.forward_text(input.find_text_batch, device=device)
        else:
            text_outputs = {key: value.to(device=device, non_blocking=True) if torch.is_tensor(value) else value for key, value in preencoded_text_outputs.items()}
        backbone_out.update(text_outputs)
        return backbone_out

    @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
    def forward(self, input: BatchedDatapoint, is_inference: bool = False):
        """This method is only used for benchmark eval (not used in the demo)."""
        # set the model to single GPU for benchmark evaluation (to be compatible with trainer)
        orig_rank = self.rank
        orig_world_size = self.world_size
        self.rank = self.detector.rank = 0
        self.world_size = self.detector.world_size = 1

        # get data
        text_prompt_ids = input.find_metadatas[0].original_category_id
        text_prompt_list = input.find_text_batch

        # loop over txt prompts
        tracking_res = defaultdict(dict)  # frame_idx --> {obj_id: mask}
        scores_labels = defaultdict(tuple)  # obj_id --> (score, text_prompt_id)
        inference_state = self.init_state(resource_path=input.raw_images)
        for prompt_id, prompt in zip(text_prompt_ids, text_prompt_list):
            self.add_prompt(inference_state, frame_idx=0, text_str=prompt)
            start_obj_id = max(scores_labels.keys(), default=-1) + 1  # prev max + 1

            # propagate the prompts
            obj_ids_this_prompt = set()
            for frame_idx, out in self.propagate_in_video(
                inference_state,
                start_frame_idx=0,
                max_frame_num_to_track=inference_state["num_frames"],
                reverse=False,
            ):
                out_obj_ids = (
                    out["out_obj_ids"].numpy()
                    if isinstance(out["out_obj_ids"], torch.Tensor)
                    else out["out_obj_ids"]
                )
                out_binary_masks = (
                    out["out_binary_masks"].numpy()
                    if isinstance(out["out_binary_masks"], torch.Tensor)
                    else out["out_binary_masks"]
                )

                current_frame_res = tracking_res[frame_idx]
                for obj_id, mask in zip(out_obj_ids, out_binary_masks):
                    mask_tensor = torch.tensor(mask[None], dtype=torch.bool)
                    current_frame_res[obj_id + start_obj_id] = mask_tensor
                obj_ids_this_prompt.update(current_frame_res.keys())

            obj_id_to_score = inference_state["tracker_metadata"]["obj_id_to_score"]
            for obj_id, score in obj_id_to_score.items():
                if obj_id + start_obj_id in obj_ids_this_prompt:
                    score_tensor = torch.tensor(score, dtype=torch.float32)
                    scores_labels[obj_id + start_obj_id] = (score_tensor, prompt_id)

            self.reset_state(inference_state)

        video_id = input.find_metadatas[0].original_image_id[0].cpu().item()
        preds = self.prep_for_evaluator(input.raw_images, tracking_res, scores_labels)

        # revert the model to the original GPU and rank
        self.rank = self.detector.rank = orig_rank
        self.world_size = self.detector.world_size = orig_world_size
        return {video_id: preds}


class Sam3MultiplexTrackingProd(Sam3MultiplexTracking):
    """
    Subclass of Sam3MultiplexTracking with support for batched processing.

    This class enables processing videos in batches rather than all at once by:
    1. Adding an `is_last_batch` parameter to control buffer flushing
    2. Persisting generator state (hotstart_buffer, hotstart_removed_obj_ids,
       unconfirmed_obj_ids_per_frame) in inference_state across generator instantiations

    This is useful for processing large videos in smaller chunks to manage memory
    or distribute processing across multiple calls.
    """

    @torch.inference_mode()
    def init_state(
        self,
        resource_path,
        offload_video_to_cpu=False,
        async_loading_frames=False,
        use_cv2=False,
        input_is_mp4=False,
    ):
        inference_state = super().init_state(
            resource_path=resource_path,
            offload_video_to_cpu=offload_video_to_cpu,
            async_loading_frames=async_loading_frames,
            use_cv2=use_cv2,
            input_is_mp4=input_is_mp4,
        )
        # Initialize generator state for batched processing
        inference_state["generator_state"] = {
            "hotstart_buffer": [],
            "hotstart_removed_obj_ids": set(),
            "unconfirmed_obj_ids_per_frame": {},
            "postprocess_yield_list": [],
        }
        return inference_state

    def reset_state(self, inference_state):
        super().reset_state(inference_state)
        # Reset generator state for batched processing
        inference_state["generator_state"] = {
            "hotstart_buffer": [],
            "hotstart_removed_obj_ids": set(),
            "unconfirmed_obj_ids_per_frame": {},
            "postprocess_yield_list": [],
        }

    @torch.inference_mode()
    def propagate_in_video(
        self,
        inference_state,
        start_frame_idx=None,
        max_frame_num_to_track=None,
        reverse=False,
        output_prob_thresh=0.5,
        compute_stability_score=False,
        is_instance_processing=False,
        is_last_batch=True,
        progress_callback=None,
    ):
        """
        Propagate the prompts to get grounding results for the entire video. This method
        is a generator and yields inference outputs for all frames in the range specified
        by `start_frame_idx`, `max_frame_num_to_track`, and `reverse`.

        Args:
            is_last_batch: Whether this is the last batch in a batched processing scenario.
                When True (default), the hotstart buffer will be flushed at end_frame_idx.
                When False, the buffer is preserved in inference_state for the next batch.
                This flag should be set to False for all batches except the last one when
                processing a video in multiple batches.
        """
        # compile the model (it's a no-op if the model is already compiled)
        # note that it's intentionally added to `self.propagate_in_video`, so that the first
        # `self.add_prompt` call will be done in eager mode to fill in the decoder buffers
        # such as positional encoding cache)
        self._compile_model()

        processing_order, end_frame_idx = self._get_processing_order(
            inference_state,
            start_frame_idx,
            max_frame_num_to_track,
            reverse=reverse,
        )

        # Store max_frame_num_to_track in feature_cache for downstream methods
        inference_state["feature_cache"]["tracking_bounds"] = {
            "max_frame_num_to_track": max_frame_num_to_track,
            "propagate_in_video_start_frame_idx": start_frame_idx,
        }

        # Initialize or retrieve generator state from inference_state to persist across batches
        if "generator_state" not in inference_state:
            inference_state["generator_state"] = {
                "hotstart_buffer": [],
                "hotstart_removed_obj_ids": set(),
                "unconfirmed_obj_ids_per_frame": {},
                "postprocess_yield_list": [],
            }

        generator_state = inference_state["generator_state"]
        hotstart_buffer = generator_state["hotstart_buffer"]
        hotstart_removed_obj_ids = generator_state["hotstart_removed_obj_ids"]
        unconfirmed_obj_ids_per_frame = generator_state["unconfirmed_obj_ids_per_frame"]
        postprocess_yield_list = generator_state.get("postprocess_yield_list", [])

        # when deciding whether to output a masklet on `yield_frame_idx`, we check whether the object is confirmed
        # in a future frame (`unconfirmed_frame_delay` frames after the current frame). For example, if we require
        # an object to be detected in 3 consecutive frames to be confirmed, then we look 2 frames in the future --
        # e.g., we output an object on frame 4 only if it becomes confirmed on frame 6.
        unconfirmed_status_delay = self.masklet_confirmation_consecutive_det_thresh - 1

        progress_done = 0
        progress_total = len(processing_order)
        for frame_idx in tqdm(
            processing_order, desc="propagate_in_video", disable=self.rank > 0
        ):
            out = self._run_single_frame_inference(
                inference_state,
                frame_idx,
                reverse,
                is_instance_processing=is_instance_processing,
            )
            progress_done += 1
            if progress_callback is not None and self.rank == 0:
                progress_callback(progress_done, progress_total)

            if self.hotstart_delay > 0:
                # accumulate the outputs for the first `hotstart_delay` frames
                hotstart_buffer.append([frame_idx, out])
                # update the object IDs removed by hotstart so that we don't output them
                if self.rank == 0:
                    hotstart_removed_obj_ids.update(out["removed_obj_ids"])
                    unconfirmed_obj_ids = out.get("unconfirmed_obj_ids", None)
                    if unconfirmed_obj_ids is not None:
                        unconfirmed_obj_ids_per_frame[frame_idx] = unconfirmed_obj_ids

                if frame_idx == end_frame_idx and is_last_batch:
                    # we reached the end of propagation -- yield all frames in the buffer
                    yield_list = hotstart_buffer
                    hotstart_buffer = []
                elif len(hotstart_buffer) >= self.hotstart_delay:
                    # we have enough frames -- yield and remove the first (oldest) frame from the buffer
                    yield_list = hotstart_buffer[:1]
                    hotstart_buffer = hotstart_buffer[1:]
                else:
                    # not enough frames yet -- skip yielding
                    yield_list = []
            else:
                yield_list = [(frame_idx, out)]  # output the current frame

            # Accumulate yield_list into postprocess_yield_list
            # Snapshot hotstart_removed_obj_ids at the time of accumulation to preserve
            # the correct state for each frame (important: this set is mutated over time)
            for yield_frame_idx, yield_out in yield_list:
                postprocess_yield_list.append(
                    (yield_frame_idx, yield_out, set(hotstart_removed_obj_ids))
                )

            # Process batch when we have enough frames
            while len(postprocess_yield_list) >= self.postprocess_batch_size:
                batch_to_process = postprocess_yield_list[: self.postprocess_batch_size]
                postprocess_yield_list = postprocess_yield_list[
                    self.postprocess_batch_size :
                ]

                with torch.profiler.record_function(
                    "Sam3MultiplexTrackingProd.postprocess_output_batched"
                ):
                    if self.rank == 0:
                        # Prepare batched inputs for postprocessing
                        H_video, W_video = (
                            inference_state["orig_height"],
                            inference_state["orig_width"],
                        )
                        num_frames = inference_state["num_frames"]

                        batched_outs = []
                        frame_indices = []
                        for (
                            yield_frame_idx,
                            yield_out,
                            removed_obj_ids_snapshot,
                        ) in batch_to_process:
                            suppressed_obj_ids = yield_out["suppressed_obj_ids"]
                            unconfirmed_status_frame_idx = (
                                yield_frame_idx + unconfirmed_status_delay
                                if not reverse
                                else yield_frame_idx - unconfirmed_status_delay
                            )
                            unconfirmed_status_frame_idx = max(
                                0, min(unconfirmed_status_frame_idx, num_frames - 1)
                            )
                            unconfirmed_obj_ids = unconfirmed_obj_ids_per_frame.get(
                                unconfirmed_status_frame_idx, None
                            )

                            batched_outs.append(
                                (
                                    yield_out,
                                    removed_obj_ids_snapshot,
                                    suppressed_obj_ids,
                                    unconfirmed_obj_ids,
                                )
                            )
                            frame_indices.append(yield_frame_idx)

                            # Cache frame outputs
                            self._cache_frame_outputs(
                                inference_state,
                                yield_frame_idx,
                                yield_out["obj_id_to_mask"],
                                suppressed_obj_ids=suppressed_obj_ids,
                                removed_obj_ids=removed_obj_ids_snapshot,
                                unconfirmed_obj_ids=unconfirmed_obj_ids,
                            )

                        # Process all frames in batch
                        if self.postprocess_batch_size > 1:
                            postprocessed_outs = self._postprocess_output_batched(
                                H_video, W_video, batched_outs
                            )
                        else:
                            # Process each frame individually but output together
                            postprocessed_outs = []
                            for (
                                yield_out,
                                removed_obj_ids_snapshot,
                                suppressed_obj_ids,
                                unconfirmed_obj_ids,
                            ) in batched_outs:
                                postprocessed_out = self._postprocess_output(
                                    inference_state,
                                    yield_out,
                                    removed_obj_ids_snapshot,
                                    suppressed_obj_ids,
                                    unconfirmed_obj_ids,
                                )
                                postprocessed_outs.append(postprocessed_out)

                        # Yield results
                        for yield_frame_idx, postprocessed_out in zip(
                            frame_indices, postprocessed_outs
                        ):
                            yield yield_frame_idx, postprocessed_out
                    else:
                        # No output on other GPUs
                        for yield_frame_idx, _, _ in batch_to_process:
                            yield yield_frame_idx, DUMMY_OUTPUT

        # Handle remaining frames in hotstart buffer at end of last batch
        if is_last_batch and len(hotstart_buffer) > 0:
            for yield_frame_idx, yield_out in hotstart_buffer:
                postprocess_yield_list.append(
                    (yield_frame_idx, yield_out, set(hotstart_removed_obj_ids))
                )
            hotstart_buffer = []

        # Flush any remaining frames in the postprocess buffer (even partial
        # batches) so that the caller gets results as soon as possible. This is
        # especially important for the first batch where hotstart_delay causes
        # only a few frames to exit the hotstart buffer — without this flush
        # the client would have to wait for the next batch before receiving any
        # output, hurting time-to-first-frame.
        if len(postprocess_yield_list) > 0:
            with torch.profiler.record_function(
                "Sam3MultiplexTrackingProd.postprocess_output_batched"
            ):
                if self.rank == 0:
                    H_video, W_video = (
                        inference_state["orig_height"],
                        inference_state["orig_width"],
                    )
                    num_frames = inference_state["num_frames"]

                    batched_outs = []
                    frame_indices = []
                    for (
                        yield_frame_idx,
                        yield_out,
                        removed_obj_ids_snapshot,
                    ) in postprocess_yield_list:
                        suppressed_obj_ids = yield_out["suppressed_obj_ids"]
                        unconfirmed_status_frame_idx = (
                            yield_frame_idx + unconfirmed_status_delay
                            if not reverse
                            else yield_frame_idx - unconfirmed_status_delay
                        )
                        unconfirmed_status_frame_idx = max(
                            0, min(unconfirmed_status_frame_idx, num_frames - 1)
                        )
                        unconfirmed_obj_ids = unconfirmed_obj_ids_per_frame.get(
                            unconfirmed_status_frame_idx, None
                        )

                        batched_outs.append(
                            (
                                yield_out,
                                removed_obj_ids_snapshot,
                                suppressed_obj_ids,
                                unconfirmed_obj_ids,
                            )
                        )
                        frame_indices.append(yield_frame_idx)

                        self._cache_frame_outputs(
                            inference_state,
                            yield_frame_idx,
                            yield_out["obj_id_to_mask"],
                            suppressed_obj_ids=suppressed_obj_ids,
                            removed_obj_ids=removed_obj_ids_snapshot,
                            unconfirmed_obj_ids=unconfirmed_obj_ids,
                        )

                    if self.postprocess_batch_size > 1:
                        postprocessed_outs = self._postprocess_output_batched(
                            H_video, W_video, batched_outs
                        )
                    else:
                        # Process each frame individually but output together
                        postprocessed_outs = []
                        for (
                            yield_out,
                            removed_obj_ids_snapshot,
                            suppressed_obj_ids,
                            unconfirmed_obj_ids,
                        ) in batched_outs:
                            postprocessed_out = self._postprocess_output(
                                inference_state,
                                yield_out,
                                removed_obj_ids_snapshot,
                                suppressed_obj_ids,
                                unconfirmed_obj_ids,
                            )
                            postprocessed_outs.append(postprocessed_out)

                    for yield_frame_idx, postprocessed_out in zip(
                        frame_indices, postprocessed_outs
                    ):
                        yield yield_frame_idx, postprocessed_out
                else:
                    for yield_frame_idx, _, _ in postprocess_yield_list:
                        yield yield_frame_idx, DUMMY_OUTPUT

            postprocess_yield_list = []

        # Store the generator state back to inference_state for persistence across batches
        generator_state["postprocess_yield_list"] = postprocess_yield_list
        generator_state["hotstart_buffer"] = hotstart_buffer
        generator_state["hotstart_removed_obj_ids"] = hotstart_removed_obj_ids
        generator_state["unconfirmed_obj_ids_per_frame"] = unconfirmed_obj_ids_per_frame

        if self.is_multiplex:
            # log the bucket utilization stats
            # bucket utilization rate is total valid objects / total capacity -> represents rooms for improvement
            # subscription rate is total valid objects / total number of buckets -> represents speedup
            total_valid_objects = 0
            total_num_buckets = 0
            for state in inference_state["sam2_inference_states"]:
                assert (
                    len(state["obj_ids"])
                    == state["multiplex_state"].total_valid_entries
                )
                total_valid_objects += len(state["obj_ids"])
                total_num_buckets += state["multiplex_state"].num_buckets
            if total_num_buckets > 0:
                bucket_utilization_rate = (
                    total_valid_objects / (total_num_buckets * self.bucket_capacity)
                ) * 100
                subscription_rate = (total_valid_objects / total_num_buckets) * 100
                logger.info(
                    f"Bucket utilization rate: {bucket_utilization_rate:.2f}%, subscription rate: {subscription_rate:.2f}%"
                )


class Sam3MultiplexTrackingWithInteractivity(Sam3MultiplexTracking):
    def __init__(
        self,
        use_prev_mem_frame=False,
        use_stateless_refinement=False,
        refinement_detector_cond_frame_removal_window=30 * 4,
        **kwargs,
    ):
        """
        use_prev_mem_frame: bool, whether to condition on previous memory frames for adding points
        use_stateless_refinement: bool, whether to enable stateless refinement behavior
        refinement_detector_cond_frame_removal_window: int, we remove a detector conditioning frame if it
            is within this many frames of a user refined frame. Set to a large value (e.g. 10000) to
            always remove detector conditioning frames if there is any user refinement in the video.
        """
        super().__init__(**kwargs)
        self.use_prev_mem_frame = use_prev_mem_frame
        self.use_stateless_refinement = use_stateless_refinement
        self.refinement_detector_cond_frame_removal_window = (
            refinement_detector_cond_frame_removal_window
        )

    @torch.inference_mode()
    def init_state(
        self,
        resource_path,
        offload_video_to_cpu=False,
        async_loading_frames=False,
        use_cv2=False,
        input_is_mp4=False,
    ):
        inference_state = super().init_state(
            resource_path=resource_path,
            offload_video_to_cpu=offload_video_to_cpu,
            async_loading_frames=async_loading_frames,
            use_cv2=use_cv2,
            input_is_mp4=input_is_mp4,
        )
        # initialize extra states
        inference_state["action_history"] = []  # for logging user actions
        if self.tracker.per_obj_inference:
            # in per_obj mode only 1 inference state is needed, we init it here.
            inference_state["sam2_inference_states"] = [
                self._init_new_sam2_state(inference_state)
            ]
        return inference_state

    def reset_state(self, inference_state):
        super().reset_state(inference_state)
        # reset extra states
        inference_state["action_history"].clear()
        if self.tracker.per_obj_inference:
            inference_state["sam2_inference_states"] = [
                self._init_new_sam2_state(inference_state)
            ]

    def _init_new_sam2_state(self, inference_state):
        return self.tracker.init_state(
            cached_features=inference_state["feature_cache"],
            video_height=inference_state["orig_height"],
            video_width=inference_state["orig_width"],
            num_frames=inference_state["num_frames"],
        )

    def cancel_propagation(self, inference_state):
        """
        Cancel any ongoing propagation and reset the model state.
        """
        logger.info("Cancelling ongoing propagation.")
        self.add_action_history(
            inference_state,
            action_type="propagation_cancel",
            obj_ids=None,
            frame_idx=None,
        )

    def fetch_and_process_single_frame_results(self, inference_state, frame_idx):
        tracker_metadata = inference_state["tracker_metadata"]
        obj_id_to_mask = inference_state["cached_frame_outputs"][frame_idx]
        # post processing - remove suppressed obj_ids
        obj_id_to_score = tracker_metadata["obj_id_to_score"]
        suppressed_obj_ids = tracker_metadata["rank0_metadata"]["suppressed_obj_ids"][
            frame_idx
        ]
        obj_id_to_sam2_score = tracker_metadata["obj_id_to_sam2_score_frame_wise"][
            frame_idx
        ]

        out = {
            "obj_id_to_mask": obj_id_to_mask,
            "obj_id_to_score": obj_id_to_score,
            "obj_id_to_sam2_score": obj_id_to_sam2_score,
        }
        return frame_idx, self._postprocess_output(
            inference_state, out, suppressed_obj_ids=suppressed_obj_ids
        )

    @torch.inference_mode()
    def propagate_in_video(
        self,
        inference_state,
        start_frame_idx=None,
        max_frame_num_to_track=None,
        reverse=False,
        output_prob_thresh=0.5,
        compute_stability_score=False,
        is_instance_processing=False,
        is_last_batch: bool = False,
        progress_callback=None,
    ):
        # step 1: check which type of propagation to run, should be the same for all GPUs.
        propagation_type, obj_ids = self.parse_action_history_for_propagation(
            inference_state
        )
        self.add_action_history(
            inference_state,
            action_type=propagation_type,
            obj_ids=obj_ids,
            frame_idx=start_frame_idx,
        )

        # step 2: run full VG propagation
        if propagation_type == "propagation_full":
            logger.info(f"Running full VG propagation (reverse={reverse}).")
            yield from super().propagate_in_video(
                inference_state,
                start_frame_idx=start_frame_idx,
                max_frame_num_to_track=max_frame_num_to_track,
                reverse=reverse,
                is_last_batch=is_last_batch,
                progress_callback=progress_callback,
            )
            return

        # step 3: run SAM2 partial propagation or direct fetch existing predictions
        assert propagation_type in ["propagation_partial", "propagation_fetch"]
        logger.info(
            f"Running SAM2 propagation for objects {obj_ids} and merging it with existing VG predictions (reverse={reverse})."
            if propagation_type == "propagation_partial"
            else f"Fetching existing VG predictions without running any propagation (reverse={reverse})."
        )
        processing_order, _end_frame_idx = self._get_processing_order(
            inference_state,
            start_frame_idx=start_frame_idx,
            max_frame_num_to_track=max_frame_num_to_track,
            reverse=reverse,
        )

        tracker_metadata = inference_state["tracker_metadata"]

        # if fetch just return from output
        if propagation_type == "propagation_fetch":
            progress_done = 0
            progress_total = len(processing_order)
            for frame_idx in tqdm(processing_order):
                if self.rank == 0:
                    frame_idx, out = self.fetch_and_process_single_frame_results(
                        inference_state, frame_idx
                    )
                    progress_done += 1
                    if progress_callback is not None:
                        progress_callback(progress_done, progress_total)
                    yield frame_idx, out
                else:
                    yield frame_idx, DUMMY_OUTPUT  # no output for other GPUs

            return

        # get SAM2 inference states containing selected obj_ids
        if propagation_type == "propagation_partial":
            # can be empty for GPUs where objects are not in their inference states
            tracker_states_local = self._get_sam2_inference_states_by_obj_ids(
                inference_state, obj_ids
            )
            for sam2_state in tracker_states_local:
                self.tracker.propagate_in_video_preflight(
                    sam2_state, run_mem_encoder=True
                )

        progress_done = 0
        progress_total = len(processing_order)
        for frame_idx in tqdm(processing_order):
            # run SAM2 propagation
            if propagation_type == "propagation_partial":
                self._prepare_backbone_feats(inference_state, frame_idx, reverse)
                obj_ids_local, low_res_masks_local, sam2_scores_local = (
                    self._propogate_tracker_one_frame_local_gpu(
                        tracker_states_local,
                        frame_idx=frame_idx,
                        reverse=reverse,
                        run_mem_encoder=True,
                    )
                )

                # broadcast refined object sam2 scores and masks to all GPUs
                # handle multiple objects that can be located on different GPUs
                refined_obj_data = {}  # obj_id -> (score, mask_video_res)

                # Collect data for objects on this GPU
                local_obj_data = {}
                for obj_id in obj_ids:
                    obj_rank = self._get_gpu_id_by_obj_id(inference_state, obj_id)
                    if self.rank == obj_rank and obj_id in obj_ids_local:
                        refined_obj_idx = obj_ids_local.index(obj_id)
                        refined_mask_low_res = low_res_masks_local[
                            refined_obj_idx
                        ]  # (H_low_res, W_low_res)
                        refined_score = sam2_scores_local[refined_obj_idx]

                        # Keep low resolution for broadcasting to reduce communication cost
                        local_obj_data[obj_id] = (refined_score, refined_mask_low_res)

                # Broadcast data from each GPU that has refined objects
                if self.world_size > 1:
                    for obj_id in obj_ids:
                        obj_rank = self._get_gpu_id_by_obj_id(inference_state, obj_id)
                        if self.rank == obj_rank:
                            # This GPU has the object, broadcast its data
                            data_to_broadcast = local_obj_data.get(obj_id, None)
                            data_list = [data_to_broadcast]
                            self.broadcast_python_obj_cpu(data_list, src=obj_rank)
                            if data_to_broadcast is not None:
                                refined_obj_data[obj_id] = data_to_broadcast
                        elif self.rank != obj_rank:
                            # This GPU doesn't have the object, receive data
                            data_list = [None]
                            self.broadcast_python_obj_cpu(data_list, src=obj_rank)
                            if data_list[0] is not None:
                                refined_obj_data[obj_id] = data_list[0]
                else:
                    # Single GPU case
                    refined_obj_data = local_obj_data

                # Update SAM2 scores for all refined objects
                for obj_id, (refined_score, _) in refined_obj_data.items():
                    # After broadcast_python_obj_cpu in multi-GPU, tensors may become numpy scalars
                    # Ensure it's a GPU tensor for consistency with base class behavior
                    if not isinstance(refined_score, torch.Tensor):
                        refined_score = torch.tensor(
                            refined_score, dtype=torch.float32, device=self.device
                        )
                    tracker_metadata["obj_id_to_sam2_score_frame_wise"][
                        frame_idx
                    ].update({obj_id: refined_score})

                if self.rank == 0:
                    # get predictions from SAM2 inference states, it includes the original
                    # VG predictions and the refined predictions from interactivity.

                    # Prepare refined masks dictionary - upscale to video resolution after broadcast
                    refined_obj_id_to_mask = {}
                    for obj_id, (_, refined_mask_low_res) in refined_obj_data.items():
                        refined_mask_video_res = (
                            self._convert_low_res_mask_to_video_res(
                                refined_mask_low_res, inference_state
                            )
                        )  # (1, H_video, W_video) bool
                        refined_obj_id_to_mask[obj_id] = refined_mask_video_res

                    obj_id_to_mask = self._build_sam2_output(
                        inference_state, frame_idx, refined_obj_id_to_mask
                    )
                    out = {
                        "obj_id_to_mask": obj_id_to_mask,
                        "obj_id_to_score": tracker_metadata["obj_id_to_score"],
                        "obj_id_to_sam2_score": tracker_metadata[
                            "obj_id_to_sam2_score_frame_wise"
                        ][frame_idx],
                    }
                    suppressed_obj_ids = tracker_metadata["rank0_metadata"][
                        "suppressed_obj_ids"
                    ][frame_idx]
                    self._cache_frame_outputs(
                        inference_state,
                        frame_idx,
                        obj_id_to_mask,
                        suppressed_obj_ids=suppressed_obj_ids,
                    )
                    suppressed_obj_ids = tracker_metadata["rank0_metadata"][
                        "suppressed_obj_ids"
                    ][frame_idx]
                    progress_done += 1
                    if progress_callback is not None:
                        progress_callback(progress_done, progress_total)
                    yield (
                        frame_idx,
                        self._postprocess_output(
                            inference_state, out, suppressed_obj_ids=suppressed_obj_ids
                        ),
                    )
                else:
                    yield frame_idx, DUMMY_OUTPUT  # no output for other GPUs

    def add_action_history(
        self, inference_state, action_type, frame_idx=None, obj_ids=None
    ):
        """
        action_history is used to automatically decide what to do during propagation.
        action_type: one of ["add", "remove", "refine"] + ["propagation_full", "propagation_partial", "propagation_fetch", "propagation_cancel"]
        """
        instance_actions = ["add", "remove", "refine"]
        propagation_actions = [
            "propagation_full",
            "propagation_partial",
            "propagation_fetch",
            "propagation_cancel",
        ]
        assert action_type in instance_actions + propagation_actions, (
            f"Invalid action type: {action_type}, must be one of {instance_actions + propagation_actions}"
        )
        action = {
            "type": action_type,
            "frame_idx": frame_idx,
            "obj_ids": obj_ids,
        }
        inference_state["action_history"].append(action)

    def _has_object_been_refined(self, inference_state, obj_id):
        if "action_history" not in inference_state:
            return False
        action_history = inference_state["action_history"]
        for action in action_history:
            if action["type"] in ["add", "refine"] and action.get("obj_ids"):
                if obj_id in action["obj_ids"]:
                    return True
        return False

    def parse_action_history_for_propagation(self, inference_state):
        action_history = inference_state["action_history"]
        if (
            len(action_history) == 1
            and action_history[0]["type"] == "propagation_cancel"
        ):
            # only one action and it is cancel, we do full propagation
            return "propagation_full", None
        elif (
            len(action_history) >= 2
            and action_history[-1]["type"] == "propagation_cancel"
        ):
            # last action is cancel, we go back to the action before cancel
            action_before_cancelation = inference_state["action_history"][-2]
            # the action before cancellation can be a propagation_fetch from running both forward
            # and backward propagation as in webdemo interface, in that case we go back one more step
            if action_before_cancelation["type"] == "propagation_fetch":
                action_before_cancelation = inference_state["action_history"][-3]
            return action_before_cancelation["type"], action_before_cancelation.get(
                "obj_ids", None
            )
        return self._parse_action_history_for_propagation(
            inference_state["action_history"], inference_state["num_frames"]
        )

    def _parse_action_history_for_propagation(self, action_history, num_frames):
        """
        Parse the actions in history before the last propagation and prepare for the next propagation.
        We support multiple actions (add/remove/refine) between two propagations. If we had an action
        history similar to this ["propagate", "add", "refine", "remove", "add"], the next propagation
        would remove the removed object, and also propagate the two added/refined objects.

        Returns:
            propagation_type: one of ["propagation_full", "propagation_partial", "propagation_fetch"]
                - "propagation_full": run VG propagation for all objects
                - "propagation_partial": run SAM2 propagation for selected objects, useful for add/refine actions
                - "propagation_fetch": fetch existing VG predictions without running any propagation
                - "propagation_cancel": this will be handled in parse_action_history_for_propagation() not this function.
            obj_ids: list of object ids to run SAM2 propagation on if propagation_type is "propagation_partial".

        TODO: (Jie) this function works for our current workflows, but may need more tests to ensure it works
        correctly with different action histories for future workflows.
        """
        if len(action_history) == 0:
            # we run propagation for the first time
            return "propagation_full", None

        if "propagation" in action_history[-1]["type"]:
            if action_history[-1]["type"] in ["propagation_fetch"]:
                # last propagation is direct fetch, we fetch existing predictions
                return "propagation_fetch", None
            elif action_history[-1]["type"] in [
                "propagation_partial",
                "propagation_full",
            ]:
                # we do fetch prediction if we have already run propagation twice or we have run
                # propagation once and it is from the first frame or last frame.
                if (
                    len(action_history) > 1
                    and action_history[-2]["type"]
                    in ["propagation_partial", "propagation_full"]
                ) or action_history[-1]["frame_idx"] in [
                    0,
                    num_frames - 1,
                ]:
                    # we have run both forward and backward partial/full propagation
                    return "propagation_fetch", None
                else:
                    # we have run partial/full forward or backward propagation once, need run it for the rest of the frames
                    return action_history[-1]["type"], action_history[-1]["obj_ids"]

        # parse actions since last propagation
        obj_ids = []
        for action in action_history[::-1]:
            if "propagation" in action["type"]:
                # we reached the last propagation action, stop parsing
                break
            if action["type"] in ["add", "refine"]:
                obj_ids.extend(action["obj_ids"])
            # else action["type"] == "remove": noop
        obj_ids = list(set(obj_ids)) if len(obj_ids) > 0 else None
        propagation_type = (
            "propagation_partial" if obj_ids is not None else "propagation_fetch"
        )
        return propagation_type, obj_ids

    def remove_object(self, inference_state, obj_id, frame_idx, is_user_action=False):
        """
        We try to remove object from sam2 states on every GPU, it will do nothing
        for states without this object.
        """
        obj_rank = self._get_gpu_id_by_obj_id(inference_state, obj_id)
        if obj_rank is None:
            # Object was already removed (e.g., by hotstart heuristics during
            # propagation). Log a warning and skip SAM2 state and metadata
            # removal, but still record action history and clean up cached outputs.
            logger.warning(
                f"Object {obj_id} not found in any GPU (already removed). "
                f"Skipping SAM2 state and metadata removal."
            )
        else:
            tracker_states_local = inference_state["sam2_inference_states"]
            if self.rank == obj_rank:
                self._tracker_remove_objects(tracker_states_local, [obj_id])

            # update metadata
            tracker_metadata = inference_state["tracker_metadata"]
            _obj_ids = tracker_metadata["obj_ids_per_gpu"][obj_rank]
            tracker_metadata["obj_ids_per_gpu"][obj_rank] = _obj_ids[_obj_ids != obj_id]
            tracker_metadata["num_obj_per_gpu"][obj_rank] = len(
                tracker_metadata["obj_ids_per_gpu"][obj_rank]
            )
            tracker_metadata["obj_ids_all_gpu"] = np.concatenate(
                tracker_metadata["obj_ids_per_gpu"]
            )
            tracker_metadata["obj_id_to_score"].pop(obj_id, None)
            # tracker_metadata["max_obj_id"] # we do not reuse the object id, so we do not update it here

        if is_user_action:
            self.add_action_history(
                inference_state, action_type="remove", obj_ids=[obj_id]
            )

        # Clean up cached frame outputs to remove references to the deleted object
        if "cached_frame_outputs" in inference_state:
            for _frame_idx in inference_state["cached_frame_outputs"]:
                frame_cache = inference_state["cached_frame_outputs"][_frame_idx]
                if obj_id in frame_cache:
                    del frame_cache[obj_id]

        out = None
        if frame_idx is not None and self.rank == 0:
            frame_idx, out = self.fetch_and_process_single_frame_results(
                inference_state, frame_idx
            )
        return frame_idx, out

    def _get_gpu_id_by_obj_id(self, inference_state, obj_id):
        """
        Locate GPU ID for a given object.
        """
        obj_ids_per_gpu = inference_state["tracker_metadata"]["obj_ids_per_gpu"]
        for rank, obj_ids in enumerate(obj_ids_per_gpu):
            if obj_id in obj_ids:
                return rank
        return None  # object not found in any GPU

    def _get_sam2_inference_states_by_obj_ids(self, inference_state, obj_ids):
        """
        Get the SAM2 inference states that contain the given object ids.
        This is used to run partial SAM2 propagation on a single object/bucket.
        Possibly multiple or zero states can be returned.
        """
        states = [
            state
            for state in inference_state["sam2_inference_states"]
            if set(obj_ids) & set(state["obj_ids"])
        ]
        return states

    def _prepare_backbone_feats(self, inference_state, frame_idx, reverse):
        input_batch = inference_state["input_batch"]
        feature_cache = inference_state["feature_cache"]
        num_frames = inference_state["num_frames"]
        geometric_prompt = (
            inference_state["constants"]["empty_geometric_prompt"]
            if inference_state["per_frame_geometric_prompt"][frame_idx] is None
            else inference_state["per_frame_geometric_prompt"][frame_idx]
        )
        _ = self.run_backbone_and_detection(
            frame_idx=frame_idx,
            num_frames=num_frames,
            reverse=reverse,
            input_batch=input_batch,
            geometric_prompt=geometric_prompt,
            feature_cache=feature_cache,
        )

    @torch.inference_mode()
    def add_prompt(
        self,
        inference_state,
        frame_idx,
        text_str=None,
        clear_old_points=True,
        points=None,
        point_labels=None,
        boxes_xywh=None,
        box_labels=None,
        clear_old_boxes=True,
        output_prob_thresh=0.5,
        obj_id=None,
        rel_coordinates=True,
        preencoded_text_outputs=None,
    ):
        if points is not None:
            if preencoded_text_outputs is not None:
                text_batch_key = tuple(inference_state["input_batch"].find_text_batch)
                text_outputs = {key: value.to(device=self.device, non_blocking=True) if torch.is_tensor(value) else value for key, value in preencoded_text_outputs.items()}
                inference_state["feature_cache"]["text"] = {text_batch_key: text_outputs}
            # SAM2 instance prompts
            assert text_str is None and boxes_xywh is None, (
                "When points are provided, text_str and boxes_xywh must be None."
            )
            assert obj_id is not None, (
                "When points are provided, obj_id must be provided."
            )
            return self.add_sam2_new_points(
                inference_state,
                frame_idx,
                obj_id=obj_id,
                points=points,
                labels=point_labels,
                clear_old_points=clear_old_points,
                rel_coordinates=rel_coordinates,
                use_prev_mem_frame=self.use_prev_mem_frame,
            )
        else:
            # SAM3 prompts — disable batched grounding for single-frame add_prompt
            _orig_batched = self.use_batched_grounding
            self.use_batched_grounding = False
            try:
                return super().add_prompt(
                    inference_state,
                    frame_idx,
                    text_str=text_str,
                    clear_old_points=clear_old_points,
                    points=points,
                    point_labels=point_labels,
                    boxes_xywh=boxes_xywh,
                    box_labels=box_labels,
                    clear_old_boxes=clear_old_boxes,
                    output_prob_thresh=output_prob_thresh,
                    preencoded_text_outputs=preencoded_text_outputs,
                )
            finally:
                self.use_batched_grounding = _orig_batched

    @torch.inference_mode()
    def add_sam2_new_points(
        self,
        inference_state,
        frame_idx,
        obj_id,
        points,
        labels,
        clear_old_points,
        rel_coordinates=True,
        use_prev_mem_frame=False,
    ):
        """Add a new point prompt to SAM2. Suppporting instance refinement to existing
        objects by passing existing obj_id or adding a new object by passing a new obj_id.
        use_prev_mem_frame=False to disable cross attention to previous memory frames.
        Every GPU returns the same results, and results should contain all masks including
        these masks not refined or not added by the current user points.
        """
        assert obj_id is not None, "obj_id must be provided to add new points"
        tracker_metadata = inference_state["tracker_metadata"]
        if tracker_metadata == {}:
            # initialize masklet metadata if it's uninitialized (empty dict)
            tracker_metadata.update(self._initialize_metadata())

        obj_rank = self._get_gpu_id_by_obj_id(inference_state, obj_id)

        # prepare feature
        self._prepare_backbone_feats(inference_state, frame_idx, reverse=False)

        object_has_been_refined = self._has_object_been_refined(inference_state, obj_id)
        if (
            obj_rank is not None
            and self.use_stateless_refinement
            and not object_has_been_refined
        ):
            # The first time we start refinement on the object, we remove it.
            logger.info(
                f"[rank={self.rank}] Removing object {obj_id} before refinement."
            )
            self.remove_object(inference_state, obj_id, is_user_action=False)
            obj_rank = None
        elif obj_rank is not None and not object_has_been_refined:
            # Extract the object into its own singleton inference state if it belongs to a batch
            if self.rank == obj_rank and not self.tracker.per_obj_inference:
                tracker_states = self._get_sam2_inference_states_by_obj_ids(
                    inference_state, [obj_id]
                )
                assert len(tracker_states) == 1
                # Check if this is a batched state (contains multiple objects)
                sam2_state = tracker_states[0]
                if len(sam2_state["obj_ids"]) > 1:
                    logger.info(
                        f"[rank={self.rank}] Extracting object {obj_id} into singleton inference state."
                    )
                    self._extract_object_to_singleton_state(
                        inference_state, obj_id, obj_rank
                    )

        if obj_rank is None:
            # new object, we assign it a GPU and create a new inference state if limit allows
            num_prev_obj = np.sum(tracker_metadata["num_obj_per_gpu"])
            if num_prev_obj >= self.max_num_objects:
                logger.warning(
                    f"add_sam2_new_points: cannot add a new object as we are already tracking {num_prev_obj=} "
                    f"masklets (under {self.max_num_objects=})"
                )
                return frame_idx, None

            new_det_gpu_ids = self._assign_new_det_to_gpus(
                new_det_num=1,
                prev_workload_per_gpu=tracker_metadata["num_obj_per_gpu"],
            )
            obj_rank = new_det_gpu_ids[0]

            # get sam2 inference state for the new object
            if self.rank == obj_rank:
                if self.tracker.per_obj_inference:
                    sam2_state = inference_state["sam2_inference_states"][0]
                else:
                    # for batched inference, we create a new inference state
                    sam2_state = self._init_new_sam2_state(inference_state)
                    inference_state["sam2_inference_states"].append(sam2_state)

            # update metadata
            tracker_metadata["obj_ids_per_gpu"][obj_rank] = np.concatenate(
                [
                    tracker_metadata["obj_ids_per_gpu"][obj_rank],
                    np.array([obj_id], dtype=np.int64),
                ]
            )
            tracker_metadata["num_obj_per_gpu"][obj_rank] = len(
                tracker_metadata["obj_ids_per_gpu"][obj_rank]
            )
            tracker_metadata["obj_ids_all_gpu"] = np.concatenate(
                tracker_metadata["obj_ids_per_gpu"]
            )
            tracker_metadata["max_obj_id"] = max(tracker_metadata["max_obj_id"], obj_id)

            logger.info(
                f"[rank={self.rank}] Adding new object with id {obj_id} at frame {frame_idx}."
            )
            self.add_action_history(
                inference_state, "add", frame_idx=frame_idx, obj_ids=[obj_id]
            )
        else:
            # existing object, for refinement
            if self.rank == obj_rank:
                tracker_states = self._get_sam2_inference_states_by_obj_ids(
                    inference_state, [obj_id]
                )
                assert len(tracker_states) == 1, (
                    f"[rank={self.rank}] Multiple SAM2 inference states found for the same object id."
                )
                sam2_state = tracker_states[0]

            # log
            logger.info(
                f"[rank={self.rank}] Refining existing object with id {obj_id} at frame {frame_idx}."
            )
            self.add_action_history(
                inference_state, "refine", frame_idx=frame_idx, obj_ids=[obj_id]
            )

        # assign higher score to added/refined object
        tracker_metadata["obj_id_to_score"][obj_id] = 1.0
        tracker_metadata["obj_id_to_sam2_score_frame_wise"][frame_idx][obj_id] = (
            torch.tensor(1.0, dtype=torch.float32, device=self.device)
        )

        if self.rank == 0:
            rank0_metadata = tracker_metadata.get("rank0_metadata", {})

            if "removed_obj_ids" in rank0_metadata:
                rank0_metadata["removed_obj_ids"].discard(obj_id)

            if "suppressed_obj_ids" in rank0_metadata:
                for frame_id in rank0_metadata["suppressed_obj_ids"]:
                    rank0_metadata["suppressed_obj_ids"][frame_id].discard(obj_id)

            if "masklet_confirmation" in rank0_metadata:
                obj_ids_all_gpu = tracker_metadata["obj_ids_all_gpu"]
                obj_indices = np.where(obj_ids_all_gpu == obj_id)[0]
                if len(obj_indices) > 0:
                    obj_idx = obj_indices[0]
                    if obj_idx < len(rank0_metadata["masklet_confirmation"]["status"]):
                        rank0_metadata["masklet_confirmation"]["status"][obj_idx] = 1
                        rank0_metadata["masklet_confirmation"]["consecutive_det_num"][
                            obj_idx
                        ] = self.masklet_confirmation_consecutive_det_thresh

        if self.rank == obj_rank:
            should_fallback_to_original_mask = (
                len(points) == 0 and inference_state["is_image_only"]
            )
            if should_fallback_to_original_mask:
                mask_input = self._get_mask_input(sam2_state, frame_idx, obj_id)
                if mask_input is None or 0 in mask_input.shape:
                    logger.warning(
                        f"Cannot retrieve original mask input for obj_id {obj_id} at frame {frame_idx} to fallback."
                    )
                    should_fallback_to_original_mask = False
            if should_fallback_to_original_mask:
                # When user cancels all points on an image, we recover the original mask
                # by re-feeding the detector mask to SAM2.
                mask_input = self._get_mask_input(sam2_state, frame_idx, obj_id)
                # clear out states related to this object to have a fresh start
                self.tracker.clear_all_points_in_frame(
                    sam2_state, frame_idx, obj_id, need_output=False
                )
                frame_idx, obj_ids, low_res_masks, video_res_masks = (
                    self.tracker.add_new_mask(
                        sam2_state,
                        frame_idx,
                        obj_id,
                        mask_input,
                    )
                )
            else:
                frame_idx, obj_ids, low_res_masks, video_res_masks = (
                    self.tracker.add_new_points(
                        inference_state=sam2_state,
                        frame_idx=frame_idx,
                        obj_id=obj_id,
                        points=points,
                        labels=labels,
                        clear_old_points=clear_old_points,
                        rel_coordinates=rel_coordinates,
                        use_prev_mem_frame=use_prev_mem_frame,
                    )
                )

            if video_res_masks is not None and len(video_res_masks) > 0:
                video_res_masks = fill_holes_in_mask_scores(
                    video_res_masks,  # shape (N, 1, H_video, W_video)
                    fill_hole_area=self.fill_hole_area,
                    sprinkle_removal_area=self.sprinkle_removal_area,
                    fill_holes=True,
                    remove_sprinkles=True,
                )

            # TODO: will this cause issue when user switching to refine another object?
            # Since the mem encoder has already run for the current input points?
            # FIX: Synchronize consolidated_frame_inds with actual point/mask
            # inputs before propagate_in_video_preflight. Two issues can cause
            # the `all_consolidated_frame_inds == input_frames_inds` assertion
            # to fail:
            #  1) VG detector conditioning frames in mask_inputs_per_obj without
            #     corresponding point inputs (stale VG entries).
            #  2) Previously consolidated point-input frames (from earlier
            #     add_points) whose consolidated_frame_inds entries were lost
            #     during subsequent propagation.
            # We fix both by: (a) clearing mask-only inputs, (b) rebuilding
            # consolidated_frame_inds from the remaining inputs, excluding
            # temp output frames (which preflight will add itself).

            # (a) Clear detector-only mask inputs
            for _obj_idx in list(sam2_state["mask_inputs_per_obj"].keys()):
                _point_frames = set(
                    sam2_state["point_inputs_per_obj"].get(_obj_idx, {}).keys()
                )
                _mask_only_frames = [
                    f
                    for f in list(sam2_state["mask_inputs_per_obj"][_obj_idx].keys())
                    if f not in _point_frames
                ]
                for f in _mask_only_frames:
                    sam2_state["mask_inputs_per_obj"][_obj_idx].pop(f, None)

            # (b) Rebuild consolidated_frame_inds from remaining inputs
            _input_frames = set()
            for _oi in sam2_state["point_inputs_per_obj"]:
                _input_frames.update(sam2_state["point_inputs_per_obj"][_oi].keys())
            for _oi in sam2_state["mask_inputs_per_obj"]:
                _input_frames.update(sam2_state["mask_inputs_per_obj"][_oi].keys())
            # Exclude temp output frames — preflight will consolidate those
            _temp_frames = set()
            for _obj_temp in sam2_state["temp_output_dict_per_obj"].values():
                _temp_frames.update(_obj_temp["cond_frame_outputs"].keys())
                _temp_frames.update(_obj_temp["non_cond_frame_outputs"].keys())
            _prev_frames = _input_frames - _temp_frames
            _cond = set()
            _non_cond = set()
            for f in _prev_frames:
                if f in sam2_state["output_dict"].get("cond_frame_outputs", {}):
                    _cond.add(f)
                else:
                    _non_cond.add(f)
            sam2_state["consolidated_frame_inds"] = {
                "cond_frame_outputs": _cond,
                "non_cond_frame_outputs": _non_cond,
            }
            self.tracker.propagate_in_video_preflight(sam2_state, run_mem_encoder=True)
            if not inference_state["is_image_only"]:
                # Clear detector conditioning frames when user clicks are received to allow
                # model updating masks on these frames. It is a noop if user is refining on the
                # detector conditioning frames or adding new objects.
                self.clear_detector_added_cond_frame_in_sam2(
                    sam2_state, obj_id, frame_idx
                )

        # fetch results from states and gather across GPUs
        # Use optimized caching approach to avoid reprocessing unmodified objects
        if self.rank == obj_rank and len(obj_ids) > 0:
            new_mask_data = (video_res_masks[obj_ids.index(obj_id)] > 0.0).to(
                torch.bool
            )
        else:
            new_mask_data = None

        # Broadcast the new mask data across all ranks for consistency
        if self.world_size > 1:
            data_list = [new_mask_data]
            self.broadcast_python_obj_cpu(data_list, src=obj_rank)
            new_mask_data = data_list[0]

        if self.rank == 0:
            obj_id_to_mask = self._build_sam2_output(
                inference_state,
                frame_idx,
                {obj_id: new_mask_data} if new_mask_data is not None else None,
            )
            # post processing - remove suppressed obj_ids
            obj_id_to_score = tracker_metadata["obj_id_to_score"]
            suppressed_obj_ids = tracker_metadata["rank0_metadata"][
                "suppressed_obj_ids"
            ][frame_idx]
            obj_id_to_sam2_score = tracker_metadata["obj_id_to_sam2_score_frame_wise"][
                frame_idx
            ]

            out = {
                "obj_id_to_mask": obj_id_to_mask,
                "obj_id_to_score": obj_id_to_score,
                "obj_id_to_sam2_score": obj_id_to_sam2_score,
            }
            self._cache_frame_outputs(
                inference_state,
                frame_idx,
                obj_id_to_mask,
                suppressed_obj_ids=suppressed_obj_ids,
            )
            return frame_idx, self._postprocess_output(
                inference_state, out, suppressed_obj_ids=suppressed_obj_ids
            )
        else:
            return frame_idx, None  # no output on other GPUs

    def _get_mask_input(self, inference_state, frame_idx, obj_id):
        """Get the mask input for a specific object on a specific frame."""
        obj_idx = self.tracker._obj_id_to_idx(inference_state, obj_id)
        mask_inputs_per_frame = inference_state["mask_inputs_per_obj"][obj_idx]
        if frame_idx not in mask_inputs_per_frame:
            logger.info(
                f"frame {frame_idx} not in mask_inputs_per_frame for obj_id {obj_id}"
            )
            return None

        mask_inputs_orig = mask_inputs_per_frame[frame_idx].squeeze(0, 1)  # (H, W)
        return mask_inputs_orig

    def _gather_obj_id_to_mask_across_gpus(self, inference_state, obj_id_to_mask_local):
        """Gather obj_id_to_mask from all GPUs. Optionally resize the masks to the video resolution."""
        tracker_metadata = inference_state["tracker_metadata"]

        # concatenate the output masklets from all local inference states
        H_mask = W_mask = self.tracker.low_res_mask_size
        obj_ids_local = tracker_metadata["obj_ids_per_gpu"][self.rank]
        low_res_masks_local = []
        for obj_id in obj_ids_local:
            if obj_id in obj_id_to_mask_local:
                low_res_masks_local.append(obj_id_to_mask_local[obj_id])
            else:
                low_res_masks_local.append(
                    torch.full((H_mask, W_mask), -1024.0, device=self.device)
                )
        if len(low_res_masks_local) > 0:
            low_res_masks_local = torch.stack(low_res_masks_local, dim=0)  # (N, H, W)
            assert low_res_masks_local.shape[1:] == (H_mask, W_mask)
        else:
            low_res_masks_local = torch.zeros(0, H_mask, W_mask, device=self.device)

        # all-gather `low_res_masks_local` into `low_res_masks_global`
        # - low_res_masks_global: Tensor -- (num_global_obj, H_mask, W_mask)
        if self.world_size > 1:
            low_res_masks_local = low_res_masks_local.float().contiguous()
            low_res_masks_peers = [
                low_res_masks_local.new_empty(num_obj, H_mask, W_mask)
                for num_obj in tracker_metadata["num_obj_per_gpu"]
            ]
            dist.all_gather(low_res_masks_peers, low_res_masks_local)
            low_res_masks_global = torch.cat(low_res_masks_peers, dim=0)
        else:
            low_res_masks_global = low_res_masks_local
        return low_res_masks_global

    def _convert_low_res_mask_to_video_res(self, low_res_mask, inference_state):
        """
        Convert a low-res mask to video resolution, matching the format expected by _build_sam2_output.

        Args:
            low_res_mask: Tensor of shape (H_low_res, W_low_res)
            inference_state: Contains video dimensions

        Returns:
            video_res_mask: Tensor of shape (1, H_video, W_video) bool
        """
        if low_res_mask is None:
            return None

        # Convert to 3D for interpolation: (H_low_res, W_low_res) -> (1, H_low_res, W_low_res)
        low_res_mask_3d = low_res_mask.unsqueeze(0).unsqueeze(0)

        # Get video dimensions
        H_video = inference_state["orig_height"]
        W_video = inference_state["orig_width"]

        video_res_mask = F.interpolate(
            low_res_mask_3d.float(),
            size=(H_video, W_video),
            mode="bilinear",
            align_corners=False,
        )  # (1, H_video, W_video)

        # Convert to boolean - already in the right shape!
        return (video_res_mask.squeeze(0) > 0.0).to(torch.bool)

    def clear_detector_added_cond_frame_in_sam2(
        self, sam2_state, obj_id, refined_frame_idx
    ):
        """Clear detector added conditioning frame if it is within a predefined window
        of the refined frame. This allow model to update masks on these frames."""
        obj_idx = self.tracker._obj_id_to_idx(sam2_state, obj_id)

        mask_only_cond_frame_indices = []
        window = self.refinement_detector_cond_frame_removal_window
        for frame_idx in sam2_state["mask_inputs_per_obj"][obj_idx]:
            if frame_idx not in sam2_state["point_inputs_per_obj"][obj_idx]:
                # clear conditioning frames within a window of the refined frame
                if abs(frame_idx - refined_frame_idx) <= window:
                    mask_only_cond_frame_indices.append(frame_idx)

        # clear
        if len(mask_only_cond_frame_indices) > 0:
            for frame_idx in mask_only_cond_frame_indices:
                # obj_ids_on_this_frame is essentially all obj_ids in the state
                # since they are bucket batched
                obj_ids_on_this_frame = sam2_state["obj_id_to_idx"].keys()
                for obj_id2 in obj_ids_on_this_frame:
                    self.tracker.clear_all_points_in_frame(
                        sam2_state, frame_idx, obj_id2, need_output=False
                    )
            logger.info(
                f"Cleared detector mask only conditioning frames ({mask_only_cond_frame_indices}) in SAM2."
            )
        return

    def _extract_object_to_singleton_state(self, inference_state, obj_id, obj_rank):
        """
        Extract an object from a batched inference state into its own singleton state.
        """
        if self.rank != obj_rank:
            return

        tracker_states_local = inference_state["sam2_inference_states"]

        # Find the inference state containing this object
        source_state = None
        source_state_idx = None
        for idx, state in enumerate(tracker_states_local):
            if obj_id in state["obj_ids"]:
                source_state = state
                source_state_idx = idx
                break

        assert source_state is not None

        if len(source_state["obj_ids"]) <= 1:
            # Object not found or already in singleton state
            return

        # Step 1: Extract all the object's state data before removing it
        obj_idx_in_source = source_state["obj_id_to_idx"][obj_id]
        multiplex_state = source_state.get("multiplex_state")

        # Extract consolidated outputs (obj_ptr, maskmem_features, etc.) BEFORE
        # remove_object modifies the source tensors.
        singleton_consolidated_outputs = {
            "cond_frame_outputs": {},
            "non_cond_frame_outputs": {},
        }
        if "output_dict" in source_state:
            for storage_key in ["cond_frame_outputs", "non_cond_frame_outputs"]:
                source_outputs = source_state["output_dict"].get(storage_key, {})
                for f_idx, source_frame_out in source_outputs.items():
                    if source_frame_out["pred_masks"].shape[0] < obj_idx_in_source + 1:
                        continue
                    singleton_frame_out = {
                        "pred_masks": source_frame_out["pred_masks"][
                            obj_idx_in_source : obj_idx_in_source + 1
                        ].clone(),
                        "object_score_logits": source_frame_out["object_score_logits"][
                            obj_idx_in_source : obj_idx_in_source + 1
                        ].clone(),
                        "image_features": source_frame_out.get("image_features"),
                        "image_pos_enc": source_frame_out.get("image_pos_enc"),
                        "local_obj_id_to_idx": {obj_id: 0},
                    }
                    # Extract maskmem_features (demux from multiplex space)
                    maskmem_features = source_frame_out.get("maskmem_features")
                    if maskmem_features is not None and multiplex_state is not None:
                        try:
                            demuxed = multiplex_state.demux(maskmem_features)
                            maskmem_features = demuxed[
                                obj_idx_in_source : obj_idx_in_source + 1
                            ].clone()
                        except (AssertionError, IndexError):
                            maskmem_features = None
                    elif maskmem_features is not None:
                        maskmem_features = maskmem_features[
                            obj_idx_in_source : obj_idx_in_source + 1
                        ].clone()
                    singleton_frame_out["maskmem_features"] = maskmem_features
                    # Extract maskmem_pos_enc (demux level by level)
                    maskmem_pos_enc = source_frame_out.get("maskmem_pos_enc")
                    if maskmem_pos_enc is not None:
                        remapped = []
                        for level_enc in maskmem_pos_enc:
                            if level_enc is None:
                                remapped.append(None)
                                continue
                            if multiplex_state is not None:
                                try:
                                    demuxed = multiplex_state.demux(level_enc)
                                    remapped.append(
                                        demuxed[
                                            obj_idx_in_source : obj_idx_in_source + 1
                                        ].clone()
                                    )
                                except (AssertionError, IndexError):
                                    remapped.append(None)
                            else:
                                remapped.append(
                                    level_enc[
                                        obj_idx_in_source : obj_idx_in_source + 1
                                    ].clone()
                                )
                        maskmem_pos_enc = remapped
                    singleton_frame_out["maskmem_pos_enc"] = maskmem_pos_enc
                    # Extract obj_ptr (demux from multiplex space)
                    if (
                        "obj_ptr" in source_frame_out
                        and self.tracker.use_obj_ptrs_in_encoder
                    ):
                        source_obj_ptr = source_frame_out["obj_ptr"]
                        if multiplex_state is not None:
                            obj_ptr_data = multiplex_state.demux(source_obj_ptr)
                            singleton_frame_out["obj_ptr"] = obj_ptr_data[
                                obj_idx_in_source : obj_idx_in_source + 1
                            ].clone()
                        else:
                            singleton_frame_out["obj_ptr"] = source_obj_ptr[
                                obj_idx_in_source : obj_idx_in_source + 1
                            ].clone()
                    # Extract conditioning_objects
                    if "conditioning_objects" in source_frame_out:
                        if (
                            obj_idx_in_source
                            in source_frame_out["conditioning_objects"]
                        ):
                            singleton_frame_out["conditioning_objects"] = {0}
                        else:
                            singleton_frame_out["conditioning_objects"] = set()
                    singleton_consolidated_outputs[storage_key][f_idx] = (
                        singleton_frame_out
                    )

        # Extract point and mask inputs for this object
        extracted_point_inputs = {}
        extracted_mask_inputs = {}

        if (
            "point_inputs_per_obj" in source_state
            and obj_idx_in_source in source_state["point_inputs_per_obj"]
        ):
            extracted_point_inputs = source_state["point_inputs_per_obj"][
                obj_idx_in_source
            ].copy()

        if (
            "mask_inputs_per_obj" in source_state
            and obj_idx_in_source in source_state["mask_inputs_per_obj"]
        ):
            extracted_mask_inputs = source_state["mask_inputs_per_obj"][
                obj_idx_in_source
            ].copy()

        # Extract per-object outputs - these are already properly sliced for the object
        extracted_obj_cond_outputs = {}
        extracted_obj_non_cond_outputs = {}
        extracted_temp_cond_outputs = {}
        extracted_temp_non_cond_outputs = {}

        if (
            "output_dict_per_obj" in source_state
            and obj_idx_in_source in source_state["output_dict_per_obj"]
        ):
            obj_output_dict = source_state["output_dict_per_obj"][obj_idx_in_source]
            extracted_obj_cond_outputs = obj_output_dict.get(
                "cond_frame_outputs", {}
            ).copy()
            cond_input_keys = (
                extracted_point_inputs.keys() | extracted_mask_inputs.keys()
            )
            # we may have obj cond outputs for other objects in a batch, so limit to cond inputs for only this object
            extracted_obj_cond_outputs = {
                k: v
                for k, v in extracted_obj_cond_outputs.items()
                if k in cond_input_keys
            }

            extracted_obj_non_cond_outputs = obj_output_dict.get(
                "non_cond_frame_outputs", {}
            ).copy()

        if (
            "temp_output_dict_per_obj" in source_state
            and obj_idx_in_source in source_state["temp_output_dict_per_obj"]
        ):
            temp_obj_output_dict = source_state["temp_output_dict_per_obj"][
                obj_idx_in_source
            ]
            extracted_temp_cond_outputs = temp_obj_output_dict.get(
                "cond_frame_outputs", {}
            ).copy()
            extracted_temp_non_cond_outputs = temp_obj_output_dict.get(
                "non_cond_frame_outputs", {}
            ).copy()

        # Step 2: Remove the object from the source state
        remaining_obj_ids, _ = self.tracker.remove_object(
            source_state, obj_id, strict=False, need_output=False
        )

        # Step 3: Create a new singleton inference state
        new_sam2_state = self.tracker.init_state(
            cached_features=inference_state["feature_cache"],
            video_height=inference_state["orig_height"],
            video_width=inference_state["orig_width"],
            num_frames=inference_state["num_frames"],
        )

        # Step 4: Set up the singleton state structure for the extracted object
        # Map the object to index 0 in the new singleton state
        new_sam2_state["obj_id_to_idx"] = {obj_id: 0}
        new_sam2_state["obj_idx_to_id"] = {0: obj_id}
        new_sam2_state["obj_ids"] = [obj_id]

        # Step 5: Restore all the extracted state
        # Restore point and mask inputs
        new_sam2_state["point_inputs_per_obj"] = {0: extracted_point_inputs}
        new_sam2_state["mask_inputs_per_obj"] = {0: extracted_mask_inputs}

        # Restore per-object output dictionaries (already properly sliced)
        new_sam2_state["output_dict_per_obj"] = {
            0: {
                "cond_frame_outputs": extracted_obj_cond_outputs,
                "non_cond_frame_outputs": extracted_obj_non_cond_outputs,
            }
        }

        # Restore temporary outputs
        new_sam2_state["temp_output_dict_per_obj"] = {
            0: {
                "cond_frame_outputs": extracted_temp_cond_outputs,
                "non_cond_frame_outputs": extracted_temp_non_cond_outputs,
            }
        }

        # Step 6: Rebuild the consolidated output_dict for the singleton state
        # Use the extracted consolidated outputs which include obj_ptr,
        # maskmem_features, maskmem_pos_enc (not just pred_masks/object_score_logits)

        # Create singleton multiplex state and remux extracted tensors
        new_multiplex_state = self.tracker.multiplex_controller.get_state(
            num_valid_entries=1,
            device=source_state.get("device", "cuda"),
            dtype=torch.float32,
            random=False,
            object_ids=[obj_id],
        )
        new_sam2_state["multiplex_state"] = new_multiplex_state

        for storage_key in ["cond_frame_outputs", "non_cond_frame_outputs"]:
            for f_idx, frame_out in singleton_consolidated_outputs[storage_key].items():
                if frame_out.get("maskmem_features") is not None:
                    frame_out["maskmem_features"] = frame_out[
                        "maskmem_features"
                    ].clone()
                if frame_out.get("maskmem_pos_enc") is not None:
                    frame_out["maskmem_pos_enc"] = [
                        level.clone() if level is not None else None
                        for level in frame_out["maskmem_pos_enc"]
                    ]
                if "obj_ptr" in frame_out and self.tracker.use_obj_ptrs_in_encoder:
                    frame_out["obj_ptr"] = new_multiplex_state.mux(frame_out["obj_ptr"])

        new_sam2_state["output_dict"] = singleton_consolidated_outputs

        # Step 7: Copy other important state if it exists
        for key in [
            "first_ann_frame_idx",
            "tracking_has_started",
        ]:
            if key in source_state:
                new_sam2_state[key] = source_state[key]

        # Leave consolidated_frame_inds empty so preflight reconstructs from per-obj data
        new_sam2_state["consolidated_frame_inds"] = {
            "cond_frame_outputs": set(),
            "non_cond_frame_outputs": set(),
        }

        # Step 8: Add the new singleton state to the list
        tracker_states_local.append(new_sam2_state)

        # Step 9: If the source state is now empty, remove it
        if len(remaining_obj_ids) == 0:
            tracker_states_local.pop(source_state_idx)
            logger.info(
                f"Removed empty inference state after extracting object {obj_id}"
            )

        logger.info(f"Object {obj_id} successfully extracted to singleton state")