Spaces:

1ripon1
/

ColabWan

Build error

App Files Files Community

ColabWan / preprocessing /sam3 /model /video_tracking_multiplex_demo.py

1ripon1

Upload folder using huggingface_hub

7344bef verified about 1 month ago

Raw

History Blame Contribute Delete

173 kB

	import logging
	from collections import OrderedDict
	from copy import deepcopy
	from typing import Iterable, Optional

	import numpy as np
	import torch
	from ..model.data_misc import NestedTensor
	from ..model.device_utils import get_accelerator_device
	from ..model.io_utils import load_video_frames
	from ..model.multiplex_utils import MultiplexState
	from ..model.sam3_tracker_utils import fill_holes_in_mask_scores
	from ..model.video_tracking_multiplex import (
	concat_points,
	NO_OBJ_SCORE,
	VideoTrackingDynamicMultiplex,
	)
	from tqdm import tqdm


	class VideoTrackingMultiplexDemo(VideoTrackingDynamicMultiplex):
	"""
	The demo class that extends the `VideoTrackingDynamicMultiplex` to handle user interactions
	and manage inference states, with support for multi-object tracking.

	Interactions are not yet implemented.
	"""

	def __init__(
	self,
	# whether to clear non-conditioning memory of the surrounding frames (which may contain outdated information) after adding correction clicks;
	# note that this would only apply to single-object tracking unless `clear_non_cond_mem_for_multi_obj` is also set to True)
	clear_non_cond_mem_around_input=False,
	# whether to also clear non-conditioning memory of the surrounding frames (only effective when `clear_non_cond_mem_around_input` is True).
	clear_non_cond_mem_for_multi_obj=False,
	# if fill_hole_area > 0, we fill small holes in the final masks up to this area (after resizing them to the original video resolution)
	fill_hole_area=0,
	# if always_start_from_first_ann_frame is True, we always start tracking from the frame where we receive the first annotation (clicks or mask)
	# and ignore the `start_frame_idx` passed to `propagate_in_video`
	always_start_from_first_ann_frame=False,
	# the maximum number of points to be used in the prompt encoder, which reduce the domain gap between training (that only has 8 points)
	# - if it's set to a positive integer, we only take the `max_point_num_in_prompt_enc//2` points and
	# the last `(max_point_num_in_prompt_enc - max_point_num_in_prompt_enc//2)` points in the prompt encoder
	# - if it's set to 0 or negative, this option is turned off and we use all points in the prompt encoder
	max_point_num_in_prompt_enc=16,
	non_overlap_masks_for_output=True,
	**kwargs,
	):
	super().__init__(**kwargs)

	self.clear_non_cond_mem_around_input = clear_non_cond_mem_around_input
	self.clear_non_cond_mem_for_multi_obj = clear_non_cond_mem_for_multi_obj
	self.fill_hole_area = fill_hole_area
	self.always_start_from_first_ann_frame = always_start_from_first_ann_frame
	self.max_point_num_in_prompt_enc = max_point_num_in_prompt_enc
	self.non_overlap_masks_for_output = non_overlap_masks_for_output

	@torch.inference_mode()
	def init_state(
	self,
	video_path,
	offload_video_to_cpu,
	offload_state_to_cpu,
	async_loading_frames=False,
	use_cv2=False,
	):
	"""Initialize a inference state."""
	# Make sure that sigmoid is used on mask logits (should be True for all our recent models).
	# Since we rely on large negative values as scores for missing objects, the raw logits
	# cannot be consumed directly and must be converted into 0~1 range via sigmoid first.
	if not self.apply_sigmoid_to_mask_logits_for_mem_enc:
	raise NotImplementedError(
	"Multi-object tracking requires sigmoid in memory encoder for non-overlapping constraints."
	)

	images, video_height, video_width = load_video_frames(
	video_path=video_path,
	image_size=self.image_size,
	offload_video_to_cpu=offload_video_to_cpu,
	async_loading_frames=async_loading_frames,
	use_cv2=use_cv2,
	)
	inference_state = {}
	inference_state["images"] = images
	inference_state["num_frames"] = len(images)
	# whether to offload the video frames to CPU memory
	# turning on this option saves the GPU memory with only a very small overhead
	inference_state["offload_video_to_cpu"] = offload_video_to_cpu
	# whether to offload the inference state to CPU memory
	# turning on this option saves the GPU memory at the cost of a lower tracking fps
	# (e.g. in a test case of 768x768 model, fps dropped from 27 to 24 when tracking one object
	# and from 24 to 21 when tracking two objects)
	inference_state["offload_state_to_cpu"] = offload_state_to_cpu
	# the original video height and width, used for resizing final output scores
	inference_state["video_height"] = video_height
	inference_state["video_width"] = video_width
	inference_state["device"] = get_accelerator_device()
	if offload_state_to_cpu:
	inference_state["storage_device"] = torch.device("cpu")
	else:
	inference_state["storage_device"] = get_accelerator_device()
	# inputs on each frame
	inference_state["point_inputs_per_obj"] = {}
	inference_state["mask_inputs_per_obj"] = {}
	# visual features on a small number of recently visited frames for quick interactions
	inference_state["cached_features"] = {}
	# values that don't change across frames (so we only need to hold one copy of them)
	inference_state["constants"] = {}
	# mapping between client-side object id and model-side object index
	inference_state["obj_id_to_idx"] = OrderedDict()
	inference_state["obj_idx_to_id"] = OrderedDict()
	inference_state["obj_ids"] = []
	# A storage to hold the model's tracking results and states on each frame
	inference_state["output_dict"] = {
	"cond_frame_outputs": {}, # dict containing {frame_idx: <out>}
	"non_cond_frame_outputs": {}, # dict containing {frame_idx: <out>}
	}
	# The index of the frame that received the first annotation
	inference_state["first_ann_frame_idx"] = None
	# Slice (view) of each object tracking results, sharing the same memory with "output_dict"
	inference_state["output_dict_per_obj"] = {}
	# A temporary storage to hold new outputs when user interact with a frame
	# to add clicks or mask (it's merged into "output_dict" before propagation starts)
	inference_state["temp_output_dict_per_obj"] = {}
	# Frames that already holds consolidated outputs from click or mask inputs
	# (we directly use their consolidated outputs during tracking)
	inference_state["consolidated_frame_inds"] = {
	"cond_frame_outputs": set(), # set containing frame indices
	"non_cond_frame_outputs": set(), # set containing frame indices
	}
	# metadata for each tracking frame (e.g. which direction it's tracked)
	inference_state["tracking_has_started"] = False
	inference_state["frames_already_tracked"] = {}
	inference_state["multiplex_state"] = None
	# Track which frames have been refined by user interaction (per object)
	# This is used to distinguish first refinement (fresh) vs subsequent refinements (incremental)
	inference_state["user_refined_frames_per_obj"] = {}
	# # Warm up the whole model and cache the image feature on frame 0
	# # by making a dummy click on the first frame (and then cleaning it up)
	# self.add_new_points(
	# inference_state=inference_state,
	# frame_idx=0,
	# obj_id=1,
	# points=torch.tensor([[0.5, 0.5]], dtype=torch.float32),
	# labels=torch.tensor([1], dtype=torch.int32),
	# clear_old_points=True,
	# rel_coordinates=True,
	# )
	# self.clear_all_points_in_video(inference_state)
	return inference_state

	def _obj_id_to_idx(self, inference_state, obj_id, error_if_new=False):
	"""Map client-side object id to model-side object index."""
	obj_idx = inference_state["obj_id_to_idx"].get(obj_id, None)
	if obj_idx is not None:
	return obj_idx

	if (
	self.is_dynamic_model or not inference_state["tracking_has_started"]
	) and not error_if_new:
	# get the next object slot
	obj_idx = len(inference_state["obj_id_to_idx"])
	inference_state["obj_id_to_idx"][obj_id] = obj_idx
	inference_state["obj_idx_to_id"][obj_idx] = obj_id
	inference_state["obj_ids"] = list(inference_state["obj_id_to_idx"])
	# set up input and output structures for this object
	inference_state["point_inputs_per_obj"][obj_idx] = {}
	inference_state["mask_inputs_per_obj"][obj_idx] = {}
	inference_state["output_dict_per_obj"][obj_idx] = {
	"cond_frame_outputs": {}, # dict containing {frame_idx: <out>}
	"non_cond_frame_outputs": {}, # dict containing {frame_idx: <out>}
	}
	inference_state["temp_output_dict_per_obj"][obj_idx] = {
	"cond_frame_outputs": {}, # dict containing {frame_idx: <out>}
	"non_cond_frame_outputs": {}, # dict containing {frame_idx: <out>}
	}
	return obj_idx
	else:
	raise RuntimeError(
	f"Cannot add new object id {obj_id}. "
	f"All existing object ids: {inference_state['obj_ids']}."
	)

	def _obj_idx_to_id(self, inference_state, obj_idx):
	"""Map model-side object index to client-side object id."""
	return inference_state["obj_idx_to_id"][obj_idx]

	def _get_obj_num(self, inference_state):
	"""Get the total number of unique object ids received so far in this session."""
	# return len(inference_state["obj_idx_to_id"])
	return inference_state["multiplex_state"].total_valid_entries

	@torch.inference_mode()
	def _extract_object_for_interaction(self, inference_state, obj_id, frame_idx):
	"""
	Extract a single object from multiplex state for singleton interaction.
	Adapted from sam3_multiplex_tracking._extract_object_to_singleton_state()

	Returns:
	singleton_state: New inference state containing only this object
	obj_idx_in_source: Original object index before removal (for merging back)
	"""
	source_state = inference_state
	obj_idx_in_source = source_state["obj_id_to_idx"][obj_id]

	# Step 1: Extract all object data BEFORE removing it
	multiplex_state = source_state.get("multiplex_state")

	# Extract consolidated outputs (slice NOW before remove_object modifies tensors)
	singleton_consolidated_outputs = {
	"cond_frame_outputs": {},
	"non_cond_frame_outputs": {},
	}

	if "output_dict" in source_state:
	for storage_key in ["cond_frame_outputs", "non_cond_frame_outputs"]:
	source_outputs = source_state["output_dict"].get(storage_key, {})

	for f_idx, source_frame_out in source_outputs.items():
	# Check if this frame has valid data for this object
	has_valid_data = (
	source_frame_out["pred_masks"].shape[0] >= obj_idx_in_source + 1
	)

	if has_valid_data:
	# Create singleton frame output by slicing
	singleton_frame_out = {
	"pred_masks": source_frame_out["pred_masks"][
	obj_idx_in_source : obj_idx_in_source + 1
	].clone(),
	"object_score_logits": source_frame_out[
	"object_score_logits"
	][obj_idx_in_source : obj_idx_in_source + 1].clone(),
	# image_features and image_pos_enc remain shared (not in multiplex space)
	"image_features": source_frame_out.get("image_features"),
	"image_pos_enc": source_frame_out.get("image_pos_enc"),
	"local_obj_id_to_idx": {obj_id: 0},
	}

	# Handle maskmem_features by converting from multiplex space to data space
	maskmem_features = source_frame_out.get("maskmem_features")
	if maskmem_features is not None:
	if multiplex_state is not None:
	expected_buckets = multiplex_state.num_buckets
	expected_multiplex = multiplex_state.multiplex_count
	if (
	maskmem_features.dim() >= 2
	and maskmem_features.shape[0] == expected_buckets
	and maskmem_features.shape[1] == expected_multiplex
	):
	try:
	demuxed_features = multiplex_state.demux(
	maskmem_features
	)
	except AssertionError as exc:
	logging.warning(
	"[EXTRACT] demux failed for maskmem_features shape %s: %s",
	tuple(maskmem_features.shape),
	exc,
	)
	demuxed_features = None
	if demuxed_features is not None:
	maskmem_features = demuxed_features[
	obj_idx_in_source : obj_idx_in_source + 1
	].clone()
	else:
	maskmem_features = maskmem_features[
	obj_idx_in_source : obj_idx_in_source + 1
	].clone()
	elif maskmem_features.shape[0] == 0:
	# No entries for this object yet; treat as missing without warning
	maskmem_features = None
	elif maskmem_features.shape[0] >= obj_idx_in_source + 1:
	# Already in data space; slice directly
	maskmem_features = maskmem_features[
	obj_idx_in_source : obj_idx_in_source + 1
	].clone()
	else:
	logging.warning(
	"[EXTRACT] maskmem_features shape %s incompatible with multiplex state; dropping tensor",
	tuple(maskmem_features.shape),
	)
	maskmem_features = None
	else:
	maskmem_features = maskmem_features[
	obj_idx_in_source : obj_idx_in_source + 1
	].clone()
	singleton_frame_out["maskmem_features"] = maskmem_features

	# Handle maskmem_pos_enc similarly, level by level
	maskmem_pos_enc = source_frame_out.get("maskmem_pos_enc")
	if maskmem_pos_enc is not None:
	remapped_pos_enc = []
	for level_enc in maskmem_pos_enc:
	if level_enc is None:
	remapped_pos_enc.append(None)
	continue
	if multiplex_state is not None:
	expected_buckets = multiplex_state.num_buckets
	expected_multiplex = multiplex_state.multiplex_count
	if (
	level_enc.dim() >= 2
	and level_enc.shape[0] == expected_buckets
	and level_enc.shape[1] == expected_multiplex
	):
	try:
	demuxed_level = multiplex_state.demux(
	level_enc
	)
	except AssertionError as exc:
	logging.warning(
	"[EXTRACT] demux failed for maskmem_pos_enc level shape %s: %s",
	tuple(level_enc.shape),
	exc,
	)
	demuxed_level = None
	if demuxed_level is not None:
	remapped_pos_enc.append(
	demuxed_level[
	obj_idx_in_source : obj_idx_in_source
	+ 1
	].clone()
	)
	elif (
	level_enc.shape[0] >= obj_idx_in_source + 1
	):
	remapped_pos_enc.append(
	level_enc[
	obj_idx_in_source : obj_idx_in_source
	+ 1
	].clone()
	)
	else:
	logging.warning(
	"[EXTRACT] maskmem_pos_enc level shape %s incompatible with multiplex state; dropping level",
	tuple(level_enc.shape),
	)
	remapped_pos_enc.append(None)
	elif level_enc.shape[0] >= obj_idx_in_source + 1:
	remapped_pos_enc.append(
	level_enc[
	obj_idx_in_source : obj_idx_in_source
	+ 1
	].clone()
	)
	else:
	logging.warning(
	"[EXTRACT] maskmem_pos_enc level shape %s incompatible with multiplex state; dropping level",
	tuple(level_enc.shape),
	)
	remapped_pos_enc.append(None)
	else:
	remapped_pos_enc.append(
	level_enc[
	obj_idx_in_source : obj_idx_in_source + 1
	].clone()
	)
	maskmem_pos_enc = remapped_pos_enc
	singleton_frame_out["maskmem_pos_enc"] = maskmem_pos_enc

	# Handle obj_ptr (must demux from multiplex space first)
	if (
	"obj_ptr" in source_frame_out
	and self.use_obj_ptrs_in_encoder
	):
	source_obj_ptr = source_frame_out["obj_ptr"]
	if multiplex_state is not None:
	# Demux: multiplex space → data space
	obj_ptr_data_space = multiplex_state.demux(
	source_obj_ptr
	)
	# Slice for this object
	singleton_frame_out["obj_ptr"] = obj_ptr_data_space[
	obj_idx_in_source : obj_idx_in_source + 1
	].clone()
	else:
	singleton_frame_out["obj_ptr"] = source_obj_ptr[
	obj_idx_in_source : obj_idx_in_source + 1
	].clone()

	# Convert conditioning_objects
	if "conditioning_objects" in source_frame_out:
	if (
	obj_idx_in_source
	in source_frame_out["conditioning_objects"]
	):
	singleton_frame_out["conditioning_objects"] = {0}
	else:
	singleton_frame_out["conditioning_objects"] = set()

	singleton_consolidated_outputs[storage_key][f_idx] = (
	singleton_frame_out
	)

	# Extract point and mask inputs
	extracted_point_inputs = {}
	extracted_mask_inputs = {}

	if (
	"point_inputs_per_obj" in source_state
	and obj_idx_in_source in source_state["point_inputs_per_obj"]
	):
	extracted_point_inputs = source_state["point_inputs_per_obj"][
	obj_idx_in_source
	].copy()

	if (
	"mask_inputs_per_obj" in source_state
	and obj_idx_in_source in source_state["mask_inputs_per_obj"]
	):
	extracted_mask_inputs = source_state["mask_inputs_per_obj"][
	obj_idx_in_source
	].copy()

	# Extract per-object outputs
	extracted_obj_cond_outputs = {}
	extracted_obj_non_cond_outputs = {}
	extracted_temp_cond_outputs = {}
	extracted_temp_non_cond_outputs = {}

	if (
	"output_dict_per_obj" in source_state
	and obj_idx_in_source in source_state["output_dict_per_obj"]
	):
	obj_output_dict = source_state["output_dict_per_obj"][obj_idx_in_source]
	extracted_obj_cond_outputs = obj_output_dict.get(
	"cond_frame_outputs", {}
	).copy()
	extracted_obj_non_cond_outputs = obj_output_dict.get(
	"non_cond_frame_outputs", {}
	).copy()

	if (
	"temp_output_dict_per_obj" in source_state
	and obj_idx_in_source in source_state["temp_output_dict_per_obj"]
	):
	temp_obj_output_dict = source_state["temp_output_dict_per_obj"][
	obj_idx_in_source
	]
	extracted_temp_cond_outputs = temp_obj_output_dict.get(
	"cond_frame_outputs", {}
	).copy()
	extracted_temp_non_cond_outputs = temp_obj_output_dict.get(
	"non_cond_frame_outputs", {}
	).copy()

	# Step 2: Remove the object from source state
	remaining_obj_ids, _ = self.remove_object(
	source_state,
	obj_id,
	strict=False,
	need_output=False,
	clear_user_refined_map=False,
	)

	# If multiplex state became empty, reset it so downstream code can reinitialize
	updated_multiplex_state = source_state.get("multiplex_state")
	if updated_multiplex_state is not None:
	if (
	getattr(updated_multiplex_state, "assignments", None) is None
	or updated_multiplex_state.total_valid_entries == 0
	):
	source_state["multiplex_state"] = None

	# Step 3: Create new singleton inference state
	singleton_state = self.init_state(
	cached_features=source_state["cached_features"],
	video_height=source_state["video_height"],
	video_width=source_state["video_width"],
	num_frames=source_state["num_frames"],
	)

	# Step 4: Set up singleton state structure
	singleton_state["obj_id_to_idx"] = {obj_id: 0}
	singleton_state["obj_idx_to_id"] = {0: obj_id}
	singleton_state["obj_ids"] = [obj_id]
	singleton_state["point_inputs_per_obj"] = {0: extracted_point_inputs}
	singleton_state["mask_inputs_per_obj"] = {0: extracted_mask_inputs}
	singleton_state["output_dict_per_obj"] = {
	0: {
	"cond_frame_outputs": extracted_obj_cond_outputs,
	"non_cond_frame_outputs": extracted_obj_non_cond_outputs,
	}
	}
	singleton_state["temp_output_dict_per_obj"] = {
	0: {
	"cond_frame_outputs": extracted_temp_cond_outputs,
	"non_cond_frame_outputs": extracted_temp_non_cond_outputs,
	}
	}
	singleton_state["frames_already_tracked"] = source_state[
	"frames_already_tracked"
	].copy()

	# Step 5: Create new singleton multiplex state (even for 1 object, needed for obj_ptr)
	new_multiplex_state = self.multiplex_controller.get_state(
	num_valid_entries=1,
	device=source_state["device"],
	dtype=torch.float32,
	random=False,
	object_ids=[obj_id],
	)
	singleton_state["multiplex_state"] = new_multiplex_state

	# Step 6: Remux extracted tensors into the singleton multiplex space
	for storage_key in ["cond_frame_outputs", "non_cond_frame_outputs"]:
	for f_idx, frame_out in singleton_consolidated_outputs[storage_key].items():
	# mask memory features
	if frame_out.get("maskmem_features") is not None:
	# Keep mask memory features in data space (num_objects, C, H, W)
	frame_out["maskmem_features"] = frame_out[
	"maskmem_features"
	].clone()

	if frame_out.get("maskmem_pos_enc") is not None:
	remapped_levels = []
	for level_enc in frame_out["maskmem_pos_enc"]:
	if level_enc is None:
	remapped_levels.append(None)
	continue
	remapped_levels.append(level_enc.clone())
	frame_out["maskmem_pos_enc"] = remapped_levels

	# object pointers
	if "obj_ptr" in frame_out and self.use_obj_ptrs_in_encoder:
	# Mux: data space [1, D] → singleton multiplex space [1, 1, D]
	frame_out["obj_ptr"] = new_multiplex_state.mux(frame_out["obj_ptr"])

	singleton_state["output_dict"] = singleton_consolidated_outputs

	return singleton_state, obj_idx_in_source

	@torch.inference_mode()
	def _merge_singleton_interaction_result(
	self,
	inference_state,
	singleton_state,
	obj_id,
	original_obj_idx,
	):
	"""
	Merge singleton interaction result back into multiplex state.

	SIMPLIFIED APPROACH: Add object back at the END (new index), not at original position.
	This avoids complex index shifting and works with multiplex controller's add_objects() API.

	Args:
	inference_state: The main multiplex inference state
	singleton_state: The singleton state with interaction results
	obj_id: The object ID
	original_obj_idx: The original index before extraction (unused - we add at end instead)
	"""
	# Determine new index (add at end)
	new_obj_idx = len(inference_state["obj_ids"])

	# Step 1: Add object mappings at new index
	inference_state["obj_ids"].append(obj_id)
	inference_state["obj_id_to_idx"][obj_id] = new_obj_idx

	# Create entry in output_dict_per_obj and temp_output_dict_per_obj for new index
	# These are DICTIONARIES indexed by obj_idx, not lists!
	inference_state["output_dict_per_obj"][new_obj_idx] = {
	"cond_frame_outputs": {},
	"non_cond_frame_outputs": {},
	}
	inference_state["temp_output_dict_per_obj"][new_obj_idx] = {
	"cond_frame_outputs": {},
	"non_cond_frame_outputs": {},
	}

	inference_state["obj_idx_to_id"][new_obj_idx] = obj_id

	# Step 2: Add object to multiplex state buckets using proper API
	multiplex_state = inference_state.get("multiplex_state")

	assignments = (
	getattr(multiplex_state, "assignments", None)
	if multiplex_state is not None
	else None
	)
	total_valid_entries = (
	getattr(multiplex_state, "total_valid_entries", 0)
	if multiplex_state is not None and assignments is not None
	else 0
	)
	need_state_reinit = (
	multiplex_state is None or assignments is None or total_valid_entries == 0
	)

	if not need_state_reinit and getattr(multiplex_state, "object_ids", None):
	if obj_id in multiplex_state.object_ids:
	old_idx = multiplex_state.object_ids.index(obj_id)
	multiplex_state.remove_objects(object_indices=[old_idx], strict=False)
	assignments = getattr(multiplex_state, "assignments", None)
	total_valid_entries = (
	getattr(multiplex_state, "total_valid_entries", 0)
	if assignments is not None
	else 0
	)
	need_state_reinit = assignments is None or total_valid_entries == 0

	if need_state_reinit:
	inference_state["multiplex_state"] = self.multiplex_controller.get_state(
	num_valid_entries=len(inference_state["obj_ids"]),
	device=inference_state["device"],
	dtype=torch.float32,
	random=False,
	object_ids=list(inference_state["obj_ids"]),
	)
	multiplex_state = inference_state["multiplex_state"]
	else:
	# Allow new buckets since we're adding at a new index (the old bucket slot may have been removed)
	multiplex_state.add_objects(
	object_indices=[new_obj_idx],
	object_ids=[obj_id],
	allow_new_buckets=True, # May need new bucket if old slot was compacted
	)

	# Step 3: Restore point and mask inputs at new index
	singleton_obj_idx = 0 # Object is always at index 0 in singleton state
	if (
	"point_inputs_per_obj" in singleton_state
	and singleton_obj_idx in singleton_state["point_inputs_per_obj"]
	):
	if "point_inputs_per_obj" not in inference_state:
	inference_state["point_inputs_per_obj"] = {}
	inference_state["point_inputs_per_obj"][new_obj_idx] = singleton_state[
	"point_inputs_per_obj"
	][singleton_obj_idx].copy()

	if (
	"mask_inputs_per_obj" in singleton_state
	and singleton_obj_idx in singleton_state["mask_inputs_per_obj"]
	):
	if "mask_inputs_per_obj" not in inference_state:
	inference_state["mask_inputs_per_obj"] = {}
	inference_state["mask_inputs_per_obj"][new_obj_idx] = singleton_state[
	"mask_inputs_per_obj"
	][singleton_obj_idx].copy()

	# Step 4: Restore per-object outputs at new index
	if (
	"output_dict_per_obj" in singleton_state
	and singleton_obj_idx in singleton_state["output_dict_per_obj"]
	):
	if "output_dict_per_obj" not in inference_state:
	inference_state["output_dict_per_obj"] = {}
	inference_state["output_dict_per_obj"][new_obj_idx] = singleton_state[
	"output_dict_per_obj"
	][singleton_obj_idx].copy()

	if (
	"temp_output_dict_per_obj" in singleton_state
	and singleton_obj_idx in singleton_state["temp_output_dict_per_obj"]
	):
	if "temp_output_dict_per_obj" not in inference_state:
	inference_state["temp_output_dict_per_obj"] = {}
	inference_state["temp_output_dict_per_obj"][new_obj_idx] = singleton_state[
	"temp_output_dict_per_obj"
	][singleton_obj_idx].copy()

	# Step 5: Merge consolidated outputs back into multiplex (append at new_obj_idx)
	# Preserve each frame's original storage key from the singleton state so that
	# conditioning frames remain in cond_frame_outputs after the merge.
	if "output_dict" in singleton_state:
	singleton_multiplex_state = singleton_state.get("multiplex_state")
	for singleton_storage_key in [
	"cond_frame_outputs",
	"non_cond_frame_outputs",
	]:
	singleton_outputs = singleton_state["output_dict"].get(
	singleton_storage_key, {}
	)

	# Skip if singleton doesn't have any frames in this storage_key
	if not singleton_outputs:
	continue

	for frame_idx, singleton_frame_out in singleton_outputs.items():
	# Get or create frame output in main state at the EXPECTED storage_key
	if "output_dict" not in inference_state:
	inference_state["output_dict"] = {
	"cond_frame_outputs": {},
	"non_cond_frame_outputs": {},
	}

	if (
	frame_idx
	not in inference_state["output_dict"][singleton_storage_key]
	):
	# Frame doesn't exist - create with singleton results at new_obj_idx
	num_objs = len(inference_state["obj_ids"])

	# Ensure num_objs is at least new_obj_idx + 1
	# (in case obj_ids list is somehow inconsistent)
	if num_objs <= new_obj_idx:
	num_objs = new_obj_idx + 1

	new_maskmem_features = None
	new_maskmem_pos_enc = None
	if (
	singleton_frame_out.get("maskmem_features") is not None
	and multiplex_state is not None
	):
	# Check if singleton features are in multiplexed format and demux if needed
	singleton_features_muxed = singleton_frame_out[
	"maskmem_features"
	]
	if singleton_features_muxed.shape[:2] == (
	singleton_multiplex_state.num_buckets,
	singleton_multiplex_state.multiplex_count,
	):
	# Singleton features are multiplexed, need to demux
	singleton_features_data = (
	singleton_multiplex_state.demux(
	singleton_features_muxed
	)
	)
	else:
	# Singleton features are in data space
	singleton_features_data = singleton_features_muxed

	feature_shape = (num_objs,) + singleton_features_data.shape[
	1:
	]
	maskmem_features_data = torch.zeros(
	feature_shape,
	dtype=singleton_features_data.dtype,
	device=singleton_features_data.device,
	)
	maskmem_features_data[new_obj_idx : new_obj_idx + 1] = (
	singleton_features_data
	)
	# Mux using destination multiplex state
	new_maskmem_features = multiplex_state.mux(
	maskmem_features_data
	)

	if (
	singleton_frame_out.get("maskmem_pos_enc") is not None
	and multiplex_state is not None
	):
	new_maskmem_pos_enc = []
	for level_enc in singleton_frame_out["maskmem_pos_enc"]:
	if level_enc is None:
	new_maskmem_pos_enc.append(None)
	continue
	# Check if singleton pos_enc is in multiplexed format and demux if needed
	if level_enc.shape[:2] == (
	singleton_multiplex_state.num_buckets,
	singleton_multiplex_state.multiplex_count,
	):
	# Singleton pos_enc is multiplexed, need to demux
	level_data = singleton_multiplex_state.demux(
	level_enc
	)
	else:
	# Singleton pos_enc is in data space
	level_data = level_enc

	level_shape = (num_objs,) + level_data.shape[1:]
	level_tensor = torch.zeros(
	level_shape,
	dtype=level_data.dtype,
	device=level_data.device,
	)
	level_tensor[new_obj_idx : new_obj_idx + 1] = level_data
	# Mux using destination multiplex state to store in multiplex format
	new_maskmem_pos_enc.append(
	multiplex_state.mux(level_tensor)
	)

	inference_state["output_dict"][singleton_storage_key][
	frame_idx
	] = {
	"maskmem_features": new_maskmem_features,
	"maskmem_pos_enc": new_maskmem_pos_enc,
	"image_features": singleton_frame_out.get("image_features"),
	"image_pos_enc": singleton_frame_out.get("image_pos_enc"),
	"local_obj_id_to_idx": {obj_id: new_obj_idx},
	"conditioning_objects": (
	set([new_obj_idx])
	if singleton_obj_idx
	in singleton_frame_out.get(
	"conditioning_objects", set()
	)
	else set()
	),
	"pred_masks": torch.zeros(
	(
	num_objs,
	1,
	singleton_frame_out["pred_masks"].shape[2],
	singleton_frame_out["pred_masks"].shape[3],
	),
	dtype=singleton_frame_out["pred_masks"].dtype,
	device=singleton_frame_out["pred_masks"].device,
	),
	"object_score_logits": torch.full(
	(num_objs, 1),
	NO_OBJ_SCORE,
	dtype=singleton_frame_out["object_score_logits"].dtype,
	device=singleton_frame_out[
	"object_score_logits"
	].device,
	),
	}
	# Set singleton results at new_obj_idx
	inference_state["output_dict"][singleton_storage_key][
	frame_idx
	]["pred_masks"][
	new_obj_idx : new_obj_idx + 1
	] = singleton_frame_out["pred_masks"]
	inference_state["output_dict"][singleton_storage_key][
	frame_idx
	]["object_score_logits"][
	new_obj_idx : new_obj_idx + 1
	] = singleton_frame_out["object_score_logits"]

	# Also copy pred_masks_video_res if it exists in singleton output
	if "pred_masks_video_res" in singleton_frame_out:
	inference_state["output_dict"][singleton_storage_key][
	frame_idx
	]["pred_masks_video_res"] = torch.zeros(
	(
	num_objs,
	1,
	singleton_frame_out["pred_masks_video_res"].shape[
	2
	],
	singleton_frame_out["pred_masks_video_res"].shape[
	3
	],
	),
	dtype=singleton_frame_out["pred_masks_video_res"].dtype,
	device=singleton_frame_out[
	"pred_masks_video_res"
	].device,
	)
	inference_state["output_dict"][singleton_storage_key][
	frame_idx
	]["pred_masks_video_res"][
	new_obj_idx : new_obj_idx + 1
	] = singleton_frame_out["pred_masks_video_res"]

	# Handle obj_ptr if present
	if (
	"obj_ptr" in singleton_frame_out
	and self.use_obj_ptrs_in_encoder
	):
	singleton_obj_ptr_data = singleton_multiplex_state.demux(
	singleton_frame_out["obj_ptr"]
	)
	obj_ptr_data = torch.zeros(
	(num_objs, singleton_obj_ptr_data.shape[1]),
	dtype=singleton_obj_ptr_data.dtype,
	device=singleton_obj_ptr_data.device,
	)
	obj_ptr_data[new_obj_idx : new_obj_idx + 1] = (
	singleton_obj_ptr_data
	)
	inference_state["output_dict"][singleton_storage_key][
	frame_idx
	]["obj_ptr"] = multiplex_state.mux(obj_ptr_data)
	else:
	# Frame exists - expand tensors and add singleton results
	main_frame_out = inference_state["output_dict"][
	singleton_storage_key
	][frame_idx]

	num_objs_total = len(inference_state["obj_ids"])

	if (
	singleton_frame_out.get("maskmem_features") is not None
	and multiplex_state is not None
	):
	# Check if singleton features are in multiplexed format and demux if needed
	singleton_features_muxed = singleton_frame_out[
	"maskmem_features"
	]
	if singleton_features_muxed.shape[:2] == (
	singleton_multiplex_state.num_buckets,
	singleton_multiplex_state.multiplex_count,
	):
	# Singleton features are multiplexed, need to demux
	singleton_features_data = (
	singleton_multiplex_state.demux(
	singleton_features_muxed
	)
	)
	else:
	# Singleton features are in data space
	singleton_features_data = singleton_features_muxed

	existing_features_muxed = main_frame_out.get(
	"maskmem_features"
	)
	if existing_features_muxed is not None:
	# Check if features are in multiplex format before demuxing
	if existing_features_muxed.shape[:2] == (
	multiplex_state.num_buckets,
	multiplex_state.multiplex_count,
	):
	# Features are in multiplex format, demux them
	existing_features_data = multiplex_state.demux(
	existing_features_muxed
	)
	else:
	# Features are already in data space, use directly
	existing_features_data = existing_features_muxed
	else:
	existing_features_data = None

	if existing_features_data is None:
	feature_shape = (
	num_objs_total,
	) + singleton_features_data.shape[1:]
	existing_features_data = torch.zeros(
	feature_shape,
	dtype=singleton_features_data.dtype,
	device=singleton_features_data.device,
	)
	elif existing_features_data.shape[0] < num_objs_total:
	pad_size = (
	num_objs_total - existing_features_data.shape[0]
	)
	pad = torch.zeros(
	(pad_size,) + existing_features_data.shape[1:],
	dtype=existing_features_data.dtype,
	device=existing_features_data.device,
	)
	existing_features_data = torch.cat(
	[existing_features_data, pad], dim=0
	)

	existing_features_data[new_obj_idx : new_obj_idx + 1] = (
	singleton_features_data
	)
	main_frame_out["maskmem_features"] = multiplex_state.mux(
	existing_features_data
	)

	if (
	singleton_frame_out.get("maskmem_pos_enc") is not None
	and multiplex_state is not None
	):
	existing_pos_enc_list = (
	main_frame_out.get("maskmem_pos_enc") or []
	)
	new_maskmem_pos_enc = []
	max_levels = max(
	len(singleton_frame_out["maskmem_pos_enc"]),
	len(existing_pos_enc_list),
	)
	for level_idx in range(max_levels):
	singleton_level_muxed = (
	singleton_frame_out["maskmem_pos_enc"][level_idx]
	if level_idx
	< len(singleton_frame_out["maskmem_pos_enc"])
	else None
	)
	existing_level_muxed = (
	existing_pos_enc_list[level_idx]
	if level_idx < len(existing_pos_enc_list)
	else None
	)

	if singleton_level_muxed is None:
	# Keep existing entry (which may also be None)
	new_maskmem_pos_enc.append(existing_level_muxed)
	continue

	# Check if singleton pos_enc is in multiplexed format and demux if needed
	if singleton_level_muxed.shape[:2] == (
	singleton_multiplex_state.num_buckets,
	singleton_multiplex_state.multiplex_count,
	):
	# Singleton pos_enc is multiplexed, need to demux
	singleton_level_data = (
	singleton_multiplex_state.demux(
	singleton_level_muxed
	)
	)
	else:
	# Singleton pos_enc is in data space
	singleton_level_data = singleton_level_muxed

	if existing_level_muxed is not None:
	# Check if pos_enc is in multiplex format before demuxing
	if existing_level_muxed.shape[:2] == (
	multiplex_state.num_buckets,
	multiplex_state.multiplex_count,
	):
	# Positional encoding is in multiplex format, demux it
	existing_level_data = multiplex_state.demux(
	existing_level_muxed
	)
	else:
	# Positional encoding is already in data space, use directly
	existing_level_data = existing_level_muxed
	else:
	existing_level_data = None

	if existing_level_data is None:
	level_shape = (
	num_objs_total,
	) + singleton_level_data.shape[1:]
	existing_level_data = torch.zeros(
	level_shape,
	dtype=singleton_level_data.dtype,
	device=singleton_level_data.device,
	)
	elif existing_level_data.shape[0] < num_objs_total:
	pad_size = (
	num_objs_total - existing_level_data.shape[0]
	)
	pad = torch.zeros(
	(pad_size,) + existing_level_data.shape[1:],
	dtype=existing_level_data.dtype,
	device=existing_level_data.device,
	)
	existing_level_data = torch.cat(
	[existing_level_data, pad], dim=0
	)

	existing_level_data[new_obj_idx : new_obj_idx + 1] = (
	singleton_level_data
	)
	new_maskmem_pos_enc.append(
	multiplex_state.mux(existing_level_data)
	)

	main_frame_out["maskmem_pos_enc"] = new_maskmem_pos_enc

	singleton_pred_masks = singleton_frame_out[
	"pred_masks"
	] # [1, 1, H, W]
	singleton_scores = singleton_frame_out[
	"object_score_logits"
	] # [1, 1]

	# Expand tensors if needed
	num_existing_objs = main_frame_out["pred_masks"].shape[0]
	if new_obj_idx >= num_existing_objs:
	num_objs_needed = new_obj_idx + 1
	pad_size = num_objs_needed - num_existing_objs

	main_frame_out["pred_masks"] = torch.cat(
	[
	main_frame_out["pred_masks"],
	torch.zeros(
	(
	pad_size,
	1,
	singleton_pred_masks.shape[2],
	singleton_pred_masks.shape[3],
	),
	dtype=singleton_pred_masks.dtype,
	device=singleton_pred_masks.device,
	),
	],
	dim=0,
	)

	main_frame_out["object_score_logits"] = torch.cat(
	[
	main_frame_out["object_score_logits"],
	torch.full(
	(pad_size, 1),
	NO_OBJ_SCORE,
	dtype=singleton_scores.dtype,
	device=singleton_scores.device,
	),
	],
	dim=0,
	)

	# Set singleton results at new_obj_idx
	main_frame_out["pred_masks"][new_obj_idx : new_obj_idx + 1] = (
	singleton_pred_masks
	)
	main_frame_out["object_score_logits"][
	new_obj_idx : new_obj_idx + 1
	] = singleton_scores
	# Initialize local_obj_id_to_idx if missing (e.g., frame
	# output was created by VG propagation's track_step which
	# does not populate this field).
	if "local_obj_id_to_idx" not in main_frame_out:
	main_frame_out["local_obj_id_to_idx"] = deepcopy(
	inference_state["obj_id_to_idx"]
	)
	main_frame_out["local_obj_id_to_idx"][obj_id] = new_obj_idx

	# Also expand and copy pred_masks_video_res if it exists in singleton output
	if "pred_masks_video_res" in singleton_frame_out:
	if "pred_masks_video_res" in main_frame_out:
	# Expand existing video_res masks
	if (
	main_frame_out["pred_masks_video_res"].shape[0]
	< new_obj_idx + 1
	):
	pad_size = (
	new_obj_idx
	+ 1
	- main_frame_out["pred_masks_video_res"].shape[
	0
	]
	)
	main_frame_out["pred_masks_video_res"] = torch.cat(
	[
	main_frame_out["pred_masks_video_res"],
	torch.zeros(
	(
	pad_size,
	1,
	singleton_frame_out[
	"pred_masks_video_res"
	].shape[2],
	singleton_frame_out[
	"pred_masks_video_res"
	].shape[3],
	),
	dtype=singleton_frame_out[
	"pred_masks_video_res"
	].dtype,
	device=singleton_frame_out[
	"pred_masks_video_res"
	].device,
	),
	],
	dim=0,
	)
	else:
	# Create new video_res masks tensor
	num_objs = len(inference_state["obj_ids"])
	main_frame_out["pred_masks_video_res"] = torch.zeros(
	(
	num_objs,
	1,
	singleton_frame_out[
	"pred_masks_video_res"
	].shape[2],
	singleton_frame_out[
	"pred_masks_video_res"
	].shape[3],
	),
	dtype=singleton_frame_out[
	"pred_masks_video_res"
	].dtype,
	device=singleton_frame_out[
	"pred_masks_video_res"
	].device,
	)
	# Set singleton video_res mask
	main_frame_out["pred_masks_video_res"][
	new_obj_idx : new_obj_idx + 1
	] = singleton_frame_out["pred_masks_video_res"]

	# Handle obj_ptr
	if (
	"obj_ptr" in singleton_frame_out
	and self.use_obj_ptrs_in_encoder
	):
	singleton_obj_ptr_data = singleton_multiplex_state.demux(
	singleton_frame_out["obj_ptr"]
	) # [1, D]

	if "obj_ptr" in main_frame_out:
	# The existing obj_ptr may have been created with a DIFFERENT number of buckets
	# (before we called multiplex_state.add_objects() which may have created new buckets).
	# We need to infer the OLD bucket count from the tensor shape to demux it correctly.

	old_obj_ptr_muxed = main_frame_out["obj_ptr"]
	# Infer old bucket count: shape is [B_old, M_old, D]
	old_num_buckets = old_obj_ptr_muxed.shape[1]

	# Create temporary multiplex state with old bucket count to demux
	if old_num_buckets != multiplex_state.num_buckets:
	# Bucket count changed - cannot safely demux old obj_ptr
	# Instead, create new obj_ptr from scratch for all objects
	num_objs = len(inference_state["obj_ids"])
	obj_ptr_data = torch.zeros(
	(num_objs, singleton_obj_ptr_data.shape[1]),
	dtype=singleton_obj_ptr_data.dtype,
	device=singleton_obj_ptr_data.device,
	)
	# Only set the singleton object's ptr, leave others as zeros
	obj_ptr_data[new_obj_idx : new_obj_idx + 1] = (
	singleton_obj_ptr_data
	)
	main_frame_out["obj_ptr"] = multiplex_state.mux(
	obj_ptr_data
	)
	else:
	# Bucket count matches - safe to demux
	main_obj_ptr_data = multiplex_state.demux(
	old_obj_ptr_muxed
	)

	# Expand if needed
	if main_obj_ptr_data.shape[0] < new_obj_idx + 1:
	pad_size = (
	new_obj_idx + 1 - main_obj_ptr_data.shape[0]
	)
	main_obj_ptr_data = torch.cat(
	[
	main_obj_ptr_data,
	torch.zeros(
	(
	pad_size,
	main_obj_ptr_data.shape[1],
	),
	dtype=main_obj_ptr_data.dtype,
	device=main_obj_ptr_data.device,
	),
	],
	dim=0,
	)

	main_obj_ptr_data[new_obj_idx : new_obj_idx + 1] = (
	singleton_obj_ptr_data
	)
	main_frame_out["obj_ptr"] = multiplex_state.mux(
	main_obj_ptr_data
	)
	else:
	# Create new obj_ptr
	num_objs = len(inference_state["obj_ids"])
	obj_ptr_data = torch.zeros(
	(num_objs, singleton_obj_ptr_data.shape[1]),
	dtype=singleton_obj_ptr_data.dtype,
	device=singleton_obj_ptr_data.device,
	)
	obj_ptr_data[new_obj_idx : new_obj_idx + 1] = (
	singleton_obj_ptr_data
	)
	main_frame_out["obj_ptr"] = multiplex_state.mux(
	obj_ptr_data
	)

	# Update conditioning_objects
	if singleton_obj_idx in singleton_frame_out.get(
	"conditioning_objects", set()
	):
	main_frame_out["conditioning_objects"].add(new_obj_idx)

	@torch.inference_mode()
	def add_new_points(
	self,
	inference_state,
	frame_idx,
	obj_id,
	points,
	labels,
	clear_old_points,
	rel_coordinates=True,
	use_prev_mem_frame=False,
	):
	"""
	Add new points to create a new object in the multiplex model.

	This method converts point inputs to masks via the interactivity head and adds
	the new object to the existing multiplex bucket (for dynamic models).

	Args:
	inference_state: Current inference state
	frame_idx: Frame index to add points
	obj_id: Object ID (will be auto-created if new)
	points: Point coordinates tensor
	labels: Point labels tensor (1 for positive, 0 for negative)
	clear_old_points: Whether to clear old points on this frame
	rel_coordinates: Whether points are in relative coordinates [0, 1]
	use_prev_mem_frame: Whether to use previous memory frames (for compatibility)

	Returns:
	Tuple of (frame_idx, obj_ids, low_res_masks, video_res_masks)
	"""
	obj_idx = self._obj_id_to_idx(inference_state, obj_id)
	obj_idxs = [obj_idx]
	obj_ids = [obj_id]

	point_inputs_per_frame = inference_state["point_inputs_per_obj"][obj_idx]
	mask_inputs_per_frame = inference_state["mask_inputs_per_obj"][obj_idx]

	if points.dim() == 2:
	points = points.unsqueeze(0)
	if labels.dim() == 1:
	labels = labels.unsqueeze(0)

	if rel_coordinates:
	points = points * self.image_size

	points = points.to(inference_state["device"])
	labels = labels.to(inference_state["device"])

	if not clear_old_points:
	old_point_inputs = point_inputs_per_frame.get(frame_idx, None)
	else:
	old_point_inputs = None

	point_inputs = concat_points(old_point_inputs, points, labels)
	point_inputs_per_frame[frame_idx] = point_inputs

	is_init_cond_frame = frame_idx not in inference_state["frames_already_tracked"]

	if is_init_cond_frame:
	reverse = False
	else:
	reverse = inference_state["frames_already_tracked"][frame_idx]["reverse"]

	is_cond = is_init_cond_frame or self.add_all_frames_to_correct_as_cond
	storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"

	multiplex_state = inference_state["multiplex_state"]
	is_new_state = multiplex_state is None

	if is_new_state:
	multiplex_state = self.multiplex_controller.get_state(
	num_valid_entries=1,
	device=inference_state["device"],
	dtype=torch.float32,
	random=False,
	object_ids=obj_ids,
	)
	inference_state["multiplex_state"] = multiplex_state

	# Determine interaction case:
	# - New object: never seen before
	# - Refine: existing mask on tracked frame
	# - Gap fill: object exists but frame has no output
	is_existing_object = (
	not is_new_state
	and multiplex_state is not None
	and obj_id in multiplex_state.object_ids
	)

	if is_existing_object:
	if is_init_cond_frame:
	is_new_obj = False
	is_refine = False
	is_gap_fill_case = True
	else:
	is_new_obj = False
	is_refine = True
	is_gap_fill_case = False
	else:
	is_new_obj = True
	is_refine = False
	is_gap_fill_case = False

	if is_new_obj:
	should_add_to_existing = not is_new_state
	allow_new_buckets_local = True
	prefer_new_buckets_local = True

	current_out, _ = self._run_single_frame_inference(
	inference_state=inference_state,
	output_dict=inference_state["output_dict"],
	frame_idx=frame_idx,
	batch_size=1,
	is_init_cond_frame=True,
	point_inputs=point_inputs,
	mask_inputs=None,
	reverse=False,
	run_mem_encoder=False,
	prev_sam_mask_logits=None,
	add_to_existing_state=should_add_to_existing,
	new_obj_idxs=obj_idxs,
	new_obj_ids=obj_ids,
	allow_new_buckets=allow_new_buckets_local,
	prefer_new_buckets=prefer_new_buckets_local,
	objects_to_interact=None,
	)
	elif is_refine:
	singleton_state, original_obj_idx = self._extract_object_for_interaction(
	inference_state, obj_id, frame_idx
	)

	user_refined_frames_map = inference_state.get(
	"user_refined_frames_per_obj", {}
	)
	user_refined_frames = user_refined_frames_map.get(obj_id)
	if user_refined_frames is None:
	user_refined_frames = set()
	is_first_refinement = frame_idx not in user_refined_frames

	prev_sam_mask_logits_singleton = None
	if not is_first_refinement:
	singleton_obj_idx = 0
	singleton_output_dict = singleton_state["output_dict_per_obj"][
	singleton_obj_idx
	]
	singleton_temp_output_dict = singleton_state[
	"temp_output_dict_per_obj"
	][singleton_obj_idx]

	# Check BOTH storage keys since previous refinement might be in a different key
	# (e.g., first refinement creates cond_frame, but after propagation,
	# second refinement on same frame would look for non_cond_frame)
	prev_out = None

	storage_key_current = (
	"cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
	)
	prev_out = singleton_temp_output_dict[storage_key_current].get(
	frame_idx
	)

	if prev_out is None:
	prev_out = singleton_output_dict["cond_frame_outputs"].get(
	frame_idx
	)
	if prev_out is None:
	prev_out = singleton_output_dict["non_cond_frame_outputs"].get(
	frame_idx
	)

	if prev_out is not None and prev_out["pred_masks"] is not None:
	prev_sam_mask_logits_singleton = prev_out["pred_masks"].cuda(
	non_blocking=True
	)
	prev_sam_mask_logits_singleton = torch.clamp(
	prev_sam_mask_logits_singleton, -32.0, 32.0
	)

	if is_first_refinement:
	# ALWAYS use is_init_cond_frame=True to force interaction_only mode
	# for fresh segmentation from points (not refinement of propagated mask).
	singleton_is_init_cond = True
	singleton_objects_to_interact = None
	else:
	# Second+ refinement: Incremental refinement for quality improvement
	singleton_is_init_cond = False
	singleton_objects_to_interact = (
	[0] if prev_sam_mask_logits_singleton is not None else None
	)

	singleton_obj_idx = 0
	singleton_obj_idxs = [singleton_obj_idx]
	singleton_obj_ids = [obj_id]

	current_out, _ = self._run_single_frame_inference(
	inference_state=singleton_state,
	output_dict=singleton_state["output_dict"],
	frame_idx=frame_idx,
	batch_size=1,
	is_init_cond_frame=singleton_is_init_cond,
	point_inputs=point_inputs,
	mask_inputs=None,
	reverse=False,
	run_mem_encoder=False,
	prev_sam_mask_logits=prev_sam_mask_logits_singleton,
	add_to_existing_state=False,
	new_obj_idxs=singleton_obj_idxs,
	new_obj_ids=singleton_obj_ids,
	allow_new_buckets=False,
	objects_to_interact=singleton_objects_to_interact,
	)

	singleton_storage_key = (
	"cond_frame_outputs"
	if singleton_is_init_cond
	else "non_cond_frame_outputs"
	)

	_, singleton_video_res_masks = self._get_orig_video_res_output(
	singleton_state, current_out["pred_masks"]
	)
	current_out["pred_masks_video_res"] = singleton_video_res_masks

	singleton_state["output_dict"][singleton_storage_key][frame_idx] = (
	current_out
	)

	self._merge_singleton_interaction_result(
	inference_state, singleton_state, obj_id, original_obj_idx
	)

	obj_idx = inference_state["obj_id_to_idx"][obj_id]
	obj_idxs = [obj_idx]

	if "user_refined_frames_per_obj" not in inference_state:
	inference_state["user_refined_frames_per_obj"] = {}
	if obj_id not in inference_state["user_refined_frames_per_obj"]:
	inference_state["user_refined_frames_per_obj"][obj_id] = set()

	inference_state["user_refined_frames_per_obj"][obj_id].add(frame_idx)

	merged_frame_out = inference_state["output_dict"][singleton_storage_key][
	frame_idx
	]
	obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
	obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]

	if "pred_masks_video_res" in merged_frame_out:
	pred_masks_video_res_slice = merged_frame_out["pred_masks_video_res"][
	obj_idx : obj_idx + 1
	]
	else:
	_, video_res_masks = self._get_orig_video_res_output(
	inference_state, merged_frame_out["pred_masks"]
	)
	pred_masks_video_res_slice = video_res_masks[obj_idx : obj_idx + 1]

	pred_masks_slice = merged_frame_out["pred_masks"][obj_idx : obj_idx + 1]

	obj_temp_output_dict[singleton_storage_key][frame_idx] = {
	"pred_masks": pred_masks_slice,
	"pred_masks_video_res": pred_masks_video_res_slice,
	"object_score_logits": merged_frame_out["object_score_logits"][
	obj_idx : obj_idx + 1
	],
	}
	obj_output_dict[singleton_storage_key][frame_idx] = obj_temp_output_dict[
	singleton_storage_key
	][frame_idx]

	elif is_gap_fill_case:
	# Gap fill: Run inference directly in multiplex mode (no singleton extraction)
	# Even though is_init_cond_frame=True, we use add_to_existing_state=False
	# because the object ALREADY EXISTS in multiplex state.
	obj_idx = inference_state["obj_id_to_idx"][obj_id]
	obj_idxs = [obj_idx]
	batch_size = self._get_obj_num(inference_state)

	obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
	obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]

	current_out, _ = self._run_single_frame_inference(
	inference_state=inference_state,
	output_dict=inference_state["output_dict"],
	frame_idx=frame_idx,
	batch_size=batch_size,
	is_init_cond_frame=True,
	point_inputs=point_inputs,
	mask_inputs=None,
	reverse=False,
	run_mem_encoder=False,
	prev_sam_mask_logits=None,
	add_to_existing_state=False,
	new_obj_idxs=[obj_idx],
	new_obj_ids=[obj_id],
	allow_new_buckets=False,
	prefer_new_buckets=False,
	objects_to_interact=[obj_idx],
	)

	current_out["local_obj_id_to_idx"] = deepcopy(
	inference_state["obj_id_to_idx"]
	)

	_, video_res_masks = self._get_orig_video_res_output(
	inference_state, current_out["pred_masks"]
	)
	current_out["pred_masks_video_res"] = video_res_masks

	is_cond = storage_key == "cond_frame_outputs"
	if (
	is_cond
	and frame_idx
	in inference_state["output_dict"]["non_cond_frame_outputs"]
	):
	del inference_state["output_dict"]["non_cond_frame_outputs"][frame_idx]
	if "consolidated_frame_inds" in inference_state:
	inference_state["consolidated_frame_inds"][
	"non_cond_frame_outputs"
	].discard(frame_idx)

	# Store consolidated output (has obj_ptr, maskmem_features, etc.)
	inference_state["output_dict"][storage_key][frame_idx] = current_out

	# Mark as consolidated
	if "consolidated_frame_inds" in inference_state:
	inference_state["consolidated_frame_inds"][storage_key].add(frame_idx)

	# Also store per-object slices in temp_output_dict_per_obj
	obj_temp_output_dict[storage_key][frame_idx] = {
	"pred_masks": current_out["pred_masks"][obj_idx : obj_idx + 1],
	"pred_masks_video_res": video_res_masks[obj_idx : obj_idx + 1],
	"object_score_logits": current_out["object_score_logits"][
	obj_idx : obj_idx + 1
	],
	}
	obj_output_dict[storage_key][frame_idx] = obj_temp_output_dict[storage_key][
	frame_idx
	]

	# Store outputs and prepare return values
	obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
	obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]

	# For refinement/gap fill (singleton extraction), handle singleton output specially
	if is_refine or is_gap_fill_case:
	# Singleton case: The merge already updated the consolidated output_dict during merge.
	# However, we need to ensure the frame is properly stored and marked.

	singleton_obj_idx = 0

	# Get video resolution masks from singleton output
	_, video_res_masks_singleton = self._get_orig_video_res_output(
	inference_state, current_out["pred_masks"]
	)

	# Mark frame as consolidated (prevents double consolidation in preflight)
	if "consolidated_frame_inds" in inference_state:
	inference_state["consolidated_frame_inds"][storage_key].add(frame_idx)

	# For return value, use singleton masks
	video_res_masks_to_return = video_res_masks_singleton[
	singleton_obj_idx : singleton_obj_idx + 1
	]
	else:
	# Standard multiplex output - use obj_idx
	_, video_res_masks = self._get_orig_video_res_output(
	inference_state, current_out["pred_masks"]
	)

	current_out["pred_masks_video_res"] = video_res_masks
	current_out["local_obj_id_to_idx"] = deepcopy(
	inference_state["obj_id_to_idx"]
	)

	# Remove from non_cond if this becomes a cond frame
	if (
	is_cond
	and frame_idx
	in inference_state["output_dict"]["non_cond_frame_outputs"]
	):
	del inference_state["output_dict"]["non_cond_frame_outputs"][frame_idx]
	# Also update consolidated_frame_inds
	if "consolidated_frame_inds" in inference_state:
	inference_state["consolidated_frame_inds"][
	"non_cond_frame_outputs"
	].discard(frame_idx)

	inference_state["output_dict"][storage_key][frame_idx] = current_out

	# Update consolidated_frame_inds to track this frame
	if "consolidated_frame_inds" in inference_state:
	inference_state["consolidated_frame_inds"][storage_key].add(frame_idx)

	# Store per-object outputs (slice from the full multiplex output)
	obj_temp_output_dict[storage_key][frame_idx] = {
	"pred_masks_video_res": current_out["pred_masks_video_res"][
	obj_idx : obj_idx + 1
	],
	"pred_masks": current_out["pred_masks"][obj_idx : obj_idx + 1],
	"object_score_logits": current_out["object_score_logits"][
	obj_idx : obj_idx + 1
	],
	}

	obj_output_dict[storage_key][frame_idx] = obj_temp_output_dict[storage_key][
	frame_idx
	]

	video_res_masks_to_return = video_res_masks[obj_idx : obj_idx + 1]

	low_res_masks = None
	return frame_idx, obj_ids, low_res_masks, video_res_masks_to_return

	@torch.inference_mode()
	def add_new_masks(
	self,
	inference_state,
	frame_idx,
	obj_ids,
	masks,
	# for compatibility with per_obj_inference class, not used here
	add_mask_to_memory=False,
	# for object reconditioning; do not update the multiplex state
	reconditioning=False,
	):
	"""Add new mask to a frame."""
	if isinstance(obj_ids, np.ndarray):
	obj_ids = obj_ids.tolist()
	obj_idxs = [
	self._obj_id_to_idx(inference_state, obj_id, error_if_new=reconditioning)
	for obj_id in obj_ids
	]
	point_inputs_per_frame = [
	inference_state["point_inputs_per_obj"][obj_idx] for obj_idx in obj_idxs
	]
	mask_inputs_per_frame = [
	inference_state["mask_inputs_per_obj"][obj_idx] for obj_idx in obj_idxs
	]

	assert masks.dim() == 3
	num_objects, mask_H, mask_W = masks.shape
	assert num_objects == len(obj_ids)
	masks_inputs_orig = masks[:, None, :, :] # add channel dimension
	masks_inputs_orig = masks_inputs_orig.float().to(inference_state["device"])

	# resize the mask if it doesn't match the model's input mask size
	if mask_H != self.input_mask_size or mask_W != self.input_mask_size:
	mask_inputs = torch.nn.functional.interpolate(
	masks_inputs_orig,
	size=(self.input_mask_size, self.input_mask_size),
	align_corners=False,
	mode="bilinear",
	antialias=True, # use antialias for downsampling
	)
	else:
	mask_inputs = masks_inputs_orig

	# also get the mask at the original video resolution (for outputting)
	video_H = inference_state["video_height"]
	video_W = inference_state["video_width"]
	if mask_H != video_H or mask_W != video_W:
	mask_inputs_video_res = torch.nn.functional.interpolate(
	masks_inputs_orig,
	size=(video_H, video_W),
	align_corners=False,
	mode="bilinear",
	antialias=True, # use antialias for potential downsampling
	)
	else:
	mask_inputs_video_res = masks_inputs_orig
	# convert mask_inputs_video_res to binary (threshold at 0.5 as it is in range 0~1)
	mask_inputs_video_res = mask_inputs_video_res > 0.5

	multiplex_state = inference_state["multiplex_state"]
	is_new_state = multiplex_state is None

	if not reconditioning:
	if is_new_state:
	multiplex_state = self.multiplex_controller.get_state(
	num_valid_entries=num_objects,
	device=inference_state["device"],
	dtype=torch.float32, # lower precision is also fine
	random=False,
	object_ids=obj_ids,
	)
	inference_state["multiplex_state"] = multiplex_state
	else:
	assert self.is_dynamic_model, (
	"New objects are not allowed after state creation"
	)

	for i in range(num_objects):
	mask_inputs_per_frame[i][frame_idx] = mask_inputs_video_res[i : i + 1]
	point_inputs_per_frame[i].pop(frame_idx, None)
	# If this frame hasn't been tracked before, we treat it as an initial conditioning
	# frame, meaning that the inputs points are to generate segments on this frame without
	# using any memory from other frames, like in SAM. Otherwise (if it has been tracked),
	# the input points will be used to correct the already tracked masks.
	is_init_cond_frame = frame_idx not in inference_state["frames_already_tracked"]
	# whether to track in reverse time order
	if is_init_cond_frame:
	reverse = False
	else:
	reverse = inference_state["frames_already_tracked"][frame_idx]["reverse"]
	obj_output_dicts = [
	inference_state["output_dict_per_obj"][obj_idx] for obj_idx in obj_idxs
	]
	obj_temp_output_dicts = [
	inference_state["temp_output_dict_per_obj"][obj_idx] for obj_idx in obj_idxs
	]
	# Add a frame to conditioning output if it's an initial conditioning frame or
	# if the model sees all frames receiving clicks/mask as conditioning frames.
	is_cond = is_init_cond_frame or self.add_all_frames_to_correct_as_cond
	storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"

	# Allow creating a new bucket only when existing buckets cannot fit the new objects
	allow_new_buckets_local = False
	if not is_new_state and not reconditioning and multiplex_state is not None:
	if multiplex_state.available_slots < num_objects:
	allow_new_buckets_local = True

	current_out, _ = self._run_single_frame_inference(
	inference_state=inference_state,
	output_dict=inference_state["output_dict"],
	frame_idx=frame_idx,
	batch_size=num_objects,
	is_init_cond_frame=is_init_cond_frame,
	point_inputs=None,
	mask_inputs=mask_inputs,
	reverse=reverse,
	# Skip the memory encoder when adding clicks or mask. We execute the memory encoder
	# at the beginning of `propagate_in_video` (after user finalize their clicks). This
	# allows us to enforce non-overlapping constraints on all objects before encoding
	# them into memory.
	run_mem_encoder=False,
	add_to_existing_state=not is_new_state and not reconditioning,
	new_obj_idxs=obj_idxs,
	new_obj_ids=obj_ids,
	allow_new_buckets=allow_new_buckets_local,
	reconditioning=reconditioning,
	)
	# We directly use the input mask at video resolution as the output mask for a better
	# video editing experience (so that the masks don't change after each brushing).
	# Here NO_OBJ_SCORE is a large negative value to represent the background and
	# similarly -NO_OBJ_SCORE is a large positive value to represent the foreground.
	_, video_res_masks = self._get_orig_video_res_output(
	inference_state, current_out["pred_masks"]
	)
	obj_idxs_t = torch.as_tensor(obj_idxs, device=video_res_masks.device)
	video_res_masks[obj_idxs_t] = torch.where(
	mask_inputs_video_res, -NO_OBJ_SCORE, NO_OBJ_SCORE
	)

	current_out["pred_masks_video_res"] = video_res_masks
	with torch.profiler.record_function("add_new_masks._deepcopy"):
	current_out["local_obj_id_to_idx"] = deepcopy(
	inference_state["obj_id_to_idx"]
	)
	if (
	is_cond
	and frame_idx in inference_state["output_dict"]["non_cond_frame_outputs"]
	):
	del inference_state["output_dict"]["non_cond_frame_outputs"][frame_idx]
	# Also update consolidated_frame_inds
	if "consolidated_frame_inds" in inference_state:
	inference_state["consolidated_frame_inds"][
	"non_cond_frame_outputs"
	].discard(frame_idx)

	inference_state["output_dict"][storage_key][frame_idx] = current_out

	# Update consolidated_frame_inds to track this frame
	if "consolidated_frame_inds" in inference_state:
	inference_state["consolidated_frame_inds"][storage_key].add(frame_idx)

	with torch.profiler.record_function("add_new_masks.obj_loop"):
	# Step 1: Set all new object masks first (batched)
	for i, obj_idx in enumerate(obj_idxs):
	# Add the predicted masks to the output dict
	# NOTE: object ordering matters here but I guess this is the same for the per-object implementation
	obj_temp_output_dicts[i][storage_key][frame_idx] = {
	"pred_masks_video_res": current_out["pred_masks_video_res"][
	obj_idx : obj_idx + 1
	]
	}
	obj_output_dicts[i][storage_key][frame_idx] = obj_temp_output_dicts[i][
	storage_key
	][frame_idx]

	# Step 2: Precompute suppress masks to avoid O(n*m) torch.where calls
	# Combined mask of all new objects (for existing objects)
	combined_new_mask = mask_inputs_video_res.any(
	dim=0, keepdim=True
	) # (1, 1, H, W)

	# Precompute exclude-self masks for new objects (if there are multiple new objects)
	num_new = len(obj_idxs)
	exclude_self_masks = {}
	if num_new > 1:
	for i in range(num_new):
	other_indices = torch.cat(
	[
	torch.arange(i, device=mask_inputs_video_res.device),
	torch.arange(
	i + 1, num_new, device=mask_inputs_video_res.device
	),
	]
	)
	exclude_self_masks[obj_idxs[i]] = mask_inputs_video_res[
	other_indices
	].any(dim=0, keepdim=True)

	# Step 3: Apply suppression to all objects in a single pass
	temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"]
	obj_idxs_set = set(obj_idxs)

	for obj_idx2, obj_temp_output_dict2 in temp_output_dict_per_obj.items():
	current_out2 = obj_temp_output_dict2[storage_key].get(frame_idx, None)
	if current_out2 is None:
	continue

	if obj_idx2 not in obj_idxs_set:
	# Existing object: suppress by all new masks
	suppress_mask = combined_new_mask
	elif obj_idx2 in exclude_self_masks:
	# New object: suppress by other new objects' masks
	suppress_mask = exclude_self_masks[obj_idx2]
	else:
	# Only one new object - nothing to suppress for itself
	continue

	current_out2["pred_masks_video_res"] = torch.where(
	suppress_mask,
	NO_OBJ_SCORE,
	current_out2["pred_masks_video_res"],
	)

	# Resize the output mask to the original video resolution
	obj_ids = inference_state["obj_ids"]
	consolidated_out = self._consolidate_temp_output_across_obj(
	inference_state,
	frame_idx,
	is_cond=is_cond,
	run_mem_encoder=False,
	consolidate_at_video_res=True,
	)
	_, video_res_masks = self._get_orig_video_res_output(
	inference_state, consolidated_out["pred_masks_video_res"]
	)
	low_res_masks = None # not needed by the demo

	consolidated_out["local_obj_id_to_idx"] = current_out["local_obj_id_to_idx"]

	return frame_idx, obj_ids, low_res_masks, video_res_masks

	def _get_orig_video_res_output(self, inference_state, any_res_masks):
	"""
	Resize the object scores to the original video resolution (video_res_masks)
	and apply non-overlapping constraints for final output.
	"""
	device = inference_state["device"]
	video_H = inference_state["video_height"]
	video_W = inference_state["video_width"]
	any_res_masks = any_res_masks.to(device, non_blocking=True)
	if any_res_masks.shape[-2:] == (video_H, video_W):
	video_res_masks = any_res_masks
	else:
	video_res_masks = torch.nn.functional.interpolate(
	any_res_masks,
	size=(video_H, video_W),
	mode="bilinear",
	align_corners=False,
	)
	if self.non_overlap_masks_for_output:
	video_res_masks = self._apply_non_overlapping_constraints(video_res_masks)
	# potentially fill holes in the predicted masks
	if self.fill_hole_area > 0:
	video_res_masks = fill_holes_in_mask_scores(
	video_res_masks, self.fill_hole_area
	)
	return any_res_masks, video_res_masks

	def _consolidate_temp_output_across_obj(
	self,
	inference_state,
	frame_idx,
	is_cond,
	run_mem_encoder,
	consolidate_at_video_res=False,
	):
	"""
	Consolidate the per-object temporary outputs in `temp_output_dict_per_obj` on
	a frame into a single output for all objects, including
	1) fill any missing objects either from `output_dict_per_obj` (if they exist in
	`output_dict_per_obj` for this frame) or leave them as placeholder values
	(if they don't exist in `output_dict_per_obj` for this frame);
	2) if specified, rerun memory encoder after apply non-overlapping constraints
	on the object scores.
	"""
	batch_size = self._get_obj_num(inference_state)
	storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"

	# After singleton merge, objects can be added at indices beyond batch_size
	# We need to find the maximum object index that has temp or regular outputs to size the tensor correctly
	max_obj_idx = batch_size - 1 # Default to batch_size - 1

	# Check both temp and regular output dicts to find max index
	for obj_idx in inference_state["temp_output_dict_per_obj"].keys():
	if obj_idx > max_obj_idx:
	max_obj_idx = obj_idx
	for obj_idx in inference_state["output_dict_per_obj"].keys():
	if obj_idx > max_obj_idx:
	max_obj_idx = obj_idx

	# Size the consolidated tensor to accommodate all object indices (not just count)
	consolidated_batch_size = max(max_obj_idx + 1, 0) # Ensure non-negative

	# Optionally, we allow consolidating the temporary outputs at the original
	# video resolution (to provide a better editing experience for mask prompts).
	if consolidate_at_video_res:
	assert not run_mem_encoder, "memory encoder cannot run at video resolution"
	consolidated_H = inference_state["video_height"]
	consolidated_W = inference_state["video_width"]
	consolidated_mask_key = "pred_masks_video_res"
	else:
	consolidated_H = consolidated_W = self.low_res_mask_size
	consolidated_mask_key = "pred_masks"

	# Initialize `consolidated_out`. Its "maskmem_features" and "maskmem_pos_enc"
	# will be added when rerunning the memory encoder after applying non-overlapping
	# constraints to object scores. Its "pred_masks" are prefilled with a large
	# negative value (NO_OBJ_SCORE) to represent missing objects.

	consolidated_out = {
	"conditioning_objects": None,
	"maskmem_features": None,
	"maskmem_pos_enc": None,
	"image_features": None,
	"image_pos_enc": None,
	"obj_ptr": None,
	consolidated_mask_key: torch.full(
	size=(
	consolidated_batch_size,
	1,
	consolidated_H,
	consolidated_W,
	), # Use consolidated_batch_size, not batch_size!
	fill_value=NO_OBJ_SCORE,
	dtype=torch.float32,
	device=inference_state["storage_device"],
	),
	}

	all_out = inference_state["output_dict"]["cond_frame_outputs"].get(
	frame_idx, None
	)
	if all_out is None:
	all_out = inference_state["output_dict"]["non_cond_frame_outputs"].get(
	frame_idx, None
	)

	# Handle the case where output_dict is empty (e.g., during demo VG propagation)
	# In this case, we'll reconstruct the consolidated output from per-object outputs
	need_to_reconstruct_from_per_obj = all_out is None

	if need_to_reconstruct_from_per_obj:
	# Initialize fields that will be populated from per-object outputs or later
	# Determine which objects are conditioned by checking if they have point/mask inputs on this frame
	conditioning_objects = set()
	for obj_idx in range(batch_size):
	# Check if this object has point inputs on this frame
	if obj_idx in inference_state["point_inputs_per_obj"]:
	point_inputs = inference_state["point_inputs_per_obj"][obj_idx]
	if (
	frame_idx in point_inputs
	and point_inputs[frame_idx] is not None
	):
	conditioning_objects.add(obj_idx)
	continue

	# Check if this object has mask inputs on this frame
	if obj_idx in inference_state["mask_inputs_per_obj"]:
	mask_inputs = inference_state["mask_inputs_per_obj"][obj_idx]
	if frame_idx in mask_inputs and mask_inputs[frame_idx] is not None:
	conditioning_objects.add(obj_idx)

	consolidated_out["conditioning_objects"] = conditioning_objects
	# Shared features will be populated when running memory encoder
	# Note: obj_ptr and object_score_logits will be populated from per-object outputs below
	else:
	# Normal case: populate from existing consolidated output
	consolidated_out["conditioning_objects"] = all_out.get(
	"conditioning_objects", set()
	)
	consolidated_out["obj_ptr"] = all_out["obj_ptr"]
	consolidated_out["object_score_logits"] = all_out["object_score_logits"]
	if self.use_memory_selection:
	consolidated_out["iou_score"] = all_out["iou_score"]
	# These fields might not exist in per-object outputs (e.g., after singleton extraction)
	consolidated_out["maskmem_features"] = all_out.get("maskmem_features")
	consolidated_out["maskmem_pos_enc"] = all_out.get("maskmem_pos_enc")
	consolidated_out["image_features"] = all_out.get("image_features")
	consolidated_out["image_pos_enc"] = all_out.get("image_pos_enc")
	consolidated_out["local_obj_id_to_idx"] = all_out.get(
	"local_obj_id_to_idx", {}
	)
	consolidated_out["obj_ptr"] = all_out["obj_ptr"]
	consolidated_out["object_score_logits"] = all_out["object_score_logits"]
	if self.use_memory_selection:
	consolidated_out["iou_score"] = all_out["iou_score"]
	# These fields might not exist in per-object outputs (e.g., after singleton extraction)
	consolidated_out["maskmem_features"] = all_out.get("maskmem_features")
	consolidated_out["maskmem_pos_enc"] = all_out.get("maskmem_pos_enc")
	consolidated_out["image_features"] = all_out.get("image_features")
	consolidated_out["image_pos_enc"] = all_out.get("image_pos_enc")
	consolidated_out["local_obj_id_to_idx"] = all_out.get(
	"local_obj_id_to_idx", {}
	)
	all_mask = all_out.get("pred_masks_video_res", all_out["pred_masks"])
	# Ensure masks are at the correct consolidated resolution
	# This handles the case where all_out has interactive resolution (288) masks
	# that need to be resized to SAM2's low_res_mask_size (256) for consistency
	if all_mask.shape[-2:] == (consolidated_H, consolidated_W):
	consolidated_out[consolidated_mask_key] = all_mask
	else:
	# Resize first if mask has a different resolution (e.g., 288 from interactive)
	# Determine if we're downsampling or upsampling
	is_downsampling = all_mask.shape[-1] > consolidated_W
	resized_mask = torch.nn.functional.interpolate(
	all_mask,
	size=(consolidated_H, consolidated_W),
	mode="bilinear",
	align_corners=False,
	antialias=is_downsampling, # use antialias for downsampling
	)
	consolidated_out[consolidated_mask_key] = resized_mask

	# Collect per-object outputs (masks and scores) to build consolidated output
	# When reconstructing from per-object outputs, we also need to collect obj_ptr and object_score_logits
	obj_score_logits_list = []
	obj_ptr_list = [] if need_to_reconstruct_from_per_obj else None
	iou_scores_list = (
	[]
	if need_to_reconstruct_from_per_obj and self.use_memory_selection
	else None
	)

	# When reconstructing from per-object outputs, initialize the mask tensor
	# with the correct size (consolidated_batch_size, not batch_size)
	if (
	need_to_reconstruct_from_per_obj
	and consolidated_mask_key not in consolidated_out
	):
	# Initialize with zeros - will be populated from per-object outputs below
	consolidated_out[consolidated_mask_key] = torch.zeros(
	(consolidated_batch_size, 1, consolidated_H, consolidated_W),
	dtype=torch.float32,
	device=inference_state["storage_device"],
	)
	consolidated_out["object_score_logits"] = torch.full(
	(consolidated_batch_size, 1),
	NO_OBJ_SCORE,
	dtype=torch.float32,
	device=inference_state["storage_device"],
	)

	for obj_idx in range(
	consolidated_batch_size
	): # Use consolidated_batch_size instead of batch_size
	# Check if this object index exists in temp/output dicts (it may not if object was just added)
	if obj_idx not in inference_state["temp_output_dict_per_obj"]:
	continue
	if obj_idx not in inference_state["output_dict_per_obj"]:
	continue
	obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
	obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
	out = obj_temp_output_dict[storage_key].get(frame_idx, None)
	# If the object doesn't appear in "temp_output_dict_per_obj" on this frame,
	# we fall back and look up its previous output in "output_dict_per_obj".
	# We look up both "cond_frame_outputs" and "non_cond_frame_outputs" in
	# "output_dict_per_obj" to find a previous output for this object.
	if out is None:
	out = obj_output_dict["cond_frame_outputs"].get(frame_idx, None)
	if out is None:
	out = obj_output_dict["non_cond_frame_outputs"].get(frame_idx, None)
	if out is None:
	# object pointers are filled globally above; we don't need empty_mask_ptr
	continue
	# Add the temporary object output mask to consolidated output mask
	# (use "pred_masks_video_res" if it's available)
	obj_mask = out.get("pred_masks_video_res")
	if obj_mask is None:
	obj_mask = out.get("pred_masks")
	consolidated_pred_masks = consolidated_out[consolidated_mask_key]

	# If obj_idx is beyond the consolidated_pred_masks size,
	# we need to expand it (can happen after singleton merge adds object at end)
	if obj_idx >= consolidated_pred_masks.shape[0]:
	pad_size = obj_idx + 1 - consolidated_pred_masks.shape[0]
	consolidated_pred_masks = torch.cat(
	[
	consolidated_pred_masks,
	torch.zeros(
	(
	pad_size,
	1,
	consolidated_pred_masks.shape[-2],
	consolidated_pred_masks.shape[-1],
	),
	dtype=consolidated_pred_masks.dtype,
	device=consolidated_pred_masks.device,
	),
	],
	dim=0,
	)
	consolidated_out[consolidated_mask_key] = consolidated_pred_masks
	# Also expand object_score_logits if present
	if "object_score_logits" in consolidated_out:
	consolidated_scores = consolidated_out["object_score_logits"]
	consolidated_scores = torch.cat(
	[
	consolidated_scores,
	torch.full(
	(pad_size, 1),
	NO_OBJ_SCORE,
	dtype=consolidated_scores.dtype,
	device=consolidated_scores.device,
	),
	],
	dim=0,
	)
	consolidated_out["object_score_logits"] = consolidated_scores

	if obj_mask.shape[-2:] == consolidated_pred_masks.shape[-2:]:
	# Ensure dtype match between source and destination before assignment
	if obj_mask.dtype != consolidated_pred_masks.dtype:
	obj_mask = obj_mask.to(consolidated_pred_masks.dtype)
	consolidated_pred_masks[obj_idx : obj_idx + 1] = obj_mask
	else:
	# Resize first if temporary object mask has a different resolution
	is_downsampling = "pred_masks_video_res" in out
	resized_obj_mask = torch.nn.functional.interpolate(
	obj_mask,
	size=consolidated_pred_masks.shape[-2:],
	mode="bilinear",
	align_corners=False,
	antialias=is_downsampling, # use antialias for downsampling
	)
	# Ensure dtype match between source and destination before assignment
	if resized_obj_mask.dtype != consolidated_pred_masks.dtype:
	resized_obj_mask = resized_obj_mask.to(
	consolidated_pred_masks.dtype
	)
	consolidated_pred_masks[obj_idx : obj_idx + 1] = resized_obj_mask

	# When reconstructing from per-object outputs, also collect scores
	if need_to_reconstruct_from_per_obj:
	if "object_score_logits" in out:
	obj_score_logits_list.append(out["object_score_logits"])
	if self.use_memory_selection and "iou_score" in out:
	iou_scores_list.append(out["iou_score"])

	# If we reconstructed from per-object outputs, consolidate the score fields
	if need_to_reconstruct_from_per_obj:
	# Check if we have ANY valid per-object outputs
	# If not, we're trying to consolidate a VG-propagated frame that was never
	# stored in output_dict (only in cached_frame_outputs)
	# In this case, we SKIP memory encoding during preflight and will do it
	# during the first propagation step instead
	if not obj_score_logits_list and run_mem_encoder:
	run_mem_encoder = False # Skip for now, will encode during propagation

	if obj_score_logits_list:
	consolidated_out["object_score_logits"] = torch.cat(
	obj_score_logits_list, dim=0
	)
	else:
	# Create placeholder scores - these will be replaced when memory encoder runs
	device = inference_state["device"]
	consolidated_out["object_score_logits"] = torch.zeros(
	(batch_size, 1),
	dtype=torch.float32,
	device=device,
	)

	if self.use_memory_selection:
	if iou_scores_list:
	consolidated_out["iou_score"] = torch.cat(iou_scores_list, dim=0)
	else:
	consolidated_out["iou_score"] = None

	# obj_ptr will be populated by memory encoder, set to None for now
	consolidated_out["obj_ptr"] = None

	# Optionally, apply non-overlapping constraints on the consolidated scores
	# and rerun the memory encoder
	if run_mem_encoder:
	device = inference_state["device"]
	high_res_masks = torch.nn.functional.interpolate(
	consolidated_out["pred_masks"].to(device, non_blocking=True),
	size=(self.image_size, self.image_size),
	mode="bilinear",
	align_corners=False,
	)
	high_res_masks = self._apply_non_overlapping_constraints(high_res_masks)
	maskmem_features, maskmem_pos_enc, image_features, image_pos_enc = (
	self._run_memory_encoder(
	inference_state=inference_state,
	frame_idx=frame_idx,
	batch_size=batch_size,
	high_res_masks=high_res_masks,
	object_score_logits=consolidated_out["object_score_logits"],
	is_mask_from_pts=True, # these frames are what the user interacted with
	conditioning_objects=consolidated_out[
	"conditioning_objects"
	], # Pass conditioning_objects
	)
	)
	consolidated_out["maskmem_features"] = maskmem_features
	consolidated_out["maskmem_pos_enc"] = maskmem_pos_enc
	consolidated_out["image_features"] = image_features
	consolidated_out["image_pos_enc"] = image_pos_enc

	return consolidated_out

	@torch.inference_mode()
	def propagate_in_video_preflight(self, inference_state, run_mem_encoder=True):
	"""Prepare inference_state and consolidate temporary outputs before tracking."""
	inference_state["tracking_has_started"] = True
	batch_size = self._get_obj_num(inference_state)

	# Consolidate per-object temporary outputs in "temp_output_dict_per_obj" and
	# add them into "output_dict".
	temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"]
	output_dict = inference_state["output_dict"]
	# "consolidated_frame_inds" contains indices of those frames where consolidated
	# temporary outputs have been added (either in this call or any previous calls
	# to `propagate_in_video_preflight`).
	consolidated_frame_inds = inference_state["consolidated_frame_inds"]
	for is_cond in [False, True]:
	# Separately consolidate conditioning and non-conditioning temp outptus
	storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
	# Find all the frames that contain temporary outputs for any objects
	# (these should be the frames that have just received clicks for mask inputs
	# via `add_new_points` or `add_new_mask`)
	temp_frame_inds = set()
	for obj_temp_output_dict in temp_output_dict_per_obj.values():
	temp_frame_inds.update(obj_temp_output_dict[storage_key].keys())
	consolidated_frame_inds[storage_key].update(temp_frame_inds)
	# consolidate the temprary output across all objects on this frame
	for frame_idx in temp_frame_inds:
	consolidated_out = self._consolidate_temp_output_across_obj(
	inference_state,
	frame_idx,
	is_cond=is_cond,
	run_mem_encoder=run_mem_encoder,
	)
	# merge them into "output_dict" and also create per-object slices
	output_dict[storage_key][frame_idx] = consolidated_out
	self._add_output_per_object(
	inference_state, frame_idx, consolidated_out, storage_key
	)
	clear_non_cond_mem = self.clear_non_cond_mem_around_input and (
	self.clear_non_cond_mem_for_multi_obj or batch_size <= 1
	)
	if clear_non_cond_mem:
	# clear non-conditioning memory of the surrounding frames
	self._clear_non_cond_mem_around_input(inference_state, frame_idx)

	# clear temporary outputs in `temp_output_dict_per_obj`
	for obj_temp_output_dict in temp_output_dict_per_obj.values():
	obj_temp_output_dict[storage_key].clear()

	# edge case: if an output is added to "cond_frame_outputs", we remove any prior
	# output on the same frame in "non_cond_frame_outputs"
	for frame_idx in output_dict["cond_frame_outputs"]:
	output_dict["non_cond_frame_outputs"].pop(frame_idx, None)
	for obj_output_dict in inference_state["output_dict_per_obj"].values():
	for frame_idx in obj_output_dict["cond_frame_outputs"]:
	obj_output_dict["non_cond_frame_outputs"].pop(frame_idx, None)
	for frame_idx in consolidated_frame_inds["cond_frame_outputs"]:
	assert frame_idx in output_dict["cond_frame_outputs"]
	consolidated_frame_inds["non_cond_frame_outputs"].discard(frame_idx)

	# Make sure that the frame indices in "consolidated_frame_inds" are exactly those frames
	# with either points or mask inputs (which should be true under a correct demo workflow).
	all_consolidated_frame_inds = (
	consolidated_frame_inds["cond_frame_outputs"]
	\| consolidated_frame_inds["non_cond_frame_outputs"]
	)

	input_frames_inds = set()
	for point_inputs_per_frame in inference_state["point_inputs_per_obj"].values():
	input_frames_inds.update(point_inputs_per_frame.keys())
	for mask_inputs_per_frame in inference_state["mask_inputs_per_obj"].values():
	input_frames_inds.update(mask_inputs_per_frame.keys())
	assert all_consolidated_frame_inds == input_frames_inds
	# Record the first interacted frame index (for tracking start)
	if inference_state["first_ann_frame_idx"] is None:
	inference_state["first_ann_frame_idx"] = min(
	input_frames_inds, default=None
	)
	# In case `first_ann_frame_idx` is not in the conditioning frames (e.g. because
	# we cleared the input points on that frame), pick the first conditioning frame
	if (
	inference_state["first_ann_frame_idx"]
	not in output_dict["cond_frame_outputs"]
	):
	inference_state["first_ann_frame_idx"] = min(
	output_dict["cond_frame_outputs"], default=None
	)

	def _get_processing_order(
	self, inference_state, start_frame_idx, max_frame_num_to_track, reverse
	):
	num_frames = inference_state["num_frames"]
	# set start index, end index, and processing order
	if self.always_start_from_first_ann_frame:
	# in this case, we always start tracking from the frame where we receive
	# the initial annotation and ignore the provided start_frame_idx
	start_frame_idx = inference_state["first_ann_frame_idx"]
	if start_frame_idx is None:
	# default: start from the earliest frame with input points
	start_frame_idx = min(inference_state["output_dict"]["cond_frame_outputs"])
	if max_frame_num_to_track is None:
	# default: track all the frames in the video
	max_frame_num_to_track = num_frames
	if reverse:
	end_frame_idx = max(start_frame_idx - max_frame_num_to_track, 0)
	if start_frame_idx > 0:
	processing_order = range(start_frame_idx, end_frame_idx - 1, -1)
	else:
	# TODO: Jie - this is the edge case that we start from frame 0 and track in reverse order;
	# and in the case we track a single frame for dense tracking, it should still run 1 frame (idx=0).
	# Not sure if this has any side effect.
	# processing_order = [] # skip reverse tracking if starting from frame 0 <-- original behaviour
	processing_order = [0]
	else:
	end_frame_idx = min(
	start_frame_idx + max_frame_num_to_track, num_frames - 1
	)
	processing_order = range(start_frame_idx, end_frame_idx + 1)
	return processing_order

	@torch.inference_mode()
	def propagate_in_video(
	self,
	inference_state,
	start_frame_idx,
	max_frame_num_to_track,
	reverse,
	tqdm_disable=False,
	obj_ids=None,
	run_mem_encoder=True,
	):
	"""Propagate the input points across frames to track in the entire video."""
	output_dict = inference_state["output_dict"]
	consolidated_frame_inds = inference_state["consolidated_frame_inds"]
	if obj_ids is not None:
	raise NotImplementedError(
	"Per-object tracking yet for batched inference if not implemented."
	)
	obj_ids = inference_state["obj_ids"]
	batch_size = self._get_obj_num(inference_state)
	if len(output_dict["cond_frame_outputs"]) == 0:
	raise RuntimeError("No points are provided; please add points first")
	clear_non_cond_mem = self.clear_non_cond_mem_around_input and (
	self.clear_non_cond_mem_for_multi_obj or batch_size <= 1
	)
	assert clear_non_cond_mem is False, "Not implemented"

	processing_order = self._get_processing_order(
	inference_state,
	start_frame_idx,
	max_frame_num_to_track,
	reverse,
	)

	for frame_idx in tqdm(
	processing_order, desc="propagate in video", disable=tqdm_disable
	):
	# We skip those frames already in consolidated outputs (these are frames
	# that received input clicks or mask). Note that we cannot directly run
	# batched forward on them via `_run_single_frame_inference` because the
	# number of clicks on each object might be different.
	if frame_idx in consolidated_frame_inds["cond_frame_outputs"]:
	storage_key = "cond_frame_outputs"
	current_out = output_dict[storage_key][frame_idx]
	pred_masks = current_out["pred_masks"]
	if clear_non_cond_mem:
	# clear non-conditioning memory of the surrounding frames
	self._clear_non_cond_mem_around_input(inference_state, frame_idx)
	elif frame_idx in consolidated_frame_inds["non_cond_frame_outputs"]:
	storage_key = "non_cond_frame_outputs"
	current_out = output_dict[storage_key][frame_idx]
	pred_masks = current_out["pred_masks"]
	else:
	storage_key = "non_cond_frame_outputs"
	with torch.profiler.record_function(
	"VideoTrackingMultiplexDemo._run_single_frame_inference"
	):
	current_out, pred_masks = self._run_single_frame_inference(
	inference_state=inference_state,
	output_dict=output_dict,
	frame_idx=frame_idx,
	batch_size=batch_size,
	is_init_cond_frame=False,
	point_inputs=None,
	mask_inputs=None,
	reverse=reverse,
	run_mem_encoder=run_mem_encoder,
	)
	current_out["local_obj_id_to_idx"] = deepcopy(
	inference_state["obj_id_to_idx"]
	)
	output_dict[storage_key][frame_idx] = current_out
	# Create slices of per-object outputs for subsequent interaction with each
	# individual object after tracking.
	self._add_output_per_object(
	inference_state, frame_idx, current_out, storage_key
	)
	inference_state["frames_already_tracked"][frame_idx] = {"reverse": reverse}

	# Resize the output mask to the original video resolution (we directly use
	# the mask scores on GPU for output to avoid any CPU conversion in between)
	low_res_masks, video_res_masks = self._get_orig_video_res_output(
	inference_state, pred_masks
	)
	yield frame_idx, obj_ids, low_res_masks, video_res_masks

	def _add_output_per_object(
	self, inference_state, frame_idx, current_out, storage_key
	):
	"""
	Split a multi-object output into per-object output slices and add them into
	`output_dict_per_obj`. The resulting slices share the same tensor storage.
	"""
	# Note for the multiplex model: we don't store the maskmem features
	# because we don't use the memory during interaction

	output_dict_per_obj = inference_state["output_dict_per_obj"]
	for obj_idx, obj_output_dict in output_dict_per_obj.items():
	obj_slice = slice(obj_idx, obj_idx + 1)
	obj_out = {
	"pred_masks": current_out["pred_masks"][obj_slice],
	"object_score_logits": current_out["object_score_logits"][obj_slice],
	}
	if self.use_memory_selection:
	obj_out["iou_score"] = current_out["iou_score"][obj_slice]
	obj_output_dict[storage_key][frame_idx] = obj_out

	@torch.inference_mode()
	def clear_all_points_in_frame(
	self,
	inference_state,
	frame_idx,
	obj_id,
	need_output=True,
	preserve_user_refined: bool = False,
	):
	"""Remove all input points or mask in a specific frame for a given object."""
	obj_idx = self._obj_id_to_idx(inference_state, obj_id)

	# Clear the conditioning information on the given frame
	inference_state["point_inputs_per_obj"][obj_idx].pop(frame_idx, None)
	inference_state["mask_inputs_per_obj"][obj_idx].pop(frame_idx, None)

	# Clear user refinement tracking for this frame and object unless preserving it
	if (
	not preserve_user_refined
	and "user_refined_frames_per_obj" in inference_state
	):
	user_refined_map = inference_state["user_refined_frames_per_obj"]
	if obj_id in user_refined_map:
	user_refined_map[obj_id].discard(frame_idx)

	temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"]
	temp_output_dict_per_obj[obj_idx]["cond_frame_outputs"].pop(frame_idx, None)
	temp_output_dict_per_obj[obj_idx]["non_cond_frame_outputs"].pop(frame_idx, None)

	# Check and see if there are still any inputs left on this frame
	batch_size = self._get_obj_num(inference_state)
	frame_has_input = False
	for obj_idx2 in range(batch_size):
	# Skip if this object doesn't exist in the input dictionaries
	if obj_idx2 not in inference_state["point_inputs_per_obj"]:
	continue
	if obj_idx2 not in inference_state["mask_inputs_per_obj"]:
	continue
	if frame_idx in inference_state["point_inputs_per_obj"][obj_idx2]:
	frame_has_input = True
	break
	if frame_idx in inference_state["mask_inputs_per_obj"][obj_idx2]:
	frame_has_input = True
	break

	# If this frame has no remaining inputs for any objects, we further clear its
	# conditioning frame status
	if not frame_has_input:
	output_dict = inference_state["output_dict"]
	consolidated_frame_inds = inference_state["consolidated_frame_inds"]
	consolidated_frame_inds["cond_frame_outputs"].discard(frame_idx)
	consolidated_frame_inds["non_cond_frame_outputs"].discard(frame_idx)
	# Remove the frame's conditioning output (possibly downgrading it to non-conditioning)
	out = output_dict["cond_frame_outputs"].pop(frame_idx, None)
	if out is not None:
	# The frame is not a conditioning frame anymore since it's not receiving inputs,
	# so we "downgrade" its output (if exists) to a non-conditioning frame output.
	output_dict["non_cond_frame_outputs"][frame_idx] = out
	inference_state["frames_already_tracked"].pop(frame_idx, None)
	# Similarly, do it for the sliced output on each object.
	for obj_idx2 in range(batch_size):
	# Skip if this object doesn't exist in the output dictionary
	if obj_idx2 not in inference_state["output_dict_per_obj"]:
	continue
	obj_output_dict = inference_state["output_dict_per_obj"][obj_idx2]
	obj_out = obj_output_dict["cond_frame_outputs"].pop(frame_idx, None)
	if obj_out is not None:
	obj_output_dict["non_cond_frame_outputs"][frame_idx] = obj_out

	# If all the conditioning frames have been removed, we also clear the tracking outputs
	if len(output_dict["cond_frame_outputs"]) == 0:
	self._reset_tracking_results(inference_state)

	if not need_output:
	return
	# Finally, output updated masks per object (after removing the inputs above)
	obj_ids = inference_state["obj_ids"]
	is_cond = any(
	frame_idx in obj_temp_output_dict["cond_frame_outputs"]
	for obj_temp_output_dict in temp_output_dict_per_obj.values()
	)
	consolidated_out = self._consolidate_temp_output_across_obj(
	inference_state,
	frame_idx,
	is_cond=is_cond,
	run_mem_encoder=False,
	consolidate_at_video_res=True,
	)
	_, video_res_masks = self._get_orig_video_res_output(
	inference_state, consolidated_out["pred_masks_video_res"]
	)
	low_res_masks = None # not needed by the demo
	return frame_idx, obj_ids, low_res_masks, video_res_masks

	@torch.inference_mode()
	def clear_all_points_in_video(self, inference_state):
	"""Remove all input points or mask in all frames throughout the video."""
	self._reset_tracking_results(inference_state)
	# Remove all object ids
	inference_state["obj_id_to_idx"].clear()
	inference_state["obj_idx_to_id"].clear()
	inference_state["obj_ids"].clear()
	inference_state["point_inputs_per_obj"].clear()
	inference_state["mask_inputs_per_obj"].clear()
	inference_state["output_dict_per_obj"].clear()
	inference_state["temp_output_dict_per_obj"].clear()
	inference_state["multiplex_state"] = None

	def _reset_tracking_results(self, inference_state):
	"""Reset all tracking inputs and results across the videos."""
	for v in inference_state["point_inputs_per_obj"].values():
	v.clear()
	for v in inference_state["mask_inputs_per_obj"].values():
	v.clear()
	for v in inference_state["output_dict_per_obj"].values():
	v["cond_frame_outputs"].clear()
	v["non_cond_frame_outputs"].clear()
	for v in inference_state["temp_output_dict_per_obj"].values():
	v["cond_frame_outputs"].clear()
	v["non_cond_frame_outputs"].clear()
	inference_state["output_dict"]["cond_frame_outputs"].clear()
	inference_state["output_dict"]["non_cond_frame_outputs"].clear()
	inference_state["consolidated_frame_inds"]["cond_frame_outputs"].clear()
	inference_state["consolidated_frame_inds"]["non_cond_frame_outputs"].clear()
	inference_state["tracking_has_started"] = False
	inference_state["frames_already_tracked"].clear()
	inference_state["first_ann_frame_idx"] = None

	def _get_image_feature(self, inference_state, frame_idx, batch_size):
	"""Compute the image features on a given frame."""
	# Look up in the cache first
	image, backbone_out = inference_state["cached_features"].get(
	frame_idx, (None, None)
	)
	if backbone_out is None:
	# Cache miss -- we will run inference on a single image
	image = inference_state["images"][frame_idx].cuda().float().unsqueeze(0)
	# TODO: We should optimize this because we don't always need all three outs
	backbone_out = self.forward_image(
	NestedTensor(tensors=image, mask=None),
	need_sam3_out=True,
	need_interactive_out=True,
	need_propagation_out=True,
	)
	# Cache the most recent frame's feature (for repeated interactions with
	# a frame; we can use an LRU cache for more frames in the future).
	inference_state["cached_features"] = {frame_idx: (image, backbone_out)}

	features = self._prepare_backbone_features(backbone_out)
	return image, features

	def _run_single_frame_inference(
	self,
	inference_state,
	output_dict,
	frame_idx,
	batch_size,
	is_init_cond_frame,
	point_inputs,
	mask_inputs,
	reverse,
	run_mem_encoder,
	prev_sam_mask_logits=None,
	add_to_existing_state: bool = False,
	new_obj_idxs: Optional[list[int]] = None,
	new_obj_ids: Optional[list[int]] = None,
	allow_new_buckets: bool = False,
	prefer_new_buckets: bool = False,
	reconditioning: bool = False,
	objects_to_interact: Optional[list[int]] = None,
	):
	"""Run tracking on a single frame based on current inputs and previous memory."""
	# Retrieve correct image features
	with torch.profiler.record_function(
	"VideoTrackingMultiplexDemo._get_image_feature"
	):
	image, backbone_features = self._get_image_feature(
	inference_state, frame_idx, batch_size
	)

	if add_to_existing_state or reconditioning:
	assert new_obj_idxs is not None
	assert new_obj_ids is not None

	backbone_features_interactive = backbone_features["interactive"]
	backbone_features_propagation = backbone_features["sam2_backbone_out"]

	if add_to_existing_state or reconditioning:
	with torch.profiler.record_function(
	"VideoTrackingMultiplexDemo.add_new_masks_to_existing_state"
	):
	# Get existing output from current frame to modify in-place
	# Try both storage keys since the output could be in either location
	existing_out = output_dict["cond_frame_outputs"].get(frame_idx)
	if existing_out is None:
	existing_out = output_dict["non_cond_frame_outputs"].get(frame_idx)
	if existing_out is None:
	raise RuntimeError(
	f"No existing output found for frame {frame_idx} in either storage"
	)

	# Prepare interactive features
	interactive_pix_feat = self._get_interactive_pix_mem(
	backbone_features_interactive["vision_feats"],
	backbone_features_interactive["feat_sizes"],
	)

	# High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW
	interactive_high_res_features = [
	x.permute(1, 2, 0).view(x.size(1), x.size(2), *s)
	for x, s in zip(
	backbone_features_interactive["vision_feats"][:-1],
	backbone_features_interactive["feat_sizes"][:-1],
	)
	]

	# Prepare propagation features for memory encoding
	propagation_vision_feats = (
	backbone_features_propagation["vision_feats"]
	if run_mem_encoder
	else None
	)
	propagation_feat_sizes = (
	backbone_features_propagation["feat_sizes"]
	if run_mem_encoder
	else None
	)

	# Add new masks to existing state
	if reconditioning:
	self.recondition_masks_in_existing_state(
	interactive_pix_feat=interactive_pix_feat,
	interactive_high_res_features=interactive_high_res_features,
	propagation_vision_feats=propagation_vision_feats,
	propagation_feat_sizes=propagation_feat_sizes,
	new_masks=mask_inputs,
	obj_idxs_in_mask=new_obj_idxs,
	obj_ids_in_mask=new_obj_ids,
	prev_output=existing_out,
	multiplex_state=inference_state["multiplex_state"],
	add_mask_to_memory=run_mem_encoder,
	)
	else:
	# If we are adding to existing state using points (mask_inputs is None),
	# first convert points -> masks via the interactivity head.
	new_masks_from_points = None
	if mask_inputs is None and point_inputs is not None:
	with torch.profiler.record_function(
	"VideoTrackingMultiplexDemo.points_to_masks"
	):
	multimask_output = self._use_multimask(
	is_init_cond_frame, point_inputs=point_inputs
	)
	interaction_out = self._forward_sam_heads(
	backbone_features=interactive_pix_feat,
	point_inputs=point_inputs,
	mask_inputs=None,
	interactive_high_res_features=interactive_high_res_features,
	multimask_output=multimask_output,
	objects_to_interact=new_obj_idxs,
	multiplex_state=inference_state["multiplex_state"],
	)
	new_masks_from_points = interaction_out["low_res_masks"]

	self.add_new_masks_to_existing_state(
	interactive_pix_feat=interactive_pix_feat,
	interactive_high_res_features=interactive_high_res_features,
	propagation_vision_feats=propagation_vision_feats,
	propagation_feat_sizes=propagation_feat_sizes,
	new_masks=(
	mask_inputs
	if mask_inputs is not None
	else new_masks_from_points
	),
	obj_idxs_in_mask=new_obj_idxs,
	obj_ids_in_mask=new_obj_ids,
	prev_output=existing_out,
	multiplex_state=inference_state["multiplex_state"],
	add_mask_to_memory=run_mem_encoder,
	are_masks_from_pts=(mask_inputs is None),
	allow_new_buckets=allow_new_buckets,
	prefer_new_buckets=prefer_new_buckets,
	)

	# Return the modified existing output
	current_out = existing_out
	else:
	# point and mask should not appear as input simultaneously on the same frame
	assert point_inputs is None or mask_inputs is None
	with torch.profiler.record_function(
	"VideoTrackingMultiplexDemo.track_step"
	):
	current_out = self.track_step(
	frame_idx=frame_idx,
	is_init_cond_frame=is_init_cond_frame,
	backbone_features_interactive=backbone_features_interactive,
	backbone_features_propagation=backbone_features_propagation,
	image=image,
	point_inputs=point_inputs,
	mask_inputs=mask_inputs,
	gt_masks=None,
	frames_to_add_correction_pt=[],
	output_dict=output_dict,
	num_frames=inference_state["num_frames"],
	track_in_reverse=reverse,
	run_mem_encoder=run_mem_encoder,
	prev_sam_mask_logits=prev_sam_mask_logits,
	multiplex_state=inference_state["multiplex_state"],
	objects_to_interact=objects_to_interact,
	)

	# optionally offload the output to CPU memory to save GPU space
	storage_device = inference_state["storage_device"]
	if current_out.get("maskmem_features") is not None:
	maskmem_features = current_out["maskmem_features"]
	maskmem_features = maskmem_features.to(
	device=storage_device, dtype=torch.bfloat16, non_blocking=True
	)
	else:
	maskmem_features = None

	if current_out.get("image_features") is not None:
	assert "image_pos_enc" in current_out
	image_features = current_out["image_features"].to(
	storage_device, non_blocking=True
	)
	image_pos_enc = current_out["image_pos_enc"].to(
	storage_device, non_blocking=True
	)
	else:
	image_features = image_pos_enc = None

	pred_masks_gpu = current_out["pred_masks"]
	pred_masks = pred_masks_gpu.to(storage_device, non_blocking=True)
	# "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
	with torch.profiler.record_function(
	"VideoTrackingMultiplexDemo.maskmem_pos_enc"
	):
	maskmem_pos_enc = self._get_maskmem_pos_enc(inference_state, current_out)
	# object pointer is a small tensor, so we always keep it on GPU memory for fast access
	obj_ptr = current_out["obj_ptr"]
	object_score_logits = current_out["object_score_logits"]
	conditioning_objects = current_out["conditioning_objects"]
	# make a compact version of this frame's output to reduce the state size
	compact_current_out = {
	"maskmem_features": maskmem_features,
	"maskmem_pos_enc": maskmem_pos_enc,
	"image_features": image_features,
	"image_pos_enc": image_pos_enc,
	"pred_masks": pred_masks,
	"obj_ptr": obj_ptr,
	"object_score_logits": object_score_logits,
	"conditioning_objects": conditioning_objects,
	}
	if self.use_memory_selection:
	with torch.profiler.record_function(
	"VideoTrackingMultiplexDemo.use_memory_selection"
	):
	compact_current_out["iou_score"] = current_out["iou_score"]
	compact_current_out["eff_iou_score"] = self.cal_mem_score(
	object_score_logits, current_out["iou_score"]
	)
	return compact_current_out, pred_masks_gpu

	def _run_memory_encoder(
	self,
	inference_state,
	frame_idx,
	batch_size,
	high_res_masks,
	object_score_logits,
	is_mask_from_pts,
	conditioning_objects=None, # Accept as parameter
	):
	"""
	Run the memory encoder on `high_res_masks`. This is usually after applying
	non-overlapping constraints to object scores. Since their scores changed, their
	memory also need to be computed again with the memory encoder.
	"""
	# Retrieve correct image features
	image, backbone_features = self._get_image_feature(
	inference_state, frame_idx, batch_size
	)
	backbone_features_propagation = backbone_features["sam2_backbone_out"]
	propagation_vision_feats = backbone_features_propagation["vision_feats"]
	propagation_vision_pos_embeds = backbone_features_propagation[
	"vision_pos_embeds"
	]
	propagation_feat_sizes = backbone_features_propagation["feat_sizes"]

	# If conditioning_objects is not provided, look it up from output_dict
	if conditioning_objects is None:
	output_dict = inference_state["output_dict"]
	for storage_key in ["cond_frame_outputs", "non_cond_frame_outputs"]:
	storage = output_dict[storage_key]
	if frame_idx not in storage:
	continue
	conditioning_objects = storage[frame_idx]["conditioning_objects"]
	break
	else:
	raise ValueError(f"conditioning objects not found at {frame_idx=}")

	maskmem_features, maskmem_pos_enc = self._encode_new_memory(
	image=image,
	current_vision_feats=propagation_vision_feats,
	feat_sizes=propagation_feat_sizes,
	pred_masks_high_res=high_res_masks,
	object_score_logits=object_score_logits,
	is_mask_from_pts=is_mask_from_pts,
	conditioning_objects=conditioning_objects,
	multiplex_state=inference_state["multiplex_state"],
	)

	# optionally offload the output to CPU memory to save GPU space
	storage_device = inference_state["storage_device"]
	maskmem_features = maskmem_features.to(torch.bfloat16)
	maskmem_features = maskmem_features.to(storage_device, non_blocking=True)
	# "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
	maskmem_pos_enc = self._get_maskmem_pos_enc(
	inference_state, {"maskmem_pos_enc": maskmem_pos_enc}
	)

	image_features = propagation_vision_feats[-1]
	image_features = image_features.to(storage_device, non_blocking=True)
	image_pos_enc = propagation_vision_pos_embeds[-1]
	image_pos_enc = image_pos_enc.to(storage_device, non_blocking=True)
	return maskmem_features, maskmem_pos_enc, image_features, image_pos_enc

	def _get_maskmem_pos_enc(self, inference_state, current_out):
	"""
	`maskmem_pos_enc` is the same across frames and objects, so we cache it as
	a constant in the inference session to reduce session storage size.
	"""
	model_constants = inference_state["constants"]
	# "out_maskmem_pos_enc" should be either a list of tensors or None
	out_maskmem_pos_enc = current_out.get("maskmem_pos_enc")
	if out_maskmem_pos_enc is not None:
	if "maskmem_pos_enc" not in model_constants:
	assert isinstance(out_maskmem_pos_enc, list)
	# only take the slice for one object, since it's same across objects
	maskmem_pos_enc = [x[0:1].clone() for x in out_maskmem_pos_enc]
	model_constants["maskmem_pos_enc"] = maskmem_pos_enc
	else:
	maskmem_pos_enc = model_constants["maskmem_pos_enc"]
	# expand the cached maskmem_pos_enc to the actual batch size
	batch_size = out_maskmem_pos_enc[0].size(0)
	expanded_maskmem_pos_enc = [
	x.expand(batch_size, -1, -1, -1) for x in maskmem_pos_enc
	]
	else:
	expanded_maskmem_pos_enc = None
	return expanded_maskmem_pos_enc

	@torch.inference_mode()
	def remove_object(
	self,
	inference_state,
	obj_id: int,
	strict=False,
	need_output=True,
	clear_user_refined_map: bool = True,
	):
	"""
	Remove a single object from the tracking state.

	This is a convenience wrapper around remove_objects() for removing a single object.

	Args:
	inference_state: Current inference state
	obj_id: Object ID to remove
	strict: If True, raise error if object doesn't exist
	need_output: Whether to return updated frames

	Returns:
	Tuple of (remaining_obj_ids, updated_frames)
	"""
	return self.remove_objects(
	inference_state,
	obj_ids=[obj_id],
	strict=strict,
	need_output=need_output,
	clear_user_refined_map=clear_user_refined_map,
	)

	@torch.inference_mode()
	def remove_objects(
	self,
	inference_state,
	obj_ids: Iterable[int],
	strict=False,
	need_output=True,
	clear_user_refined_map: bool = True,
	):
	"""
	Remove a list of object ids from the tracking state. If strict is True, we check whether
	the object ids actually exist and raise an error if any of them don't exist.
	"""
	obj_ids = list(obj_ids)
	old_obj_idxs_to_rm = [
	inference_state["obj_id_to_idx"].get(obj_id, None) for obj_id in obj_ids
	]
	updated_frames = []
	actually_used_obj_ids = []
	removing_any = False
	for old_obj_idx_to_rm, obj_id in zip(old_obj_idxs_to_rm, obj_ids, strict=True):
	if old_obj_idx_to_rm is None:
	if strict:
	raise ValueError(
	f"Object id {obj_id} does not exist in the tracking state."
	)
	else:
	actually_used_obj_ids.append(obj_id)
	removing_any = True
	if not removing_any:
	return inference_state["obj_ids"], updated_frames

	# ignore any object IDs that don't exist
	old_obj_idxs_to_rm = [x for x in old_obj_idxs_to_rm if x is not None]
	obj_ids = actually_used_obj_ids
	removed_obj_ids = list(obj_ids)

	# There are still remaining objects after removing this object id. In this case,
	# we need to delete the object storage from inference state tensors.
	# Step 0: clear the input on those frames where this object id has point or mask input
	# (note that this step is required as it might downgrade conditioning frames to
	# non-conditioning ones)
	if clear_user_refined_map and "user_refined_frames_per_obj" in inference_state:
	user_refined_map = inference_state["user_refined_frames_per_obj"]
	for removed_obj_id in removed_obj_ids:
	if removed_obj_id in user_refined_map:
	user_refined_map.pop(removed_obj_id, None)

	all_obj_input_frames_inds = set()
	for old_obj_idx_to_rm, obj_id in zip(old_obj_idxs_to_rm, obj_ids, strict=True):
	obj_input_frames_inds = set()
	obj_input_frames_inds.update(
	inference_state["point_inputs_per_obj"][old_obj_idx_to_rm]
	)
	obj_input_frames_inds.update(
	inference_state["mask_inputs_per_obj"][old_obj_idx_to_rm]
	)
	for frame_idx in obj_input_frames_inds:
	self.clear_all_points_in_frame(
	inference_state,
	frame_idx,
	obj_id,
	need_output=False,
	preserve_user_refined=not clear_user_refined_map,
	)
	all_obj_input_frames_inds.update(obj_input_frames_inds)

	# Step 1: Update the object id mapping (note that it must be done after Step 0,
	# since Step 0 still requires the old object id mappings in inference_state)
	old_obj_ids = inference_state["obj_ids"]
	old_obj_inds = list(range(len(old_obj_ids)))
	remain_old_obj_inds = old_obj_inds.copy()
	for old_obj_idx_to_rm in old_obj_idxs_to_rm:
	remain_old_obj_inds.remove(old_obj_idx_to_rm)
	new_obj_ids = [old_obj_ids[old_idx] for old_idx in remain_old_obj_inds]
	new_obj_inds = list(range(len(new_obj_ids)))
	# build new mappings
	old_idx_to_new_idx = dict(zip(remain_old_obj_inds, new_obj_inds))
	inference_state["obj_id_to_idx"] = dict(zip(new_obj_ids, new_obj_inds))
	inference_state["obj_idx_to_id"] = dict(zip(new_obj_inds, new_obj_ids))
	inference_state["obj_ids"] = new_obj_ids

	if len(new_obj_ids) == 0:
	return new_obj_ids, updated_frames

	# Step 2: For per-object tensor storage, we shift their obj_idx in the dict keys.
	# (note that "consolidated_frame_inds" doesn't need to be updated in this step as
	# it's already handled in Step 0)
	def _map_keys(container):
	new_kvs = []
	for k in old_obj_inds:
	v = container.pop(k)
	if k in old_idx_to_new_idx:
	new_kvs.append((old_idx_to_new_idx[k], v))
	container.update(new_kvs)

	_map_keys(inference_state["point_inputs_per_obj"])
	_map_keys(inference_state["mask_inputs_per_obj"])
	_map_keys(inference_state["output_dict_per_obj"])
	_map_keys(inference_state["temp_output_dict_per_obj"])

	multiplex_state: MultiplexState = inference_state["multiplex_state"]
	# strict is set to True because we have done the filtering above
	buckets_to_keep = multiplex_state.remove_objects(
	old_obj_idxs_to_rm, strict=True
	)
	obj_ids = set(obj_ids)

	# Step 3: For packed tensor storage, we index the remaining ids and rebuild the per-bucket/per-object slices.
	def _slice_state(output_dict, storage_key):
	for frame_idx, out in output_dict[storage_key].items():
	if out.get("maskmem_features") is not None:
	out["maskmem_features"] = out["maskmem_features"][buckets_to_keep]
	if out.get("maskmem_pos_enc") is not None:
	out["maskmem_pos_enc"] = [
	x[buckets_to_keep] for x in out["maskmem_pos_enc"]
	]
	# "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
	out["maskmem_pos_enc"] = self._get_maskmem_pos_enc(inference_state, out)
	if out.get("obj_ptr") is not None:
	out["obj_ptr"] = out["obj_ptr"][buckets_to_keep]

	# Note that pred_maks and score_logits are stored in a per-object manner
	# When we add new objects, obj_id_to_idx mapping could be different
	# locally (at this past frame) versus globally (at the current frame),
	# so we need to use a local copy of this mapping
	local_obj_id_to_idx = out["local_obj_id_to_idx"]

	# Find which local indices correspond to the remaining old object indices
	local_remain_old_obj_inds = [
	obj_idx
	for obj_id, obj_idx in local_obj_id_to_idx.items()
	if obj_id not in obj_ids
	]

	# Guard against stale indices by intersecting with available rows
	max_pred = out["pred_masks"].shape[0]
	max_scores = out["object_score_logits"].shape[0]
	keep_indices = [
	idx
	for idx in local_remain_old_obj_inds
	if 0 <= idx < max_pred and 0 <= idx < max_scores
	]
	out["pred_masks"] = out["pred_masks"][keep_indices]
	out["object_score_logits"] = out["object_score_logits"][keep_indices]
	if self.use_memory_selection:
	out["iou_score"] = out["iou_score"][keep_indices]
	out["eff_iou_score"] = self.cal_mem_score(
	out["object_score_logits"], out["iou_score"]
	) # recalculate the memory frame score
	sliced_conditioning_objects = set()

	# Update local_obj_id_to_idx to reflect the new indices after removal
	new_local_obj_id_to_idx = {}
	old_to_new = {
	old_idx: new_i for new_i, old_idx in enumerate(keep_indices)
	}
	for obj_id, old_idx in local_obj_id_to_idx.items():
	if obj_id not in obj_ids: # Keep objects not being removed
	# Find the new index for this object if it was kept
	if old_idx in old_to_new:
	new_idx = old_to_new[old_idx]
	new_local_obj_id_to_idx[obj_id] = new_idx
	if old_idx in out["conditioning_objects"]:
	sliced_conditioning_objects.add(new_idx)

	out["local_obj_id_to_idx"] = new_local_obj_id_to_idx
	out["conditioning_objects"] = sliced_conditioning_objects

	# also update the per-object slices
	self._add_output_per_object(
	inference_state, frame_idx, out, storage_key
	)

	_slice_state(inference_state["output_dict"], "cond_frame_outputs")
	_slice_state(inference_state["output_dict"], "non_cond_frame_outputs")

	# Step 4: Further collect the outputs on those frames in `obj_input_frames_inds`, which
	# could show an updated mask for objects previously occluded by the object being removed
	if need_output:
	temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"]
	for frame_idx in all_obj_input_frames_inds:
	is_cond = any(
	frame_idx in obj_temp_output_dict["cond_frame_outputs"]
	for obj_temp_output_dict in temp_output_dict_per_obj.values()
	)
	consolidated_out = self._consolidate_temp_output_across_obj(
	inference_state,
	frame_idx,
	is_cond=is_cond,
	run_mem_encoder=False,
	consolidate_at_video_res=True,
	)
	_, video_res_masks = self._get_orig_video_res_output(
	inference_state, consolidated_out["pred_masks_video_res"]
	)
	updated_frames.append((frame_idx, video_res_masks))

	return inference_state["obj_ids"], updated_frames

	def _clear_non_cond_mem_around_input(self, inference_state, frame_idx):
	"""
	Remove the non-conditioning memory around the input frame. When users provide
	correction clicks, the surrounding frames' non-conditioning memories can still
	contain outdated object appearance information and could confuse the model.

	This function clears those non-conditioning memories surrounding the interacted
	frame to avoid giving the model both old and new information about the object.
	"""
	r = self.memory_temporal_stride_for_eval
	frame_idx_begin = frame_idx - r * self.num_maskmem
	frame_idx_end = frame_idx + r * self.num_maskmem
	output_dict = inference_state["output_dict"]
	non_cond_frame_outputs = output_dict["non_cond_frame_outputs"]
	for t in range(frame_idx_begin, frame_idx_end + 1):
	non_cond_frame_outputs.pop(t, None)
	for obj_output_dict in inference_state["output_dict_per_obj"].values():
	obj_output_dict["non_cond_frame_outputs"].pop(t, None)

	@torch.inference_mode()
	@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
	def warm_up_compilation(
	self, offload_video_to_cpu=False, offload_state_to_cpu=False
	):
	"""
	Warm up the model by running a dummy inference to compile the model. This is
	useful to avoid the compilation overhead in the first inference call.
	"""
	if not self.compile_all_components:
	return

	raise NotImplementedError(
	"Please use `VideoTrackingMultiplexDemoPerBucketInference` instead for full model compilation."
	)


	class Sam3VideoTrackingMultiplexDemo(VideoTrackingMultiplexDemo):
	@torch.inference_mode()
	def init_state(
	self,
	video_height,
	video_width,
	num_frames,
	cached_features=None,
	offload_video_to_cpu=False,
	offload_state_to_cpu=False,
	):
	"""Initialize a inference state."""
	# Make sure that sigmoid is used on mask logits (should be True for all our recent models).
	# Since we rely on large negative values as scores for missing objects, the raw logits
	# cannot be consumed directly and must be converted into 0~1 range via sigmoid first.
	if not self.apply_sigmoid_to_mask_logits_for_mem_enc:
	raise NotImplementedError(
	"Multi-object tracking requires sigmoid in memory encoder for non-overlapping constraints."
	)
	inference_state = {}
	# inference_state["images"] = images
	inference_state["num_frames"] = num_frames
	# whether to offload the video frames to CPU memory
	# turning on this option saves the GPU memory with only a very small overhead
	inference_state["offload_video_to_cpu"] = offload_video_to_cpu
	# whether to offload the inference state to CPU memory
	# turning on this option saves the GPU memory at the cost of a lower tracking fps
	# (e.g. in a test case of 768x768 model, fps dropped from 27 to 24 when tracking one object
	# and from 24 to 21 when tracking two objects)
	inference_state["offload_state_to_cpu"] = offload_state_to_cpu
	# the original video height and width, used for resizing final output scores
	inference_state["video_height"] = video_height
	inference_state["video_width"] = video_width
	inference_state["device"] = get_accelerator_device()
	if offload_state_to_cpu:
	inference_state["storage_device"] = torch.device("cpu")
	else:
	inference_state["storage_device"] = get_accelerator_device()
	# inputs on each frame
	inference_state["point_inputs_per_obj"] = {}
	inference_state["mask_inputs_per_obj"] = {}
	# visual features on a small number of recently visited frames for quick interactions
	inference_state["cached_features"] = (
	{} if cached_features is None else cached_features
	)
	# values that don't change across frames (so we only need to hold one copy of them)
	inference_state["constants"] = {}
	# mapping between client-side object id and model-side object index
	inference_state["obj_id_to_idx"] = OrderedDict()
	inference_state["obj_idx_to_id"] = OrderedDict()
	inference_state["obj_ids"] = []
	# A storage to hold the model's tracking results and states on each frame
	inference_state["output_dict"] = {
	"cond_frame_outputs": {}, # dict containing {frame_idx: <out>}
	"non_cond_frame_outputs": {}, # dict containing {frame_idx: <out>}
	}
	# The index of the frame that received the first annotation
	inference_state["first_ann_frame_idx"] = None
	# Slice (view) of each object tracking results, sharing the same memory with "output_dict"
	inference_state["output_dict_per_obj"] = {}
	# A temporary storage to hold new outputs when user interact with a frame
	# to add clicks or mask (it's merged into "output_dict" before propagation starts)
	inference_state["temp_output_dict_per_obj"] = {}
	# Frames that already holds consolidated outputs from click or mask inputs
	# (we directly use their consolidated outputs during tracking)
	inference_state["consolidated_frame_inds"] = {
	"cond_frame_outputs": set(), # set containing frame indices
	"non_cond_frame_outputs": set(), # set containing frame indices
	}
	# metadata for each tracking frame (e.g. which direction it's tracked)
	inference_state["tracking_has_started"] = False
	inference_state["frames_already_tracked"] = {}
	inference_state["multiplex_state"] = None
	# Warm up the whole model and cache the image feature on frame 0
	# by making a dummy click on the first frame (and then cleaning it up)
	# self.add_new_points(
	# inference_state=inference_state,
	# frame_idx=0,
	# obj_id=1,
	# points=torch.tensor([[0.5, 0.5]], dtype=torch.float32),
	# labels=torch.tensor([1], dtype=torch.int32),
	# clear_old_points=True,
	# rel_coordinates=True,
	# )
	self.clear_all_points_in_video(inference_state)
	return inference_state

	def _suppress_shrinked_masks(
	self, pred_masks, new_pred_masks, shrink_threshold=0.3
	):
	area_before = (pred_masks > 0).sum(dim=(-1, -2))
	area_after = (new_pred_masks > 0).sum(dim=(-1, -2))
	area_before = torch.clamp(area_before, min=1.0)
	area_ratio = area_after / area_before
	keep = area_ratio >= shrink_threshold
	keep_mask = keep[..., None, None].expand_as(pred_masks)
	pred_masks_after = torch.where(
	keep_mask, pred_masks, torch.clamp(pred_masks, max=-10.0)
	)
	return pred_masks_after

	@staticmethod
	def _suppress_object_pw_area_shrinkage(pred_masks):
	"""
	This function suppresses masks that shrink in area after applying pixelwise non-overlapping constriants.
	Note that the final output can still be overlapping.
	"""
	# Apply pixel-wise non-overlapping constraint based on mask scores
	# pixel_level_non_overlapping_masks = super()._apply_non_overlapping_constraints(
	# pred_masks
	# )

	batch_size = pred_masks.size(0)
	if batch_size == 1:
	return pred_masks

	device = pred_masks.device
	# "max_obj_inds": object index of the object with the highest score at each location
	max_obj_inds = torch.argmax(pred_masks, dim=0, keepdim=True)
	# "batch_obj_inds": object index of each object slice (along dim 0) in `pred_masks`
	batch_obj_inds = torch.arange(batch_size, device=device)[:, None, None, None]
	keep = max_obj_inds == batch_obj_inds
	# suppress overlapping regions' scores below -10.0 so that the foreground regions
	# don't overlap (here sigmoid(-10.0)=4.5398e-05)
	pixel_level_non_overlapping_masks = torch.where(
	keep, pred_masks, torch.clamp(pred_masks, max=-10.0)
	)

	# Fully suppress masks with high shrinkage (probably noisy) based on the pixel wise non-overlapping constraints
	# NOTE: The output of this function can be a no op if none of the masks shrinked by a large factor.
	# pred_masks = self._suppress_shrinked_masks(
	# pred_masks, pixel_level_non_overlapping_masks
	# )

	shrink_threshold = 0.3
	area_before = (pred_masks > 0).sum(dim=(-1, -2))
	area_after = (pixel_level_non_overlapping_masks > 0).sum(dim=(-1, -2))
	area_before = torch.clamp(area_before, min=1.0)
	area_ratio = area_after / area_before
	keep = area_ratio >= shrink_threshold
	keep_mask = keep[..., None, None].expand_as(pred_masks)
	pred_masks_after = torch.where(
	keep_mask, pred_masks, torch.clamp(pred_masks, max=-10.0)
	)

	return pred_masks_after

	def _apply_object_wise_non_overlapping_constraints(
	self, pred_masks, obj_scores, background_value=-10.0
	):
	"""
	Applies non-overlapping constraints object wise (i.e. only one object can claim the overlapping region)
	"""
	# TODO: Try suppression based on IoM here as well.
	# Replace pixel scores with object scores
	pred_masks_single_score = torch.where(
	pred_masks > 0, obj_scores[..., None, None], background_value
	)
	# Apply pixel-wise non-overlapping constraint based on mask scores
	pixel_level_non_overlapping_masks = super()._apply_non_overlapping_constraints(
	pred_masks_single_score
	)
	# Replace object scores with pixel scores. Note, that now only one object can claim the overlapping region
	pred_masks = torch.where(
	pixel_level_non_overlapping_masks > 0,
	pred_masks,
	torch.clamp(pred_masks, max=background_value),
	)
	return pred_masks

	@torch.inference_mode()
	def propagate_in_video(
	self,
	inference_state,
	start_frame_idx,
	max_frame_num_to_track,
	reverse,
	tqdm_disable=False,
	obj_ids=None,
	run_mem_encoder=True,
	):
	"""Propagate the input points across frames to track in the entire video."""
	# NOTE: This is a copy from the parent class, except that we return object scores as well.
	output_dict = inference_state["output_dict"]
	consolidated_frame_inds = inference_state["consolidated_frame_inds"]
	if obj_ids is not None:
	raise NotImplementedError(
	"Per-object tracking yet for batched inference if not implemented."
	)
	obj_ids = inference_state["obj_ids"]
	batch_size = self._get_obj_num(inference_state)
	if len(output_dict["cond_frame_outputs"]) == 0:
	raise RuntimeError("No points are provided; please add points first")
	clear_non_cond_mem = self.clear_non_cond_mem_around_input and (
	self.clear_non_cond_mem_for_multi_obj or batch_size <= 1
	)

	processing_order = self._get_processing_order(
	inference_state,
	start_frame_idx,
	max_frame_num_to_track,
	reverse,
	)

	for frame_idx in tqdm(
	processing_order, desc="propagate in video", disable=tqdm_disable
	):
	# We skip those frames already in consolidated outputs (these are frames
	# that received input clicks or mask). Note that we cannot directly run
	# batched forward on them via `_run_single_frame_inference` because the
	# number of clicks on each object might be different.
	if frame_idx in consolidated_frame_inds["cond_frame_outputs"]:
	storage_key = "cond_frame_outputs"
	current_out = output_dict[storage_key][frame_idx]
	pred_masks = current_out["pred_masks"]
	obj_scores = current_out["object_score_logits"]
	if clear_non_cond_mem:
	# clear non-conditioning memory of the surrounding frames
	self._clear_non_cond_mem_around_input(inference_state, frame_idx)
	elif frame_idx in consolidated_frame_inds["non_cond_frame_outputs"]:
	storage_key = "non_cond_frame_outputs"
	current_out = output_dict[storage_key][frame_idx]
	pred_masks = current_out["pred_masks"]
	obj_scores = current_out["object_score_logits"]
	else:
	storage_key = "non_cond_frame_outputs"
	with torch.profiler.record_function(
	"VideoTrackingMultiplexDemo._run_single_frame_inference"
	):
	current_out, pred_masks = self._run_single_frame_inference(
	inference_state=inference_state,
	output_dict=output_dict,
	frame_idx=frame_idx,
	batch_size=batch_size,
	is_init_cond_frame=False,
	point_inputs=None,
	mask_inputs=None,
	reverse=reverse,
	run_mem_encoder=run_mem_encoder,
	)
	obj_scores = current_out["object_score_logits"]
	current_out["local_obj_id_to_idx"] = deepcopy(
	inference_state["obj_id_to_idx"]
	)
	output_dict[storage_key][frame_idx] = current_out

	# Create slices of per-object outputs for subsequent interaction with each
	# individual object after tracking.
	self._add_output_per_object(
	inference_state, frame_idx, current_out, storage_key
	)
	inference_state["frames_already_tracked"][frame_idx] = {"reverse": reverse}

	# Resize the output mask to the original video resolution (we directly use
	# the mask scores on GPU for output to avoid any CPU conversion in between)
	low_res_masks, video_res_masks = self._get_orig_video_res_output(
	inference_state, pred_masks
	)
	yield frame_idx, obj_ids, low_res_masks, video_res_masks, obj_scores