Spaces:

1ripon1
/

ColabWan

Build error

App Files Files Community

ColabWan / preprocessing /sam3 /model /sam3_multiplex_detector.py

1ripon1

Upload folder using huggingface_hub

7344bef verified 26 days ago

Raw

History Blame Contribute Delete

40.4 kB

	import os

	import torch
	from ..model.vl_combiner import SAM3VLBackbone

	try:
	from ..model.vl_combiner import SAM3VLBackboneTri
	except ImportError:
	SAM3VLBackboneTri = None
	from typing import Dict, List, Optional

	import numpy as np
	from ..model.data_misc import BatchedDatapoint, FindStage
	from ..model.geometry_encoders import Prompt
	from ..model.model_misc import SAM3Output
	from ..model.sam3_image import Sam3Image
	from ..model.sam3_multiplex_detector_utils import nms_masks


	class Sam3MultiplexImageBase(Sam3Image):
	"""A wrapper class to run Sam3Image on videos for per-frame detection (no tracking)."""

	def __init__(
	self,
	*args,
	tracking_score_thresh: float = 0.0,
	offload_outputs_to_cpu_for_eval: bool = False,
	**kwargs,
	):
	super().__init__(args, *kwargs)
	self.tracking_score_thresh = tracking_score_thresh
	self.offload_outputs_to_cpu_for_eval = offload_outputs_to_cpu_for_eval
	self.trim_outputs_for_eval = True # dummy option -- it doesn't do anything

	def forward(
	self,
	input: BatchedDatapoint,
	is_inference=False, # (a dummy parameter not used anymore)
	):
	assert not self.training, (
	"Sam3MultiplexImageBase should only be used in eval mode."
	)

	device = self.device
	backbone_out = {"img_batch_all_stages": input.img_batch}
	text_outputs = self.backbone.forward_text(input.find_text_batch, device=device)
	backbone_out.update(text_outputs)
	num_frames = len(input.find_inputs)

	previous_stages_out = SAM3Output(
	iter_mode=SAM3Output.IterMode.LAST_STEP_PER_STAGE
	)
	for frame_idx in range(num_frames):
	find_input = input.find_inputs[frame_idx]
	find_target = input.find_targets[frame_idx]
	geometric_prompt = self._get_geo_prompt_from_find_input(find_input)
	cur_out, _ = self.forward_video_grounding(
	backbone_out=backbone_out,
	find_input=find_input,
	find_target=find_target,
	geometric_prompt=geometric_prompt,
	)
	# offload model outputs to CPU (to save GPU memory) for evaluation
	if self.offload_outputs_to_cpu_for_eval:
	cur_out = {k: v.cpu() for k, v in cur_out.items()}

	previous_stages_out.append([cur_out])

	get_queries = None
	return previous_stages_out, get_queries

	def forward_video_grounding(
	self,
	backbone_out,
	find_input,
	find_target,
	geometric_prompt: Prompt,
	**kwargs,
	):
	# route this to the image grounding forward method
	out = self.forward_grounding(
	backbone_out=backbone_out,
	find_input=find_input,
	find_target=find_target,
	geometric_prompt=geometric_prompt,
	)
	# trim the output to only include the necessary keys
	out = {
	"pred_logits": out["pred_logits"],
	"pred_boxes": out["pred_boxes"],
	"pred_boxes_xyxy": out["pred_boxes_xyxy"],
	"pred_masks": out["pred_masks"],
	"pred_object_ids": self._get_dummy_object_ids(out["pred_logits"]),
	}
	return out, backbone_out

	def _get_dummy_object_ids(self, pred_logits):
	"""Generate dummy object IDs for the detected objects, based on their detection query indices."""
	# Assuming pred_logits has shape [batch_size, num_queries, num_classes]
	B, Q, _ = pred_logits.shape
	is_above_thresh = pred_logits.squeeze(2) > self.tracking_score_thresh
	dummy_obj_ids = torch.arange(Q, device=self.device).expand(B, -1)
	dummy_obj_ids = torch.where(is_above_thresh, dummy_obj_ids, -1)
	return dummy_obj_ids

	def _trim_outputs(self, args, *kwargs):
	pass # not needed for image-on-video

	def _batch_find_inputs(
	self,
	find_inputs: List[FindStage],
	chunk_start: int,
	chunk_end: int,
	) -> FindStage:
	"""
	Batch multiple FindStage objects into a single batched FindStage.

	For each frame in the chunk, creates img_ids that point to the correct
	frame index. When processing streaming video, the img_ids are the actual
	frame indices (e.g., [0, 1, 2, ..., 15] for chunk 0-16), and the modulo
	for circular buffer access is applied later in _get_img_feats.

	Args:
	find_inputs: List of FindStage objects for all frames.
	chunk_start: Start index of the chunk.
	chunk_end: End index of the chunk (exclusive).

	Returns:
	A single FindStage with batched tensors.
	"""
	chunk_find_inputs = [
	find_inputs[i % len(find_inputs)] for i in range(chunk_start, chunk_end)
	]

	# Generate img_ids based on chunk frame indices
	# Each frame in the chunk gets its corresponding frame index
	# The modulo for circular buffer access is handled in _get_img_feats
	device = chunk_find_inputs[0].img_ids.device
	dtype = chunk_find_inputs[0].img_ids.dtype
	img_ids_list = [
	torch.tensor([i], device=device, dtype=dtype)
	for i in range(chunk_start, chunk_end)
	]
	batched_img_ids = torch.cat(img_ids_list, dim=0)

	# Generate img_ids_np to match
	img_ids_np_list = [np.array([i]) for i in range(chunk_start, chunk_end)]
	batched_img_ids_np = np.concatenate(img_ids_np_list, axis=0)

	# Concatenate text_ids
	text_ids_list = [fi.text_ids for fi in chunk_find_inputs]
	batched_text_ids = torch.cat(text_ids_list, dim=0)

	# Concatenate input_boxes
	input_boxes_list = [fi.input_boxes for fi in chunk_find_inputs]
	batched_input_boxes = (
	torch.cat(input_boxes_list, dim=0)
	if input_boxes_list[0] is not None
	else None
	)

	# Concatenate input_boxes_mask
	input_boxes_mask_list = [fi.input_boxes_mask for fi in chunk_find_inputs]
	batched_input_boxes_mask = (
	torch.cat(input_boxes_mask_list, dim=0)
	if input_boxes_mask_list[0] is not None
	else None
	)

	# Concatenate input_boxes_label
	input_boxes_label_list = [fi.input_boxes_label for fi in chunk_find_inputs]
	batched_input_boxes_label = (
	torch.cat(input_boxes_label_list, dim=0)
	if input_boxes_label_list[0] is not None
	else None
	)

	# Concatenate input_points
	input_points_list = [fi.input_points for fi in chunk_find_inputs]
	batched_input_points = (
	torch.cat(input_points_list, dim=0)
	if input_points_list[0] is not None
	else None
	)

	# Concatenate input_points_mask
	input_points_mask_list = [fi.input_points_mask for fi in chunk_find_inputs]
	batched_input_points_mask = (
	torch.cat(input_points_mask_list, dim=0)
	if input_points_mask_list[0] is not None
	else None
	)

	# Handle optional fields
	input_boxes_before_embed_list = [
	fi.input_boxes_before_embed for fi in chunk_find_inputs
	]
	batched_input_boxes_before_embed = (
	torch.cat(input_boxes_before_embed_list, dim=0)
	if input_boxes_before_embed_list[0] is not None
	else None
	)

	input_points_before_embed_list = [
	fi.input_points_before_embed for fi in chunk_find_inputs
	]
	batched_input_points_before_embed = (
	torch.cat(input_points_before_embed_list, dim=0)
	if input_points_before_embed_list[0] is not None
	else None
	)

	# Create batched FindStage
	batched_find_input = FindStage(
	img_ids=batched_img_ids,
	img_ids_np=batched_img_ids_np,
	text_ids=batched_text_ids,
	input_boxes=batched_input_boxes,
	input_boxes_mask=batched_input_boxes_mask,
	input_boxes_label=batched_input_boxes_label,
	input_points=batched_input_points,
	input_points_mask=batched_input_points_mask,
	ptrs=None, # Not batching pointers for now
	ptrs_seg=None,
	object_ids=None,
	input_boxes_before_embed=batched_input_boxes_before_embed,
	input_points_before_embed=batched_input_points_before_embed,
	)

	return batched_find_input

	def _batch_geometric_prompts(
	self,
	geometric_prompts: List[Prompt],
	chunk_start: int,
	chunk_end: int,
	) -> Prompt:
	"""
	Batch multiple Prompt objects into a single batched Prompt.

	Args:
	geometric_prompts: List of Prompt objects for all frames.
	chunk_start: Start index of the chunk.
	chunk_end: End index of the chunk (exclusive).

	Returns:
	A single Prompt with batched tensors.
	"""
	chunk_prompts = [geometric_prompts[i] for i in range(chunk_start, chunk_end)]
	return self._batch_geometric_prompts_from_list(chunk_prompts)

	def _batch_geometric_prompts_from_list(
	self,
	chunk_prompts: List[Prompt],
	) -> Prompt:
	"""
	Batch a list of Prompt objects into a single batched Prompt.

	Prompt uses seq-first, batch-second convention:
	- box_embeddings: N_boxes x B x C_box - batch along dim 1
	- box_mask: B x N_boxes - batch along dim 0
	- box_labels: N_boxes x B - batch along dim 1
	- point_embeddings: N_points x B x C_point - batch along dim 1
	- point_mask: B x N_points - batch along dim 0
	- point_labels: N_points x B - batch along dim 1

	Args:
	chunk_prompts: List of Prompt objects to batch.

	Returns:
	A single Prompt with batched tensors.
	"""

	# Helper function to batch tensors along specified dimension
	def batch_tensors(tensors, dim):
	if tensors[0] is None:
	return None
	return torch.cat(tensors, dim=dim)

	# Batch box embeddings (N_boxes x B x C_box - batch along dim 1)
	box_embeddings_list = [p.box_embeddings for p in chunk_prompts]
	batched_box_embeddings = batch_tensors(box_embeddings_list, dim=1)

	# Batch box mask (B x N_boxes - batch along dim 0)
	box_mask_list = [p.box_mask for p in chunk_prompts]
	batched_box_mask = batch_tensors(box_mask_list, dim=0)

	# Batch box labels (N_boxes x B - batch along dim 1)
	box_labels_list = [p.box_labels for p in chunk_prompts]
	batched_box_labels = batch_tensors(box_labels_list, dim=1)

	# Batch point embeddings (N_points x B x C_point - batch along dim 1)
	point_embeddings_list = [p.point_embeddings for p in chunk_prompts]
	batched_point_embeddings = batch_tensors(point_embeddings_list, dim=1)

	# Batch point mask (B x N_points - batch along dim 0)
	point_mask_list = [p.point_mask for p in chunk_prompts]
	batched_point_mask = batch_tensors(point_mask_list, dim=0)

	# Batch point labels (N_points x B - batch along dim 1)
	point_labels_list = [p.point_labels for p in chunk_prompts]
	batched_point_labels = batch_tensors(point_labels_list, dim=1)

	# Create batched Prompt
	batched_prompt = Prompt(
	box_embeddings=batched_box_embeddings,
	box_mask=batched_box_mask,
	box_labels=batched_box_labels,
	point_embeddings=batched_point_embeddings,
	point_mask=batched_point_mask,
	point_labels=batched_point_labels,
	)

	return batched_prompt


	class Sam3MultiplexDetector(Sam3MultiplexImageBase):
	def __init__(
	self,
	*args,
	async_all_gather=True,
	gather_backbone_out=None,
	is_multiplex=False,
	**kwargs,
	):
	super().__init__(args, *kwargs)
	self.rank = int(os.getenv("RANK", "0"))
	self.world_size = int(os.getenv("WORLD_SIZE", "1"))
	self.async_all_gather = async_all_gather

	# if gather_backbone is not set, default to gathering only for `SAM3VLBackbone`
	if gather_backbone_out is None:
	gather_backbone_out = isinstance(self.backbone, SAM3VLBackbone) or (
	SAM3VLBackboneTri is not None
	and isinstance(self.backbone, SAM3VLBackboneTri)
	)
	self.gather_backbone_out = gather_backbone_out
	self.is_multiplex = is_multiplex

	def forward_video_grounding_multigpu(
	self,
	backbone_out,
	find_inputs,
	geometric_prompt: Prompt,
	frame_idx,
	num_frames,
	# `multigpu_buffer` is a dict to cache FA outputs in a chunk between different calls
	multigpu_buffer,
	track_in_reverse=False,
	# whether to also return the SAM2 backbone features (in addition to FA results)
	return_sam2_backbone_feats=False,
	# whether to perform NMS and suppress the scores of those detections removed by NMS
	run_nms=False,
	nms_prob_thresh=None,
	nms_iou_thresh=None,
	nms_use_iom=False,
	# tracking bounds to respect max_frame_num_to_track
	max_frame_num_to_track=None,
	propagate_in_video_start_frame_idx=None,
	# feature_cache for buffered backbone computation
	feature_cache=None,
	**kwargs,
	):
	"""
	Compute the FA detection outputs in a distributed manner, where all GPUs process
	a chunk of frames (equal to the number of GPUs) at once and store them in cache.
	"""
	# Calculate valid frame range based on max_frame_num_to_track
	# We prevent pre-fetching beyond the tracking window relative to current frame
	if max_frame_num_to_track is not None:
	if propagate_in_video_start_frame_idx is None:
	propagate_in_video_start_frame_idx = 0
	if track_in_reverse:
	# When going backwards, limit how far back we can go from current frame
	valid_frame_start = max(
	0,
	propagate_in_video_start_frame_idx - max_frame_num_to_track + 1,
	)
	valid_frame_end = num_frames
	else:
	# When going forwards, limit how far ahead we can go from current frame
	valid_frame_start = 0
	valid_frame_end = min(
	num_frames,
	propagate_in_video_start_frame_idx + max_frame_num_to_track,
	)
	else:
	# No tracking limit specified, use full video range
	valid_frame_start = 0
	valid_frame_end = num_frames

	# Step 1: fetch the FA outputs in the current chunk from buffer
	frame_idx_curr_b = frame_idx - frame_idx % self.world_size
	frame_idx_curr_e = min(frame_idx_curr_b + self.world_size, num_frames)

	# Clamp the current chunk to the valid tracking range
	frame_idx_curr_b = max(frame_idx_curr_b, valid_frame_start)
	frame_idx_curr_e = min(frame_idx_curr_e, valid_frame_end)
	# in case the current frame's FA results are not in the buffer yet, build the current chunk
	# (this should only happen on the first chunk, since we are also building the next chunk below)
	if frame_idx not in multigpu_buffer:
	with torch.profiler.record_function("build_multigpu_buffer_next_chunk1"):
	self._build_multigpu_buffer_next_chunk(
	backbone_out=backbone_out,
	find_inputs=find_inputs,
	geometric_prompt=geometric_prompt,
	frame_idx_begin=frame_idx_curr_b,
	frame_idx_end=frame_idx_curr_e,
	num_frames=num_frames,
	multigpu_buffer=multigpu_buffer,
	run_nms=run_nms,
	nms_prob_thresh=nms_prob_thresh,
	nms_iou_thresh=nms_iou_thresh,
	nms_use_iom=nms_use_iom,
	feature_cache=feature_cache,
	)

	# read out the current frame's results from `multigpu_buffer`
	out = {}
	for k, (v, handle) in multigpu_buffer[frame_idx].items():
	if self.is_multiplex:
	if (
	k.startswith("interactive_backbone_")
	or k.startswith("propagation_backbone_")
	) and not return_sam2_backbone_feats:
	continue
	else:
	if k.startswith("sam2_backbone_") and not return_sam2_backbone_feats:
	continue
	if handle is not None:
	handle.wait() # wait for async all-gather to finish
	out[k] = v

	# Step 2: remove FA outputs of the previous chunk from cache to save GPU memory
	if not track_in_reverse and frame_idx_curr_b - self.world_size >= 0:
	frame_idx_prev_e = frame_idx_curr_b
	frame_idx_prev_b = frame_idx_curr_b - self.world_size
	elif track_in_reverse and frame_idx_curr_e < num_frames:
	frame_idx_prev_b = frame_idx_curr_e
	frame_idx_prev_e = min(frame_idx_prev_b + self.world_size, num_frames)
	else:
	frame_idx_prev_b = frame_idx_prev_e = None
	if frame_idx_prev_b is not None:
	for frame_idx_rm in range(frame_idx_prev_b, frame_idx_prev_e):
	multigpu_buffer.pop(frame_idx_rm, None)

	# Step 3: compute and cache FA outputs of the next chunk ahead of time
	# (so that we can overlap computation with all-gather transfer)
	# Respect tracking bounds when calculating next chunk

	if not track_in_reverse and frame_idx_curr_e < valid_frame_end:
	frame_idx_next_b = frame_idx_curr_e
	frame_idx_next_e = min(frame_idx_next_b + self.world_size, valid_frame_end)
	elif (
	track_in_reverse and frame_idx_curr_b - self.world_size >= valid_frame_start
	):
	frame_idx_next_e = frame_idx_curr_b
	frame_idx_next_b = max(
	frame_idx_curr_b - self.world_size, valid_frame_start
	)
	else:
	frame_idx_next_b = frame_idx_next_e = None
	if frame_idx_next_b is not None and frame_idx_next_b not in multigpu_buffer:
	with torch.profiler.record_function("build_multigpu_buffer_next_chunk2"):
	self._build_multigpu_buffer_next_chunk(
	backbone_out=backbone_out,
	find_inputs=find_inputs,
	geometric_prompt=geometric_prompt,
	frame_idx_begin=frame_idx_next_b,
	frame_idx_end=frame_idx_next_e,
	num_frames=num_frames,
	multigpu_buffer=multigpu_buffer,
	run_nms=run_nms,
	nms_prob_thresh=nms_prob_thresh,
	nms_iou_thresh=nms_iou_thresh,
	feature_cache=feature_cache,
	)

	return out, backbone_out

	def _build_multigpu_buffer_next_chunk(
	self,
	backbone_out,
	find_inputs,
	geometric_prompt: Prompt,
	frame_idx_begin,
	frame_idx_end,
	num_frames,
	multigpu_buffer,
	run_nms=False,
	nms_prob_thresh=None,
	nms_iou_thresh=None,
	nms_use_iom=False,
	feature_cache=None,
	):
	"""Compute FA outputs on a chunk of frames and store their results in multigpu_buffer."""
	# each GPU computes FA on one frame in the chunk (in a round-robin manner)
	frame_idx_local_gpu = min(frame_idx_begin + self.rank, frame_idx_end - 1)
	# `forward_grounding` (from base class `Sam3MultiplexImageBase`) runs FA on a single frame
	with torch.profiler.record_function("forward_grounding"):
	out_local = self.forward_grounding(
	backbone_out=backbone_out,
	# HACK: Since find_inputs is on GPU having to realloc is expensive so changing the values in place for the prod usecase
	# i.e. when using the streaming frame loader resource instead of local file. For non-prod is always
	# frame_idx_local_gpu < len(find_inputs) so should be a no-op
	find_input=find_inputs[frame_idx_local_gpu % len(find_inputs)],
	find_target=None,
	geometric_prompt=geometric_prompt,
	feature_cache=feature_cache,
	)
	if run_nms:
	with torch.profiler.record_function("nms_masks"):
	# run NMS as a post-processing step on top of the detection outputs
	assert nms_prob_thresh is not None and nms_iou_thresh is not None
	pred_probs = out_local["pred_logits"].squeeze(-1).sigmoid()
	pred_masks = out_local["pred_masks"]
	# loop over text prompts (not an overhead for demo where there's only 1 prompt)
	for prompt_idx in range(pred_probs.size(0)):
	keep = nms_masks(
	pred_probs=pred_probs[prompt_idx],
	pred_masks=pred_masks[prompt_idx],
	prob_threshold=nms_prob_thresh,
	iou_threshold=nms_iou_thresh,
	nms_use_iom=nms_use_iom,
	do_compile=getattr(self, "compile_model", False),
	running_in_prod=getattr(self, "running_in_prod", False),
	)
	# set a very low threshold for those detections removed by NMS
	out_local["pred_logits"][prompt_idx, :, 0] -= 1e4 * (~keep).float()

	if self.gather_backbone_out:
	# gather the SAM 2 backbone features across GPUs
	if self.is_multiplex:
	# Note that we should not need to compute the interaction features every frame
	# TODO: rooms for optimization

	# Interaction features
	inte_feats = out_local["prev_encoder_out"]["backbone_out"][
	"interactive"
	]
	assert inte_feats["vision_mask"] is None
	assert (
	len(inte_feats["backbone_fpn"]) == 3
	) # SAM2 backbone always have 3 levels
	assert all(x.mask is None for x in inte_feats["backbone_fpn"])
	# cast the SAM2 backbone features to bfloat16 for all-gather (this is usually
	# a no-op, SAM2 backbone features are likely already in bfloat16 due to AMP)
	inte_backbone_fpn_bf16 = [
	x.to(torch.bfloat16) for x in inte_feats["backbone_fpn"]
	]
	inte_fpn0, inte_fpn_handle0 = self._gather_tensor(
	inte_backbone_fpn_bf16[0].tensors
	)
	inte_fpn1, inte_fpn_handle1 = self._gather_tensor(
	inte_backbone_fpn_bf16[1].tensors
	)
	inte_fpn2, inte_fpn_handle2 = self._gather_tensor(
	inte_backbone_fpn_bf16[2].tensors
	)
	# vision_pos_enc is the same on all frames, so no need to all-gather them
	inte_vision_pos_enc = inte_feats["vision_pos_enc"]

	feats = out_local["prev_encoder_out"]["backbone_out"]["sam2_backbone_out"]
	assert feats["vision_mask"] is None
	assert len(feats["backbone_fpn"]) == 3 # SAM2 backbone always have 3 levels
	assert all(x.mask is None for x in feats["backbone_fpn"])
	# cast the SAM2 backbone features to bfloat16 for all-gather (this is usually
	# a no-op, SAM2 backbone features are likely already in bfloat16 due to AMP)
	backbone_fpn_bf16 = [x.to(torch.bfloat16) for x in feats["backbone_fpn"]]
	fpn0, fpn_handle0 = self._gather_tensor(backbone_fpn_bf16[0].tensors)
	fpn1, fpn_handle1 = self._gather_tensor(backbone_fpn_bf16[1].tensors)
	fpn2, fpn_handle2 = self._gather_tensor(backbone_fpn_bf16[2].tensors)
	# vision_pos_enc is the same on all frames, so no need to all-gather them
	vision_pos_enc = feats["vision_pos_enc"]

	# trim the FA output to only include the necessary keys
	out_local = {
	"pred_logits": out_local["pred_logits"],
	"pred_boxes": out_local["pred_boxes"],
	"pred_boxes_xyxy": out_local["pred_boxes_xyxy"],
	"pred_masks": out_local["pred_masks"],
	"pred_object_ids": self._get_dummy_object_ids(out_local["pred_logits"]),
	}

	# gather the results: after this step, each GPU will receive FA outputs on
	# all frames in the chunk and store them in `multigpu_buffer`
	out_gathered = {k: self._gather_tensor(v) for k, v in out_local.items()}
	for rank in range(self.world_size):
	frame_idx_to_save = frame_idx_begin + rank
	if frame_idx_to_save >= num_frames:
	continue
	frame_buffer = {
	k: (v[rank], handle) for k, (v, handle) in out_gathered.items()
	}
	if self.gather_backbone_out:
	# also add gathered SAM 2 backbone features to frame_buffer
	if self.is_multiplex:
	frame_buffer["interactive_backbone_fpn_0"] = (
	inte_fpn0[rank],
	inte_fpn_handle0,
	)
	frame_buffer["interactive_backbone_fpn_1"] = (
	inte_fpn1[rank],
	inte_fpn_handle1,
	)
	frame_buffer["interactive_backbone_fpn_2"] = (
	inte_fpn2[rank],
	inte_fpn_handle2,
	)
	frame_buffer["interactive_backbone_pos_enc"] = (
	inte_vision_pos_enc,
	None,
	)
	frame_buffer["sam2_backbone_fpn_0"] = (fpn0[rank], fpn_handle0)
	frame_buffer["sam2_backbone_fpn_1"] = (fpn1[rank], fpn_handle1)
	frame_buffer["sam2_backbone_fpn_2"] = (fpn2[rank], fpn_handle2)
	frame_buffer["sam2_backbone_pos_enc"] = (vision_pos_enc, None)

	multigpu_buffer[frame_idx_to_save] = frame_buffer

	def _gather_tensor(self, x):
	if self.world_size == 1:
	return [x], None

	async_op = self.async_all_gather
	# here `.contiguous()` is required -- otherwise NCCL all_gather
	# sometimes gives wrong results (based on Ronghang's observations)
	x = x.contiguous() # ensure contiguous memory for NCCL
	output_list = [torch.empty_like(x) for _ in range(self.world_size)]
	handle = torch.distributed.all_gather(output_list, x, async_op=async_op)
	return output_list, handle

	def forward_video_grounding_batched_multigpu(
	self,
	backbone_out,
	find_inputs,
	geometric_prompt: Prompt,
	frame_idx,
	num_frames,
	# `grounding_cache` is a dict to cache FA outputs in a chunk between different calls
	grounding_cache,
	track_in_reverse=False,
	# whether to also return the SAM2 backbone features (in addition to FA results)
	return_sam2_backbone_feats=False,
	# whether to perform NMS and suppress the scores of those detections removed by NMS
	run_nms=False,
	nms_prob_thresh=None,
	nms_iou_thresh=None,
	nms_use_iom=False,
	# tracking bounds to respect max_frame_num_to_track
	max_frame_num_to_track=None,
	propagate_in_video_start_frame_idx=None,
	# feature_cache for buffered backbone computation
	feature_cache=None,
	# batch_size for batched forward_grounding (default: 16)
	batch_size=16,
	):
	"""
	Fully batched forward_grounding that processes chunks of frames together on each GPU.

	Unlike forward_video_grounding_multigpu which processes 1 frame per GPU per chunk,
	this method processes `batch_size` frames at once using the batched forward_grounding
	approach from Sam3MultiplexImageBase.

	For single-GPU (world_size=1), this is equivalent to forward_grounding_batched.
	For multi-GPU, each GPU processes batch_size frames in parallel.

	Args:
	backbone_out: Dictionary containing backbone outputs and image batch.
	find_inputs: List of FindStage objects for all frames.
	geometric_prompt: Prompt object (used as template, individual prompts are
	constructed from find_inputs for batching).
	frame_idx: Current frame index to process.
	num_frames: Total number of frames in the video.
	grounding_cache: Dictionary to cache grounding outputs.
	track_in_reverse: If True, processing in reverse frame order.
	return_sam2_backbone_feats: Whether to also return SAM2 backbone features.
	run_nms: Whether to perform NMS on detection outputs.
	nms_prob_thresh: Probability threshold for NMS.
	nms_iou_thresh: IoU threshold for NMS.
	nms_use_iom: Whether to use IoM for NMS.
	max_frame_num_to_track: Maximum number of frames to track.
	propagate_in_video_start_frame_idx: Start frame index for propagation.
	feature_cache: Optional dictionary for backbone feature caching.
	batch_size: Number of frames to batch together per GPU (default: 16).

	Returns:
	Tuple of (out, backbone_out) where out contains detection results for frame_idx.
	"""
	# Calculate valid frame range based on max_frame_num_to_track
	if max_frame_num_to_track is not None:
	if propagate_in_video_start_frame_idx is None:
	propagate_in_video_start_frame_idx = 0
	if track_in_reverse:
	valid_frame_start = (
	propagate_in_video_start_frame_idx - max_frame_num_to_track + 1
	)
	valid_frame_end = propagate_in_video_start_frame_idx
	else:
	valid_frame_start = propagate_in_video_start_frame_idx
	valid_frame_end = (
	propagate_in_video_start_frame_idx + max_frame_num_to_track
	)
	else:
	valid_frame_start = 0
	valid_frame_end = num_frames

	# Initialize grounding_buffer if not present
	if "grounding_buffer" not in grounding_cache:
	grounding_cache["grounding_buffer"] = {}

	# Calculate chunk boundaries - use batch_size instead of world_size
	chunk_start = (frame_idx // batch_size) * batch_size
	chunk_end = min(chunk_start + batch_size, valid_frame_end)
	chunk_key = (chunk_start, chunk_end)

	# Process chunk if not already cached
	if chunk_key not in grounding_cache["grounding_buffer"]:
	self._cleanup_previous_chunks_multigpu(
	grounding_cache=grounding_cache,
	current_chunk_key=chunk_key,
	batch_size=batch_size,
	num_frames=num_frames,
	track_in_reverse=track_in_reverse,
	)
	with torch.profiler.record_function(
	"forward_grounding_batched.process_chunk"
	):
	chunk_outputs = self._process_grounding_chunk_batched(
	backbone_out=backbone_out,
	find_inputs=find_inputs,
	chunk_start=chunk_start,
	chunk_end=chunk_end,
	run_nms=run_nms,
	nms_prob_thresh=nms_prob_thresh,
	nms_iou_thresh=nms_iou_thresh,
	nms_use_iom=nms_use_iom,
	feature_cache=feature_cache,
	return_sam2_backbone_feats=return_sam2_backbone_feats,
	)
	grounding_cache["grounding_buffer"][chunk_key] = chunk_outputs

	# Auto-cleanup previous chunks
	self._cleanup_previous_chunks_multigpu(
	grounding_cache=grounding_cache,
	current_chunk_key=chunk_key,
	batch_size=batch_size,
	num_frames=num_frames,
	track_in_reverse=track_in_reverse,
	)

	# Retrieve the cached output for this frame
	chunk_outputs = grounding_cache["grounding_buffer"][chunk_key]
	local_idx = frame_idx - chunk_start

	# Slice out the output for this specific frame
	out = self._slice_batched_output(
	chunk_outputs, local_idx, return_sam2_backbone_feats
	)

	return out, backbone_out

	def _process_grounding_chunk_batched(
	self,
	backbone_out,
	find_inputs,
	chunk_start: int,
	chunk_end: int,
	run_nms: bool,
	nms_prob_thresh,
	nms_iou_thresh,
	nms_use_iom: bool,
	feature_cache,
	return_sam2_backbone_feats: bool,
	):
	"""
	Process a chunk of frames through the full forward_grounding pipeline in batch.
	"""
	chunk_size = chunk_end - chunk_start

	# Build geometric prompts for the chunk
	chunk_geo_prompts = [
	self._get_geo_prompt_from_find_input(find_inputs[i % len(find_inputs)])
	for i in range(chunk_start, chunk_end)
	]

	# Batch the find_inputs for this chunk
	batched_find_input = self._batch_find_inputs(
	find_inputs, chunk_start, chunk_end
	)

	# Batch the geometric prompts
	batched_geometric_prompt = self._batch_geometric_prompts_from_list(
	chunk_geo_prompts
	)

	# Run forward_grounding on the batched input
	with torch.profiler.record_function("forward_grounding_batched.forward"):
	out = self.forward_grounding(
	backbone_out=backbone_out,
	find_input=batched_find_input,
	find_target=None,
	geometric_prompt=batched_geometric_prompt,
	feature_cache=feature_cache,
	)

	# Apply NMS per frame in the batch
	if run_nms:
	with torch.profiler.record_function("forward_grounding_batched.nms"):
	assert nms_prob_thresh is not None and nms_iou_thresh is not None
	pred_probs = out["pred_logits"].squeeze(-1).sigmoid()
	pred_masks = out["pred_masks"]
	# pred_probs shape: [batch_size, num_queries]
	# pred_masks shape: [batch_size, num_queries, H, W]
	# Use batched NMS to process all frames at once
	keep = nms_masks(
	pred_probs=pred_probs,
	pred_masks=pred_masks,
	prob_threshold=nms_prob_thresh,
	iou_threshold=nms_iou_thresh,
	nms_use_iom=nms_use_iom,
	do_compile=getattr(self, "compile_model", False),
	running_in_prod=getattr(self, "running_in_prod", False),
	)
	# Set a very low threshold for detections removed by NMS
	# keep shape: [batch_size, num_queries]
	out["pred_logits"][:, :, 0] -= 1e4 * (~keep).float()

	# Extract SAM2 backbone features if requested
	if return_sam2_backbone_feats and "prev_encoder_out" in out:
	backbone_data = out["prev_encoder_out"]["backbone_out"]
	if self.is_multiplex and "interactive" in backbone_data:
	out["_interactive_backbone"] = backbone_data["interactive"]
	if "sam2_backbone_out" in backbone_data:
	out["_sam2_backbone"] = backbone_data["sam2_backbone_out"]

	out["_chunk_size"] = chunk_size
	return out

	def _slice_batched_output(
	self,
	chunk_outputs,
	local_idx: int,
	return_sam2_backbone_feats: bool,
	):
	"""
	Slice a single frame's output from the batched chunk outputs.
	"""
	out = {}

	# Keys to slice at batch dimension
	batch_dim_keys = {
	"pred_logits",
	"pred_boxes",
	"pred_boxes_xyxy",
	"pred_masks",
	"pred_logits_o2m",
	"pred_boxes_o2m",
	"pred_boxes_xyxy_o2m",
	"pred_masks_o2m",
	"queries",
	"presence_logit_dec",
	}

	# Keys to skip
	skip_keys = {
	"_chunk_size",
	"_interactive_backbone",
	"_sam2_backbone",
	"prev_encoder_out",
	"encoder_hidden_states",
	"aux_outputs",
	}

	for key, value in chunk_outputs.items():
	if key in skip_keys:
	continue
	if key in batch_dim_keys and isinstance(value, torch.Tensor):
	out[key] = value[local_idx : local_idx + 1]
	elif isinstance(value, torch.Tensor):
	try:
	out[key] = value[local_idx : local_idx + 1]
	except (IndexError, RuntimeError):
	out[key] = value

	# Add object IDs
	if "pred_logits" in out:
	out["pred_object_ids"] = self._get_dummy_object_ids(out["pred_logits"])

	# Add SAM2 backbone features if requested
	if return_sam2_backbone_feats:
	if "_sam2_backbone" in chunk_outputs:
	sam2_bb = chunk_outputs["_sam2_backbone"]
	out["sam2_backbone_fpn_0"] = sam2_bb["backbone_fpn"][0].tensors[
	local_idx : local_idx + 1
	]
	out["sam2_backbone_fpn_1"] = sam2_bb["backbone_fpn"][1].tensors[
	local_idx : local_idx + 1
	]
	out["sam2_backbone_fpn_2"] = sam2_bb["backbone_fpn"][2].tensors[
	local_idx : local_idx + 1
	]
	out["sam2_backbone_pos_enc"] = [
	x[local_idx : local_idx + 1] for x in sam2_bb["vision_pos_enc"]
	]

	if self.is_multiplex and "_interactive_backbone" in chunk_outputs:
	inte_bb = chunk_outputs["_interactive_backbone"]
	out["interactive_backbone_fpn_0"] = inte_bb["backbone_fpn"][0].tensors[
	local_idx : local_idx + 1
	]
	out["interactive_backbone_fpn_1"] = inte_bb["backbone_fpn"][1].tensors[
	local_idx : local_idx + 1
	]
	out["interactive_backbone_fpn_2"] = inte_bb["backbone_fpn"][2].tensors[
	local_idx : local_idx + 1
	]
	out["interactive_backbone_pos_enc"] = [
	x[local_idx : local_idx + 1] for x in inte_bb["vision_pos_enc"]
	]

	return out

	def _cleanup_previous_chunks_multigpu(
	self,
	grounding_cache,
	current_chunk_key,
	batch_size: int,
	num_frames: int,
	track_in_reverse: bool,
	):
	"""Remove previous chunks from cache to save GPU memory."""
	chunk_start, chunk_end = current_chunk_key

	if not track_in_reverse:
	prev_chunk_start = chunk_start - batch_size
	if prev_chunk_start >= 0:
	prev_chunk_end = chunk_start
	prev_chunk_key = (prev_chunk_start, prev_chunk_end)

	# Cleanup grounding_buffer entry
	chunk = grounding_cache["grounding_buffer"].pop(prev_chunk_key, None)
	if chunk is not None:
	del chunk
	else:
	next_chunk_start = chunk_end
	if next_chunk_start < num_frames:
	next_chunk_end = min(next_chunk_start + batch_size, num_frames)
	next_chunk_key = (next_chunk_start, next_chunk_end)
	grounding_cache["grounding_buffer"].pop(next_chunk_key, None)