UWGZQ
/

TRASER

@@ -160,7 +160,6 @@ def run_single_video(model, processor, video_path, mask_path, out_dir, device, a
             text_token_ids_per_sample=text_token_ids_per_sample,
             timestamp_token_ids_per_batch=timestamp_token_ids_per_batch,
             grids_per_temporal_window_per_batch=grids_per_window_batch,
-            use_resampler=True
         )
         gen_out = model.generate(

             text_token_ids_per_sample=text_token_ids_per_sample,
             timestamp_token_ids_per_batch=timestamp_token_ids_per_batch,
             grids_per_temporal_window_per_batch=grids_per_window_batch,
         )
         gen_out = model.generate(

resampler_utils/token_arrangement.py CHANGED Viewed

@@ -6,49 +6,31 @@ import math
 def rearrange_token(
     model,
-    input_ids: torch.LongTensor,           # [B, L]
-    attention_mask: torch.LongTensor,      # [B, L]
-    pixel_values: Optional[torch.FloatTensor],            # unused here (image path kept for API compatibility)
-    image_grid_thw: Optional[torch.LongTensor],           # unused here (image path kept for API compatibility)
-    pixel_values_videos: Optional[torch.FloatTensor],     # may be None
-    video_grid_thw: Optional[torch.LongTensor],           # may be None
-    second_per_grid_ts: Optional[torch.Tensor],           # may be None
-    # Per-sample list of objects; each object is a 1D LongTensor of relative video-token indices (in the original video token stream)
     obj_token_indices_per_sample: List[List[torch.Tensor]],
-    # Only mode3_traj_and_text is kept:
     obj_traj_start_id: Optional[int] = None,
     obj_traj_end_id: Optional[int] = None,
-    # Required: List[sample][object] -> 1D LongTensor(ids)
     text_token_ids_per_sample: Optional[List[List[torch.Tensor]]] = None,
-    timestamp_token_ids_per_batch=None,  # List[sample][1D LongTensor(ids)]
-    grids_per_temporal_window_per_batch=None,  # List[sample] number of grids per temporal window
     labels: Optional[torch.LongTensor] = None,
     IGNORE_ID: int = -100,
-    use_resampler: bool = True,             # True → per-object resampling + linear (1D) positions
     use_second_resampler: bool = True,
-    add_timestamp_token: bool = True,       # whether to add timestamp token for each object window
 ):
-    """
-    Fixed simplifications:
-      - insert_where: only "in_order" (no argument kept)
-      - insertion_mode: only "mode3_traj_and_text"
-      - perceiver_injection: only "visuals" (no time tokens injected into resampler)
-    Returns:
-      new_inputs_embeds:  [B, Lmax, D]
-      new_position_ids:   [3, B, Lmax] (int32)
-      new_attention_mask: [B, Lmax] (bool)
-      rope_deltas:        [B, 1] (long)
-      cache_position:     [Lmax] (int32)
-      new_input_ids:      [B, Lmax] (long)
-      new_labels:         [B, Lmax] or None (long)
-    """
     dev = input_ids.device
     B, L = input_ids.shape
     cpu = torch.device("cpu")
@@ -62,7 +44,6 @@ def rearrange_token(
         assert grids_per_temporal_window_per_batch is not None and len(grids_per_temporal_window_per_batch) == B, \
             "add_timestamp_token=True requires grids_per_temporal_window_per_batch with length B."
     else:
-        # still needed for window indexing if use_resampler path uses temporal windows
         assert grids_per_temporal_window_per_batch is not None and len(grids_per_temporal_window_per_batch) == B, \
             "grids_per_temporal_window_per_batch is required."
@@ -70,14 +51,14 @@ def rearrange_token(
     vt_id = int(model.config.video_token_id)
     vs_id = getattr(model.config, "vision_start_token_id", None)
     ve_id = getattr(model.config, "vision_end_token_id", None)
-    pad_id = 151643  # align with original implementation
     # ---- (0+) temporal window meta ----
     assert video_grid_thw is not None, "video_grid_thw is required for temporal windowing"
     assert video_grid_thw.shape[0] == B and video_grid_thw.shape[1] == 3, \
         f"video_grid_thw should be ({B},3), got {video_grid_thw.shape}"
-    grid_area_batch: List[int] = []  # per-sample spatial token count (H*W/4)
     temporal_window_size_batch = grids_per_temporal_window_per_batch
     # ---- (0) Compute visual features (with grad) ----
@@ -86,7 +67,7 @@ def rearrange_token(
         _vid = model.model.get_video_features(
             pixel_values_videos.type(model.model.visual.dtype), video_grid_thw
         )
-        video_embeds = torch.cat(_vid, dim=0) if isinstance(_vid, (list, tuple)) else _vid  # [N_vid, D]
         del pixel_values_videos, _vid
     # ---- (0.1) Resamplers ----
@@ -106,30 +87,18 @@ def rearrange_token(
             second_resampler_num_latents = int(second_resampler.n_latents)
     # ---- (1) Position ids preparation ----
-    need_3d_rope = (not use_resampler)
-    if need_3d_rope:
-        with torch.no_grad():
-            position_ids_full, _ = model.model.get_rope_index(
-                input_ids=input_ids,
-                image_grid_thw=image_grid_thw,
-                video_grid_thw=video_grid_thw,
-                second_per_grid_ts=second_per_grid_ts,
-                attention_mask=attention_mask,
-            ).to(cpu)  # (3, B, L)
-    else:
-        position_ids_full = None
     # ---- (2) Move to CPU for sequence planning ----
     attn_cpu = attention_mask.to(cpu, dtype=torch.bool)
     ids_cpu = input_ids.to(cpu)
-    pid_cpu = position_ids_full.to(cpu, dtype=torch.int32) if need_3d_rope else None
     lbls_cpu = labels.to(cpu) if labels is not None else None
     eff_lens: List[int] = []
     vid_idx_list: List[torch.Tensor] = []
     for b in range(B):
         video_grid_thw_b = video_grid_thw[b]
-        # H*W/4 as integer
         grid_area = (int(video_grid_thw_b[1].item()) * int(video_grid_thw_b[2].item())) // 4
         grid_area_batch.append(int(grid_area))
@@ -144,7 +113,6 @@ def rearrange_token(
         else:
             vid_idx_list.append(torch.empty(0, dtype=torch.long))
-    # ---- Global offsets into concatenated video_embeds for each sample ----
     vid_counts = [int(v.numel()) for v in vid_idx_list]
     vid_offsets: List[int] = [0] * B
     running = 0
@@ -154,26 +122,17 @@ def rearrange_token(
     # ---- (3) Length planning ----
     def _object_block_len(b: int, obj_i: int, sel_latent_len: int, rel_temporal_window_idx: torch.Tensor) -> int:
-        """
-        mode3_traj_and_text block length:
-          [<traj_start>?] + [text] + [<VS>?] + [<ts>* + <vt_latents>*] + [<VE>?] + [<traj_end>?]
-        where <ts>* and <vt_latents>* repeat per non-empty temporal window (resampler path),
-        or raw selected video tokens (non-resampler path).
-        """
         add = 0
         if obj_traj_start_id is not None:
             add += 1
-        # text
         tlen = int(text_token_ids_per_sample[b][obj_i].numel())
         add += tlen
-        # VS
         if vs_id is not None:
             add += 1
-        # timestamps per unique window (if enabled)
         if add_timestamp_token and timestamp_token_ids_per_batch is not None:
             locs = rel_temporal_window_idx.unique()
             for loc in locs:
@@ -183,7 +142,6 @@ def rearrange_token(
                 else:
                     add += int(timestamp_token_ids_per_batch[b][-1].numel())
-        # visual placeholder length (either resampled latents or raw selected tokens)
         add += int(sel_latent_len)
         # VE
@@ -230,19 +188,14 @@ def rearrange_token(
             rel = rel.to(cpu, dtype=torch.long)
             sel_len = int(rel.numel())
-            if use_resampler:
-                tokens_per_window = int(grid_area_batch[b] * int(temporal_window_size_batch[b]))
-                rel_temporal_window_idx = rel // tokens_per_window if (tokens_per_window > 0) else torch.zeros_like(rel)
-                nonempty_windows = int(rel_temporal_window_idx.unique().numel())
-                if use_second_resampler and second_resampler_num_latents is not None:
-                    sel_len = int(second_resampler_num_latents) + int(resampler_num_latents) * nonempty_windows
-                else:
-                    sel_len = int(resampler_num_latents) * nonempty_windows
             else:
-                # Non-resampler: keep raw selected video tokens count
-                tokens_per_window = int(grid_area_batch[b] * int(temporal_window_size_batch[b]))
-                rel_temporal_window_idx = rel // tokens_per_window if (tokens_per_window > 0) else torch.zeros_like(rel)
             cur_total += _object_block_len(b, i, sel_len, rel_temporal_window_idx)
@@ -260,10 +213,10 @@ def rearrange_token(
     rows_for_video: List[torch.Tensor] = [torch.empty(0, dtype=torch.long) for _ in range(B)]
-    batched_obj_rows: List[torch.Tensor] = []  # each: rows into video_embeds (visual-only)
-    batched_obj_pos: List[torch.Tensor] = []   # each: destination positions [R]
     batched_obj_bids: List[int] = []
-    batched_obj_lens: List[int] = []           # visual token lengths per (object-window)
     batched_second_rows: List[torch.Tensor] = []
     batched_second_pos: List[torch.Tensor] = []
@@ -289,16 +242,12 @@ def rearrange_token(
         dst = 0
-        # No video tokens: copy through
         if vid_idx.numel() == 0:
             new_input_ids_cpu[b, :L_eff] = ids_b
             new_attention_mask_cpu[b, :L_eff] = msk_b
             if new_labels_cpu is not None and labs_b is not None:
                 new_labels_cpu[b, :L_eff] = labs_b
-            if need_3d_rope:
-                new_position_ids_cpu[:, b, :L_eff] = pid_cpu[:, b, :L_eff]
-            else:
-                new_position_ids_cpu[:, b, :L_eff] = _text_pos_block(0, L_eff, dtype=torch.int32)
             continue
         v_s = int(vid_idx[0].item())
@@ -313,34 +262,14 @@ def rearrange_token(
         prefix_len = v_s
         suffix_len = L_eff - (v_e + 1)
-        if need_3d_rope:
-            pid_b = pid_cpu[:, b, :L_eff]
-            pos_scalar = pid_b.max(dim=0).values
-            first_video_scalar = int(pos_scalar[v_s + (1 if has_vs else 0)].item())
-            last_video_scalar = int(pos_scalar[v_e - (1 if has_ve else 0)].item())
-            vs_scalar = int(pos_scalar[v_s].item()) if has_vs else None
-            min_video_scalar_base = int(first_video_scalar)
-            max_video_scalar_base = int(last_video_scalar)
-        # prefix
         if prefix_len > 0:
             new_input_ids_cpu[b, dst:dst + prefix_len] = ids_b[:prefix_len]
             new_attention_mask_cpu[b, dst:dst + prefix_len] = msk_b[:prefix_len]
             if new_labels_cpu is not None and labs_b is not None:
                 new_labels_cpu[b, dst:dst + prefix_len] = labs_b[:prefix_len]
-            if need_3d_rope:
-                new_position_ids_cpu[:, b, dst:dst + prefix_len] = pid_b[:, :prefix_len]
-            else:
-                new_position_ids_cpu[:, b, dst:dst + prefix_len] = _text_pos_block(dst, prefix_len, dtype=torch.int32)
             dst += prefix_len
-        # in_order only:
-        if need_3d_rope:
-            cursor = int(vs_scalar) if has_vs else int(first_video_scalar)
-        else:
-            cursor = dst
         Nv = int(vid_idx.numel())
         pos2rank = torch.full((L_eff,), -1, dtype=torch.long, device=cpu)
         if Nv > 0:
@@ -359,170 +288,128 @@ def rearrange_token(
             # (1) <obj_traj_start> (optional)
             if obj_traj_start_id is not None:
                 new_input_ids_cpu[b, dst] = int(obj_traj_start_id)
-                new_position_ids_cpu[:, b, dst:dst + 1] = _text_pos_block(cursor if need_3d_rope else dst, 1, dtype=torch.int32)
                 if new_labels_cpu is not None:
                     new_labels_cpu[b, dst] = IGNORE_ID
                 new_attention_mask_cpu[b, dst] = True
                 dst += 1
-                if need_3d_rope:
-                    cursor += 1
             # (2) text tokens (required)
             txt_ids = text_token_ids_per_sample[b][i].to(cpu, dtype=torch.long)
             k = int(txt_ids.numel())
             if k > 0:
                 new_input_ids_cpu[b, dst:dst + k] = txt_ids
-                new_position_ids_cpu[:, b, dst:dst + k] = _text_pos_block(cursor if need_3d_rope else dst, k, dtype=torch.int32)
                 if new_labels_cpu is not None:
                     new_labels_cpu[b, dst:dst + k] = IGNORE_ID
                 new_attention_mask_cpu[b, dst:dst + k] = True
                 dst += k
-                if need_3d_rope:
-                    cursor += k
             # (3) <VS> (optional)
             if vs_id is not None:
                 new_input_ids_cpu[b, dst] = int(vs_id)
-                new_position_ids_cpu[:, b, dst:dst + 1] = _text_pos_block(cursor if need_3d_rope else dst, 1, dtype=torch.int32)
                 if new_labels_cpu is not None:
                     new_labels_cpu[b, dst] = IGNORE_ID
                 new_attention_mask_cpu[b, dst] = True
                 dst += 1
-                if need_3d_rope:
-                    cursor += 1
             # (4) video tokens
             if g.numel() > 0:
-                if use_resampler:
-                    tokens_per_window = int(grid_area_batch[b] * int(temporal_window_size_batch[b]))
-                    rel_temporal_window_idx = rel // tokens_per_window if (tokens_per_window > 0) else torch.zeros_like(rel)
-                    # Loop only over windows that actually appear in rel (robust)
-                    W_eff = int(rel_temporal_window_idx.max().item()) + 1 if rel_temporal_window_idx.numel() > 0 else 0
-                    all_rows_list = []
-                    for w in range(W_eff):
-                        m_w = (rel_temporal_window_idx == w)
-                        if not torch.any(m_w):
-                            all_rows_list.append(torch.empty(0, dtype=torch.long, device=cpu))
-                            continue
-                        rel_w = rel[m_w]
-                        rows_w = rel_w + vid_offset
-                        all_rows_list.append(rows_w)
-                    # second resampler: global object summary
-                    if use_second_resampler and second_resampler is not None:
-                        rows_all = torch.cat([x for x in all_rows_list if x.numel() > 0], dim=0) if any(x.numel() > 0 for x in all_rows_list) \
-                            else torch.empty(0, dtype=torch.long, device=cpu)
-                        if rows_all.numel() > 0:
-                            R2 = int(second_resampler_num_latents)
-                            new_input_ids_cpu[b, dst:dst + R2] = int(vt_id)
-                            new_position_ids_cpu[:, b, dst:dst + R2] = _text_pos_block(cursor if need_3d_rope else dst, R2, dtype=torch.int32)
-                            if new_labels_cpu is not None:
-                                new_labels_cpu[b, dst:dst + R2] = IGNORE_ID
-                            new_attention_mask_cpu[b, dst:dst + R2] = True
-                            pos_idx2 = torch.arange(dst, dst + R2, dtype=torch.long, device=cpu)
-                            batched_second_rows.append(rows_all)
-                            batched_second_pos.append(pos_idx2)
-                            batched_second_bids.append(b)
-                            batched_second_oids.append(i)
-                            dst += R2
-                            if need_3d_rope:
-                                cursor += R2
-                    R = int(resampler_num_latents)
-                    for w in range(W_eff):
-                        m_w = (rel_temporal_window_idx == w)
-                        if not torch.any(m_w):
-                            continue
-                        # timestamp tokens (text-only; NOT injected into resampler)
-                        if add_timestamp_token and (timestamp_token_ids_per_batch is not None):
-                            loc = w
-                            if loc < len(timestamp_token_ids_per_batch[b]):
-                                ts_ids = timestamp_token_ids_per_batch[b][loc].to(cpu, dtype=torch.long)
-                            else:
-                                ts_ids = timestamp_token_ids_per_batch[b][-1].to(cpu, dtype=torch.long)
-                            kt = int(ts_ids.numel())
-                            assert kt > 0, "Timestamp token ids should not be empty."
-                            new_input_ids_cpu[b, dst:dst + kt] = ts_ids
-                            new_position_ids_cpu[:, b, dst:dst + kt] = _text_pos_block(cursor if need_3d_rope else dst, kt, dtype=torch.int32)
-                            if new_labels_cpu is not None:
-                                new_labels_cpu[b, dst:dst + kt] = IGNORE_ID
-                            new_attention_mask_cpu[b, dst:dst + kt] = True
-                            dst += kt
-                            if need_3d_rope:
-                                cursor += kt
-                        # reserve R vt slots for resampled latents
-                        new_input_ids_cpu[b, dst:dst + R] = int(vt_id)
-                        new_position_ids_cpu[:, b, dst:dst + R] = _text_pos_block(cursor if need_3d_rope else dst, R, dtype=torch.int32)
-                        if new_labels_cpu is not None:
-                            new_labels_cpu[b, dst:dst + R] = IGNORE_ID
-                        new_attention_mask_cpu[b, dst:dst + R] = True
-                        rel_w = rel[m_w]
-                        rows_w = rel_w + vid_offset
-                        pos_idx = torch.arange(dst, dst + R, dtype=torch.long, device=cpu)
-                        batched_obj_rows.append(rows_w)
-                        batched_obj_pos.append(pos_idx)
-                        batched_obj_bids.append(b)
-                        batched_obj_lens.append(int(rows_w.numel()))  # visuals-only
-                        dst += R
-                        if need_3d_rope:
-                            cursor += R
-                else:
-                    # Non-resampler: 3D RoPE positions for selected raw video tokens
-                    assert need_3d_rope, "Non-resampler path requires 3D RoPE positions."
-                    pid_vid = pid_b.index_select(1, g)  # (3, Lv_sel)
-                    # in_order only: shift selected pid by delta
-                    delta = int(cursor - min_video_scalar_base)
-                    if delta != 0:
-                        pid_vid = pid_vid + delta
-                        cursor = max_video_scalar_base + delta + 1
-                    Lv_sel = int(g.numel())
-                    new_input_ids_cpu[b, dst:dst + Lv_sel] = int(vt_id)
-                    new_position_ids_cpu[:, b, dst:dst + Lv_sel] = pid_vid
                     if new_labels_cpu is not None:
-                        new_labels_cpu[b, dst:dst + Lv_sel] = IGNORE_ID
-                    new_attention_mask_cpu[b, dst:dst + Lv_sel] = True
-                    ranks = pos2rank.index_select(0, g)
-                    rows = ranks + vid_offset
-                    rows_for_video[b] = torch.cat([rows_for_video[b], rows], dim=0)
-                    dst += Lv_sel
             # (5) <VE> (optional)
             if ve_id is not None:
                 new_input_ids_cpu[b, dst] = int(ve_id)
-                new_position_ids_cpu[:, b, dst:dst + 1] = _text_pos_block(cursor if need_3d_rope else dst, 1, dtype=torch.int32)
                 if new_labels_cpu is not None:
                     new_labels_cpu[b, dst] = IGNORE_ID
                 new_attention_mask_cpu[b, dst] = True
                 dst += 1
-                if need_3d_rope:
-                    cursor += 1
             # (6) <obj_traj_end> (optional)
             if obj_traj_end_id is not None:
                 new_input_ids_cpu[b, dst] = int(obj_traj_end_id)
-                new_position_ids_cpu[:, b, dst:dst + 1] = _text_pos_block(cursor if need_3d_rope else dst, 1, dtype=torch.int32)
                 if new_labels_cpu is not None:
                     new_labels_cpu[b, dst] = IGNORE_ID
                 new_attention_mask_cpu[b, dst] = True
                 dst += 1
-                if need_3d_rope:
-                    cursor += 1
         # suffix
         if suffix_len > 0:
@@ -533,7 +420,7 @@ def rearrange_token(
             new_attention_mask_cpu[b, dst:dst + seg] = msk_b[src_lo:src_hi]
             if new_labels_cpu is not None and labs_b is not None:
                 new_labels_cpu[b, dst:dst + seg] = labs_b[src_lo:src_hi]
-            new_position_ids_cpu[:, b, dst:dst + seg] = _text_pos_block(dst, seg, dtype=torch.int32) if not need_3d_rope else _text_pos_block(cursor, seg, dtype=torch.int32)
             dst += seg
         assert dst == L_new_each[b], f"sample {b}: dst={dst}, L_new={L_new_each[b]}"
@@ -547,17 +434,6 @@ def rearrange_token(
     base = tok_embed(new_input_ids)
     new_inputs_embeds = base.clone()
-    # Non-resampler: copy raw video features at vt positions
-    if (video_embeds is not None) and (not use_resampler) and any(r.numel() > 0 for r in rows_for_video):
-        vemb = video_embeds.to(dev, dtype=new_inputs_embeds.dtype, non_blocking=True)
-        for b in range(B):
-            rows = rows_for_video[b]
-            if rows.numel() == 0:
-                continue
-            vt_pos = torch.nonzero(new_input_ids[b] == vt_id, as_tuple=False).flatten()
-            assert vt_pos.numel() == rows.numel(), f"video rows mismatch for sample {b}"
-            new_inputs_embeds[b].index_copy_(0, vt_pos.to(dev), vemb.index_select(0, rows.to(dev)))
     # ---- (5.1) second resampler: object-level global summary ----
     if use_resampler and use_second_resampler and len(batched_second_rows) > 0:
         if video_embeds is None:
@@ -582,7 +458,7 @@ def rearrange_token(
         ar2 = torch.arange(L2_max, device=dev_emb).unsqueeze(0) if L2_max > 0 else torch.zeros(1, 0, device=dev_emb, dtype=torch.long)
         mask2 = (ar2 < lens2_t.unsqueeze(1)) if L2_max > 0 else torch.zeros(0, 0, device=dev_emb, dtype=torch.bool)
-        y2 = second_resampler(x2, attention_mask=mask2)  # [N_obj2, R2, D]
         y2 = y2.to(new_inputs_embeds.dtype)
         for j in range(N_obj2):
@@ -590,7 +466,7 @@ def rearrange_token(
             pos2 = batched_second_pos[j].to(dev)
             new_inputs_embeds[b_cur, pos2] = y2[j]
-    # ---- (5.2) main resampler: visuals-only ----
     if use_resampler and len(batched_obj_rows) > 0:
         if video_embeds is None:
             raise RuntimeError("use_resampler=True but video_embeds is None.")
@@ -599,7 +475,7 @@ def rearrange_token(
         D = video_embeds.shape[-1]
         N_obj = len(batched_obj_rows)
-        lens = torch.tensor(batched_obj_lens, device=dev_emb, dtype=torch.long)  # [N_obj]
         L_max = int(lens.max().item()) if lens.numel() > 0 else 0
         seqs = []
@@ -607,13 +483,13 @@ def rearrange_token(
             if rows.numel() == 0:
                 seqs.append(torch.zeros(0, D, device=dev_emb, dtype=dtype_emb))
             else:
-                seqs.append(video_embeds.index_select(0, rows.to(dev_emb)))  # [Lv_sel, D]
         x = torch.nn.utils.rnn.pad_sequence(seqs, batch_first=True) if len(seqs) > 0 else torch.zeros(0, 0, D, device=dev_emb, dtype=dtype_emb)
         ar = torch.arange(L_max, device=dev_emb).unsqueeze(0) if L_max > 0 else torch.zeros(1, 0, device=dev_emb, dtype=torch.long)
         mask = (ar < lens.unsqueeze(1)) if L_max > 0 else torch.zeros(0, 0, device=dev_emb, dtype=torch.bool)
-        y = resampler(x, attention_mask=mask)  # [N_obj, R, D]
         y = y.to(new_inputs_embeds.dtype)
         per_b_indices: List[List[int]] = [[] for _ in range(B)]
@@ -633,7 +509,7 @@ def rearrange_token(
             new_inputs_embeds[b, pos_b] = emb_b
     # ---- (6) rope_deltas / cache_position ----
-    maxpos = new_position_ids.max(dim=0)[0].max(dim=1, keepdim=True)[0]  # [B,1]
     rope_deltas = (maxpos + 1 - new_inputs_embeds.shape[1]).to(dtype=torch.long, device=dev)
     cache_position = torch.arange(new_inputs_embeds.shape[1], device=dev, dtype=torch.int32)

 def rearrange_token(
     model,
+    input_ids: torch.LongTensor,
+    attention_mask: torch.LongTensor,
+    pixel_values: Optional[torch.FloatTensor],
+    image_grid_thw: Optional[torch.LongTensor],
+    pixel_values_videos: Optional[torch.FloatTensor],
+    video_grid_thw: Optional[torch.LongTensor],
+    second_per_grid_ts: Optional[torch.Tensor],
     obj_token_indices_per_sample: List[List[torch.Tensor]],
     obj_traj_start_id: Optional[int] = None,
     obj_traj_end_id: Optional[int] = None,
     text_token_ids_per_sample: Optional[List[List[torch.Tensor]]] = None,
+    timestamp_token_ids_per_batch=None,
+    grids_per_temporal_window_per_batch=None,
     labels: Optional[torch.LongTensor] = None,
     IGNORE_ID: int = -100,
+    use_resampler: bool = True,
     use_second_resampler: bool = True,
+    add_timestamp_token: bool = True,
 ):
     dev = input_ids.device
     B, L = input_ids.shape
     cpu = torch.device("cpu")
         assert grids_per_temporal_window_per_batch is not None and len(grids_per_temporal_window_per_batch) == B, \
             "add_timestamp_token=True requires grids_per_temporal_window_per_batch with length B."
     else:
         assert grids_per_temporal_window_per_batch is not None and len(grids_per_temporal_window_per_batch) == B, \
             "grids_per_temporal_window_per_batch is required."
     vt_id = int(model.config.video_token_id)
     vs_id = getattr(model.config, "vision_start_token_id", None)
     ve_id = getattr(model.config, "vision_end_token_id", None)
+    pad_id = 151643
     # ---- (0+) temporal window meta ----
     assert video_grid_thw is not None, "video_grid_thw is required for temporal windowing"
     assert video_grid_thw.shape[0] == B and video_grid_thw.shape[1] == 3, \
         f"video_grid_thw should be ({B},3), got {video_grid_thw.shape}"
+    grid_area_batch: List[int] = []
     temporal_window_size_batch = grids_per_temporal_window_per_batch
     # ---- (0) Compute visual features (with grad) ----
         _vid = model.model.get_video_features(
             pixel_values_videos.type(model.model.visual.dtype), video_grid_thw
         )
+        video_embeds = torch.cat(_vid, dim=0) if isinstance(_vid, (list, tuple)) else _vid
         del pixel_values_videos, _vid
     # ---- (0.1) Resamplers ----
             second_resampler_num_latents = int(second_resampler.n_latents)
     # ---- (1) Position ids preparation ----
+    position_ids_full = None
     # ---- (2) Move to CPU for sequence planning ----
     attn_cpu = attention_mask.to(cpu, dtype=torch.bool)
     ids_cpu = input_ids.to(cpu)
+    pid_cpu = None
     lbls_cpu = labels.to(cpu) if labels is not None else None
     eff_lens: List[int] = []
     vid_idx_list: List[torch.Tensor] = []
     for b in range(B):
         video_grid_thw_b = video_grid_thw[b]
         grid_area = (int(video_grid_thw_b[1].item()) * int(video_grid_thw_b[2].item())) // 4
         grid_area_batch.append(int(grid_area))
         else:
             vid_idx_list.append(torch.empty(0, dtype=torch.long))
     vid_counts = [int(v.numel()) for v in vid_idx_list]
     vid_offsets: List[int] = [0] * B
     running = 0
     # ---- (3) Length planning ----
     def _object_block_len(b: int, obj_i: int, sel_latent_len: int, rel_temporal_window_idx: torch.Tensor) -> int:
         add = 0
         if obj_traj_start_id is not None:
             add += 1
         tlen = int(text_token_ids_per_sample[b][obj_i].numel())
         add += tlen
         if vs_id is not None:
             add += 1
         if add_timestamp_token and timestamp_token_ids_per_batch is not None:
             locs = rel_temporal_window_idx.unique()
             for loc in locs:
                 else:
                     add += int(timestamp_token_ids_per_batch[b][-1].numel())
         add += int(sel_latent_len)
         # VE
             rel = rel.to(cpu, dtype=torch.long)
             sel_len = int(rel.numel())
+            tokens_per_window = int(grid_area_batch[b] * int(temporal_window_size_batch[b]))
+            rel_temporal_window_idx = rel // tokens_per_window if (tokens_per_window > 0) else torch.zeros_like(rel)
+            nonempty_windows = int(rel_temporal_window_idx.unique().numel())
+            if use_second_resampler and second_resampler_num_latents is not None:
+                sel_len = int(second_resampler_num_latents) + int(resampler_num_latents) * nonempty_windows
             else:
+                sel_len = int(resampler_num_latents) * nonempty_windows
             cur_total += _object_block_len(b, i, sel_len, rel_temporal_window_idx)
     rows_for_video: List[torch.Tensor] = [torch.empty(0, dtype=torch.long) for _ in range(B)]
+    batched_obj_rows: List[torch.Tensor] = []
+    batched_obj_pos: List[torch.Tensor] = []
     batched_obj_bids: List[int] = []
+    batched_obj_lens: List[int] = []
     batched_second_rows: List[torch.Tensor] = []
     batched_second_pos: List[torch.Tensor] = []
         dst = 0
         if vid_idx.numel() == 0:
             new_input_ids_cpu[b, :L_eff] = ids_b
             new_attention_mask_cpu[b, :L_eff] = msk_b
             if new_labels_cpu is not None and labs_b is not None:
                 new_labels_cpu[b, :L_eff] = labs_b
+            new_position_ids_cpu[:, b, :L_eff] = _text_pos_block(0, L_eff, dtype=torch.int32)
             continue
         v_s = int(vid_idx[0].item())
         prefix_len = v_s
         suffix_len = L_eff - (v_e + 1)
         if prefix_len > 0:
             new_input_ids_cpu[b, dst:dst + prefix_len] = ids_b[:prefix_len]
             new_attention_mask_cpu[b, dst:dst + prefix_len] = msk_b[:prefix_len]
             if new_labels_cpu is not None and labs_b is not None:
                 new_labels_cpu[b, dst:dst + prefix_len] = labs_b[:prefix_len]
+            new_position_ids_cpu[:, b, dst:dst + prefix_len] = _text_pos_block(dst, prefix_len, dtype=torch.int32)
             dst += prefix_len
         Nv = int(vid_idx.numel())
         pos2rank = torch.full((L_eff,), -1, dtype=torch.long, device=cpu)
         if Nv > 0:
             # (1) <obj_traj_start> (optional)
             if obj_traj_start_id is not None:
                 new_input_ids_cpu[b, dst] = int(obj_traj_start_id)
+                new_position_ids_cpu[:, b, dst:dst + 1] = _text_pos_block(dst, 1, dtype=torch.int32)
                 if new_labels_cpu is not None:
                     new_labels_cpu[b, dst] = IGNORE_ID
                 new_attention_mask_cpu[b, dst] = True
                 dst += 1
             # (2) text tokens (required)
             txt_ids = text_token_ids_per_sample[b][i].to(cpu, dtype=torch.long)
             k = int(txt_ids.numel())
             if k > 0:
                 new_input_ids_cpu[b, dst:dst + k] = txt_ids
+                new_position_ids_cpu[:, b, dst:dst + k] = _text_pos_block(dst, k, dtype=torch.int32)
                 if new_labels_cpu is not None:
                     new_labels_cpu[b, dst:dst + k] = IGNORE_ID
                 new_attention_mask_cpu[b, dst:dst + k] = True
                 dst += k
             # (3) <VS> (optional)
             if vs_id is not None:
                 new_input_ids_cpu[b, dst] = int(vs_id)
+                new_position_ids_cpu[:, b, dst:dst + 1] = _text_pos_block(dst, 1, dtype=torch.int32)
                 if new_labels_cpu is not None:
                     new_labels_cpu[b, dst] = IGNORE_ID
                 new_attention_mask_cpu[b, dst] = True
                 dst += 1
             # (4) video tokens
             if g.numel() > 0:
+                tokens_per_window = int(grid_area_batch[b] * int(temporal_window_size_batch[b]))
+                rel_temporal_window_idx = rel // tokens_per_window if (tokens_per_window > 0) else torch.zeros_like(rel)
+                W_eff = int(rel_temporal_window_idx.max().item()) + 1 if rel_temporal_window_idx.numel() > 0 else 0
+                all_rows_list = []
+                for w in range(W_eff):
+                    m_w = (rel_temporal_window_idx == w)
+                    if not torch.any(m_w):
+                        all_rows_list.append(torch.empty(0, dtype=torch.long, device=cpu))
+                        continue
+                    rel_w = rel[m_w]
+                    rows_w = rel_w + vid_offset
+                    all_rows_list.append(rows_w)
+                # second resampler: global object summary
+                if use_second_resampler and second_resampler is not None:
+                    rows_all = torch.cat([x for x in all_rows_list if x.numel() > 0], dim=0) if any(x.numel() > 0 for x in all_rows_list) \
+                        else torch.empty(0, dtype=torch.long, device=cpu)
+                    if rows_all.numel() > 0:
+                        R2 = int(second_resampler_num_latents)
+                        new_input_ids_cpu[b, dst:dst + R2] = int(vt_id)
+                        new_position_ids_cpu[:, b, dst:dst + R2] = _text_pos_block( dst, R2, dtype=torch.int32)
+                        if new_labels_cpu is not None:
+                            new_labels_cpu[b, dst:dst + R2] = IGNORE_ID
+                        new_attention_mask_cpu[b, dst:dst + R2] = True
+                        pos_idx2 = torch.arange(dst, dst + R2, dtype=torch.long, device=cpu)
+                        batched_second_rows.append(rows_all)
+                        batched_second_pos.append(pos_idx2)
+                        batched_second_bids.append(b)
+                        batched_second_oids.append(i)
+                        dst += R2
+                R = int(resampler_num_latents)
+                for w in range(W_eff):
+                    m_w = (rel_temporal_window_idx == w)
+                    if not torch.any(m_w):
+                        continue
+                    # timestamp tokens (text-only; NOT injected into resampler)
+                    if add_timestamp_token and (timestamp_token_ids_per_batch is not None):
+                        loc = w
+                        if loc < len(timestamp_token_ids_per_batch[b]):
+                            ts_ids = timestamp_token_ids_per_batch[b][loc].to(cpu, dtype=torch.long)
+                        else:
+                            ts_ids = timestamp_token_ids_per_batch[b][-1].to(cpu, dtype=torch.long)
+                        kt = int(ts_ids.numel())
+                        assert kt > 0, "Timestamp token ids should not be empty."
+                        new_input_ids_cpu[b, dst:dst + kt] = ts_ids
+                        new_position_ids_cpu[:, b, dst:dst + kt] = _text_pos_block(dst, kt, dtype=torch.int32)
+                        if new_labels_cpu is not None:
+                            new_labels_cpu[b, dst:dst + kt] = IGNORE_ID
+                        new_attention_mask_cpu[b, dst:dst + kt] = True
+                        dst += kt
+                    new_input_ids_cpu[b, dst:dst + R] = int(vt_id)
+                    new_position_ids_cpu[:, b, dst:dst + R] = _text_pos_block(dst, R, dtype=torch.int32)
                     if new_labels_cpu is not None:
+                        new_labels_cpu[b, dst:dst + R] = IGNORE_ID
+                    new_attention_mask_cpu[b, dst:dst + R] = True
+                    rel_w = rel[m_w]
+                    rows_w = rel_w + vid_offset
+                    pos_idx = torch.arange(dst, dst + R, dtype=torch.long, device=cpu)
+                    batched_obj_rows.append(rows_w)
+                    batched_obj_pos.append(pos_idx)
+                    batched_obj_bids.append(b)
+                    batched_obj_lens.append(int(rows_w.numel()))
+                    dst += R
             # (5) <VE> (optional)
             if ve_id is not None:
                 new_input_ids_cpu[b, dst] = int(ve_id)
+                new_position_ids_cpu[:, b, dst:dst + 1] = _text_pos_block(dst, 1, dtype=torch.int32)
                 if new_labels_cpu is not None:
                     new_labels_cpu[b, dst] = IGNORE_ID
                 new_attention_mask_cpu[b, dst] = True
                 dst += 1
             # (6) <obj_traj_end> (optional)
             if obj_traj_end_id is not None:
                 new_input_ids_cpu[b, dst] = int(obj_traj_end_id)
+                new_position_ids_cpu[:, b, dst:dst + 1] = _text_pos_block(dst, 1, dtype=torch.int32)
                 if new_labels_cpu is not None:
                     new_labels_cpu[b, dst] = IGNORE_ID
                 new_attention_mask_cpu[b, dst] = True
                 dst += 1
         # suffix
         if suffix_len > 0:
             new_attention_mask_cpu[b, dst:dst + seg] = msk_b[src_lo:src_hi]
             if new_labels_cpu is not None and labs_b is not None:
                 new_labels_cpu[b, dst:dst + seg] = labs_b[src_lo:src_hi]
+            new_position_ids_cpu[:, b, dst:dst + seg] = _text_pos_block(dst, seg, dtype=torch.int32)
             dst += seg
         assert dst == L_new_each[b], f"sample {b}: dst={dst}, L_new={L_new_each[b]}"
     base = tok_embed(new_input_ids)
     new_inputs_embeds = base.clone()
     # ---- (5.1) second resampler: object-level global summary ----
     if use_resampler and use_second_resampler and len(batched_second_rows) > 0:
         if video_embeds is None:
         ar2 = torch.arange(L2_max, device=dev_emb).unsqueeze(0) if L2_max > 0 else torch.zeros(1, 0, device=dev_emb, dtype=torch.long)
         mask2 = (ar2 < lens2_t.unsqueeze(1)) if L2_max > 0 else torch.zeros(0, 0, device=dev_emb, dtype=torch.bool)
+        y2 = second_resampler(x2, attention_mask=mask2)
         y2 = y2.to(new_inputs_embeds.dtype)
         for j in range(N_obj2):
             pos2 = batched_second_pos[j].to(dev)
             new_inputs_embeds[b_cur, pos2] = y2[j]
+    # ---- (5.2) main resampler: temporal resampler----
     if use_resampler and len(batched_obj_rows) > 0:
         if video_embeds is None:
             raise RuntimeError("use_resampler=True but video_embeds is None.")
         D = video_embeds.shape[-1]
         N_obj = len(batched_obj_rows)
+        lens = torch.tensor(batched_obj_lens, device=dev_emb, dtype=torch.long)
         L_max = int(lens.max().item()) if lens.numel() > 0 else 0
         seqs = []
             if rows.numel() == 0:
                 seqs.append(torch.zeros(0, D, device=dev_emb, dtype=dtype_emb))
             else:
+                seqs.append(video_embeds.index_select(0, rows.to(dev_emb)))
         x = torch.nn.utils.rnn.pad_sequence(seqs, batch_first=True) if len(seqs) > 0 else torch.zeros(0, 0, D, device=dev_emb, dtype=dtype_emb)
         ar = torch.arange(L_max, device=dev_emb).unsqueeze(0) if L_max > 0 else torch.zeros(1, 0, device=dev_emb, dtype=torch.long)
         mask = (ar < lens.unsqueeze(1)) if L_max > 0 else torch.zeros(0, 0, device=dev_emb, dtype=torch.bool)
+        y = resampler(x, attention_mask=mask)
         y = y.to(new_inputs_embeds.dtype)
         per_b_indices: List[List[int]] = [[] for _ in range(B)]
             new_inputs_embeds[b, pos_b] = emb_b
     # ---- (6) rope_deltas / cache_position ----
+    maxpos = new_position_ids.max(dim=0)[0].max(dim=1, keepdim=True)[0]
     rope_deltas = (maxpos + 1 - new_inputs_embeds.shape[1]).to(dtype=torch.long, device=dev)
     cache_position = torch.arange(new_inputs_embeds.shape[1], device=dev, dtype=torch.int32)