OpenMOSS-Team
/

MOSS-VL-Base-0408

@@ -14,21 +14,18 @@
 # limitations under the License.
 """PyTorch MossVL model - Qwen3VL Vision + Text with Cross Attention"""
-import copy
 from dataclasses import dataclass
-import queue
-import threading
-from typing import Any, Callable, Dict, Optional, Union, Tuple, List
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.generation import GenerationMixin
-from transformers.generation.stopping_criteria import StoppingCriteria, StoppingCriteriaList
-from transformers.generation.streamers import TextIteratorStreamer
 from transformers.integrations import use_kernel_forward_from_hub
 from transformers.masking_utils import create_causal_mask
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
@@ -39,7 +36,8 @@ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling, logging
 from transformers.utils.deprecation import deprecate_kwarg
-from transformers.utils.generic import OutputRecorder
 from .configuration_moss_vl import MossVLConfig, MossVLTextConfig, MossVLVisionConfig
@@ -47,58 +45,6 @@ from .configuration_moss_vl import MossVLConfig, MossVLTextConfig, MossVLVisionC
 logger = logging.get_logger(__name__)
-_OFFLINE_SYSTEM_PROMPTS = {
-    "no_thinking": {
-        "text_image": "You are a helpful AI assistant. Respond to the user's request based on the provided text and/or images.",
-        "video": "You are a helpful AI assistant specializing in video analysis. Respond to the user's request based on the provided video content.",
-    },
-    "deep_thinking": {
-        "text_image": "A conversation between User and Assistant. The user makes a request, and the assistant responds to it based on the provided text and/or images. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <thinking></thinking> and <answer></answer> tags, respectively, i.e., <thinking>reasoning process here</thinking><answer>answer here</answer>.",
-        "video": "A conversation between User and Assistant specializing in video analysis. The user makes a request, and the assistant responds to it based on the provided video content. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <thinking></thinking> and <answer></answer> tags, respectively, i.e., <thinking>reasoning process here</thinking><answer>answer here</answer>.",
-    },
-}
-class _OfflineCancelStoppingCriteria(StoppingCriteria):
-    def __init__(self, cancel_event: threading.Event):
-        self.cancel_event = cancel_event
-    def __call__(self, input_ids, scores, **kwargs) -> bool:
-        return self.cancel_event.is_set()
-class _OfflineQueueStreamer(TextIteratorStreamer):
-    def __init__(self, tokenizer, output_text_queue: "queue.Queue[str]"):
-        super().__init__(tokenizer, skip_prompt=True, skip_special_tokens=True)
-        self.output_text_queue = output_text_queue
-        self.collected_chunks: List[str] = []
-    def on_finalized_text(self, text: str, stream_end: bool = False):
-        if text:
-            self.collected_chunks.append(text)
-            self.output_text_queue.put(text)
-        super().on_finalized_text(text, stream_end=stream_end)
-_OFFLINE_THINKING_MODE_ALIASES = {
-    "no_thinking": "no_thinking",
-    "default": "no_thinking",
-    "standard": "no_thinking",
-    "deep_thinking": "deep_thinking",
-    "thinking": "deep_thinking",
-    "reasoning": "deep_thinking",
-}
-_OFFLINE_SYSTEM_PROMPT_TYPE_ALIASES = {
-    "text_image": "text_image",
-    "text-image": "text_image",
-    "image_text": "text_image",
-    "image-text": "text_image",
-    "text": "text_image",
-    "image": "text_image",
-    "video": "video",
-}
 @dataclass
 class MossVLModelOutputWithPast(ModelOutput):
@@ -198,13 +144,21 @@ class MossVLVisionPatchEmbed(nn.Module):
 class MossVLVisionRotaryEmbedding(nn.Module):
-    inv_freq: torch.Tensor
     def __init__(self, dim: int, theta: float = 10000.0) -> None:
         super().__init__()
-        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
     def forward(self, seqlen: int) -> torch.Tensor:
         seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
         freqs = torch.outer(seq, self.inv_freq)
@@ -233,11 +187,16 @@ class MossVLVisionPatchMerger(nn.Module):
         self.act_fn = nn.GELU()
         self.linear_fc2 = nn.Linear(self.input_hidden_size, config.out_hidden_size)
-    def forward(self, last_hidden_state: torch.Tensor, deepstack_features: List[torch.Tensor] = []) -> torch.Tensor:
         # 1. Collect all features: [last_hidden_state, deepstack_1, deepstack_2, ...]
         # self.norms[0] corresponds to last_hidden_state
         # self.norms[1:] corresponds to deepstack_features
         all_inputs = [last_hidden_state] + deepstack_features
         # 2. Apply Norm independently
@@ -346,11 +305,11 @@ class MossVLVisionAttention(nn.Module):
         key_states = key_states.transpose(0, 1).unsqueeze(0)
         value_states = value_states.transpose(0, 1).unsqueeze(0)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
-        if self.config._attn_implementation == "flash_attention_2":
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
             attn_output, _ = attention_interface(
                 self,
@@ -429,26 +388,44 @@ class MossVLTextRotaryEmbedding(nn.Module):
     def __init__(self, config: MossVLTextConfig, device=None):
         super().__init__()
-        # BC: "rope_type" was originally "type"
-        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
-            self.rope_type = config.rope_scaling.get("rope_type", "default")
-        else:
-            self.rope_type = "default"
         self.max_seq_len_cached = config.max_position_embeddings
         self.original_max_seq_len = config.max_position_embeddings
         self.config = config
-        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = self.inv_freq
-        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
-             self.mrope_section = config.rope_scaling.get("mrope_section", [24, 20, 20])
-        else:
-             self.mrope_section = [24, 20, 20]
     def apply_interleaved_mrope(self, freqs, mrope_section):
         """Apply interleaved MRoPE to 3D rotary embeddings.
@@ -470,7 +447,6 @@ class MossVLTextRotaryEmbedding(nn.Module):
     @torch.no_grad()
     @dynamic_rope_update
     def forward(self, x, position_ids):
         if position_ids.ndim == 2:
             position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
@@ -571,12 +547,11 @@ class MossVLTextSelfAttention(nn.Module):
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
         if past_key_values is not None:
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
         attn_output, attn_weights = attention_interface(
             self,
@@ -625,7 +600,7 @@ class MossVLTextCrossAttention(nn.Module):
         attention_mask: Optional[torch.Tensor] = None,
         past_key_values: Optional[Cache] = None,
         use_cache: bool = None,
-        cache_position: Optional[torch.LongTensor] = None,  # vision_cache_position
         query_position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
         vision_position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
         **kwargs,
@@ -659,9 +634,7 @@ class MossVLTextCrossAttention(nn.Module):
             if past_key_values is not None:
                 # if we have a new image + new tokens, we only computed key_states on that new image
                 # we still update the cross key states, past_image, new_image. And use it!
-                key_states, value_states = past_key_values.update(
-                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
-                )
         elif cache_position[0] != 0:
             key_states, value_states = (
@@ -673,13 +646,13 @@ class MossVLTextCrossAttention(nn.Module):
                 "Cross attention layer can't find neither `cross_attn_states` nor cached values for key/values!"
             )
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            # 如果是flash attention，走sdpa_attention_forward
-            if self.config._attn_implementation == "flash_attention_3" or self.config._attn_implementation == "flash_attention_2":
-                attention_interface = ALL_ATTENTION_FUNCTIONS["sdpa"]
-            else:
-                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
         attn_output, attn_weights = attention_interface(
             self,
@@ -740,14 +713,14 @@ class MossVLSelfAttentionDecoderLayer(GradientCheckpointingLayer):
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         vision_position_ids: Optional[torch.LongTensor] = None,
-        vision_cache_position: Optional[torch.LongTensor] = None,
         vision_position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> torch.Tensor:
         # Self Attention
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
-        hidden_states, _ = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             past_key_values=past_key_values,
@@ -762,8 +735,11 @@ class MossVLSelfAttentionDecoderLayer(GradientCheckpointingLayer):
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
-        return hidden_states
 class MossVLCrossAttentionDecoderLayer(GradientCheckpointingLayer):
@@ -799,21 +775,21 @@ class MossVLCrossAttentionDecoderLayer(GradientCheckpointingLayer):
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         vision_position_ids: Optional[torch.LongTensor] = None,
-        vision_cache_position: Optional[torch.LongTensor] = None,
         vision_position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
         **kwargs: Unpack[TransformersKwargs],
-    ) -> torch.Tensor:
         # Cross Attention
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
-        hidden_states, _ = self.cross_attn(
             hidden_states=hidden_states,
             cross_attention_states=cross_attention_states,
             attention_mask=cross_attention_mask,
             past_key_values=past_key_values,
             use_cache=use_cache,
-            cache_position=vision_cache_position,
             query_position_embeddings=position_embeddings,
             vision_position_embeddings=vision_position_embeddings,
         )
@@ -830,8 +806,11 @@ class MossVLCrossAttentionDecoderLayer(GradientCheckpointingLayer):
             hidden_states = full_text_row_masked_out_mask[:, 0] * hidden_states
         hidden_states = residual + self.cross_attn_mlp_gate.tanh() * hidden_states
-        return hidden_states
@@ -857,32 +836,10 @@ class MossVLPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         """Initialize the weights.
-        Note: For loading pretrained weights:
-        - Cross attention: can be initialized from the previous layer's self attention weights
         """
-        std = getattr(self.config, "initializer_range", 0.02)
-        if hasattr(self.config, "text_config") and hasattr(self.config.text_config, "initializer_range"):
-            std = self.config.text_config.initializer_range
-        if isinstance(module, MossVLVisionPatchMerger):
-            # Initialize merger weights
-            # Input: hidden_size * (1 + num_deepstack_features) -> Output: out_hidden_size
-            # This projection handles concatenated features, so we might want specific initialization
-            module.linear_fc1.weight.data.normal_(mean=0.0, std=std)
-            module.linear_fc2.weight.data.normal_(mean=0.0, std=std)
-            if module.linear_fc1.bias is not None:
-                module.linear_fc1.bias.data.zero_()
-            if module.linear_fc2.bias is not None:
-                module.linear_fc2.bias.data.zero_()
-            # Initialize separate LayerNorms
-            if hasattr(module, "norms"):
-                for norm in module.norms:
-                    if hasattr(norm, "weight") and norm.weight is not None:
-                        norm.weight.data.fill_(1.0)
-                    if hasattr(norm, "bias") and norm.bias is not None:
-                        norm.bias.data.zero_()
@@ -958,13 +915,15 @@ class MossVLVisionModel(MossVLPreTrainedModel):
     def fast_pos_embed_interpolate(self, grid_thw):
         grid_ts, grid_hs, grid_ws = grid_thw[:, 0], grid_thw[:, 1], grid_thw[:, 2]
-        idx_list = [[] for _ in range(4)]
-        weight_list = [[] for _ in range(4)]
         for t, h, w in zip(grid_ts, grid_hs, grid_ws):
-            h_idxs = torch.linspace(0, self.num_grid_per_side - 1, h)
-            w_idxs = torch.linspace(0, self.num_grid_per_side - 1, w)
             h_idxs_floor = h_idxs.int()
             w_idxs_floor = w_idxs.int()
@@ -992,13 +951,11 @@ class MossVLVisionModel(MossVLPreTrainedModel):
             ]
             for i in range(4):
-                idx_list[i].extend(indices[i].tolist())
-                weight_list[i].extend(weights[i].tolist())
-        idx_tensor = torch.tensor(idx_list, dtype=torch.long, device=self.pos_embed.weight.device)
-        weight_tensor = torch.tensor(
-            weight_list, dtype=self.pos_embed.weight.dtype, device=self.pos_embed.weight.device
-        )
         pos_embeds = self.pos_embed(idx_tensor) * weight_tensor[:, :, None]
         patch_pos_embeds = pos_embeds[0] + pos_embeds[1] + pos_embeds[2] + pos_embeds[3]
@@ -1127,7 +1084,9 @@ class MossVLTextModel(MossVLPreTrainedModel):
         vision_position_ids: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-        vision_cache_position: Optional[torch.LongTensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> Union[tuple, BaseModelOutputWithPast]:
         """
@@ -1140,9 +1099,15 @@ class MossVLTextModel(MossVLPreTrainedModel):
                 Attention mask for cross-attention between text and vision. Shape: `(batch_size, 1, text_seq_len, vision_seq_len)`.
             vision_position_ids (`torch.LongTensor`, *optional*):
                 Position IDs for vision tokens used in cross-attention. Shape: `(batch_size, vision_seq_len)`.
-            vision_cache_position (`torch.LongTensor`, *optional*):
-                Cache position for vision tokens. Shape: `(vision_seq_len,)`.
         """
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -1164,7 +1129,7 @@ class MossVLTextModel(MossVLPreTrainedModel):
         attention_mask = create_causal_mask(
             config=self.config,
-            input_embeds=inputs_embeds,
             attention_mask=attention_mask,
             cache_position=cache_position,
             past_key_values=past_key_values,
@@ -1179,14 +1144,15 @@ class MossVLTextModel(MossVLPreTrainedModel):
         # Compute vision position embeddings (for cross-attention key/value) if needed
         vision_position_embeddings = None
-        if vision_cache_position is None:
-            # TODO：use cache_position now
-            vision_cache_position = cache_position
         if cross_attention_states is not None:
             if vision_position_ids is not None:
                 vision_position_embeddings = self.rotary_emb(cross_attention_states, vision_position_ids)
         for idx, decoder_layer in enumerate(self.layers):
             # For text-only path we should skip cross attention layers.
@@ -1211,17 +1177,35 @@ class MossVLTextModel(MossVLPreTrainedModel):
                 cross_attention_states=cross_attention_states,
                 cross_attention_mask=cross_attention_mask,
                 vision_position_ids=vision_position_ids,
-                vision_cache_position=vision_cache_position,
                 vision_position_embeddings=vision_position_embeddings,
                 **kwargs,
             )
-            hidden_states = layer_outputs
         hidden_states = self.norm(hidden_states)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values,
         )
@@ -1240,8 +1224,6 @@ class MossVLModel(MossVLPreTrainedModel):
         super().__init__(config)
         self.visual = MossVLVisionModel._from_config(config.vision_config)
         self.language_model = MossVLTextModel._from_config(config.text_config)
-        self.vision_token_info = None  # cache vision_token_info here for decode stage
-        self.rope_deltas = None  # cache position deltas for decode stage
         # Learnable Separator Token: inserted after each image/frame's vision tokens
         # Initialized from LLM's separator_token_init_id embedding
@@ -1550,7 +1532,7 @@ class MossVLModel(MossVLPreTrainedModel):
                 continue
             # Collect repetition counts for all frames in this sample
-            repeats = []
             for media in medias:
                 num_frames = media.get('num_frames', 1)
                 length = media['length']
@@ -1565,25 +1547,30 @@ class MossVLModel(MossVLPreTrainedModel):
                 # In convert_packed_to_batch we enforce strictly regular frames
                 # so we can assume all frames have the same number of tokens
-                repeats.extend([tokens_per_frame_with_sep] * num_frames)
-            num_valid_frames = len(repeats)
             if num_valid_frames == 0:
                 continue
             # If cross_attention_mask has more frames (e.g. padded), slice it
             # If it has fewer (shouldn't happen), slice repeats
             valid_mask_frames = min(num_valid_frames, cross_attention_mask.shape[-1])
             if valid_mask_frames < num_valid_frames:
-                 repeats = repeats[:valid_mask_frames]
             # Extract valid columns for this sample
             # (1, text_len, valid_mask_frames)
             source_mask = cross_attention_mask[i, :, :, :valid_mask_frames]
-            # Convert repeats to tensor
-            repeats_tensor = torch.tensor(repeats, device=cross_attention_mask.device)
             # Expand using repeat_interleave
             # output shape: (1, text_len, sum(repeats))
             expanded_mask = source_mask.repeat_interleave(repeats_tensor, dim=-1)
@@ -1602,7 +1589,8 @@ class MossVLModel(MossVLPreTrainedModel):
         self,
         input_ids: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        cache_position: Optional[torch.LongTensor] = None,
     ) -> torch.Tensor:
         """
         Compute 3D position IDs for text tokens with special handling for image tokens.
@@ -1617,7 +1605,7 @@ class MossVLModel(MossVLPreTrainedModel):
         Args:
             input_ids: (batch_size, seq_len)
             attention_mask: (batch_size, seq_len), optional
-            cache_position: (seq_len,), position in cache
         Returns:
             position_ids: (3, batch_size, seq_len)
@@ -1626,25 +1614,17 @@ class MossVLModel(MossVLPreTrainedModel):
         device = input_ids.device
         image_token_id = self.config.image_token_id
-        # Decode stage: use cached rope_deltas for fast computation
-        if cache_position is not None and cache_position[0] != 0 and self.rope_deltas is not None:
-            # In decode, position = cache_position + rope_deltas
-            # rope_deltas is per-sample: (batch_size,)
             position_ids = torch.arange(seq_len, device=device, dtype=torch.long)
-            position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)  # (batch, seq_len)
-            # Add cache_position offset
-            if cache_position is not None:
-                position_ids = position_ids + cache_position[0]
-            # Add rope_deltas (position offset due to vision tokens)
-            # self.rope_deltas shape: (batch_size,), need to unsqueeze for broadcasting
-            position_ids = position_ids + self.rope_deltas.unsqueeze(1)  # (batch, seq_len)
-            # Expand to 3D: (3, batch, seq_len)
-            position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
-            return position_ids
         # Prefill stage: compute full position_ids with image token awareness
         # Vectorized implementation
@@ -1723,7 +1703,7 @@ class MossVLModel(MossVLPreTrainedModel):
             rope_deltas: (batch_size,) - position offset due to vision tokens
         """
         batch_size, max_vision_seq_len, _ = cross_attention_states.shape
-        device = position_ids.device if position_ids is not None else input_ids.device
         image_token_id = self.config.image_token_id
         merge_size = self.visual.spatial_merge_size
@@ -1731,15 +1711,14 @@ class MossVLModel(MossVLPreTrainedModel):
         # We need to flatten the nested vision_token_info structure to align with image tokens in input_ids
         # Find all image tokens in text: (num_occurrences, 2) -> [batch_idx, seq_idx]
-        image_token_indices = (input_ids == image_token_id).nonzero().to(device)
         # Flatten vision_token_info to parallel lists
         # We assume the order of medias in vision_token_info matches the appearance of image tokens in input_ids
-        flat_eff_h = []
-        flat_eff_w = []
-        flat_vis_starts = []
-        flat_batch_indices = []
         # Processing metadata on CPU (fast enough for typical batch sizes)
         for b_idx, info in enumerate(vision_token_info):
             medias = info.get('medias', [])
@@ -1750,13 +1729,11 @@ class MossVLModel(MossVLPreTrainedModel):
                 start = media['start']
                 tok_per_frame = media['vision_tokens_per_frame']
                 stride = tok_per_frame + 1 # +1 for separator
-                # Generate entries for all frames in this media
-                for f in range(num_frames):
-                    flat_eff_h.append(eh)
-                    flat_eff_w.append(ew)
-                    flat_vis_starts.append(start + f * stride)
-                    flat_batch_indices.append(b_idx)
         # Pre-allocate output
         vision_pos_ids = torch.zeros(
@@ -1766,17 +1743,19 @@ class MossVLModel(MossVLPreTrainedModel):
         )
         # Handle case where no image tokens or info
-        if len(flat_eff_h) == 0 or len(image_token_indices) == 0:
             rope_deltas = position_ids.max(dim=0).values.max(dim=-1).values + 1 - input_ids.shape[1]
             return vision_pos_ids, position_ids, rope_deltas
         # Align lengths (handle truncation if text has fewer tokens or vice versa)
-        num_matches = min(len(flat_eff_h), len(image_token_indices))
-        # Convert to tensors
-        flat_eff_h = torch.tensor(flat_eff_h[:num_matches], device=device, dtype=torch.long)
-        flat_eff_w = torch.tensor(flat_eff_w[:num_matches], device=device, dtype=torch.long)
-        flat_vis_starts = torch.tensor(flat_vis_starts[:num_matches], device=device, dtype=torch.long)
         # Get corresponding text positions
         target_indices = image_token_indices[:num_matches]
@@ -1942,53 +1921,6 @@ class MossVLModel(MossVLPreTrainedModel):
         )
         return vision_embeds, vision_token_info
-    def get_vision_features_chunked(
-        self,
-        pixel_values: torch.FloatTensor,
-        grid_thw: Optional[torch.LongTensor] = None,
-        media_nums_per_sample: Optional[List[int]] = None,
-        vision_chunked_length: Optional[int] = None,
-    ):
-        """
-        Chunk the visual encoder forward by media items, then reuse the same
-        packed-to-batch conversion logic. This keeps output semantics identical
-        to `get_vision_features(...)` while reducing prefill memory pressure.
-        """
-        if (
-            vision_chunked_length is None
-            or vision_chunked_length <= 0
-            or grid_thw is None
-            or grid_thw.shape[0] <= vision_chunked_length
-        ):
-            return self.get_vision_features(pixel_values, grid_thw, media_nums_per_sample)
-        pixel_values = pixel_values.type(self.visual.dtype)
-        token_counts = (grid_thw[:, 0] * grid_thw[:, 1] * grid_thw[:, 2]).tolist()
-        hidden_state_chunks = []
-        token_offset = 0
-        for media_start in range(0, grid_thw.shape[0], vision_chunked_length):
-            media_end = min(media_start + vision_chunked_length, grid_thw.shape[0])
-            chunk_grid_thw = grid_thw[media_start:media_end]
-            chunk_token_count = sum(token_counts[media_start:media_end])
-            chunk_pixel_values = pixel_values[token_offset:token_offset + chunk_token_count]
-            token_offset += chunk_token_count
-            hidden_state_chunks.append(
-                self.visual(
-                    chunk_pixel_values,
-                    grid_thw=chunk_grid_thw,
-                )
-            )
-        hidden_states = torch.cat(hidden_state_chunks, dim=0)
-        vision_embeds, vision_token_info = self.convert_packed_to_batch(
-            hidden_states,
-            grid_thw,
-            media_nums_per_sample,
-        )
-        return vision_embeds, vision_token_info
     @auto_docstring
@@ -2004,7 +1936,11 @@ class MossVLModel(MossVLPreTrainedModel):
         media_nums_per_sample: Optional[List[int]] = None,
         vision_position_ids: Optional[torch.LongTensor] = None,
         cross_attention_mask: Optional[torch.Tensor] = None,
-        cache_position: Optional[torch.LongTensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> Union[tuple, BaseModelOutputWithPast]:
         """
@@ -2021,11 +1957,20 @@ class MossVLModel(MossVLPreTrainedModel):
             cross_attention_mask (`torch.Tensor` of shape `(batch_size, 1, text_seq_len, vision_seq_len)`, *optional*):
                 Attention mask for cross-attention between text and vision. Controls which vision tokens each text
                 token can attend to, enforcing causal visibility for video frames.
-            vision_chunked_length (`int`, *optional*):
-                Number of media items to process per visual-encoder chunk during prefill. This only changes
-                how the vision tower is executed, not the final prompt or decoding logic.
         """
-        vision_chunked_length = kwargs.pop("vision_chunked_length", None)
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
@@ -2034,8 +1979,7 @@ class MossVLModel(MossVLPreTrainedModel):
         # Process vision features (images and videos are already merged by processor)
         cross_attention_states = None
-        num_vision_tokens = 0
         if pixel_values is not None:
             # Determine batch size
             batch_size = inputs_embeds.shape[0]
@@ -2050,23 +1994,12 @@ class MossVLModel(MossVLPreTrainedModel):
             # Process all vision inputs together through VIT
             # pixel_values and grid_thw are already ordered by appearance in text
-            vision_embeds, vision_token_info = self.get_vision_features_chunked(
-                pixel_values,
-                grid_thw,
-                media_nums_per_sample,
-                vision_chunked_length=vision_chunked_length,
             )
             # vision_embeds: [batch_size, max_seq_len, hidden_size]
             cross_attention_states = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
-            num_vision_tokens = cross_attention_states.shape[1]
-            # Cache vision_token_info for decode stage (prefill only)
-            self.vision_token_info = vision_token_info
-        else:
-            # In decode stage, use cached vision_token_info
-            vision_token_info = self.vision_token_info
         # Generate 3D position IDs for text if not provided
         if position_ids is None:
@@ -2075,7 +2008,8 @@ class MossVLModel(MossVLPreTrainedModel):
             position_ids = self.compute_position_ids(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                cache_position=cache_position,
             )
         # Compute cross_attention_mask, vision_position_ids, and full_text_row_masked_out_mask
@@ -2099,8 +2033,6 @@ class MossVLModel(MossVLPreTrainedModel):
                     (cross_attention_mask != negative_inf_value).any(dim=-1).type_as(cross_attention_mask)[..., None]
                 )
                 cross_attention_mask = cross_attention_mask * full_text_row_masked_out_mask
         if vision_position_ids is None and cross_attention_states is not None and input_ids is not None:
             vision_position_ids, position_ids, rope_deltas = self.compute_vision_position_ids(
@@ -2110,14 +2042,6 @@ class MossVLModel(MossVLPreTrainedModel):
                 cross_attention_states,
                 attention_mask
             )
-            # Cache rope_deltas for decode stage (only in prefill)
-            # rope_deltas = max_position - sequence_length
-            # This allows fast position computation in decode: position = cache_position + rope_deltas
-            if cache_position is not None and cache_position[0] == 0:
-                self.rope_deltas = rope_deltas
         outputs = self.language_model(
             input_ids=None,
@@ -2130,16 +2054,33 @@ class MossVLModel(MossVLPreTrainedModel):
             cross_attention_mask=cross_attention_mask,
             vision_position_ids=vision_position_ids,
             full_text_row_masked_out_mask=full_text_row_masked_out_mask,
             **kwargs,
         )
         return MossVLModelOutputWithPast(
             last_hidden_state=outputs.last_hidden_state,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-            vision_token_info=self.vision_token_info,
-            rope_deltas=self.rope_deltas,
         )
@@ -2161,7 +2102,6 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
         super().__init__(config)
         self.model = MossVLModel(config)
         self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
-        self._offline_processor_lock = threading.RLock()
         self.post_init()
@@ -2219,9 +2159,12 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
         media_nums_per_sample: Optional[List[int]] = None,
         vision_position_ids: Optional[torch.LongTensor] = None,
         cross_attention_mask: Optional[torch.Tensor] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        vision_chunked_length: Optional[int] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         **kwargs: Unpack[TransformersKwargs],
     ) -> Union[tuple, CausalLMOutputWithPast]:
         """
@@ -2238,10 +2181,13 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
             cross_attention_mask (`torch.Tensor` of shape `(batch_size, 1, text_seq_len, vision_seq_len)`, *optional*):
                 Attention mask for cross-attention between text and vision. Controls which vision tokens each text
                 token can attend to, enforcing causal visibility for video frames.
-            vision_chunked_length (`int`, *optional*):
-                Number of media items to process per visual-encoder chunk during prefill. This only changes
-                how the vision tower is executed, not the final prompt or decoding logic.
         """
         outputs = self.model(
             input_ids=input_ids,
             pixel_values=pixel_values,
@@ -2253,12 +2199,17 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
             cross_attention_mask=cross_attention_mask,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             cache_position=cache_position,
-            vision_chunked_length=vision_chunked_length,
             **kwargs,
         )
-        hidden_states = outputs[0]
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
@@ -2267,6 +2218,11 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
         return MossVLCausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -2283,15 +2239,15 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
         past_key_values=None,
         attention_mask=None,
         inputs_embeds=None,
-        cache_position=None,
         position_ids=None,
         use_cache=True,
         pixel_values=None,
         grid_thw=None,
         media_nums_per_sample=None,  # One video is one meida.
         vision_position_ids=None,
         cross_attention_mask=None,
-        vision_chunked_length=None,
         **kwargs,
     ):
         """
@@ -2304,12 +2260,12 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
         Args:
             media_nums_per_sample: One video counts as one media item (regardless of frame count)
         """
         model_inputs = super().prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
             inputs_embeds=inputs_embeds,
-            cache_position=cache_position,
             position_ids=position_ids,
             pixel_values=pixel_values,
             grid_thw=grid_thw,
@@ -2318,21 +2274,27 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
             **kwargs,
         )
-        # For decoding stage, if position_ids are generated by GenerationMixin (2D),
-        # we can set them to None to let forward recompute them from cache_position.
         model_inputs["position_ids"] = None
         # Handle cross attention mask
         if cross_attention_mask is not None:
-            # Slice to current sequence length on text dimension (dim=2)
-            # Shape: [batch, 1, text_len, vision_len] -> [batch, 1, cache_len, vision_len]
-            cross_attention_mask = cross_attention_mask[:, :, -cache_position.shape[0]:, :]
             model_inputs["cross_attention_mask"] = cross_attention_mask
-        # Vision inputs are only needed in prefill stage (cache_position[0] == 0)
         # In decode stage, vision features are retrieved from cross attention cache
-        if cache_position[0] != 0:
             model_inputs["pixel_values"] = None
             model_inputs["grid_thw"] = None
             model_inputs["media_nums_per_sample"] = None
@@ -2341,7 +2303,6 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
         else:
             # In prefill stage, include all vision-related inputs
             model_inputs["vision_position_ids"] = vision_position_ids
-            model_inputs["vision_chunked_length"] = vision_chunked_length
         return model_inputs
@@ -2362,1026 +2323,16 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
             **kwargs,
         )
-        # Extend cross_attention_mask for the new token
-        # Copy the last token's mask pattern for the newly generated token
         if cross_attention_mask_prev is not None:
-            model_kwargs["cross_attention_mask"] = torch.cat(
-                [cross_attention_mask_prev, cross_attention_mask_prev[:, :, -1:, :]],
-                dim=2  # Concatenate along text sequence dimension
-            )
         return model_kwargs
-    @staticmethod
-    def _offline_flatten_content_with_vision_tokens(content) -> str:
-        if isinstance(content, str):
-            return content
-        if not isinstance(content, list):
-            return str(content) if content else ""
-        parts = []
-        for item in content:
-            if isinstance(item, dict):
-                if item.get("type") == "image" or "image" in item:
-                    parts.append("<|image|>")
-                elif item.get("type") == "video" or "video" in item:
-                    parts.append("<|video|>")
-                if "text" in item:
-                    parts.append(str(item["text"]))
-            elif isinstance(item, str):
-                parts.append(item)
-        return "".join(parts)
-    @staticmethod
-    def _offline_sanitize_prompt_text(processor, text: Any) -> str:
-        if text is None:
-            return ""
-        sanitized = str(text)
-        replacements = [
-            (getattr(processor, "image_placeholder", None), ""),
-            (getattr(processor, "video_placeholder", None), ""),
-            (getattr(processor, "image_token", None), ""),
-            (getattr(processor, "video_token", None), ""),
-        ]
-        for needle, replacement in replacements:
-            if needle:
-                sanitized = sanitized.replace(needle, replacement)
-        return sanitized.lstrip("\n")
-    def _offline_sanitize_message_content(self, processor, content: Any) -> Any:
-        if isinstance(content, str):
-            return self._offline_sanitize_prompt_text(processor, content)
-        if not isinstance(content, list):
-            return content
-        sanitized_items = []
-        for item in content:
-            if isinstance(item, dict):
-                item_copy = dict(item)
-                if "text" in item_copy:
-                    item_copy["text"] = self._offline_sanitize_prompt_text(processor, item_copy.get("text"))
-                sanitized_items.append(item_copy)
-            elif isinstance(item, str):
-                sanitized_items.append(self._offline_sanitize_prompt_text(processor, item))
-            else:
-                sanitized_items.append(item)
-        return sanitized_items
-    def _offline_prepare_messages(self, processor, query: Dict[str, Any]) -> List[Dict[str, Any]]:
-        messages = query.get("messages")
-        if messages:
-            prepared_messages = []
-            for message in messages:
-                if not isinstance(message, dict):
-                    continue
-                message_copy = dict(message)
-                message_copy["content"] = self._offline_sanitize_message_content(
-                    processor,
-                    message_copy.get("content", ""),
-                )
-                prepared_messages.append(message_copy)
-            if prepared_messages:
-                return prepared_messages
-        prompt = self._offline_sanitize_prompt_text(processor, query.get("prompt", ""))
-        images = list(query.get("images") or [])
-        videos = list(query.get("videos") or [])
-        content = []
-        for image in images:
-            content.append({"type": "image", "image": image})
-        for video in videos:
-            content.append({"type": "video", "video": video})
-        if prompt:
-            content.append({"type": "text", "text": prompt.lstrip("\n")})
-        if not content:
-            content = [{"type": "text", "text": ""}]
-        return [{"role": "user", "content": content}]
-    @staticmethod
-    def _offline_extract_content_parts(content: Any) -> Tuple[str, List[Any], List[Any]]:
-        if isinstance(content, str):
-            return content, [], []
-        if not isinstance(content, list):
-            return (str(content) if content else ""), [], []
-        text_parts: List[str] = []
-        images: List[Any] = []
-        videos: List[Any] = []
-        for item in content:
-            if isinstance(item, dict):
-                if item.get("type") == "image" or "image" in item or "image_url" in item:
-                    image = item.get("image") or item.get("image_url")
-                    if image is not None:
-                        images.append(image)
-                elif item.get("type") == "video" or "video" in item or "video_path" in item:
-                    video = item.get("video") or item.get("video_path")
-                    if video is not None:
-                        videos.append(video)
-                if "text" in item and item["text"] is not None:
-                    text_parts.append(str(item["text"]))
-            elif isinstance(item, str):
-                text_parts.append(item)
-        return "".join(text_parts), images, videos
-    @staticmethod
-    def _offline_resolve_use_template(query: Dict[str, Any]) -> bool:
-        return bool(query.get("use_template", False))
-    def _offline_prepare_input_text(
-        self,
-        processor,
-        messages: List[Dict[str, Any]],
-        use_template: bool,
-    ) -> str:
-        if not use_template:
-            if any(isinstance(message, dict) and message.get("role") == "system" for message in messages):
-                raise ValueError("system messages require use_template=True")
-            parts = ["<|im_start|>"]
-            for message in messages:
-                role = message.get("role", "user") if isinstance(message, dict) else "user"
-                content = message.get("content", "") if isinstance(message, dict) else message
-                text, msg_images, msg_videos = self._offline_extract_content_parts(content)
-                if role == "user":
-                    media_tokens = ""
-                    if msg_images:
-                        media_tokens += "<|image|>" * len(msg_images)
-                    if msg_videos:
-                        media_tokens += "<|video|>" * len(msg_videos)
-                    parts.append(f"{media_tokens}{text}")
-                else:
-                    parts.append(f"{text}<|im_end|>")
-            return "".join(parts)
-        processed_messages = []
-        for message in messages:
-            message_copy = dict(message)
-            message_copy["content"] = self._offline_flatten_content_with_vision_tokens(
-                message_copy.get("content", "")
-            )
-            processed_messages.append(message_copy)
-        return processor.apply_chat_template(
-            processed_messages,
-            tokenize=False,
-            add_generation_prompt=True,
-        )
-    @staticmethod
-    def _offline_collect_media(messages: List[Dict[str, Any]]) -> tuple[List[Any], List[Any]]:
-        all_images: List[Any] = []
-        all_videos: List[Any] = []
-        for message in messages:
-            content = message.get("content")
-            if isinstance(content, list):
-                for item in content:
-                    if not isinstance(item, dict):
-                        continue
-                    if item.get("type") == "image" or "image" in item:
-                        image = item.get("image") or item.get("image_url")
-                        if image is not None:
-                            all_images.append(image)
-                    elif item.get("type") == "video" or "video" in item:
-                        video = item.get("video")
-                        if video is not None:
-                            all_videos.append(video)
-        return all_images, all_videos
-    def _offline_build_processor_kwargs(
-        self,
-        input_text: Union[str, List[str]],
-        all_images: List[Any],
-        all_videos: List[Any],
-        media_kwargs: Dict[str, Any],
-    ) -> Dict[str, Any]:
-        processor_kwargs: Dict[str, Any] = {
-            "text": input_text,
-            "images": all_images or None,
-            "videos": all_videos or None,
-            "return_tensors": "pt",
-            "padding": False,
-        }
-        if media_kwargs.get("min_pixels") is not None:
-            processor_kwargs["min_pixels"] = media_kwargs["min_pixels"]
-        if media_kwargs.get("max_pixels") is not None:
-            processor_kwargs["max_pixels"] = media_kwargs["max_pixels"]
-        if media_kwargs.get("video_fps") is not None:
-            processor_kwargs["video_fps"] = media_kwargs["video_fps"]
-        min_frames = media_kwargs.get("min_frames", media_kwargs.get("video_minlen"))
-        max_frames = media_kwargs.get("max_frames", media_kwargs.get("video_maxlen"))
-        if min_frames is not None:
-            processor_kwargs["min_frames"] = min_frames
-        if max_frames is not None:
-            processor_kwargs["max_frames"] = max_frames
-        return processor_kwargs
-    def _offline_prepare_inputs(self, processor, query: Dict[str, Any]):
-        messages = self._offline_prepare_messages(processor, query)
-        input_text = self._offline_prepare_input_text(
-            processor,
-            messages,
-            use_template=self._offline_resolve_use_template(query),
-        )
-        all_images, all_videos = self._offline_collect_media(messages)
-        media_kwargs = dict(query.get("media_kwargs") or {})
-        processor_kwargs = self._offline_build_processor_kwargs(
-            input_text,
-            all_images,
-            all_videos,
-            media_kwargs,
-        )
-        image_proc = getattr(processor, "image_processor", None)
-        video_proc = getattr(processor, "video_processor", None)
-        modified_multi_image = False
-        modified_video = False
-        with self._offline_processor_lock:
-            try:
-                multi_image_max_pixels = media_kwargs.get("multi_image_max_pixels")
-                if multi_image_max_pixels is not None and image_proc is not None:
-                    orig_multi_image_max_pixels = getattr(image_proc, "multi_image_max_pixels", None)
-                    image_proc.multi_image_max_pixels = multi_image_max_pixels
-                    modified_multi_image = True
-                video_max_pixels = media_kwargs.get("video_max_pixels")
-                if video_max_pixels is not None and video_proc is not None:
-                    orig_video_max_pixels = getattr(video_proc, "video_max_pixels", None)
-                    video_proc.video_max_pixels = video_max_pixels
-                    modified_video = True
-                inputs = processor(**processor_kwargs)
-            finally:
-                if modified_multi_image and image_proc is not None:
-                    image_proc.multi_image_max_pixels = orig_multi_image_max_pixels
-                if modified_video and video_proc is not None:
-                    video_proc.video_max_pixels = orig_video_max_pixels
-        text_device = self.get_input_embeddings().weight.device
-        vision_device = self.visual.patch_embed.proj.weight.device
-        vision_input_keys = {"pixel_values", "grid_thw"}
-        for key, value in list(inputs.items()):
-            if not isinstance(value, torch.Tensor):
-                continue
-            target_device = vision_device if key in vision_input_keys else text_device
-            moved_value = value.to(target_device)
-            if moved_value.dtype == torch.float32:
-                moved_value = moved_value.to(torch.bfloat16)
-            inputs[key] = moved_value
-        return inputs, input_text
-    def _offline_build_session_messages(
-        self,
-        processor,
-        query: Dict[str, Any],
-        session_messages: List[Dict[str, Any]],
-    ) -> List[Dict[str, Any]]:
-        has_explicit_messages = bool(query.get("messages"))
-        if has_explicit_messages and not query.get("append_messages_to_session", False):
-            base_messages: List[Dict[str, Any]] = []
-        else:
-            base_messages = [dict(message) for message in session_messages]
-        turn_messages = self._offline_prepare_messages(processor, query)
-        has_system_message = any(
-            isinstance(message, dict) and message.get("role") == "system"
-            for message in (base_messages + turn_messages)
-        )
-        should_add_system_prompt = (
-            query.get("use_default_system_prompt", False)
-            or query.get("system_prompt") is not None
-            or query.get("system_prompt_type") is not None
-            or query.get("thinking_mode") is not None
-        )
-        if not base_messages and not has_system_message and should_add_system_prompt:
-            system_prompt = self._offline_resolve_system_prompt(query, turn_messages)
-            if system_prompt is not None:
-                base_messages.append({"role": "system", "content": system_prompt})
-        return base_messages + turn_messages
-    @staticmethod
-    def _offline_query_contains_video(query: Dict[str, Any], messages: List[Dict[str, Any]]) -> bool:
-        if query.get("videos"):
-            return True
-        for message in messages:
-            content = message.get("content") if isinstance(message, dict) else None
-            if isinstance(content, list) and any(
-                isinstance(item, dict) and (item.get("type") == "video" or "video" in item)
-                for item in content
-            ):
-                return True
-        return False
-    @staticmethod
-    def _offline_normalize_thinking_mode(value: Optional[str]) -> str:
-        if value is None:
-            return "no_thinking"
-        normalized = _OFFLINE_THINKING_MODE_ALIASES.get(str(value).strip().lower())
-        if normalized is None:
-            allowed = ", ".join(sorted(set(_OFFLINE_THINKING_MODE_ALIASES.values())))
-            raise ValueError(f"Unsupported thinking_mode: {value!r}. Supported values: {allowed}")
-        return normalized
-    @staticmethod
-    def _offline_normalize_system_prompt_type(value: Optional[str], has_video: bool) -> str:
-        if value is None:
-            return "video" if has_video else "text_image"
-        normalized_key = str(value).strip().lower().replace("/", "_").replace(" ", "_")
-        while "__" in normalized_key:
-            normalized_key = normalized_key.replace("__", "_")
-        normalized = _OFFLINE_SYSTEM_PROMPT_TYPE_ALIASES.get(normalized_key)
-        if normalized is None:
-            allowed = ", ".join(sorted(set(_OFFLINE_SYSTEM_PROMPT_TYPE_ALIASES.values())))
-            raise ValueError(f"Unsupported system_prompt_type: {value!r}. Supported values: {allowed}")
-        return normalized
-    def _offline_resolve_system_prompt(
-        self,
-        query: Dict[str, Any],
-        turn_messages: List[Dict[str, Any]],
-    ) -> Optional[str]:
-        explicit_system_prompt = query.get("system_prompt")
-        if explicit_system_prompt is not None:
-            return str(explicit_system_prompt)
-        has_video = self._offline_query_contains_video(query, turn_messages)
-        thinking_mode = self._offline_normalize_thinking_mode(query.get("thinking_mode"))
-        system_prompt_type = self._offline_normalize_system_prompt_type(
-            query.get("system_prompt_type"),
-            has_video=has_video,
-        )
-        return _OFFLINE_SYSTEM_PROMPTS[thinking_mode][system_prompt_type]
-    @staticmethod
-    def _offline_finalize_session_messages(
-        working_messages: List[Dict[str, Any]],
-        assistant_text: str,
-    ) -> List[Dict[str, Any]]:
-        next_messages = [dict(message) for message in working_messages]
-        next_messages.append({"role": "assistant", "content": assistant_text})
-        return next_messages
-    def _offline_prepare_generation(self, processor, query: Dict[str, Any]):
-        inputs, input_text = self._offline_prepare_inputs(processor, query)
-        generate_kwargs = dict(query.get("generate_kwargs") or {})
-        max_new_tokens = generate_kwargs.pop("max_new_tokens", 1024)
-        temperature = generate_kwargs.pop("temperature", 1.0)
-        top_k = generate_kwargs.pop("top_k", 50)
-        top_p = generate_kwargs.pop("top_p", 1.0)
-        repetition_penalty = generate_kwargs.pop("repetition_penalty", 1.0)
-        do_sample = generate_kwargs.pop("do_sample", False)
-        vision_chunked_length = generate_kwargs.pop("vision_chunked_length", None)
-        if temperature is None:
-            temperature = 1.0
-        if temperature <= 0:
-            temperature = 1.0
-            do_sample = False
-        call_kwargs = dict(
-            max_new_tokens=max_new_tokens,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            repetition_penalty=repetition_penalty,
-            do_sample=do_sample,
-            vision_chunked_length=vision_chunked_length,
-            **generate_kwargs,
-        )
-        return inputs, input_text, call_kwargs
-    @staticmethod
-    def _offline_normalize_shared_mapping(
-        values: List[Dict[str, Any]],
-        mapping_name: str,
-    ) -> Dict[str, Any]:
-        normalized_values = [dict(value or {}) for value in values]
-        if not normalized_values:
-            return {}
-        all_keys = set()
-        for value in normalized_values:
-            all_keys.update(value.keys())
-        merged: Dict[str, Any] = {}
-        mismatched_keys: List[str] = []
-        for key in sorted(all_keys):
-            unique_values = {repr(value.get(key)) for value in normalized_values}
-            if len(unique_values) > 1:
-                mismatched_keys.append(key)
-            else:
-                merged[key] = normalized_values[0].get(key)
-        if mismatched_keys:
-            mismatch_text = ", ".join(mismatched_keys)
-            raise ValueError(
-                f"All batch queries must share the same {mapping_name}. "
-                f"Mismatched keys: {mismatch_text}"
-            )
-        return merged
-    def _offline_prepare_batch_generation(
-        self,
-        processor,
-        queries: List[Dict[str, Any]],
-        session_states: Optional[List[List[Dict[str, Any]]]] = None,
-    ):
-        if not queries:
-            raise ValueError("`queries` must contain at least one query.")
-        if session_states is None:
-            session_states = [[] for _ in queries]
-        elif len(session_states) != len(queries):
-            raise ValueError("`session_states` must have the same length as `queries`.")
-        working_messages_list: List[List[Dict[str, Any]]] = []
-        input_texts: List[str] = []
-        all_images_per_query: List[List[Any]] = []
-        all_videos_per_query: List[List[Any]] = []
-        for query, session_state in zip(queries, session_states):
-            if not isinstance(query, dict):
-                raise TypeError("Each batch query must be a dict.")
-            if query.get("stop_offline_generate"):
-                raise ValueError("`stop_offline_generate` is not supported in offline_batch_generate.")
-            if query.get("stream_output", query.get("stream", False)):
-                raise ValueError("Streaming is not supported in offline_batch_generate.")
-            if query.get("cancel_current_generate") or query.get("stop_generation"):
-                raise ValueError("Cancel / stop controls are not supported in offline_batch_generate.")
-            current_session = [] if query.get("reset_session") or query.get("clear_history") else session_state
-            working_messages = self._offline_build_session_messages(
-                processor,
-                query,
-                current_session,
-            )
-            working_messages_list.append(working_messages)
-            input_texts.append(
-                self._offline_prepare_input_text(
-                    processor,
-                    working_messages,
-                    use_template=self._offline_resolve_use_template(query),
-                )
-            )
-            all_images, all_videos = self._offline_collect_media(working_messages)
-            all_images_per_query.append(all_images)
-            all_videos_per_query.append(all_videos)
-        media_kwargs = self._offline_normalize_shared_mapping(
-            [query.get("media_kwargs") or {} for query in queries],
-            mapping_name="media_kwargs",
-        )
-        processor_kwargs = self._offline_build_processor_kwargs(
-            input_text=input_texts,
-            all_images=[image for images in all_images_per_query for image in images],
-            all_videos=[video for videos in all_videos_per_query for video in videos],
-            media_kwargs=media_kwargs,
-        )
-        processor_kwargs["padding"] = True
-        image_proc = getattr(processor, "image_processor", None)
-        video_proc = getattr(processor, "video_processor", None)
-        tokenizer = getattr(processor, "tokenizer", None)
-        modified_multi_image = False
-        modified_video = False
-        orig_padding_side = None
-        with self._offline_processor_lock:
-            try:
-                multi_image_max_pixels = media_kwargs.get("multi_image_max_pixels")
-                if multi_image_max_pixels is not None and image_proc is not None:
-                    orig_multi_image_max_pixels = getattr(image_proc, "multi_image_max_pixels", None)
-                    image_proc.multi_image_max_pixels = multi_image_max_pixels
-                    modified_multi_image = True
-                video_max_pixels = media_kwargs.get("video_max_pixels")
-                if video_max_pixels is not None and video_proc is not None:
-                    orig_video_max_pixels = getattr(video_proc, "video_max_pixels", None)
-                    video_proc.video_max_pixels = video_max_pixels
-                    modified_video = True
-                if tokenizer is not None and hasattr(tokenizer, "padding_side"):
-                    orig_padding_side = tokenizer.padding_side
-                    tokenizer.padding_side = "left"
-                inputs = processor(**processor_kwargs)
-            finally:
-                if modified_multi_image and image_proc is not None:
-                    image_proc.multi_image_max_pixels = orig_multi_image_max_pixels
-                if modified_video and video_proc is not None:
-                    video_proc.video_max_pixels = orig_video_max_pixels
-                if tokenizer is not None and orig_padding_side is not None:
-                    tokenizer.padding_side = orig_padding_side
-        text_device = self.get_input_embeddings().weight.device
-        vision_device = self.visual.patch_embed.proj.weight.device
-        vision_input_keys = {"pixel_values", "grid_thw"}
-        for key, value in list(inputs.items()):
-            if not isinstance(value, torch.Tensor):
-                continue
-            target_device = vision_device if key in vision_input_keys else text_device
-            moved_value = value.to(target_device)
-            if moved_value.dtype == torch.float32:
-                moved_value = moved_value.to(torch.bfloat16)
-            inputs[key] = moved_value
-        generate_kwargs = self._offline_normalize_shared_mapping(
-            [query.get("generate_kwargs") or {} for query in queries],
-            mapping_name="generate_kwargs",
-        )
-        max_new_tokens = generate_kwargs.pop("max_new_tokens", 1024)
-        temperature = generate_kwargs.pop("temperature", 1.0)
-        top_k = generate_kwargs.pop("top_k", 50)
-        top_p = generate_kwargs.pop("top_p", 1.0)
-        repetition_penalty = generate_kwargs.pop("repetition_penalty", 1.0)
-        do_sample = generate_kwargs.pop("do_sample", False)
-        vision_chunked_length = generate_kwargs.pop("vision_chunked_length", None)
-        if temperature is None:
-            temperature = 1.0
-        if temperature <= 0:
-            temperature = 1.0
-            do_sample = False
-        call_kwargs = dict(
-            max_new_tokens=max_new_tokens,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            repetition_penalty=repetition_penalty,
-            do_sample=do_sample,
-            vision_chunked_length=vision_chunked_length,
-            **generate_kwargs,
-        )
-        return inputs, input_texts, working_messages_list, call_kwargs
-    def offline_batch_generate(
-        self,
-        processor,
-        queries: List[Dict[str, Any]],
-        session_states: Optional[List[List[Dict[str, Any]]]] = None,
-        vision_chunked_length: int = 64,
-    ) -> Dict[str, Any]:
-        """
-        Batch offline generation for multiple independent samples.
-        This method supports:
-        - batched single-turn generation
-        - batched multi-turn continuation through `session_states`
-        It intentionally does not support queue-style controls such as:
-        - `stream_output`
-        - `cancel_current_generate`
-        - `stop_generation`
-        - `stop_offline_generate`
-        """
-        if not queries:
-            return {"results": [], "session_states": []}
-        prepared_queries = [dict(query) for query in queries]
-        for query in prepared_queries:
-            generate_kwargs = query.setdefault("generate_kwargs", {})
-            generate_kwargs.setdefault("vision_chunked_length", vision_chunked_length)
-        if session_states is None:
-            session_states = [[] for _ in prepared_queries]
-        elif len(session_states) != len(prepared_queries):
-            raise ValueError("`session_states` must have the same length as `queries`.")
-        tokenizer = getattr(processor, "tokenizer", None)
-        bucketed_indices: Dict[Any, List[int]] = {}
-        for index, (query, session_state) in enumerate(zip(prepared_queries, session_states)):
-            current_session = [] if query.get("reset_session") or query.get("clear_history") else session_state
-            working_messages = self._offline_build_session_messages(processor, query, current_session)
-            input_text = self._offline_prepare_input_text(
-                processor,
-                working_messages,
-                use_template=self._offline_resolve_use_template(query),
-            )
-            if tokenizer is not None:
-                token_ids = tokenizer(input_text, add_special_tokens=False)["input_ids"]
-                bucket_key = len(token_ids)
-            else:
-                bucket_key = len(input_text)
-            bucketed_indices.setdefault(bucket_key, []).append(index)
-        results: List[Optional[Dict[str, Any]]] = [None] * len(prepared_queries)
-        next_session_states: List[Optional[List[Dict[str, Any]]]] = [None] * len(prepared_queries)
-        for bucket_indices in bucketed_indices.values():
-            bucket_queries = [prepared_queries[index] for index in bucket_indices]
-            bucket_session_states = [session_states[index] for index in bucket_indices]
-            inputs, input_texts, working_messages_list, call_kwargs = self._offline_prepare_batch_generation(
-                processor,
-                bucket_queries,
-                session_states=bucket_session_states,
-            )
-            with torch.no_grad():
-                outputs = self.generate(
-                    **inputs,
-                    **call_kwargs,
-                )
-            input_seq_len = inputs["input_ids"].shape[1]
-            generated_tokens = outputs[:, input_seq_len:]
-            decoded_texts = processor.batch_decode(generated_tokens, skip_special_tokens=True)
-            for local_index, (query, input_text, working_messages, text) in enumerate(
-                zip(bucket_queries, input_texts, working_messages_list, decoded_texts)
-            ):
-                original_index = bucket_indices[local_index]
-                if query.get("persist_session", True):
-                    next_session_state = self._offline_finalize_session_messages(working_messages, text)
-                else:
-                    next_session_state = working_messages
-                next_session_states[original_index] = next_session_state
-                results[original_index] = {
-                    "index": original_index,
-                    "text": text,
-                    "input_text": input_text,
-                    "messages": working_messages,
-                }
-        return {
-            "results": [item for item in results if item is not None],
-            "session_states": [item for item in next_session_states if item is not None],
-        }
-    def _offline_generate_one(self, processor, query: Dict[str, Any]) -> str:
-        working_messages = self._offline_build_session_messages(processor, query, [])
-        generation_query = dict(query)
-        generation_query["messages"] = working_messages
-        inputs, _, call_kwargs = self._offline_prepare_generation(processor, generation_query)
-        with torch.no_grad():
-            outputs = self.generate(
-                **inputs,
-                **call_kwargs,
-            )
-        new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
-        return processor.decode(new_tokens, skip_special_tokens=True)
-    @staticmethod
-    def _offline_capture_processor_attrs(target, overrides: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
-        if target is None or not overrides:
-            return None
-        return {name: copy.deepcopy(getattr(target, name)) for name in overrides}
-    @staticmethod
-    def _offline_apply_processor_attrs(target, overrides: Optional[Dict[str, Any]]) -> None:
-        if target is None or not overrides:
-            return
-        for name, value in overrides.items():
-            setattr(target, name, copy.deepcopy(value))
-    @staticmethod
-    def _offline_restore_processor_attrs(target, snapshot: Optional[Dict[str, Any]]) -> None:
-        if target is None or snapshot is None:
-            return
-        for name, value in snapshot.items():
-            setattr(target, name, copy.deepcopy(value))
-    def _offline_generate_one_with_processor_overrides(
-        self,
-        processor,
-        query: Dict[str, Any],
-        image_processor_overrides: Optional[Dict[str, Any]] = None,
-        video_processor_overrides: Optional[Dict[str, Any]] = None,
-    ) -> str:
-        image_proc = getattr(processor, "image_processor", None)
-        video_proc = getattr(processor, "video_processor", None)
-        image_snapshot = self._offline_capture_processor_attrs(image_proc, image_processor_overrides)
-        video_snapshot = self._offline_capture_processor_attrs(video_proc, video_processor_overrides)
-        with self._offline_processor_lock:
-            try:
-                self._offline_apply_processor_attrs(image_proc, image_processor_overrides)
-                self._offline_apply_processor_attrs(video_proc, video_processor_overrides)
-                return self._offline_generate_one(processor, query)
-            finally:
-                self._offline_restore_processor_attrs(image_proc, image_snapshot)
-                self._offline_restore_processor_attrs(video_proc, video_snapshot)
-    def offline_image_generate(
-        self,
-        processor,
-        prompt: str = "",
-        image: Any = None,
-        *,
-        shortest_edge: int = 4096,
-        longest_edge: int = 16777216,
-        multi_image_max_pixels: int = 201326592,
-        patch_size: int = 16,
-        temporal_patch_size: int = 1,
-        merge_size: int = 2,
-        image_mean: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
-        image_std: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
-        max_new_tokens: int = 1024,
-        temperature: float = 1.0,
-        top_k: int = 50,
-        top_p: float = 1.0,
-        repetition_penalty: float = 1.0,
-        do_sample: bool = False,
-        vision_chunked_length: int = 64,
-        use_template: bool = False,
-        thinking_mode: Optional[str] = None,
-        system_prompt_type: Optional[str] = None,
-        system_prompt: Optional[str] = None,
-    ) -> str:
-        """
-        Single-image offline generation with explicit image preprocessor defaults.
-        The default values mirror `preprocessor_config.json` so README examples can
-        surface the full image preprocessing setup without requiring a batch wrapper.
-        """
-        if image is None:
-            raise ValueError("`image` is required.")
-        query: Dict[str, Any] = {
-            "prompt": prompt,
-            "images": [image],
-            "videos": [],
-            "media_kwargs": {
-                "min_pixels": shortest_edge,
-                "max_pixels": longest_edge,
-                "multi_image_max_pixels": multi_image_max_pixels,
-            },
-            "generate_kwargs": {
-                "max_new_tokens": max_new_tokens,
-                "temperature": temperature,
-                "top_k": top_k,
-                "top_p": top_p,
-                "repetition_penalty": repetition_penalty,
-                "do_sample": do_sample,
-                "vision_chunked_length": vision_chunked_length,
-            },
-            "use_template": use_template,
-        }
-        if thinking_mode is not None:
-            query["thinking_mode"] = thinking_mode
-        if system_prompt_type is not None:
-            query["system_prompt_type"] = system_prompt_type
-        if system_prompt is not None:
-            query["system_prompt"] = system_prompt
-        image_processor_overrides = {
-            "size": {"shortest_edge": shortest_edge, "longest_edge": longest_edge},
-            "multi_image_max_pixels": multi_image_max_pixels,
-            "patch_size": patch_size,
-            "temporal_patch_size": temporal_patch_size,
-            "merge_size": merge_size,
-            "image_mean": list(image_mean) if image_mean is not None else None,
-            "image_std": list(image_std) if image_std is not None else None,
-        }
-        return self._offline_generate_one_with_processor_overrides(
-            processor,
-            query,
-            image_processor_overrides=image_processor_overrides,
-        )
-    def offline_video_generate(
-        self,
-        processor,
-        prompt: str = "",
-        video: Any = None,
-        *,
-        shortest_edge: int = 4096,
-        longest_edge: int = 16777216,
-        video_max_pixels: int = 201326592,
-        patch_size: int = 16,
-        temporal_patch_size: int = 1,
-        merge_size: int = 2,
-        video_fps: float = 1.0,
-        min_frames: int = 1,
-        max_frames: int = 256,
-        num_extract_threads: int = 4,
-        image_mean: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
-        image_std: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
-        max_new_tokens: int = 1024,
-        temperature: float = 1.0,
-        top_k: int = 50,
-        top_p: float = 1.0,
-        repetition_penalty: float = 1.0,
-        do_sample: bool = False,
-        vision_chunked_length: int = 64,
-        use_template: bool = False,
-        thinking_mode: Optional[str] = None,
-        system_prompt_type: Optional[str] = None,
-        system_prompt: Optional[str] = None,
-    ) -> str:
-        """
-        Single-video offline generation with explicit video preprocessor defaults.
-        The default values mirror `video_preprocessor_config.json` so README examples
-        can show a standalone video entry point with the effective preprocessing knobs.
-        """
-        if video is None:
-            raise ValueError("`video` is required.")
-        query: Dict[str, Any] = {
-            "prompt": prompt,
-            "images": [],
-            "videos": [video],
-            "media_kwargs": {
-                "min_pixels": shortest_edge,
-                "max_pixels": longest_edge,
-                "video_max_pixels": video_max_pixels,
-                "video_fps": video_fps,
-                "min_frames": min_frames,
-                "max_frames": max_frames,
-            },
-            "generate_kwargs": {
-                "max_new_tokens": max_new_tokens,
-                "temperature": temperature,
-                "top_k": top_k,
-                "top_p": top_p,
-                "repetition_penalty": repetition_penalty,
-                "do_sample": do_sample,
-                "vision_chunked_length": vision_chunked_length,
-            },
-            "use_template": use_template,
-        }
-        if thinking_mode is not None:
-            query["thinking_mode"] = thinking_mode
-        if system_prompt_type is not None:
-            query["system_prompt_type"] = system_prompt_type
-        if system_prompt is not None:
-            query["system_prompt"] = system_prompt
-        video_processor_overrides = {
-            "size": {"shortest_edge": shortest_edge, "longest_edge": longest_edge},
-            "video_max_pixels": video_max_pixels,
-            "patch_size": patch_size,
-            "temporal_patch_size": temporal_patch_size,
-            "merge_size": merge_size,
-            "video_fps": video_fps,
-            "min_frames": min_frames,
-            "max_frames": max_frames,
-            "num_extract_threads": num_extract_threads,
-            "image_mean": list(image_mean) if image_mean is not None else None,
-            "image_std": list(image_std) if image_std is not None else None,
-        }
-        return self._offline_generate_one_with_processor_overrides(
-            processor,
-            query,
-            video_processor_overrides=video_processor_overrides,
-        )
-    def offline_generate(
-        self,
-        processor,
-        new_queries: "queue.Queue[dict]",
-        output_text_queue: "queue.Queue[str]",
-        vision_chunked_length: int = 64,
-    ) -> None:
-        """
-        HF-style offline inference wrapper aligned with the previous backend output path.
-        This method intentionally reuses the checkpoint's existing processor and
-        `generate()` flow so that outputs stay consistent with the old external
-        backend inference implementation.
-        Supported query keys include:
-        - `prompt` / `messages`
-        - `images` / `videos`
-        - `media_kwargs` / `generate_kwargs`
-        - `use_template` to switch between backend-style pretrain prompting
-          (`False`, default for base) and tokenizer chat template prompting (`True`)
-        - `thinking_mode` (`no_thinking` or `deep_thinking`, plus compatible aliases)
-        - `system_prompt_type` (`text_image` or `video`, plus compatible aliases)
-        - `system_prompt` for an explicit override
-        - `stream_output` / `stream`
-        - `reset_session` / `clear_history`
-        - `cancel_current_generate` / `stop_generation` / `stop_offline_generate`
-        """
-        buffered_queries: List[Dict[str, Any]] = []
-        session_messages: List[Dict[str, Any]] = []
-        while True:
-            if buffered_queries:
-                query = buffered_queries.pop(0)
-            else:
-                query = new_queries.get()
-            if not isinstance(query, dict):
-                continue
-            if query.get("stop_offline_generate"):
-                break
-            if query.get("reset_session") or query.get("clear_history"):
-                session_messages = []
-            try:
-                generate_kwargs = query.setdefault("generate_kwargs", {})
-                generate_kwargs.setdefault("vision_chunked_length", vision_chunked_length)
-                working_messages = self._offline_build_session_messages(
-                    processor,
-                    query,
-                    session_messages,
-                )
-                generation_query = dict(query)
-                generation_query["messages"] = working_messages
-                inputs, input_text, call_kwargs = self._offline_prepare_generation(processor, generation_query)
-                stream_output = bool(query.get("stream_output", query.get("stream", False)))
-                cancel_event = threading.Event()
-                stopping_criteria = StoppingCriteriaList([_OfflineCancelStoppingCriteria(cancel_event)])
-                generation_state: Dict[str, Any] = {}
-                if stream_output:
-                    output_text_queue.put("<|round_start|>")
-                    streamer = _OfflineQueueStreamer(getattr(processor, "tokenizer", processor), output_text_queue)
-                else:
-                    streamer = None
-                def _run_generation():
-                    try:
-                        with torch.no_grad():
-                            generation_state["outputs"] = self.generate(
-                                **inputs,
-                                stopping_criteria=stopping_criteria,
-                                streamer=streamer,
-                                **call_kwargs,
-                            )
-                    except Exception as exc:
-                        generation_state["exception"] = exc
-                worker = threading.Thread(target=_run_generation, daemon=True)
-                worker.start()
-                stop_conversation_after_turn = False
-                while worker.is_alive():
-                    try:
-                        control_query = new_queries.get(timeout=0.1)
-                    except queue.Empty:
-                        continue
-                    if not isinstance(control_query, dict):
-                        continue
-                    if control_query.get("cancel_current_generate") or control_query.get("stop_generation"):
-                        cancel_event.set()
-                        stop_conversation_after_turn = stop_conversation_after_turn or control_query.get("stop_offline_generate", False)
-                        continue
-                    if control_query.get("stop_offline_generate"):
-                        cancel_event.set()
-                        stop_conversation_after_turn = True
-                        continue
-                    buffered_queries.append(control_query)
-                worker.join()
-                was_cancelled = cancel_event.is_set()
-                if "exception" in generation_state:
-                    raise generation_state["exception"]
-                if stream_output and streamer is not None:
-                    text = "".join(streamer.collected_chunks)
-                else:
-                    outputs = generation_state["outputs"]
-                    new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
-                    text = processor.decode(new_tokens, skip_special_tokens=True)
-                    output_text_queue.put(text)
-                if query.get("persist_session", True) and (not was_cancelled or query.get("persist_cancelled_turn", False)):
-                    session_messages = self._offline_finalize_session_messages(working_messages, text)
-                output_text_queue.put("<|round_end|>")
-                if stop_conversation_after_turn:
-                    break
-            except Exception as exc:
-                output_text_queue.put(f"[ERROR] {exc}")
-                output_text_queue.put("<|round_end|>")
 __all__ = [
     "MossVLVisionModel",

 # limitations under the License.
 """PyTorch MossVL model - Qwen3VL Vision + Text with Cross Attention"""
 from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union, Tuple, List
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from transformers import initialization as init
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.generation import GenerationMixin
 from transformers.integrations import use_kernel_forward_from_hub
 from transformers.masking_utils import create_causal_mask
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling, logging
 from transformers.utils.deprecation import deprecate_kwarg
+from transformers.utils.generic import is_flash_attention_requested
+from transformers.utils.output_capturing import OutputRecorder
 from .configuration_moss_vl import MossVLConfig, MossVLTextConfig, MossVLVisionConfig
 logger = logging.get_logger(__name__)
 @dataclass
 class MossVLModelOutputWithPast(ModelOutput):
 class MossVLVisionRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
     def __init__(self, dim: int, theta: float = 10000.0) -> None:
         super().__init__()
+        # Keep dim / theta so that `_init_weights` can rebuild `inv_freq` after
+        # from_pretrained materializes the module (it is a non-persistent buffer
+        # and therefore never populated by the checkpoint).
+        self.dim = dim
+        self.theta = theta
+        inv_freq = self.compute_inv_freq()
         self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def compute_inv_freq(self) -> torch.Tensor:
+        return 1.0 / (self.theta ** (torch.arange(0, self.dim, 2, dtype=torch.float) / self.dim))
     def forward(self, seqlen: int) -> torch.Tensor:
         seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
         freqs = torch.outer(seq, self.inv_freq)
         self.act_fn = nn.GELU()
         self.linear_fc2 = nn.Linear(self.input_hidden_size, config.out_hidden_size)
+    def forward(
+        self,
+        last_hidden_state: torch.Tensor,
+        deepstack_features: Optional[List[torch.Tensor]] = None,
+    ) -> torch.Tensor:
         # 1. Collect all features: [last_hidden_state, deepstack_1, deepstack_2, ...]
         # self.norms[0] corresponds to last_hidden_state
         # self.norms[1:] corresponds to deepstack_features
+        if deepstack_features is None:
+            deepstack_features = []
         all_inputs = [last_hidden_state] + deepstack_features
         # 2. Apply Norm independently
         key_states = key_states.transpose(0, 1).unsqueeze(0)
         value_states = value_states.transpose(0, 1).unsqueeze(0)
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+        if is_flash_attention_requested(self.config):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
             attn_output, _ = attention_interface(
                 self,
     def __init__(self, config: MossVLTextConfig, device=None):
         super().__init__()
         self.max_seq_len_cached = config.max_position_embeddings
         self.original_max_seq_len = config.max_position_embeddings
         self.config = config
+        rope_parameters = getattr(config, "rope_parameters", None)
+        if rope_parameters is None:
+            rope_parameters = getattr(config, "rope_scaling", None) or {"rope_type": "default"}
+        self.rope_type = rope_parameters.get("rope_type", rope_parameters.get("type", "default"))
+        rope_init_fn: Callable = self.compute_default_rope_parameters
+        if self.rope_type != "default":
+            rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
+        self.mrope_section = rope_parameters.get("mrope_section", [24, 20, 20])
+    @staticmethod
+    def compute_default_rope_parameters(
+        config: Optional[MossVLTextConfig] = None,
+        device: Optional[torch.device] = None,
+        seq_len: Optional[int] = None,
+    ) -> tuple[torch.Tensor, float]:
+        rope_parameters = getattr(config, "rope_parameters", None) or {}
+        base = rope_parameters.get("rope_theta", getattr(config, "rope_theta", 10000.0))
+        head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        partial_rotary_factor = rope_parameters.get(
+            "partial_rotary_factor", getattr(config, "partial_rotary_factor", 1.0)
+        )
+        dim = int(head_dim * partial_rotary_factor)
+        attention_factor = 1.0
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
+        return inv_freq, attention_factor
     def apply_interleaved_mrope(self, freqs, mrope_section):
         """Apply interleaved MRoPE to 3D rotary embeddings.
     @torch.no_grad()
     @dynamic_rope_update
     def forward(self, x, position_ids):
         if position_ids.ndim == 2:
             position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
         if past_key_values is not None:
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
         attn_output, attn_weights = attention_interface(
             self,
         attention_mask: Optional[torch.Tensor] = None,
         past_key_values: Optional[Cache] = None,
         use_cache: bool = None,
+        cache_position: Optional[torch.LongTensor] = None,
         query_position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
         vision_position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
         **kwargs,
             if past_key_values is not None:
                 # if we have a new image + new tokens, we only computed key_states on that new image
                 # we still update the cross key states, past_image, new_image. And use it!
+                key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
         elif cache_position[0] != 0:
             key_states, value_states = (
                 "Cross attention layer can't find neither `cross_attn_states` nor cached values for key/values!"
             )
+        if is_flash_attention_requested(self.config):
+            # Cross attention still relies on an explicit dense mask.
+            attention_interface: Callable = ALL_ATTENTION_FUNCTIONS["sdpa"]
+        else:
+            attention_interface = ALL_ATTENTION_FUNCTIONS.get_interface(
+                self.config._attn_implementation, eager_attention_forward
+            )
         attn_output, attn_weights = attention_interface(
             self,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         vision_position_ids: Optional[torch.LongTensor] = None,
         vision_position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        output_attentions: bool = False,
         **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, ...]:
         # Self Attention
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             past_key_values=past_key_values,
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
 class MossVLCrossAttentionDecoderLayer(GradientCheckpointingLayer):
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         vision_position_ids: Optional[torch.LongTensor] = None,
         vision_position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        output_attentions: bool = False,
         **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, ...]:
         # Cross Attention
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, attn_weights = self.cross_attn(
             hidden_states=hidden_states,
             cross_attention_states=cross_attention_states,
             attention_mask=cross_attention_mask,
             past_key_values=past_key_values,
             use_cache=use_cache,
+            cache_position=cache_position,
             query_position_embeddings=position_embeddings,
             vision_position_embeddings=vision_position_embeddings,
         )
             hidden_states = full_text_row_masked_out_mask[:, 0] * hidden_states
         hidden_states = residual + self.cross_attn_mlp_gate.tanh() * hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
     def _init_weights(self, module):
         """Initialize the weights.
         """
+        super()._init_weights(module)
+        if isinstance(module, MossVLVisionRotaryEmbedding):
+            init.copy_(module.inv_freq, module.compute_inv_freq())
     def fast_pos_embed_interpolate(self, grid_thw):
         grid_ts, grid_hs, grid_ws = grid_thw[:, 0], grid_thw[:, 1], grid_thw[:, 2]
+        device = self.pos_embed.weight.device
+        dtype = self.pos_embed.weight.dtype
+        idx_parts = [[] for _ in range(4)]
+        weight_parts = [[] for _ in range(4)]
         for t, h, w in zip(grid_ts, grid_hs, grid_ws):
+            h_idxs = torch.linspace(0, self.num_grid_per_side - 1, h, device=device)
+            w_idxs = torch.linspace(0, self.num_grid_per_side - 1, w, device=device)
             h_idxs_floor = h_idxs.int()
             w_idxs_floor = w_idxs.int()
             ]
             for i in range(4):
+                idx_parts[i].append(indices[i])
+                weight_parts[i].append(weights[i])
+        idx_tensor = torch.stack([torch.cat(parts) for parts in idx_parts]).to(dtype=torch.long)
+        weight_tensor = torch.stack([torch.cat(parts) for parts in weight_parts]).to(dtype=dtype)
         pos_embeds = self.pos_embed(idx_tensor) * weight_tensor[:, :, None]
         patch_pos_embeds = pos_embeds[0] + pos_embeds[1] + pos_embeds[2] + pos_embeds[3]
         vision_position_ids: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> Union[tuple, BaseModelOutputWithPast]:
         """
                 Attention mask for cross-attention between text and vision. Shape: `(batch_size, 1, text_seq_len, vision_seq_len)`.
             vision_position_ids (`torch.LongTensor`, *optional*):
                 Position IDs for vision tokens used in cross-attention. Shape: `(batch_size, vision_seq_len)`.
+            cache_position (`torch.LongTensor`, *optional*):
+                Absolute cache positions for the current text tokens during incremental decoding.
         """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         attention_mask = create_causal_mask(
             config=self.config,
+            inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             cache_position=cache_position,
             past_key_values=past_key_values,
         # Compute vision position embeddings (for cross-attention key/value) if needed
         vision_position_embeddings = None
         if cross_attention_states is not None:
             if vision_position_ids is not None:
                 vision_position_embeddings = self.rotary_emb(cross_attention_states, vision_position_ids)
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
         for idx, decoder_layer in enumerate(self.layers):
             # For text-only path we should skip cross attention layers.
                 cross_attention_states=cross_attention_states,
                 cross_attention_mask=cross_attention_mask,
                 vision_position_ids=vision_position_ids,
                 vision_position_embeddings=vision_position_embeddings,
+                output_attentions=output_attentions,
                 **kwargs,
             )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
         hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states[:-1] + (hidden_states,)
+        if not return_dict:
+            outputs = (hidden_states, past_key_values)
+            if output_hidden_states:
+                outputs += (all_hidden_states,)
+            if output_attentions:
+                outputs += (all_attentions,)
+            return outputs
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
         )
         super().__init__(config)
         self.visual = MossVLVisionModel._from_config(config.vision_config)
         self.language_model = MossVLTextModel._from_config(config.text_config)
         # Learnable Separator Token: inserted after each image/frame's vision tokens
         # Initialized from LLM's separator_token_init_id embedding
                 continue
             # Collect repetition counts for all frames in this sample
+            repeats_parts = []
             for media in medias:
                 num_frames = media.get('num_frames', 1)
                 length = media['length']
                 # In convert_packed_to_batch we enforce strictly regular frames
                 # so we can assume all frames have the same number of tokens
+                repeats_parts.append(
+                    torch.full(
+                        (num_frames,),
+                        tokens_per_frame_with_sep,
+                        dtype=torch.long,
+                        device=cross_attention_mask.device,
+                    )
+                )
+            num_valid_frames = sum(part.numel() for part in repeats_parts)
             if num_valid_frames == 0:
                 continue
             # If cross_attention_mask has more frames (e.g. padded), slice it
             # If it has fewer (shouldn't happen), slice repeats
             valid_mask_frames = min(num_valid_frames, cross_attention_mask.shape[-1])
+            repeats_tensor = torch.cat(repeats_parts)
             if valid_mask_frames < num_valid_frames:
+                repeats_tensor = repeats_tensor[:valid_mask_frames]
             # Extract valid columns for this sample
             # (1, text_len, valid_mask_frames)
             source_mask = cross_attention_mask[i, :, :, :valid_mask_frames]
             # Expand using repeat_interleave
             # output shape: (1, text_len, sum(repeats))
             expanded_mask = source_mask.repeat_interleave(repeats_tensor, dim=-1)
         self,
         input_ids: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
     ) -> torch.Tensor:
         """
         Compute 3D position IDs for text tokens with special handling for image tokens.
         Args:
             input_ids: (batch_size, seq_len)
             attention_mask: (batch_size, seq_len), optional
+            past_key_values: cache object used to infer decode offset from the current text cache length
         Returns:
             position_ids: (3, batch_size, seq_len)
         device = input_ids.device
         image_token_id = self.config.image_token_id
+        # Decode stage: always advance positions from the current text cache length.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        if past_seen_tokens > 0:
             position_ids = torch.arange(seq_len, device=device, dtype=torch.long)
+            position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
+            position_ids = position_ids + past_seen_tokens
+            if rope_deltas is not None:
+                position_ids = position_ids + rope_deltas.unsqueeze(1)
+            return position_ids.unsqueeze(0).expand(3, -1, -1)
         # Prefill stage: compute full position_ids with image token awareness
         # Vectorized implementation
             rope_deltas: (batch_size,) - position offset due to vision tokens
         """
         batch_size, max_vision_seq_len, _ = cross_attention_states.shape
+        device = cross_attention_states.device
         image_token_id = self.config.image_token_id
         merge_size = self.visual.spatial_merge_size
         # We need to flatten the nested vision_token_info structure to align with image tokens in input_ids
         # Find all image tokens in text: (num_occurrences, 2) -> [batch_idx, seq_idx]
+        image_token_indices = (input_ids == image_token_id).nonzero()
         # Flatten vision_token_info to parallel lists
         # We assume the order of medias in vision_token_info matches the appearance of image tokens in input_ids
+        flat_eff_h_parts = []
+        flat_eff_w_parts = []
+        flat_vis_start_parts = []
         # Processing metadata on CPU (fast enough for typical batch sizes)
         for b_idx, info in enumerate(vision_token_info):
             medias = info.get('medias', [])
                 start = media['start']
                 tok_per_frame = media['vision_tokens_per_frame']
                 stride = tok_per_frame + 1 # +1 for separator
+                frame_offsets = start + torch.arange(num_frames, device=device, dtype=torch.long) * stride
+                flat_vis_start_parts.append(frame_offsets)
+                flat_eff_h_parts.append(torch.full((num_frames,), eh, device=device, dtype=torch.long))
+                flat_eff_w_parts.append(torch.full((num_frames,), ew, device=device, dtype=torch.long))
         # Pre-allocate output
         vision_pos_ids = torch.zeros(
         )
         # Handle case where no image tokens or info
+        if len(flat_eff_h_parts) == 0 or len(image_token_indices) == 0:
             rope_deltas = position_ids.max(dim=0).values.max(dim=-1).values + 1 - input_ids.shape[1]
             return vision_pos_ids, position_ids, rope_deltas
+        flat_eff_h = torch.cat(flat_eff_h_parts)
+        flat_eff_w = torch.cat(flat_eff_w_parts)
+        flat_vis_starts = torch.cat(flat_vis_start_parts)
         # Align lengths (handle truncation if text has fewer tokens or vice versa)
+        num_matches = min(flat_eff_h.shape[0], image_token_indices.shape[0])
+        flat_eff_h = flat_eff_h[:num_matches]
+        flat_eff_w = flat_eff_w[:num_matches]
+        flat_vis_starts = flat_vis_starts[:num_matches]
         # Get corresponding text positions
         target_indices = image_token_indices[:num_matches]
         )
         return vision_embeds, vision_token_info
     @auto_docstring
         media_nums_per_sample: Optional[List[int]] = None,
         vision_position_ids: Optional[torch.LongTensor] = None,
         cross_attention_mask: Optional[torch.Tensor] = None,
+        vision_token_info: Optional[List[dict]] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> Union[tuple, BaseModelOutputWithPast]:
         """
             cross_attention_mask (`torch.Tensor` of shape `(batch_size, 1, text_seq_len, vision_seq_len)`, *optional*):
                 Attention mask for cross-attention between text and vision. Controls which vision tokens each text
                 token can attend to, enforcing causal visibility for video frames.
+            vision_token_info (`List[dict]`, *optional*):
+                Cached metadata describing how packed vision tokens were regrouped per sample. Reused in decode
+                to expand frame-level cross-attention masks to token-level masks without recomputing vision features.
+            rope_deltas (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Cached offsets between text sequence length and multimodal RoPE positions. Reused in decode to
+                reconstruct text position ids from the current cache length.
         """
+        cache_position = kwargs.pop("cache_position", None)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         # Process vision features (images and videos are already merged by processor)
         cross_attention_states = None
         if pixel_values is not None:
             # Determine batch size
             batch_size = inputs_embeds.shape[0]
             # Process all vision inputs together through VIT
             # pixel_values and grid_thw are already ordered by appearance in text
+            vision_embeds, vision_token_info = self.get_vision_features(
+                pixel_values, grid_thw, media_nums_per_sample
             )
             # vision_embeds: [batch_size, max_seq_len, hidden_size]
             cross_attention_states = vision_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
         # Generate 3D position IDs for text if not provided
         if position_ids is None:
             position_ids = self.compute_position_ids(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                rope_deltas=rope_deltas,
             )
         # Compute cross_attention_mask, vision_position_ids, and full_text_row_masked_out_mask
                     (cross_attention_mask != negative_inf_value).any(dim=-1).type_as(cross_attention_mask)[..., None]
                 )
                 cross_attention_mask = cross_attention_mask * full_text_row_masked_out_mask
         if vision_position_ids is None and cross_attention_states is not None and input_ids is not None:
             vision_position_ids, position_ids, rope_deltas = self.compute_vision_position_ids(
                 cross_attention_states,
                 attention_mask
             )
         outputs = self.language_model(
             input_ids=None,
             cross_attention_mask=cross_attention_mask,
             vision_position_ids=vision_position_ids,
             full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
             **kwargs,
         )
+        if not return_dict:
+            last_hidden_state = outputs[0]
+            model_outputs = (
+                last_hidden_state,
+                outputs[1] if len(outputs) > 1 else past_key_values,
+            )
+            if output_hidden_states:
+                model_outputs += (outputs[2],)
+            if output_attentions:
+                attn_idx = 3 if output_hidden_states else 2
+                model_outputs += (outputs[attn_idx],)
+            model_outputs += (vision_token_info, rope_deltas)
+            return model_outputs
         return MossVLModelOutputWithPast(
             last_hidden_state=outputs.last_hidden_state,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+            vision_token_info=vision_token_info,
+            rope_deltas=rope_deltas,
         )
         super().__init__(config)
         self.model = MossVLModel(config)
         self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
         self.post_init()
         media_nums_per_sample: Optional[List[int]] = None,
         vision_position_ids: Optional[torch.LongTensor] = None,
         cross_attention_mask: Optional[torch.Tensor] = None,
+        vision_token_info: Optional[List[dict]] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
         logits_to_keep: Union[int, torch.Tensor] = 0,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> Union[tuple, CausalLMOutputWithPast]:
         """
             cross_attention_mask (`torch.Tensor` of shape `(batch_size, 1, text_seq_len, vision_seq_len)`, *optional*):
                 Attention mask for cross-attention between text and vision. Controls which vision tokens each text
                 token can attend to, enforcing causal visibility for video frames.
+            vision_token_info (`List[dict]`, *optional*):
+                Cached metadata describing how packed vision tokens were regrouped per sample. Reused across decode
+                steps to expand cross-attention masks without re-running the vision encoder.
+            rope_deltas (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Cached multimodal RoPE offsets returned by the base model during prefill and reused during decode.
         """
+        cache_position = kwargs.pop("cache_position", None)
         outputs = self.model(
             input_ids=input_ids,
             pixel_values=pixel_values,
             cross_attention_mask=cross_attention_mask,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
+            vision_token_info=vision_token_info,
+            rope_deltas=rope_deltas,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
             cache_position=cache_position,
             **kwargs,
         )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        hidden_states = outputs[0] if not return_dict else outputs.last_hidden_state
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
         logits = self.lm_head(hidden_states[:, slice_indices, :])
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
+        if not return_dict:
+            output = (logits,)
+            output += outputs[1:]
+            return ((loss,) + output) if loss is not None else output
         return MossVLCausalLMOutputWithPast(
             loss=loss,
             logits=logits,
         past_key_values=None,
         attention_mask=None,
         inputs_embeds=None,
         position_ids=None,
         use_cache=True,
         pixel_values=None,
         grid_thw=None,
         media_nums_per_sample=None,  # One video is one meida.
         vision_position_ids=None,
+        vision_token_info=None,
+        rope_deltas=None,
         cross_attention_mask=None,
         **kwargs,
     ):
         """
         Args:
             media_nums_per_sample: One video counts as one media item (regardless of frame count)
         """
+        kwargs.pop("cache_position", None)
         model_inputs = super().prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
             attention_mask=attention_mask,
             inputs_embeds=inputs_embeds,
             position_ids=position_ids,
             pixel_values=pixel_values,
             grid_thw=grid_thw,
             **kwargs,
         )
+        model_input = model_inputs.get("input_ids")
+        if model_input is None:
+            model_input = model_inputs.get("inputs_embeds")
+        current_length = model_input.shape[1]
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        # Let the model recompute multimodal position ids from the current cache length.
         model_inputs["position_ids"] = None
+        model_inputs["vision_token_info"] = vision_token_info
+        model_inputs["rope_deltas"] = rope_deltas
         # Handle cross attention mask
         if cross_attention_mask is not None:
+            # Slice to the current text slice on text dimension (dim=2).
+            # Shape: [batch, 1, text_len, vision_len] -> [batch, 1, current_len, vision_len]
+            cross_attention_mask = cross_attention_mask[:, :, -current_length:, :]
             model_inputs["cross_attention_mask"] = cross_attention_mask
+        # Vision inputs are only needed in prefill stage.
         # In decode stage, vision features are retrieved from cross attention cache
+        if past_seen_tokens > 0:
             model_inputs["pixel_values"] = None
             model_inputs["grid_thw"] = None
             model_inputs["media_nums_per_sample"] = None
         else:
             # In prefill stage, include all vision-related inputs
             model_inputs["vision_position_ids"] = vision_position_ids
         return model_inputs
             **kwargs,
         )
         if cross_attention_mask_prev is not None:
+            model_kwargs["cross_attention_mask"] = cross_attention_mask_prev
+        if getattr(outputs, "vision_token_info", None) is not None:
+            model_kwargs["vision_token_info"] = outputs.vision_token_info
+        if getattr(outputs, "rope_deltas", None) is not None:
+            model_kwargs["rope_deltas"] = outputs.rope_deltas
         return model_kwargs
 __all__ = [
     "MossVLVisionModel",