InnovatorLab
/

Innovator-VL-8B-Thinking

@@ -28,46 +28,29 @@ import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch.nn import LayerNorm
-from transformers import AutoConfig, AutoModelForCausalLM
 from transformers.activations import ACT2FN
-from transformers.cache_utils import (
-    Cache,
-    DynamicCache,
-    SlidingWindowCache,
-    StaticCache,
-)
 from transformers.generation import GenerationMixin
-from transformers.integrations import use_kernel_forward_from_hub
-from transformers.masking_utils import (
-    create_causal_mask,
-    create_sliding_window_causal_mask,
-)
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.modeling_flash_attention_utils import (
-    FlashAttentionKwargs,
-    flash_attn_supports_top_left_mask,
-    is_flash_attn_available,
-)
 from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import PreTrainedModel
 from transformers.processing_utils import Unpack
-from transformers.utils import (
-    auto_docstring,
-    can_return_tuple,
-    is_torch_flex_attn_available,
-    is_torchdynamo_compiling,
-    logging,
-)
 from .configuration_innovator_vl import InnovatorVLConfig, InnovatorVLTextConfig, RiceConfig
 if is_flash_attn_available():
-    from flash_attn import flash_attn_varlen_func
-    from transformers.modeling_flash_attention_utils import _flash_attention_forward
 if is_torch_flex_attn_available():
     from torch.nn.attention.flex_attention import BlockMask
     from transformers.integrations.flex_attention import make_flex_block_causal_mask
@@ -75,7 +58,7 @@ logger = logging.get_logger(__name__)
 @dataclass
-class LLaVAOneVision1_5_ModelOutputWithPast(ModelOutput):
     """
     Base class for Llava outputs, with hidden states and attentions.
@@ -111,7 +94,7 @@ class LLaVAOneVision1_5_ModelOutputWithPast(ModelOutput):
 @dataclass
-class LLaVAOneVision1_5_CausalLMOutputWithPast(ModelOutput):
     """
     Base class for LLaVAOneVision1.5 causal language model (or autoregressive) outputs.
@@ -149,8 +132,8 @@ class LLaVAOneVision1_5_CausalLMOutputWithPast(ModelOutput):
     rope_deltas: Optional[torch.LongTensor] = None
-class LLaVAOneVision1_5_RotaryEmbedding(nn.Module):
-    def __init__(self, config: LLaVAOneVision1_5_TextConfig, device=None):
         super().__init__()
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
@@ -465,10 +448,10 @@ class RiceBlock(nn.Module):
 @use_kernel_forward_from_hub("RMSNorm")
-class LLaVAOneVision1_5_RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
-        LLaVAOneVision1_5_RMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -486,7 +469,7 @@ class LLaVAOneVision1_5_RMSNorm(nn.Module):
-class LLaVAOneVision1_5_MLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -515,13 +498,13 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-class LLaVAOneVision1_5_Attention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
     and "Generating Long Sequences with Sparse Transformers".
     """
-    def __init__(self, config: LLaVAOneVision1_5_TextConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -544,8 +527,8 @@ class LLaVAOneVision1_5_Attention(nn.Module):
         self.o_proj = nn.Linear(
             config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
         )
-        self.q_norm = LLaVAOneVision1_5_RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # unlike olmo, only on the head dim!
-        self.k_norm = LLaVAOneVision1_5_RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # thus post q_norm does not need reshape
         self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None
     def forward(
@@ -612,9 +595,9 @@ class LLaVAOneVision1_5_Attention(nn.Module):
         return attn_output, attn_weights, past_key_value
-class LLaVAOneVision1_5_FlashAttention2(LLaVAOneVision1_5_Attention):
     """
-    LLaVAOneVision1_5 flash attention module, following Qwen2VL attention module. This module inherits from `LLaVAOneVision1_5_Attention`
     as the weights of the module stays untouched. The only required change would be on the forward pass
     where it needs to correctly call the public API of flash attention and deal with padding tokens
     in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
@@ -718,14 +701,14 @@ class LLaVAOneVision1_5_FlashAttention2(LLaVAOneVision1_5_Attention):
         return attn_output, attn_weights, past_key_value
-class LLaVAOneVision1_5_SdpaAttention(LLaVAOneVision1_5_Attention):
     """
     LLaVAOneVision1_51.5 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `LLaVAOneVision1_5_Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     SDPA API.
     """
-    # Adapted from LLaVAOneVision1_5_Attention.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -805,28 +788,28 @@ class LLaVAOneVision1_5_SdpaAttention(LLaVAOneVision1_5_Attention):
         return attn_output, None, past_key_value
-LLaVAOneVision1_5_ATTENTION_CLASSES = {
-    "eager": LLaVAOneVision1_5_Attention,
-    "flash_attention_2": LLaVAOneVision1_5_FlashAttention2,
-    "sdpa": LLaVAOneVision1_5_SdpaAttention,
 }
-class LLaVAOneVision1_5_DecoderLayer(nn.Module):
-    def __init__(self, config: LLaVAOneVision1_5_TextConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
             logger.warning_once(
                 f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
                 "unexpected results may be encountered."
             )
-        self.self_attn = LLaVAOneVision1_5_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.mlp = LLaVAOneVision1_5_MLP(config)
-        self.input_layernorm = LLaVAOneVision1_5_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LLaVAOneVision1_5_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.attention_type = config.layer_types[layer_idx]
     def forward(
         self,
@@ -895,10 +878,10 @@ class LLaVAOneVision1_5_DecoderLayer(nn.Module):
 @auto_docstring
 class Qwen2VLPreTrainedModel(PreTrainedModel):
-    config_class = Llavaonevision1_5Config
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["LLaVAOneVision1_5_DecoderLayer", "RiceBlock"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
@@ -918,7 +901,7 @@ class Qwen2VLPreTrainedModel(PreTrainedModel):
         elif isinstance(module, nn.LayerNorm):
             module.weight.data.fill_(1.0)
             module.bias.data.zero_()
-        elif isinstance(module, LLaVAOneVision1_5_RMSNorm):
             module.weight.data.fill_(1.0)
@@ -1103,23 +1086,21 @@ class RiceTransformerPretrainedModel(Qwen2VLPreTrainedModel):
 @auto_docstring
-class LLaVAOneVision1_5_TextModel(Qwen2VLPreTrainedModel):
-    config_class = LLaVAOneVision1_5_TextConfig
-    def __init__(self, config: LLaVAOneVision1_5_TextConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
-            [LLaVAOneVision1_5_DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
-        self.norm = LLaVAOneVision1_5_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.rotary_emb = LLaVAOneVision1_5_RotaryEmbedding(config=config)
-        self.has_sliding_layers = "sliding_attention" in self.config.layer_types
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -1182,24 +1163,9 @@ class LLaVAOneVision1_5_TextModel(Qwen2VLPreTrainedModel):
         # elif position_ids.dim() == 2: # 这是为了3drope准备的
         #     position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
-        # It may already have been prepared by e.g. `generate`
-        if not isinstance(causal_mask_mapping := attention_mask, dict):
-            # Prepare mask arguments
-            mask_kwargs = {
-                "config": self.config,
-                "input_embeds": inputs_embeds,
-                "attention_mask": attention_mask,
-                "cache_position": cache_position,
-                "past_key_values": past_key_values,
-                "position_ids": position_ids,
-            }
-            # Create the masks
-            causal_mask_mapping = {
-                "full_attention": create_causal_mask(**mask_kwargs),
-            }
-            # The sliding window alternating layers are not always activated depending on the config
-            if self.has_sliding_layers:
-                causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
         hidden_states = inputs_embeds
@@ -1211,7 +1177,6 @@ class LLaVAOneVision1_5_TextModel(Qwen2VLPreTrainedModel):
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
         for decoder_layer in self.layers:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
@@ -1220,7 +1185,7 @@ class LLaVAOneVision1_5_TextModel(Qwen2VLPreTrainedModel):
                 layer_outputs = self._gradient_checkpointing_func(
                     decoder_layer.__call__,
                     hidden_states,
-                    causal_mask_mapping[decoder_layer.attention_type],
                     position_ids,
                     past_key_values,
                     output_attentions,
@@ -1231,7 +1196,7 @@ class LLaVAOneVision1_5_TextModel(Qwen2VLPreTrainedModel):
             else:
                 layer_outputs = decoder_layer(
                     hidden_states,
-                    attention_mask=causal_mask_mapping[decoder_layer.attention_type],
                     position_ids=position_ids,
                     past_key_value=past_key_values,
                     output_attentions=output_attentions,
@@ -1359,7 +1324,7 @@ class LLaVAOneVision1_5_TextModel(Qwen2VLPreTrainedModel):
         dtype: torch.dtype,
         cache_position: torch.Tensor,
         batch_size: int,
-        config: Llavaonevision1_5Config,
         past_key_values: Cache,
     ):
         """
@@ -1379,7 +1344,7 @@ class LLaVAOneVision1_5_TextModel(Qwen2VLPreTrainedModel):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
                 Batch size.
-            config (`Llavaonevision1_5Config`):
                 The model's configuration class
             past_key_values (`Cache`):
                 The cache class that is being used currently to generate
@@ -1422,14 +1387,14 @@ class LLaVAOneVision1_5_TextModel(Qwen2VLPreTrainedModel):
 @auto_docstring
-class LLaVAOneVision1_5_Model(Qwen2VLPreTrainedModel):
     base_model_prefix = ""
     _checkpoint_conversion_mapping = {"^model": "language_model"}
-    def __init__(self, config: Llavaonevision1_5Config):
         super().__init__(config)
         self.visual = RiceTransformerPretrainedModel._from_config(config.vision_config)
-        self.language_model = LLaVAOneVision1_5_TextModel._from_config(config.text_config)
         self.rope_deltas = None  # cache rope_deltas here
         # Initialize weights and apply final processing
@@ -1638,7 +1603,7 @@ class LLaVAOneVision1_5_Model(Qwen2VLPreTrainedModel):
         video_grid_thw: Optional[torch.LongTensor] = None,
         rope_deltas: Optional[torch.LongTensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, LLaVAOneVision1_5_ModelOutputWithPast]:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
             The tensors corresponding to the input videos. Pixel values can be obtained using
@@ -1665,25 +1630,9 @@ class LLaVAOneVision1_5_Model(Qwen2VLPreTrainedModel):
                 n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
                 n_image_features = image_embeds.shape[0]
                 if not is_torchdynamo_compiling() and n_image_tokens != n_image_features:
-                    if abs(n_image_tokens - n_image_features) <= 10:
-                        logger.warning_once(
-                            f"!!!!!!!! Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}. "
-                            f"This may be caused by dynamic image sizes during training. Try to fix it. !!!!!!!!"
-                        )
-                        if n_image_tokens > n_image_features:
-                            diff = n_image_tokens - n_image_features
-                            pad_embeds = torch.zeros(
-                                (diff, image_embeds.shape[1]),
-                                dtype=image_embeds.dtype,
-                                device=image_embeds.device,
-                            )
-                            image_embeds = torch.cat([image_embeds, pad_embeds], dim=0)
-                        else:
-                            image_embeds = image_embeds[:n_image_tokens, :]
-                    else:
-                        raise ValueError(
-                            f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-                        )
                 image_mask = (
                     (input_ids == self.config.image_token_id)
                     .unsqueeze(-1)
@@ -1710,7 +1659,7 @@ class LLaVAOneVision1_5_Model(Qwen2VLPreTrainedModel):
                 video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                 inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
-            if attention_mask is not None and isinstance(attention_mask, torch.Tensor):
                 attention_mask = attention_mask.to(inputs_embeds.device)
         if use_cache and past_key_values is None:
@@ -1739,7 +1688,7 @@ class LLaVAOneVision1_5_Model(Qwen2VLPreTrainedModel):
             cache_position=cache_position,
         )
-        output = LLaVAOneVision1_5_ModelOutputWithPast(
             last_hidden_state=outputs.last_hidden_state,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
@@ -1805,7 +1754,7 @@ class LLaVAOneVision1_5_Model(Qwen2VLPreTrainedModel):
         return causal_mask
-class LLaVAOneVision1_5_ForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
     _checkpoint_conversion_mapping = {
         "^visual": "model.visual",
         r"^model(?!\.(language_model|visual))": "model.language_model",
@@ -1814,7 +1763,7 @@ class LLaVAOneVision1_5_ForConditionalGeneration(Qwen2VLPreTrainedModel, Generat
     def __init__(self, config):
         super().__init__(config)
-        self.model = LLaVAOneVision1_5_Model(config)
         self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
         self.post_init()
@@ -1866,9 +1815,7 @@ class LLaVAOneVision1_5_ForConditionalGeneration(Qwen2VLPreTrainedModel, Generat
         video_grid_thw: Optional[torch.LongTensor] = None,
         rope_deltas: Optional[torch.LongTensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
-        *args,
-        **kwargs,
-    ) -> Union[Tuple, LLaVAOneVision1_5_CausalLMOutputWithPast]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -1915,7 +1862,6 @@ class LLaVAOneVision1_5_ForConditionalGeneration(Qwen2VLPreTrainedModel, Generat
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
         ```"""
-        position_ids = None
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1945,23 +1891,11 @@ class LLaVAOneVision1_5_ForConditionalGeneration(Qwen2VLPreTrainedModel, Generat
         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
-        # with torch.no_grad():
-        #     log_probs = torch.nn.functional.log_softmax(logits.float() / 1, dim=-1)
-        #     entropy = -torch.sum(log_probs.exp() * log_probs, dim=-1).squeeze()
-        #     if entropy.ndim != 1:
-        #         entropy = entropy.unsqueeze(0)
-        #     if hasattr(self, "entropy"):
-        #         self.entropy = torch.cat([self.entropy, entropy], dim=-1)
-        #     else:
-        #         self.entropy = entropy
-        #     print(self.entropy.mean())
         loss = None
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)
-        return LLaVAOneVision1_5_CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
             past_key_values=outputs.past_key_values,
@@ -2133,8 +2067,13 @@ class LLaVAOneVision1_5_ForConditionalGeneration(Qwen2VLPreTrainedModel, Generat
         return input_ids, model_kwargs
-__all__ = ["LLaVAOneVision1_5_ForConditionalGeneration", "LLaVAOneVision1_5_Model", "Qwen2VLPreTrainedModel", "LLaVAOneVision1_5_TextModel"]
-AutoConfig.register("llavaonevision1_5", Llavaonevision1_5Config)
-AutoModelForCausalLM.register(Llavaonevision1_5Config, LLaVAOneVision1_5_ForConditionalGeneration)

 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch.nn import LayerNorm
 from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from transformers.generation import GenerationMixin
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
 from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, is_torchdynamo_compiling, logging
+from transformers.integrations import use_kernel_forward_from_hub
 from transformers.processing_utils import Unpack
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers import AutoModelForCausalLM, AutoConfig
 from .configuration_innovator_vl import InnovatorVLConfig, InnovatorVLTextConfig, RiceConfig
 if is_flash_attn_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward, flash_attn_varlen_func
 if is_torch_flex_attn_available():
     from torch.nn.attention.flex_attention import BlockMask
     from transformers.integrations.flex_attention import make_flex_block_causal_mask
 @dataclass
+class InnovatorVLModelOutputWithPast(ModelOutput):
     """
     Base class for Llava outputs, with hidden states and attentions.
 @dataclass
+class InnovatorVLCausalLMOutputWithPast(ModelOutput):
     """
     Base class for LLaVAOneVision1.5 causal language model (or autoregressive) outputs.
     rope_deltas: Optional[torch.LongTensor] = None
+class InnovatorVL_RotaryEmbedding(nn.Module):
+    def __init__(self, config: InnovatorVLTextConfig, device=None):
         super().__init__()
         # BC: "rope_type" was originally "type"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
 @use_kernel_forward_from_hub("RMSNorm")
+class InnovatorVL_RMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
+        InnovatorVL_RMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
+class InnovatorVL_MLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class InnovatorVL_Attention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
     and "Generating Long Sequences with Sparse Transformers".
     """
+    def __init__(self, config: InnovatorVLTextConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
         self.o_proj = nn.Linear(
             config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
         )
+        self.q_norm = InnovatorVL_RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # unlike olmo, only on the head dim!
+        self.k_norm = InnovatorVL_RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # thus post q_norm does not need reshape
         self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None
     def forward(
         return attn_output, attn_weights, past_key_value
+class InnovatorVL_FlashAttention2(InnovatorVL_Attention):
     """
+    LLaVAOneVision1_5 flash attention module, following Qwen2VL attention module. This module inherits from `InnovatorVL_Attention`
     as the weights of the module stays untouched. The only required change would be on the forward pass
     where it needs to correctly call the public API of flash attention and deal with padding tokens
     in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
         return attn_output, attn_weights, past_key_value
+class InnovatorVL_SdpaAttention(InnovatorVL_Attention):
     """
     LLaVAOneVision1_51.5 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `InnovatorVL_Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     SDPA API.
     """
+    # Adapted from InnovatorVL_Attention.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
         return attn_output, None, past_key_value
+InnovatorVL_ATTENTION_CLASSES = {
+    "eager": InnovatorVL_Attention,
+    "flash_attention_2": InnovatorVL_FlashAttention2,
+    "sdpa": InnovatorVL_SdpaAttention,
 }
+class InnovatorVL_DecoderLayer(nn.Module):
+    def __init__(self, config: InnovatorVLTextConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
             logger.warning_once(
                 f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
                 "unexpected results may be encountered."
             )
+        self.self_attn = InnovatorVL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.mlp = InnovatorVL_MLP(config)
+        self.input_layernorm = InnovatorVL_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = InnovatorVL_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
 @auto_docstring
 class Qwen2VLPreTrainedModel(PreTrainedModel):
+    config_class = InnovatorVLConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["InnovatorVL_DecoderLayer", "RiceBlock"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
         elif isinstance(module, nn.LayerNorm):
             module.weight.data.fill_(1.0)
             module.bias.data.zero_()
+        elif isinstance(module, InnovatorVL_RMSNorm):
             module.weight.data.fill_(1.0)
 @auto_docstring
+class InnovatorVLTextModel(Qwen2VLPreTrainedModel):
+    config_class = InnovatorVLTextConfig
+    def __init__(self, config: InnovatorVLTextConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList(
+            [InnovatorVL_DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
+        self.norm = InnovatorVL_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = InnovatorVL_RotaryEmbedding(config=config)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         # elif position_ids.dim() == 2: # 这是为了3drope准备的
         #     position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
         hidden_states = inputs_embeds
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
         for decoder_layer in self.layers:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
                 layer_outputs = self._gradient_checkpointing_func(
                     decoder_layer.__call__,
                     hidden_states,
+                    causal_mask,
                     position_ids,
                     past_key_values,
                     output_attentions,
             else:
                 layer_outputs = decoder_layer(
                     hidden_states,
+                    attention_mask=causal_mask,
                     position_ids=position_ids,
                     past_key_value=past_key_values,
                     output_attentions=output_attentions,
         dtype: torch.dtype,
         cache_position: torch.Tensor,
         batch_size: int,
+        config: InnovatorVLConfig,
         past_key_values: Cache,
     ):
         """
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
                 Batch size.
+            config (`InnovatorVLConfig`):
                 The model's configuration class
             past_key_values (`Cache`):
                 The cache class that is being used currently to generate
 @auto_docstring
+class InnovatorVLModel(Qwen2VLPreTrainedModel):
     base_model_prefix = ""
     _checkpoint_conversion_mapping = {"^model": "language_model"}
+    def __init__(self, config: InnovatorVLConfig):
         super().__init__(config)
         self.visual = RiceTransformerPretrainedModel._from_config(config.vision_config)
+        self.language_model = InnovatorVLTextModel._from_config(config.text_config)
         self.rope_deltas = None  # cache rope_deltas here
         # Initialize weights and apply final processing
         video_grid_thw: Optional[torch.LongTensor] = None,
         rope_deltas: Optional[torch.LongTensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, InnovatorVLModelOutputWithPast]:
         r"""
         pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
             The tensors corresponding to the input videos. Pixel values can be obtained using
                 n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
                 n_image_features = image_embeds.shape[0]
                 if not is_torchdynamo_compiling() and n_image_tokens != n_image_features:
+                    raise ValueError(
+                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                    )
                 image_mask = (
                     (input_ids == self.config.image_token_id)
                     .unsqueeze(-1)
                 video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
                 inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+            if attention_mask is not None:
                 attention_mask = attention_mask.to(inputs_embeds.device)
         if use_cache and past_key_values is None:
             cache_position=cache_position,
         )
+        output = InnovatorVLModelOutputWithPast(
             last_hidden_state=outputs.last_hidden_state,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
         return causal_mask
+class InnovatorVLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
     _checkpoint_conversion_mapping = {
         "^visual": "model.visual",
         r"^model(?!\.(language_model|visual))": "model.language_model",
     def __init__(self, config):
         super().__init__(config)
+        self.model = InnovatorVLModel(config)
         self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
         self.post_init()
         video_grid_thw: Optional[torch.LongTensor] = None,
         rope_deltas: Optional[torch.LongTensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, InnovatorVLCausalLMOutputWithPast]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
         loss = None
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)
+        return InnovatorVLCausalLMOutputWithPast(
             loss=loss,
             logits=logits,
             past_key_values=outputs.past_key_values,
         return input_ids, model_kwargs
+__all__ = [
+    "InnovatorVLForConditionalGeneration",
+    "InnovatorVLModel",
+    "InnovatorVLTextModel",
+    "Qwen2VLPreTrainedModel",
+]
+AutoConfig.register("innovator_vl", InnovatorVLConfig)
+AutoModelForCausalLM.register(InnovatorVLConfig, InnovatorVLForConditionalGeneration)