File size: 70,507 Bytes

3a1fbb9

#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/llava_onevision/modular_llava_onevision.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_llava_onevision.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# coding=utf-8
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import numpy as np
import torch
from torch import nn
import torch.distributed as dist

from transformers.activations import ACT2FN
from transformers.generation import GenerationMixin
from transformers.image_processing_utils import select_best_resolution
from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
from transformers.modeling_utils import PreTrainedModel
from transformers.processing_utils import Unpack
from transformers.utils import (
    LossKwargs,
    auto_docstring,
    can_return_tuple,
    is_torchdynamo_compiling,
    logging,
)
from transformers.models.auto import AutoModel
from torch.nn.attention.flex_attention import create_block_mask
from .configuration_llava_onevision import LlavaOnevisionConfig
from .fused_linear_diffusion_cross_entropy import FusedLinearDiffusionCrossEntropyLoss

logger = logging.get_logger(__name__)


@dataclass
class LlavaOnevisionModelOutputWithPast(BaseModelOutputWithPast):
    """
    Base class for Llava outputs, with hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.

        video_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor`  of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
            video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    """

    image_hidden_states: Optional[torch.FloatTensor] = None

    video_hidden_states: Optional[torch.FloatTensor] = None
    
    logits_to_keep_half: Optional[torch.BoolTensor] = None

    logits_to_keep: Optional[torch.BoolTensor] = None
    
    p_mask: Optional[torch.FloatTensor] = None



@dataclass
class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
    """
    Base class for LlavaOnevision causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.

        video_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor`  of size `(batch_size * num_frames, num_videos, sequence_length, hidden_size)`.
            video_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    """

    loss: Optional[torch.FloatTensor] = None
    logits: Optional[torch.FloatTensor] = None
    past_key_values: Optional[List[torch.FloatTensor]] = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    image_hidden_states: Optional[torch.FloatTensor] = None

    video_hidden_states: Optional[torch.FloatTensor] = None


class LlavaOnevisionPooler(nn.Module):
    def __init__(self, config):
        super().__init__()

        mode = config.spatial_pool_mode
        stride = config.spatial_pool_stride
        out_channels = getattr(config, "spatial_pool_out_channels", config.vision_config.hidden_size)
        self.image_size = (config.vision_config.image_size // config.vision_config.patch_size) ** 2

        if mode == "average":
            self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride)
        elif mode == "max":
            self.pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
        elif mode == "conv":
            self.pool = nn.Conv2d(
                in_channels=config.vision_config.hidden_size,
                out_channels=out_channels,
                kernel_size=stride,
                stride=stride,
            )
        else:
            raise ValueError(f"Unknown pooling mode: {mode}. Has to be one of [`average`, `max`, `conv`]")

    def forward(self, image_features):
        ori_width = int(math.sqrt(image_features.shape[1] * self.image_size // self.image_size))
        ori_height = int(ori_width * self.image_size // self.image_size)

        batch_size, _, dim = image_features.shape
        image_features_spatial = image_features.view(batch_size, ori_height, ori_height, dim).permute(0, 3, 1, 2)
        image_features_spatial_pool = self.pool(image_features_spatial)

        return image_features_spatial_pool.flatten(2).transpose(1, 2).contiguous()


class LlavaOnevisionMultiModalProjector(nn.Module):
    def __init__(self, config: LlavaOnevisionConfig):
        super().__init__()
        # We have hidden_size * the number of vision feature layers
        num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer)
        self.linear_1 = nn.Linear(
            config.vision_config.hidden_size * num_feature_layers,
            config.text_config.hidden_size,
            bias=config.multimodal_projector_bias,
        )
        self.act = ACT2FN[config.projector_hidden_act]
        self.linear_2 = nn.Linear(
            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
        )

    def forward(self, image_features):
        hidden_states = self.linear_1(image_features)
        hidden_states = self.act(hidden_states)
        hidden_states = self.linear_2(hidden_states)
        return hidden_states


def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
    """
    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.

    Args:
        image_size (`tuple`):
            The size of the input image in the format (width, height).
        grid_pinpoints (`List`):
            A list containing possible resolutions. Each item in the list should be a tuple or list
            of the form `(height, width)`.
        patch_size (`int`):
            The size of each image patch.

    Returns:
        tuple: The shape of the image patch grid in the format (width, height).
    """
    if not isinstance(grid_pinpoints, list):
        raise TypeError("grid_pinpoints should be a list of tuples or lists")

    # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
    if not isinstance(image_size, (list, tuple)):
        if not isinstance(image_size, (torch.Tensor, np.ndarray)):
            raise TypeError(
                f"image_size invalid type: {type(image_size)} not valid, should be either list, tuple, np.ndarray or tensor"
            )
        image_size = image_size.tolist()

    height, width = select_best_resolution(image_size, grid_pinpoints)
    return height // patch_size, width // patch_size


def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
    """
    Calculate the number of patches after the preprocessing for images of any resolution.

    Args:
        image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
            The size of the input image in the format (height, width). ?
        grid_pinpoints (`List`):
            A list containing possible resolutions. Each item in the list should be a tuple or list
            of the form `(height, width)`.
        patch_size (`int`):
            The size of each image patch.

    Returns:
        int: the number of patches
    """
    if not isinstance(grid_pinpoints, list):
        raise TypeError("grid_pinpoints should be a list of tuples or lists")

    # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
    if not isinstance(image_size, (list, tuple)):
        if not isinstance(image_size, (torch.Tensor, np.ndarray)):
            raise TypeError(f"image_size invalid type {type(image_size)} with value {image_size}")
        image_size = image_size.tolist()

    best_resolution = select_best_resolution(image_size, grid_pinpoints)
    height, width = best_resolution
    num_patches = 0
    # consider change to ceil(height/patch_size)*ceil(width/patch_size) + 1
    for i in range(0, height, patch_size):
        for j in range(0, width, patch_size):
            num_patches += 1
    # add the base patch
    num_patches += 1
    return num_patches


def unpad_image(tensor, original_size):
    """
    Unpads a PyTorch tensor of a padded and resized image.

    Args:
        tensor (`torch.Tensor`):
            The image tensor, assumed to be of shape (num_channels, height, width).
        original_size (`tuple`):
            The original size of the image (height, width).

    Returns:
        `torch.Tensor`: The unpadded image tensor.
    """
    if not isinstance(original_size, (list, tuple)):
        if not isinstance(original_size, (torch.Tensor, np.ndarray)):
            raise TypeError(
                f"image_size invalid type: {type(original_size)} not valid, should be either list, tuple, np.ndarray or tensor"
            )
        original_size = original_size.tolist()
    original_height, original_width = original_size
    current_height, current_width = tensor.shape[1:]

    original_aspect_ratio = original_width / original_height
    current_aspect_ratio = current_width / current_height

    if original_aspect_ratio > current_aspect_ratio:
        scale_factor = current_width / original_width
        new_height = int(round(original_height * scale_factor, 7))
        padding = (current_height - new_height) // 2
        unpadded_tensor = tensor[:, padding : current_height - padding, :]
    else:
        scale_factor = current_height / original_height
        new_width = int(round(original_width * scale_factor, 7))
        padding = (current_width - new_width) // 2
        unpadded_tensor = tensor[:, :, padding : current_width - padding]

    return unpadded_tensor


@auto_docstring
class LlavaOnevisionPreTrainedModel(PreTrainedModel):
    config_class = LlavaOnevisionConfig
    base_model_prefix = ""
    supports_gradient_checkpointing = True
    _no_split_modules = ["LlamaDecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    _supports_cache_class = True
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _supports_quantized_cache = True
    _supports_static_cache = True
    _supports_attention_backend = True

    def _init_weights(self, module):
        std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range)

        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, LlavaOnevisionModel):
            embed_std = 1 / math.sqrt(self.config.text_config.hidden_size)
            module.image_newline.data.normal_(mean=0.0, std=embed_std)


def modify_padded_position_ids(position_ids: torch.Tensor) -> torch.Tensor:
    """
    使用 PyTorch Tensor 操作修改 packed position_ids 中尾部 padding 的值。
    这个函数假设输入是一个 1D Tensor。
    Args:
        position_ids: 一维 PyTorch Tensor.
    Returns:
        修改后的 position_ids Tensor.
    """
    seq_len = position_ids.size(0)
    # 找到所有非零元素的索引
    nonzero_indices = (position_ids != 0).nonzero().squeeze()

    # 确定 padding 开始的位置
    if nonzero_indices.numel() > 0:
        # 如果存在非零元素，padding 从最后一个非零元素的下一个位置开始
        last_nonzero_idx = nonzero_indices.max().item()
        pad_start_idx = last_nonzero_idx + 1
    else:
        pad_start_idx = 0

    # 如果有需要修改的 padding 部分
    if pad_start_idx < seq_len:
        pad_length = seq_len - pad_start_idx
        new_pad_values = torch.arange(pad_length, device=position_ids.device, dtype=position_ids.dtype)
        position_ids[pad_start_idx:] = new_pad_values

    return position_ids


def modify_padded_position_ids_2d(position_ids: torch.LongTensor) -> torch.LongTensor:
    """
    使用完全向量化的 PyTorch 操作修改一个 batch 的 packed position_ids。
    这个函数假设输入是一个 2D Tensor，形状为 (batch_size, sequence_length)。
    它会独立地处理 batch 中的每一行。

    Args:
        position_ids: 二维 PyTorch Tensor, shape (batch_size, sequence_length).

    Returns:
        修改后的 position_ids Tensor, shape (batch_size, sequence_length).
    """
    if position_ids.dim() != 2:
        raise ValueError(f"Input tensor must be 2D, but got {position_ids.dim()} dimensions.")
        
    batch_size, seq_len = position_ids.shape
    device = position_ids.device

    col_indices = torch.arange(seq_len, device=device, dtype=position_ids.dtype).expand(batch_size, -1)
    mask = (position_ids != 0)

    masked_indices = col_indices * mask
    last_nonzero_idx = torch.max(masked_indices, dim=1).values
    has_nonzero = torch.any(mask, dim=1)
    pad_start_idx = torch.where(has_nonzero, last_nonzero_idx + 1, torch.tensor(0, device=device, dtype=position_ids.dtype))

    padding_mask = col_indices >= pad_start_idx.unsqueeze(1)
    new_pad_values = col_indices - pad_start_idx.unsqueeze(1)
    position_ids = torch.where(padding_mask, new_pad_values, position_ids)

    return position_ids


def calculate_token_nums(position_ids: torch.Tensor):
    """
    使用 PyTorch 高效计算一个批次中每个打包序列的长度。

    Args:
        position_ids (torch.Tensor): 一个 2D Tensor，形状为 (batch_size, sequence_length)。
                                     例如：tensor([[0,1,2,3,4,0,1,2,3,4,5,0,1,2,3,0,0,0]])
    Returns:
        list[list[int]]: 一个嵌套列表，包含每个批次项中各个序列的长度。
                         例如：[[5, 6, 4, 1, 1, 1]]
    """
    # 检查输入是否为 2D Tensor
    if position_ids.dim() != 2:
        raise ValueError(f"输入必须是 2D Tensor，但得到了 {position_ids.dim()}D")

    all_lengths = []
    
    # 我们按批次逐行处理。因为每行的序列长度数量不同（ragged），
    # 所以 Python 循环在批次维度上是最高效且最清晰的写法。
    # 循环内部的操作是完全向量化的。
    for pids_row in position_ids:
        # 获取当前行的总长度
        seq_len = pids_row.shape[0]
        
        # 1. 找到所有值为 0 的元素的索引
        # pids_row == 0 会返回一个布尔 Tensor: [True, False, ..., True, ...]
        # torch.nonzero 会返回这些 True 值的索引
        # .flatten() 将其从 (N, 1) 形状的 Tensor 变为 (N,) 形状
        zero_indices = torch.nonzero(pids_row == 0).flatten()
        
        # 2. 将序列的总长度作为一个额外的切分点添加到末尾
        # 这对于计算最后一个序列的长度至关重要
        # 注意：要确保新创建的 tensor 和原始 tensor 在同一个设备上 (cpu/cuda)
        split_points = torch.cat([
            zero_indices,
            torch.tensor([seq_len], device=pids_row.device, dtype=zero_indices.dtype)
        ])
        
        # 3. 计算相邻切分点之间的差值，这就是我们想要的长度
        # torch.diff([a, b, c, d]) 会返回 [b-a, c-b, d-c]
        lengths = torch.diff(split_points)

        all_lengths.append(lengths)

    return all_lengths


# def forward_add_noise_packed(
#     inputs_embeds: torch.Tensor,
#     num_tokens: torch.Tensor,
#     prompt_mask: torch.Tensor,
#     mask_embed: torch.Tensor,
#     eps: float = 1e-3,
#     max_tries: int = 10,
# ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
#     """
#     为单个打包（packed）序列的 embedding 添加噪声，该序列的形状带有 batch 维度。

#     函数为每个逻辑样本（在 inputs_embeds 中拼接）生成一个随机噪声率，
#     并随机将一部分 token 的 embedding 替换为 mask_embed。
#     这个过程会避开被 prompt_mask 标记的位置。

#     Args:
#         inputs_embeds (torch.Tensor): 输入的 embedding 张量，形状为 **(1, total_tokens, embed_dim)**。
#         num_tokens (torch.Tensor): 1D 张量，记录了每个逻辑样本的长度。
#                                    例如 [len_sample1, len_sample2, ...]。
#         prompt_mask (torch.Tensor): 布尔型张量，形状为 **(1, total_tokens)**，
#                                     值为 True 的位置表示是 prompt，不应添加噪声。
#         mask_embed (torch.Tensor): 用于替换的 mask embedding，形状为 (embed_dim,) 或 (1, embed_dim)。
#         eps (float): 微小值，用于防止噪声率 t 恰好为 0，确保 p_mask > 0。
#         max_tries (int): 为确保至少一个非 prompt token 被 mask，尝试的最大次数。

#     Returns:
#         Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
#         - noisy_embeds (torch.Tensor): 添加噪声后的 embedding 张量，形状为 (1, total_tokens, embed_dim)。
#         - final_masked_indices (torch.Tensor): 布尔型张量，标记了哪些位置被实际 mask 了，形状为 (1, total_tokens)。
#         - p_mask_per_sample (torch.Tensor): 每个逻辑样本实际使用的噪声率，形状为 (num_samples, )。
#     """
#     # 1. 验证和获取形状
#     bsz, total_tokens, embed_dim = inputs_embeds.shape
#     assert bsz == 1, f"此函数设计用于处理 bsz=1 的打包序列，但收到了 bsz={bsz}"
    
#     num_samples = len(num_tokens)
#     assert total_tokens == torch.sum(num_tokens), "num_tokens 之和与 inputs_embeds 的总长度不匹配"
#     assert prompt_mask.shape == (bsz, total_tokens), f"prompt_mask 形状不匹配, 期望 {(bsz, total_tokens)}, 得到 {prompt_mask.shape}"
#     assert mask_embed.dim() == 1 or mask_embed.shape[-1] == embed_dim, "mask_embed 形状不匹配"

#     device = inputs_embeds.device
    
#     # 调整 mask_embed 形状以便广播: (dim,) -> (1, 1, dim)
#     mask_embed = mask_embed.view(1, 1, embed_dim)
    
#     # --- 确定可以被 mask 的位置 ---
#     eligible_for_masking = ~prompt_mask
    
#     # 如果没有任何 token 可以被 mask，直接返回原始输入
#     if not eligible_for_masking.any():
#         return (
#             inputs_embeds,
#             torch.zeros_like(prompt_mask, dtype=torch.bool),
#             torch.full((num_samples,), eps, device=device)
#         )

#     # 2. 生成噪声率和 mask，尝试几次以确保至少 mask 一个 token
#     final_masked_indices = torch.zeros_like(prompt_mask, dtype=torch.bool)

#     for _ in range(max_tries):
#         # 为每个逻辑样本生成一个独立的随机噪声率 t in [0, 1]
#         t = torch.rand(num_samples, device=device) # shape: (num_samples,)
#         p_mask_per_sample = (1 - eps) * t + eps

#         # 将每个样本的噪声率扩展到其所有 token 上
#         p_mask_per_token_1d = torch.repeat_interleave(p_mask_per_sample, num_tokens) # shape: (total_tokens,)
#         p_mask_per_token = p_mask_per_token_1d.unsqueeze(0) # shape: (1, total_tokens)
        
#         # 生成随机数并根据 p_mask 创建初步的 mask
#         masked_indices = torch.rand_like(p_mask_per_token) < p_mask_per_token # shape: (1, total_tokens)
        
#         # 应用约束：只在允许的位置进行 mask
#         final_masked_indices = masked_indices & eligible_for_masking

#         if final_masked_indices.any():
#             break

#     # 3. 根据最终的 mask 生成带噪声的 embedding
#     # final_masked_indices 是 (1, total_tokens)，需要扩展到 (1, total_tokens, 1)
#     # 以便和 (1, total_tokens, embed_dim) 的张量在 torch.where 中正确广播
#     noisy_embeds = torch.where(
#         final_masked_indices.unsqueeze(-1),
#         mask_embed,
#         inputs_embeds
#     )

#     return noisy_embeds, final_masked_indices, p_mask_per_token[final_masked_indices]

def forward_add_noise_packed(
    inputs_embeds: torch.Tensor,
    num_tokens_list: List[torch.Tensor],
    prompt_mask: torch.Tensor,
    mask_embed: torch.Tensor,
    eps: float = 1e-3,
    max_tries: int = 10,
) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
    """
    为一批打包（packed）序列的 embedding 添加噪声。

    函数为每个逻辑样本（在每个批次项内拼接）生成一个独立的随机噪声率，
    并随机将一部分 token 的 embedding 替换为 mask_embed。
    这个过程会避开被 prompt_mask 标记的位置。

    Args:
        inputs_embeds (torch.Tensor): 
            输入的 embedding 张量，形状为 (bsz, total_tokens, embed_dim)。
        num_tokens_list (List[torch.Tensor]): 
            一个张量列表，长度为 bsz。列表中的每个张量记录了对应批次项中
            每个逻辑样本的长度。例如: [tensor([len1, len2]), tensor([len3, len4, len5])].
        prompt_mask (torch.Tensor): 
            布尔型张量，形状为 (bsz, total_tokens)，值为 True 的位置表示是 prompt，
            不应添加噪声。
        mask_embed (torch.Tensor): 
            用于替换的 mask embedding，形状为 (embed_dim,) 或 (1, embed_dim)。
        eps (float): 
            微小值，用于防止噪声率 t 恰好为 0，确保 p_mask > 0。
        max_tries (int): 
            为确保至少一个非 prompt token 被 mask，对每个批次项尝试的最大次数。

    Returns:
        Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
        - noisy_embeds (torch.Tensor): 
            添加噪声后的 embedding 张量，形状为 (bsz, total_tokens, embed_dim)。
        - final_masked_indices (torch.Tensor): 
            布尔型张量，标记了哪些位置被实际 mask 了，形状为 (bsz, total_tokens)。
        - p_masks_list (List[torch.Tensor]): 
            一个张量列表，长度为 bsz。每个张量包含了对应批次项中每个逻辑样本的
            实际噪声率。
    """
    # 1. 验证和获取形状
    bsz, total_tokens, embed_dim = inputs_embeds.shape
    device = inputs_embeds.device

    # 检查输入的一致性
    assert len(num_tokens_list) == bsz, f"num_tokens_list 的长度 ({len(num_tokens_list)}) 必须等于 bsz ({bsz})"
    assert prompt_mask.shape == (bsz, total_tokens), f"prompt_mask 形状不匹配, 期望 {(bsz, total_tokens)}, 得到 {prompt_mask.shape}"

    # 准备结果容器
    noisy_embeds_list = []
    final_masked_indices_list = []
    p_masks_list = []
    
    # 调整 mask_embed 形状以便广播: (dim,) -> (1, 1, dim)
    mask_embed_view = mask_embed.view(1, 1, embed_dim)

    # 2. 在批次维度上迭代
    # 这是处理不同打包结构最直接有效的方法
    for i in range(bsz):
        # 提取当前批次项的数据
        current_embeds = inputs_embeds[i:i+1] # shape: (1, total_tokens, embed_dim)
        current_num_tokens = num_tokens_list[i]
        current_prompt_mask = prompt_mask[i:i+1] # shape: (1, total_tokens)
        
        num_samples_in_item = len(current_num_tokens)
        assert total_tokens == torch.sum(current_num_tokens), \
            f"批次项 {i} 的 num_tokens 之和与总长度不匹配"

        eligible_for_masking = ~current_prompt_mask

        # 如果没有任何 token 可以被 mask，直接使用原始输入
        if not eligible_for_masking.any():
            noisy_embeds_list.append(current_embeds)
            final_masked_indices_list.append(torch.zeros_like(current_prompt_mask, dtype=torch.bool))
            p_masks_list.append(torch.full((total_tokens,), eps, device=device))
            continue

        # --- 尝试生成 mask，确保至少 mask 一个 token ---
        final_masked_indices_item = torch.zeros_like(current_prompt_mask, dtype=torch.bool)
        p_mask_per_token = None
        for _ in range(max_tries):
            t = torch.rand(num_samples_in_item, device=device)
            p_mask_per_sample = (1 - eps) * t + eps

            p_mask_per_token_1d = torch.repeat_interleave(p_mask_per_sample, current_num_tokens)
            p_mask_per_token = p_mask_per_token_1d.unsqueeze(0)

            masked_indices = torch.rand_like(p_mask_per_token) < p_mask_per_token
            final_masked_indices_item = masked_indices & eligible_for_masking

            if final_masked_indices_item.any():
                break

        # --- 根据最终的 mask 生成带噪声的 embedding ---
        noisy_embeds_item = torch.where(
            final_masked_indices_item.unsqueeze(-1),
            mask_embed_view,
            current_embeds
        )
        
        # 保存这个批次项的结果
        noisy_embeds_list.append(noisy_embeds_item)
        final_masked_indices_list.append(final_masked_indices_item)

        p_masks_list.append(p_mask_per_token)

    # 3. 将列表中的结果堆叠成最终的批处理张量
    final_noisy_embeds = torch.cat(noisy_embeds_list, dim=0)
    final_masked_indices = torch.cat(final_masked_indices_list, dim=0)
    p_mask = torch.cat(p_masks_list, dim=0)
    return final_noisy_embeds, final_masked_indices, p_mask[final_masked_indices]


def block_diff_mask(b, h, q_idx, kv_idx, block_size=None, n=None):
    """
    Constructs the specialized block diffusion attention mask for training
    composed of three masks:
    - **Block Diagonal Mask (M_BD)**: Self-attention within noised blocks
    - **Offset Block Causal Mask (M_OBC)**: Cross-attention for conditional context
    - **Block Causal Mask (M_BC)**: Attention to update x0

    Args:
        b, h: Batch and head indices (ignored for mask logic).
        q_idx, kv_idx: Query and Key indices.
        seq_len: Total sequence length.
        block_size: Defines the block structure.

    Returns:
        A boolean attention mask.
    """

    # Indicate whether token belongs to xt or x0
    x0_flag_q = q_idx >= n
    x0_flag_kv = kv_idx >= n

    # Compute block indices
    block_q = torch.where(
        x0_flag_q == 1, (q_idx - n) // block_size, q_idx // block_size
    )
    block_kv = torch.where(
        x0_flag_kv == 1, (kv_idx - n) // block_size, kv_idx // block_size
    )

    # **1. Block Diagonal Mask (M_BD) **
    block_diagonal = (block_q == block_kv) & (x0_flag_q == x0_flag_kv)

    # **2. Offset Block-Causal Mask (M_OBC) **
    offset_block_causal = (block_q > block_kv) & (
        x0_flag_kv == 1) & (x0_flag_q == 0)

    # **3. Block-Causal Mask (M_BC) **
    block_causal = (block_q >= block_kv) & (x0_flag_kv == 1) & (x0_flag_q == 1)

    # **4. Combine Masks **
    return block_diagonal | offset_block_causal | block_causal


def block_attn_mask(num_tokens, block_size, device):
    masks = []
    for i in range(len(num_tokens)):
        cur_masks = []
        for num in num_tokens[i]:
            # 全部返回 n*n 而非 2n*2n
            single_mask = block_diff_mask(
                b=None,
                h=None,
                q_idx=torch.arange(num * 2, device=device)[:, None],
                kv_idx=torch.arange(num * 2, device=device)[None, :],
                block_size=block_size,
                n=num,
            )
            cur_masks.append(single_mask)
        masks.append(torch.block_diag(*cur_masks))
    masks = torch.stack(masks, dim=0)
    return masks


@auto_docstring(
    custom_intro="""
    The Llava-Next model which consists of a vision backbone and a language model without language modeling head.
    """
)
class LlavaOnevisionModel(LlavaOnevisionPreTrainedModel):
    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}

    def __init__(self, config):
        super().__init__(config)
        self.vision_tower = AutoModel.from_config(config.vision_config)

        self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config)
        embed_std = 1 / math.sqrt(config.text_config.hidden_size)
        self.image_newline = nn.Parameter(torch.randn(config.text_config.hidden_size, dtype=self.dtype) * embed_std)

        self.vocab_size = config.text_config.vocab_size
        if "auto_map" in config.text_config.to_dict():
            logger.warning_once(
                "The text_config of this model contains `auto_map` in its configuration. This might result in errors when using `from_pretrained` to load the model. Please make sure that the `auto_map` is correct."
            )
            config.text_config._name_or_path = config._name_or_path
            self.language_model = AutoModel.from_config(config.text_config, trust_remote_code=True)
        else:
            self.language_model = AutoModel.from_config(config.text_config)

        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
        self.post_init()

    def get_input_embeddings(self):
        return self.language_model.get_input_embeddings()

    def set_input_embeddings(self, value):
        self.language_model.set_input_embeddings(value)

    def pack_image_features(self, image_features, image_sizes, image_newline=None, vision_aspect_ratio="anyres_max_9"):
        """
        Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.

        Args:
            image_features (`List[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
                List of image feature tensor, each contains all the visual feature of all patches.
            image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
                Actual image size of each images (H, W).
            image_newline (`torch.Tensor` of shape `(embed_dim)`)
                New line embedding vector.
            vision_aspect_ratio (`str`, *optional*, "anyres_max_9"):
                Aspect ratio used when processong image features. The default value is "anyres_max_9".
        Returns:
            image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
            feature_lens (`List[int]`)
                token length of each image in image_features
        """
        new_image_features = []
        feature_lens = []
        for image_idx, image_feature in enumerate(image_features):
            if image_feature.shape[0] > 1:
                base_image_feature = image_feature[0]
                image_feature = image_feature[1:]
                height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
                if height * width != base_image_feature.shape[0]:
                    raise ValueError("The number of patches is not consistent with the image size.")
                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
                    image_sizes[image_idx],
                    self.config.image_grid_pinpoints,
                    self.config.vision_config.image_size,
                )
                image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
                image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
                image_feature = image_feature.flatten(1, 2).flatten(2, 3)
                image_feature = unpad_image(image_feature, image_sizes[image_idx])
                max_num_patches = int(vision_aspect_ratio.strip("anyres_max_"))
                channels, curr_height, curr_width = image_feature.shape
                ratio = math.sqrt(curr_height * curr_width / (max_num_patches * height**2))
                if ratio > 1.1:
                    image_feature = image_feature[None]
                    image_feature = nn.functional.interpolate(
                        image_feature, [int(curr_height // ratio), int(curr_width // ratio)], mode="bilinear"
                    )[0]
                if image_newline is not None:
                    image_feature = torch.cat(
                        (
                            image_feature,
                            image_newline[:, None, None]
                            .expand(*image_feature.shape[:-1], 1)
                            .to(image_feature.device, image_feature.dtype),
                        ),
                        dim=-1,
                    )
                image_feature = image_feature.flatten(1, 2).transpose(0, 1)
                image_feature = torch.cat((base_image_feature, image_feature), dim=0)
            else:
                image_feature = image_feature[0]
                if image_newline is not None:
                    image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
            new_image_features.append(image_feature)
            feature_lens.append(image_feature.size(0))
        image_features = torch.cat(new_image_features, dim=0)
        feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features.device)
        return image_features, feature_lens

    def get_image_features(
        self,
        pixel_values: torch.FloatTensor,
        image_sizes: torch.Tensor,
        vision_feature_layer: Optional[Union[int, List[int]]] = None,
        vision_feature_select_strategy: Optional[str] = None,
    ):
        """
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
               The tensors corresponding to the input images.
            image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
                Actual image size of each images (H, W).
            vision_feature_layer (`Union[int, List[int]]`, *optional*):
                The index of the layer to select the vision feature. If multiple indices are provided,
                the vision feature of the corresponding indices will be concatenated to form the
                vision features.
            vision_feature_select_strategy (`str`, *optional*):
                The feature selection strategy used to select the vision feature from the vision backbone.
                Can be one of `"default"` or `"full"`
        Returns:
            image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
            and are of shape `(num_patches, image_length, embed_dim)`).
        """
        vision_feature_layer = (
            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
        )
        vision_feature_select_strategy = (
            vision_feature_select_strategy
            if vision_feature_select_strategy is not None
            else self.config.vision_feature_select_strategy
        )

        # ! infer image_num_patches from image_sizes
        image_num_patches = [
            image_size_to_num_patches(
                image_size=imsize,
                grid_pinpoints=self.config.image_grid_pinpoints,
                patch_size=self.config.vision_config.image_size,
            )
            for imsize in image_sizes
        ]
        if pixel_values.dim() == 5:
            # stacked if input is (batch_size, num_patches, num_channels, height, width)
            _pixel_values_list = [pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)]
            pixel_values = torch.cat(_pixel_values_list, dim=0)
        elif pixel_values.dim() != 4:
            # otherwise has to be stacked from list of (num_patches, num_channels, height, width)
            raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions")

        image_features = self.vision_tower(pixel_values, output_hidden_states=True)
        # If we have one vision feature layer, return the corresponding hidden states,
        # otherwise, select the hidden states of each feature layer and concatenate them
        if isinstance(vision_feature_layer, int):
            selected_image_feature = image_features.hidden_states[vision_feature_layer]
        else:
            hs_pool = [image_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
            selected_image_feature = torch.cat(hs_pool, dim=-1)

        if vision_feature_select_strategy == "default":
            selected_image_feature = selected_image_feature[:, 1:]
        elif vision_feature_select_strategy == "full":
            selected_image_feature = selected_image_feature
        image_features = self.multi_modal_projector(selected_image_feature)
        image_features = torch.split(image_features, image_num_patches, dim=0)
        return image_features

    def _get_mask_embedding(self):
        device = self.get_input_embeddings().weight.device
        mask_token_tensor = torch.tensor(self.config.text_config.mask_token_id, device=device)
        return self.get_input_embeddings()(mask_token_tensor)

    def prepare_for_bd_training(self, inputs_embeds, position_ids, prompt_mask):
        bsz, seq_len, _ = inputs_embeds.shape
        num_tokens = calculate_token_nums(position_ids) # List[torch.Tensor]
        noisy_inputs_embeds, logits_to_keep_half, p_mask = forward_add_noise_packed(
            inputs_embeds=inputs_embeds,
            num_tokens_list=num_tokens,
            prompt_mask=prompt_mask,
            mask_embed=self._get_mask_embedding(),
        )
        router_noisy_part_list = []
        for i in range(bsz):
            cur_router_noisy_part = (torch.arange(num_tokens[i].shape[0] *2) % 2 == 0).to(inputs_embeds.device)
            cur_router_noisy_part = cur_router_noisy_part.repeat_interleave(num_tokens[i].repeat_interleave(2))
            router_noisy_part_list.append(cur_router_noisy_part)
        router_noisy_part = torch.stack(router_noisy_part_list, dim=0)

        # concated inputs_embeds: (bzs, seq_len x 2, dim)
        concat_inputs_embeds = inputs_embeds.repeat(1, 2, 1)
        # concated logits_to_keep: (bsz, seq_len x 2)
        logits_to_keep = torch.zeros(
                    bsz, 2 * seq_len, dtype=torch.bool, device=inputs_embeds.device)
        # concated position_ids: (bsz, seq_len x 2)
        concat_position_ids = torch.zeros(
                    bsz, 2 * seq_len, dtype=position_ids.dtype, device=position_ids.device)
        for i in range(bsz):
            concat_inputs_embeds[i][router_noisy_part[i]] = noisy_inputs_embeds[i]
            concat_inputs_embeds[i][~router_noisy_part[i]] = inputs_embeds[i]

            logits_to_keep[i][router_noisy_part[i]] = logits_to_keep_half[i]

            concat_position_ids[i][router_noisy_part[i]] = position_ids[i]
            concat_position_ids[i][~router_noisy_part[i]] = position_ids[i]

        # create flex_attention mask
        attention_mask = block_attn_mask(num_tokens, self.config.text_config.block_size, inputs_embeds.device)
        flex_attention_mask_3d = create_block_mask(
                            lambda b, h, q_idx, kv_idx: attention_mask[b, q_idx, kv_idx],
                            B=attention_mask.size(0), H=None,
                            Q_LEN=attention_mask.size(1), KV_LEN=attention_mask.size(2),
        )

        return concat_inputs_embeds, concat_position_ids, flex_attention_mask_3d, logits_to_keep_half, logits_to_keep, p_mask


    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        pixel_values: torch.FloatTensor = None,
        image_sizes: Optional[torch.LongTensor] = None,
        pixel_values_videos: torch.FloatTensor = None,
        image_sizes_videos: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        prompt_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        vision_feature_layer: Optional[Union[int, List[int]]] = None,
        vision_feature_select_strategy: Optional[str] = None,
        vision_aspect_ratio: Optional[str] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> Union[Tuple, LlavaOnevisionModelOutputWithPast]:
        r"""
        pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, frames, num_channels, image_size, image_size)):
            The tensors corresponding to the input videos. Pixel values can be obtained using
            [`LlavaNextVideoProcessor`]. See [`LlavaNextVideoProcessor.__call__`] for details. [`LlavaProcessor`] uses
            [`LlavaNextVideoProcessor`] for processing videos.
        image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*):
            The sizes of the videos in the batch, being (height, width) for each frame in the video.
        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
            The feature selection strategy used to select the vision feature from the vision backbone.
            Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
            If `"full"`, the full vision features are used.
        vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`):
            Aspect ratio used when processong image features. The default value is "anyres_max_9".
        """

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        vision_feature_layer = (
            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
        )
        vision_feature_select_strategy = (
            vision_feature_select_strategy
            if vision_feature_select_strategy is not None
            else self.config.vision_feature_select_strategy
        )
        vision_aspect_ratio = (
            vision_aspect_ratio if vision_aspect_ratio is not None else self.config.vision_aspect_ratio
        )

        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

        if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, "
                "and must specify either one"
            )

        if inputs_embeds is None:
            inputs_embeds = self.get_input_embeddings()(input_ids)

        # Images are processed with Anyres
        if pixel_values is not None:
            image_features = self.get_image_features(
                pixel_values,
                image_sizes,
                vision_feature_layer=vision_feature_layer,
                vision_feature_select_strategy=vision_feature_select_strategy,
            )
            image_features, feature_lens = self.pack_image_features(
                image_features,
                image_sizes,
                image_newline=self.image_newline,
                vision_aspect_ratio=vision_aspect_ratio,
            )

            special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
            if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
                n_image_tokens = (input_ids == self.config.image_token_id).sum()
                n_image_features = image_features.shape[0]
                raise ValueError(
                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                )
            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)

        # Video are simply embedded and further pooled to decrease seq len
        if pixel_values_videos is not None:
            video_features = self.get_video_features(
                pixel_values_videos,
                vision_feature_layer=vision_feature_layer,
                vision_feature_select_strategy=vision_feature_select_strategy,
            )
            if isinstance(video_features, tuple):
                image_newline = self.image_newline[None, :].to(video_features[0].device)
                video_features = [torch.cat((single_video_feature, image_newline), dim=0) for single_video_feature in video_features]
                video_features = torch.cat(video_features, dim=0)
            else:
                image_newline = (
                    self.image_newline[None, None, :].repeat(video_features.shape[0], 1, 1).to(video_features.device)
                )
                video_features = torch.cat((video_features, image_newline), dim=1)
                video_features = video_features.flatten(0, 1)

            special_video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1)
            special_video_mask = special_video_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
            if not is_torchdynamo_compiling() and inputs_embeds[special_video_mask].numel() != video_features.numel():
                n_video_tokens = (input_ids == self.config.video_token_id).sum()
                n_video_features = video_features.shape[0]
                raise ValueError(
                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                )
            video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
            inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features)

        if self.training:
            position_ids = modify_padded_position_ids_2d(position_ids)
            concat_inputs_embeds, concat_position_ids, flex_attention_mask_3d, logits_to_keep_half, logits_to_keep, p_mask = self.prepare_for_bd_training(inputs_embeds, position_ids, prompt_mask)
            outputs = self.language_model(
                attention_mask=flex_attention_mask_3d,
                position_ids=concat_position_ids,
                inputs_embeds=concat_inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=True,
                cache_position=cache_position,
                **kwargs,
            )
        else:
            # raise NotImplementedError("Currently only support training.")
            outputs = self.language_model(
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_values=past_key_values,
                inputs_embeds=inputs_embeds,
                use_cache=use_cache,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=True,
                cache_position=cache_position,
                **kwargs,
            )

        return LlavaOnevisionModelOutputWithPast(
            last_hidden_state=outputs.last_hidden_state,
            logits_to_keep_half=logits_to_keep_half if self.training else None,
            logits_to_keep=logits_to_keep if self.training else None,
            p_mask=p_mask if self.training else None,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            image_hidden_states=image_features if pixel_values is not None else None,
            video_hidden_states=video_features if pixel_values_videos is not None else None,
        )

    def get_video_features(
        self,
        pixel_values: torch.FloatTensor,
        vision_feature_layer: Union[int, List[int]],
        vision_feature_select_strategy: str,
    ):
        """
        Obtains video last hidden states from the vision tower, apply multimodal projection and pooling.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_frames, channels, height, width)`)
               The tensors corresponding to the input video.
            vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`):
                The index of the layer to select the vision feature. If multiple indices are provided,
                the vision feature of the corresponding indices will be concatenated to form the
                vision features.
            vision_feature_select_strategy (`str`):
                The feature selection strategy used to select the vision feature from the vision backbone.
                Can be one of `"default"` or `"full"`
        Returns:
            video_features (List[`torch.Tensor`]): List of video feature tensor, each contains all the visual feature of all patches
            and are of shape `(num_videos, video_length, embed_dim)`).
        """
        has_variable_frames = isinstance(pixel_values, List)
        if has_variable_frames:
            frame_nums = [video.size(0) for video in pixel_values]
            pixel_values = torch.cat(pixel_values, dim=0) # Shape: (total_frames, C, H, W)
        else:
            # 每个视频帧数相同
            batch_size, frames, channels, height, width = pixel_values.shape
            pixel_values = pixel_values.view(batch_size * frames, channels, height, width)
        video_features = self.vision_tower(pixel_values, output_hidden_states=True)
        # If we have one vision feature layer, return the corresponding hidden states,
        # otherwise, select the hidden states of each feature layer and concatenate them
        if isinstance(vision_feature_layer, int):
            selected_video_feature = video_features.hidden_states[vision_feature_layer]
        else:
            hs_pool = [video_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
            selected_video_feature = torch.cat(hs_pool, dim=-1)

        if vision_feature_select_strategy == "default":
            selected_video_feature = selected_video_feature[:, 1:]
        elif vision_feature_select_strategy == "full":
            selected_video_feature = selected_video_feature
        video_features = self.multi_modal_projector(selected_video_feature)

        video_features = self.apply_pooling(video_features)

        if has_variable_frames:
            tokens_per_frame = video_features.shape[1]
            video_features = video_features.flatten(0, 1)
            video_tokens_lengths = [num_frames * tokens_per_frame for num_frames in frame_nums]
            video_features = torch.split(video_features, video_tokens_lengths, dim=0)
        else:
            video_features = video_features.reshape(batch_size, frames * video_features.shape[1], -1)

        return video_features

    def apply_pooling(self, image_features):
        height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
        batch_frames, seq_len, dim = image_features.shape
        image_features = image_features.view(batch_frames, height, width, -1)
        image_features = image_features.permute(0, 3, 1, 2).contiguous()

        height, width = image_features.shape[2:]
        scaled_shape = [math.ceil(height / 2), math.ceil(width / 2)]
        image_features = nn.functional.interpolate(image_features, size=scaled_shape, mode="bilinear")

        image_features = image_features.permute(0, 2, 3, 1)
        image_features = image_features.view(batch_frames, -1, dim)
        return image_features


class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...


@auto_docstring(
    custom_intro="""
    The LLAVA-NeXT model which consists of a vision backbone and a language model.
    """
)
class LlavaOnevisionForConditionalGeneration(LlavaOnevisionPreTrainedModel, GenerationMixin):
    _checkpoint_conversion_mapping = {
        "^language_model.model": "model.language_model",
        "^vision_tower": "model.vision_tower",
        "^multi_modal_projector": "model.multi_modal_projector",
        "^image_newline": "model.image_newline",
        "^language_model.lm_head": "lm_head",
    }
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config: LlavaOnevisionConfig):
        super().__init__(config)
        self.model = LlavaOnevisionModel(config)
        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
        self.post_init()

    def get_input_embeddings(self):
        return self.model.get_input_embeddings()

    def set_input_embeddings(self, value):
        self.model.set_input_embeddings(value)

    def get_output_embeddings(self) -> nn.Module:
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # Make modules available throught conditional class for BC
    @property
    def language_model(self):
        return self.model.language_model

    @property
    def vision_tower(self):
        return self.model.vision_tower

    @property
    def multi_modal_projector(self):
        return self.model.multi_modal_projector

    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        pixel_values: torch.FloatTensor = None,
        image_sizes: Optional[torch.LongTensor] = None,
        pixel_values_videos: torch.FloatTensor = None,
        image_sizes_videos: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        vision_feature_layer: Optional[Union[int, List[int]]] = None,
        vision_feature_select_strategy: Optional[str] = None,
        vision_aspect_ratio: Optional[str] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        logits_to_keep: Union[int, torch.Tensor] = 0,
        **kwargs: Unpack[KwargsForCausalLM],
    ) -> Union[Tuple, LlavaOnevisionCausalLMOutputWithPast]:
        r"""
        pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, frames, num_channels, image_size, image_size)):
            The tensors corresponding to the input videos. Pixel values can be obtained using
            [`LlavaNextVideoProcessor`]. See [`LlavaNextVideoProcessor.__call__`] for details. [`LlavaProcessor`] uses
            [`LlavaNextVideoProcessor`] for processing videos.
        image_sizes_videos (`torch.LongTensor` of shape `(batch_size, frames, 2)`, *optional*):
            The sizes of the videos in the batch, being (height, width) for each frame in the video.
        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
            The feature selection strategy used to select the vision feature from the vision backbone.
            Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
            If `"full"`, the full vision features are used.
        vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`):
            Aspect ratio used when processong image features. The default value is "anyres_max_9".
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> import torch
        >>> from transformers import LlavaOnevisionProcessor, LlavaOnevisionForConditionalGeneration

        >>> model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype="float16", device_map="cuda:0")
        >>> processor = LlavaOnevisionProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf")

        >>> conversation = [
        ...     {
        ...       "role": "user",
        ...       "content": [
        ...           {"type": "text", "text": "What is shown in this image?"},
        ...           {"type": "image"},
        ...         ],
        ...     },
        ... ]
        >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

        >>> image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> raw_image = Image.open(requests.get(image_file, stream=True).raw)
        >>> inputs = processor(text=prompt, images=raw_image, return_tensors='pt').to(0, torch.float16)

        >>> output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
        >>> processor.batch_decode(output, skip_special_tokens=True)[0]
        "user\n\nWhat is shown in this image?\nassistant\ncat"
        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        vision_feature_layer = (
            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
        )
        vision_feature_select_strategy = (
            vision_feature_select_strategy
            if vision_feature_select_strategy is not None
            else self.config.vision_feature_select_strategy
        )
        vision_aspect_ratio = (
            vision_aspect_ratio if vision_aspect_ratio is not None else self.config.vision_aspect_ratio
        )
        prompt_mask = (labels == -100) if labels is not None else None
        outputs = self.model(
            input_ids=input_ids,
            pixel_values=pixel_values,
            pixel_values_videos=pixel_values_videos,
            image_sizes=image_sizes,
            image_sizes_videos=image_sizes_videos,
            vision_aspect_ratio=vision_aspect_ratio,
            vision_feature_layer=vision_feature_layer,
            vision_feature_select_strategy=vision_feature_select_strategy,
            attention_mask=attention_mask,
            prompt_mask=prompt_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=True,
            cache_position=cache_position,
            logits_to_keep=logits_to_keep,
            **kwargs,
        )

        hidden_states = outputs[0]
        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss

        loss = None
        if self.training:
            assert labels is not None, "Labels must be provided for training."
            hidden_states = hidden_states[outputs.logits_to_keep].contiguous()
            labels = labels[outputs.logits_to_keep_half].contiguous()
            loss_fct = FusedLinearDiffusionCrossEntropyLoss(reduction='sum')
            loss = loss_fct(  # it will return (sum_loss, unreduced_loss)
                    # conduct `view(-1, V)` inside the function
                    x=hidden_states,
                    target=labels,
                    weight=self.lm_head.weight,
                    bias=self.lm_head.bias,
                    p_mask=outputs.p_mask,
                )
            loss = loss / labels.numel()
            logits = None
        else:
            logits = self.lm_head(hidden_states)

        return LlavaOnevisionCausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            image_hidden_states=outputs.image_hidden_states,
            video_hidden_states=outputs.video_hidden_states,
        )

    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        inputs_embeds=None,
        pixel_values=None,
        image_sizes=None,
        pixel_values_videos=None,
        image_sizes_videos=None,
        attention_mask=None,
        cache_position=None,
        logits_to_keep=None,
        **kwargs,
    ):
        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model

        model_inputs = super().prepare_inputs_for_generation(
            input_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            cache_position=cache_position,
            logits_to_keep=logits_to_keep,
            **kwargs,
        )

        if cache_position[0] == 0:
            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
            # Otherwise we need pixel values to be passed to model
            model_inputs["pixel_values"] = pixel_values
            model_inputs["image_sizes"] = image_sizes
            model_inputs["pixel_values_videos"] = pixel_values_videos
            model_inputs["image_sizes_videos"] = image_sizes_videos

        return model_inputs

    @staticmethod
    def _prepare_4d_causal_attention_mask_with_cache_position(
        attention_mask: torch.Tensor,
        sequence_length: int,
        target_length: int,
        dtype: torch.dtype,
        cache_position: torch.Tensor,
        batch_size: int,
        **kwargs,
    ):
        """
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        """
        if attention_mask is not None and attention_mask.dim() == 4:
            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
            causal_mask = attention_mask
        else:
            min_dtype = torch.finfo(dtype).min
            causal_mask = torch.full(
                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
            )
            if sequence_length != 1:
                causal_mask = torch.triu(causal_mask, diagonal=1)
            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
                    causal_mask.device
                )
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
                )

        return causal_mask


__all__ = ["LlavaOnevisionModel", "LlavaOnevisionForConditionalGeneration", "LlavaOnevisionPreTrainedModel"]